diff options
author | Peng Wu <alexepico@gmail.com> | 2015-04-16 13:54:15 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2015-04-16 13:54:15 +0800 |
commit | dcabdc5b28f8cac72ac57f8d70590b79e321c2f5 (patch) | |
tree | 5e753bd368ea1c33098582cd19afbbe7506a9589 /src | |
parent | b54e2c1991d62f852f42e7689c6c156e4ca0cf47 (diff) | |
download | libpinyin-dcabdc5b28f8cac72ac57f8d70590b79e321c2f5.tar.gz libpinyin-dcabdc5b28f8cac72ac57f8d70590b79e321c2f5.tar.xz libpinyin-dcabdc5b28f8cac72ac57f8d70590b79e321c2f5.zip |
add flexible_single_gram.h
Diffstat (limited to 'src')
-rw-r--r-- | src/storage/flexible_ngram.h | 298 | ||||
-rw-r--r-- | src/storage/flexible_single_gram.h | 323 |
2 files changed, 324 insertions, 297 deletions
diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h index 9589d45..dc32dae 100644 --- a/src/storage/flexible_ngram.h +++ b/src/storage/flexible_ngram.h @@ -29,303 +29,7 @@ * struct MagicHeader, ArrayHeader, ArrayItem. */ -namespace pinyin{ - -typedef GArray * FlexibleBigramPhraseArray; - -/** - * FlexibleSingleGram: - * @ArrayHeader: the struct ArrayHeader. - * @ArrayItem: the struct ArrayItem. - * - * The flexible single gram is mainly used for training purpose. - * - */ - -template<typename ArrayHeader, typename ArrayItem> -class FlexibleSingleGram{ - template<typename MH, typename AH, - typename AI> - friend class FlexibleBigram; -private: - MemoryChunk m_chunk; - FlexibleSingleGram(void * buffer, size_t length){ - m_chunk.set_chunk(buffer, length, NULL); - } -public: - /** - * ArrayItemWithToken: - * - * Define the struct ArrayItemWithToken type. - * - */ - typedef struct{ - phrase_token_t m_token; - ArrayItem m_item; - } ArrayItemWithToken; - -private: - static bool token_less_than(const ArrayItemWithToken & lhs, - const ArrayItemWithToken & rhs){ - return lhs.m_token < rhs.m_token; - } - -public: - /** - * FlexibleSingleGram::FlexibleSingleGram: - * - * The constructor of the FlexibleSingleGram. - * - */ - FlexibleSingleGram(){ - m_chunk.set_size(sizeof(ArrayHeader)); - memset(m_chunk.begin(), 0, sizeof(ArrayHeader)); - } - - /** - * FlexibleSingleGram::retrieve_all: - * @array: the array to store all items in this single gram. - * @returns: whether the retrieve operation is successful. - * - * Retrieve all items in this single gram. - * - */ - bool retrieve_all(/* out */ FlexibleBigramPhraseArray array){ - const ArrayItemWithToken * begin = (const ArrayItemWithToken *) - ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); - const ArrayItemWithToken * end = (const ArrayItemWithToken *) - m_chunk.end(); - - ArrayItemWithToken item; - for ( const ArrayItemWithToken * cur_item = begin; - cur_item != end; - ++cur_item){ - /* Note: optimize this with g_array_append_vals? */ - item.m_token = cur_item->m_token; - item.m_item = cur_item->m_item; - g_array_append_val(array, item); - } - - return true; - } - - /** - * FlexibleSingleGram::search: - * @range: the token range. - * @array: the array to store the array items with token in the range. - * @returns: whether the search operation is successful. - * - * Search the array items with token in the range. - * - * Note: The array result may contain many items. - * - */ - bool search(/* in */ PhraseIndexRange * range, - /* out */ FlexibleBigramPhraseArray array){ - const ArrayItemWithToken * begin = (const ArrayItemWithToken *) - ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); - const ArrayItemWithToken * end = (const ArrayItemWithToken *) - m_chunk.end(); - - ArrayItemWithToken compare_item; - compare_item.m_token = range->m_range_begin; - const ArrayItemWithToken * cur_item = std_lite::lower_bound - (begin, end, compare_item, token_less_than); - - ArrayItemWithToken item; - for ( ; cur_item != end; ++cur_item){ - if ( cur_item->m_token >= range->m_range_end ) - break; - item.m_token = cur_item->m_token; - item.m_item = cur_item->m_item; - g_array_append_val(array, item); - } - - return true; - } - - /** - * FlexibleSingleGram::insert_array_item: - * @token: the phrase token to be inserted. - * @item: the array item of this token. - * @returns: whether the insert operation is successful. - * - * Insert the array item of the token. - * - */ - bool insert_array_item(/* in */ phrase_token_t token, - /* in */ const ArrayItem & item){ - ArrayItemWithToken * begin = (ArrayItemWithToken *) - ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); - ArrayItemWithToken * end = (ArrayItemWithToken *) - m_chunk.end(); - - ArrayItemWithToken compare_item; - compare_item.m_token = token; - ArrayItemWithToken * cur_item = std_lite::lower_bound - (begin, end, compare_item, token_less_than); - - ArrayItemWithToken insert_item; - insert_item.m_token = token; - insert_item.m_item = item; - - for ( ; cur_item != end; ++cur_item ){ - if ( cur_item->m_token > token ){ - size_t offset = sizeof(ArrayHeader) + - sizeof(ArrayItemWithToken) * (cur_item - begin); - m_chunk.insert_content(offset, &insert_item, - sizeof(ArrayItemWithToken)); - return true; - } - if ( cur_item->m_token == token ){ - return false; - } - } - m_chunk.insert_content(m_chunk.size(), &insert_item, - sizeof(ArrayItemWithToken)); - return true; - } - - /** - * FlexibleSingleGram::remove_array_item: - * @token: the phrase token to be removed. - * @item: the content of the removed array item. - * @returns: whether the remove operation is successful. - * - * Remove the array item of the token. - * - */ - bool remove_array_item(/* in */ phrase_token_t token, - /* out */ ArrayItem & item) - { - /* clear retval */ - memset(&item, 0, sizeof(ArrayItem)); - - const ArrayItemWithToken * begin = (const ArrayItemWithToken *) - ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); - const ArrayItemWithToken * end = (const ArrayItemWithToken *) - m_chunk.end(); - - ArrayItemWithToken compare_item; - compare_item.m_token = token; - const ArrayItemWithToken * cur_item = std_lite::lower_bound - (begin, end, compare_item, token_less_than); - - for ( ; cur_item != end; ++cur_item){ - if ( cur_item->m_token > token ) - return false; - if ( cur_item->m_token == token ){ - memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem)); - size_t offset = sizeof(ArrayHeader) + - sizeof(ArrayItemWithToken) * (cur_item - begin); - m_chunk.remove_content(offset, sizeof(ArrayItemWithToken)); - return true; - } - } - return false; - } - - /** - * FlexibleSingleGram::get_array_item: - * @token: the phrase token. - * @item: the array item of the token. - * @returns: whether the get operation is successful. - * - * Get the array item of the token. - * - */ - bool get_array_item(/* in */ phrase_token_t token, - /* out */ ArrayItem & item) - { - /* clear retval */ - memset(&item, 0, sizeof(ArrayItem)); - - const ArrayItemWithToken * begin = (const ArrayItemWithToken *) - ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); - const ArrayItemWithToken * end = (const ArrayItemWithToken *) - m_chunk.end(); - - ArrayItemWithToken compare_item; - compare_item.m_token = token; - const ArrayItemWithToken * cur_item = std_lite::lower_bound - (begin, end, compare_item, token_less_than); - - for ( ; cur_item != end; ++cur_item){ - if ( cur_item->m_token > token ) - return false; - if ( cur_item->m_token == token ){ - memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem)); - return true; - } - } - return false; - } - - /** - * FlexibleSingleGram::set_array_item: - * @token: the phrase token. - * @item: the array item of the token. - * @returns: whether the set operation is successful. - * - * Set the array item of the token. - * - */ - bool set_array_item(/* in */ phrase_token_t token, - /* in */ const ArrayItem & item){ - ArrayItemWithToken * begin = (ArrayItemWithToken *) - ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); - ArrayItemWithToken * end = (ArrayItemWithToken *) - m_chunk.end(); - - ArrayItemWithToken compare_item; - compare_item.m_token = token; - ArrayItemWithToken * cur_item = std_lite::lower_bound - (begin, end, compare_item, token_less_than); - - for ( ; cur_item != end; ++cur_item ){ - if ( cur_item->m_token > token ){ - return false; - } - if ( cur_item->m_token == token ){ - memcpy(&(cur_item->m_item), &item, sizeof(ArrayItem)); - return true; - } - } - return false; - } - - /** - * FlexibleSingleGram::get_array_header: - * @header: the array header of this single gram. - * @returns: whether the get operation is successful. - * - * Get the array header of this single gram. - * - */ - bool get_array_header(/* out */ ArrayHeader & header){ - /* clear retval */ - memset(&header, 0, sizeof(ArrayHeader)); - char * buf_begin = (char *)m_chunk.begin(); - memcpy(&header, buf_begin, sizeof(ArrayHeader)); - return true; - } - - /** - * FlexibleSingleGram::set_array_header: - * @header: the array header of this single gram. - * @returns: whether the set operation is successful. - * - * Set the array header of this single gram. - * - */ - bool set_array_header(/* in */ const ArrayHeader & header){ - char * buf_begin = (char *)m_chunk.begin(); - memcpy(buf_begin, &header, sizeof(ArrayHeader)); - return true; - } -}; - -}; +#include "flexible_single_gram.h" #ifdef HAVE_BERKELEY_DB #include "flexible_ngram_bdb.h" diff --git a/src/storage/flexible_single_gram.h b/src/storage/flexible_single_gram.h new file mode 100644 index 0000000..ded2c50 --- /dev/null +++ b/src/storage/flexible_single_gram.h @@ -0,0 +1,323 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2015 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef FLEXIBLE_SINGLE_GRAM_H +#define FLEXIBLE_SINGLE_GRAM_H + +namespace pinyin{ + +typedef GArray * FlexibleBigramPhraseArray; + +/** + * FlexibleSingleGram: + * @ArrayHeader: the struct ArrayHeader. + * @ArrayItem: the struct ArrayItem. + * + * The flexible single gram is mainly used for training purpose. + * + */ + +template<typename ArrayHeader, typename ArrayItem> +class FlexibleSingleGram{ + template<typename MH, typename AH, + typename AI> + friend class FlexibleBigram; +private: + MemoryChunk m_chunk; + FlexibleSingleGram(void * buffer, size_t length){ + m_chunk.set_chunk(buffer, length, NULL); + } +public: + /** + * ArrayItemWithToken: + * + * Define the struct ArrayItemWithToken type. + * + */ + typedef struct{ + phrase_token_t m_token; + ArrayItem m_item; + } ArrayItemWithToken; + +private: + static bool token_less_than(const ArrayItemWithToken & lhs, + const ArrayItemWithToken & rhs){ + return lhs.m_token < rhs.m_token; + } + +public: + /** + * FlexibleSingleGram::FlexibleSingleGram: + * + * The constructor of the FlexibleSingleGram. + * + */ + FlexibleSingleGram(){ + m_chunk.set_size(sizeof(ArrayHeader)); + memset(m_chunk.begin(), 0, sizeof(ArrayHeader)); + } + + /** + * FlexibleSingleGram::retrieve_all: + * @array: the array to store all items in this single gram. + * @returns: whether the retrieve operation is successful. + * + * Retrieve all items in this single gram. + * + */ + bool retrieve_all(/* out */ FlexibleBigramPhraseArray array){ + const ArrayItemWithToken * begin = (const ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + const ArrayItemWithToken * end = (const ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken item; + for ( const ArrayItemWithToken * cur_item = begin; + cur_item != end; + ++cur_item){ + /* Note: optimize this with g_array_append_vals? */ + item.m_token = cur_item->m_token; + item.m_item = cur_item->m_item; + g_array_append_val(array, item); + } + + return true; + } + + /** + * FlexibleSingleGram::search: + * @range: the token range. + * @array: the array to store the array items with token in the range. + * @returns: whether the search operation is successful. + * + * Search the array items with token in the range. + * + * Note: The array result may contain many items. + * + */ + bool search(/* in */ PhraseIndexRange * range, + /* out */ FlexibleBigramPhraseArray array){ + const ArrayItemWithToken * begin = (const ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + const ArrayItemWithToken * end = (const ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = range->m_range_begin; + const ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + ArrayItemWithToken item; + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token >= range->m_range_end ) + break; + item.m_token = cur_item->m_token; + item.m_item = cur_item->m_item; + g_array_append_val(array, item); + } + + return true; + } + + /** + * FlexibleSingleGram::insert_array_item: + * @token: the phrase token to be inserted. + * @item: the array item of this token. + * @returns: whether the insert operation is successful. + * + * Insert the array item of the token. + * + */ + bool insert_array_item(/* in */ phrase_token_t token, + /* in */ const ArrayItem & item){ + ArrayItemWithToken * begin = (ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + ArrayItemWithToken * end = (ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = token; + ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + ArrayItemWithToken insert_item; + insert_item.m_token = token; + insert_item.m_item = item; + + for ( ; cur_item != end; ++cur_item ){ + if ( cur_item->m_token > token ){ + size_t offset = sizeof(ArrayHeader) + + sizeof(ArrayItemWithToken) * (cur_item - begin); + m_chunk.insert_content(offset, &insert_item, + sizeof(ArrayItemWithToken)); + return true; + } + if ( cur_item->m_token == token ){ + return false; + } + } + m_chunk.insert_content(m_chunk.size(), &insert_item, + sizeof(ArrayItemWithToken)); + return true; + } + + /** + * FlexibleSingleGram::remove_array_item: + * @token: the phrase token to be removed. + * @item: the content of the removed array item. + * @returns: whether the remove operation is successful. + * + * Remove the array item of the token. + * + */ + bool remove_array_item(/* in */ phrase_token_t token, + /* out */ ArrayItem & item) + { + /* clear retval */ + memset(&item, 0, sizeof(ArrayItem)); + + const ArrayItemWithToken * begin = (const ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + const ArrayItemWithToken * end = (const ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = token; + const ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem)); + size_t offset = sizeof(ArrayHeader) + + sizeof(ArrayItemWithToken) * (cur_item - begin); + m_chunk.remove_content(offset, sizeof(ArrayItemWithToken)); + return true; + } + } + return false; + } + + /** + * FlexibleSingleGram::get_array_item: + * @token: the phrase token. + * @item: the array item of the token. + * @returns: whether the get operation is successful. + * + * Get the array item of the token. + * + */ + bool get_array_item(/* in */ phrase_token_t token, + /* out */ ArrayItem & item) + { + /* clear retval */ + memset(&item, 0, sizeof(ArrayItem)); + + const ArrayItemWithToken * begin = (const ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + const ArrayItemWithToken * end = (const ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = token; + const ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem)); + return true; + } + } + return false; + } + + /** + * FlexibleSingleGram::set_array_item: + * @token: the phrase token. + * @item: the array item of the token. + * @returns: whether the set operation is successful. + * + * Set the array item of the token. + * + */ + bool set_array_item(/* in */ phrase_token_t token, + /* in */ const ArrayItem & item){ + ArrayItemWithToken * begin = (ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + ArrayItemWithToken * end = (ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = token; + ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item ){ + if ( cur_item->m_token > token ){ + return false; + } + if ( cur_item->m_token == token ){ + memcpy(&(cur_item->m_item), &item, sizeof(ArrayItem)); + return true; + } + } + return false; + } + + /** + * FlexibleSingleGram::get_array_header: + * @header: the array header of this single gram. + * @returns: whether the get operation is successful. + * + * Get the array header of this single gram. + * + */ + bool get_array_header(/* out */ ArrayHeader & header){ + /* clear retval */ + memset(&header, 0, sizeof(ArrayHeader)); + char * buf_begin = (char *)m_chunk.begin(); + memcpy(&header, buf_begin, sizeof(ArrayHeader)); + return true; + } + + /** + * FlexibleSingleGram::set_array_header: + * @header: the array header of this single gram. + * @returns: whether the set operation is successful. + * + * Set the array header of this single gram. + * + */ + bool set_array_header(/* in */ const ArrayHeader & header){ + char * buf_begin = (char *)m_chunk.begin(); + memcpy(buf_begin, &header, sizeof(ArrayHeader)); + return true; + } +}; + +}; + +#endif |