diff options
Diffstat (limited to 'src/storage/chewing_large_table.cpp')
-rw-r--r-- | src/storage/chewing_large_table.cpp | 1047 |
1 files changed, 1047 insertions, 0 deletions
diff --git a/src/storage/chewing_large_table.cpp b/src/storage/chewing_large_table.cpp new file mode 100644 index 0000000..2eb8658 --- /dev/null +++ b/src/storage/chewing_large_table.cpp @@ -0,0 +1,1047 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "chewing_large_table.h" +#include <assert.h> +#include "pinyin_phrase2.h" +#include "pinyin_parser2.h" + + +/* internal class definition */ + +namespace pinyin{ +class ChewingLengthIndexLevel{ + +protected: + GArray * m_chewing_array_indexes; + +public: + /* constructor/destructor */ + ChewingLengthIndexLevel(); + ~ChewingLengthIndexLevel(); + + /* load/store method */ + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, + table_offset_t & end); + + /* search method */ + int search(pinyin_option_t options, int phrase_length, + /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const; + + /* add/remove index method */ + int add_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token); + int remove_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token); + + /* get length method */ + int get_length() const; + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + + +template<size_t phrase_length> +class ChewingArrayIndexLevel{ +protected: + typedef PinyinIndexItem2<phrase_length> IndexItem; + +protected: + MemoryChunk m_chunk; + + /* compress consecutive tokens */ + int convert(pinyin_option_t options, + const ChewingKey keys[], + IndexItem * begin, + IndexItem * end, + PhraseIndexRanges ranges) const; + +public: + /* load/store method */ + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, + table_offset_t & end); + + /* search method */ + int search(pinyin_option_t options, /* in */const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const; + + /* add/remove index method */ + int add_index(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token); + int remove_index(/* in */ const ChewingKey keys[], + /* in */ phrase_token_t token); + + /* get length method */ + int get_length() const; + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + +}; + + +using namespace pinyin; + +/* class implementation */ + +ChewingBitmapIndexLevel::ChewingBitmapIndexLevel(pinyin_option_t options) + : m_options(options) { + memset(m_chewing_length_indexes, 0, sizeof(m_chewing_length_indexes)); +} + +void ChewingBitmapIndexLevel::reset() { + for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k) + for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l) + for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m) + for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES; + ++n) { + ChewingLengthIndexLevel * & length_array = + m_chewing_length_indexes[k][l][m][n]; + if (length_array) + delete length_array; + length_array = NULL; + } +} + + +/* search method */ + +int ChewingBitmapIndexLevel::search(int phrase_length, + /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + assert(phrase_length > 0); + return initial_level_search(phrase_length, keys, ranges); +} + +int ChewingBitmapIndexLevel::initial_level_search (int phrase_length, + /* in */ const ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const { + +/* macros */ +#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \ + { \ + result |= middle_and_final_level_search(ORIGIN, phrase_length, \ + keys, ranges); \ + if (m_options & AMBIGUITY) { \ + result |= middle_and_final_level_search(ANOTHER, \ + phrase_length, \ + keys, ranges); \ + } \ + return result; \ + } + + /* deal with ambiguities */ + int result = SEARCH_NONE; + const ChewingKey & first_key = keys[0]; + + switch(first_key.m_initial) { + MATCH(PINYIN_AMB_C_CH, CHEWING_C, CHEWING_CH); + MATCH(PINYIN_AMB_C_CH, CHEWING_CH, CHEWING_C); + MATCH(PINYIN_AMB_Z_ZH, CHEWING_Z, CHEWING_ZH); + MATCH(PINYIN_AMB_Z_ZH, CHEWING_ZH, CHEWING_Z); + MATCH(PINYIN_AMB_S_SH, CHEWING_S, CHEWING_SH); + MATCH(PINYIN_AMB_S_SH, CHEWING_SH, CHEWING_S); + MATCH(PINYIN_AMB_L_R, CHEWING_R, CHEWING_L); + MATCH(PINYIN_AMB_L_N, CHEWING_N, CHEWING_L); + MATCH(PINYIN_AMB_F_H, CHEWING_F, CHEWING_H); + MATCH(PINYIN_AMB_F_H, CHEWING_H, CHEWING_F); + MATCH(PINYIN_AMB_G_K, CHEWING_G, CHEWING_K); + MATCH(PINYIN_AMB_G_K, CHEWING_K, CHEWING_G); + + case CHEWING_L: + { + result |= middle_and_final_level_search + (CHEWING_L, phrase_length, keys, ranges); + + if (m_options & PINYIN_AMB_L_N) + result |= middle_and_final_level_search + (CHEWING_N, phrase_length, keys,ranges); + + if (m_options & PINYIN_AMB_L_R) + result |= middle_and_final_level_search + (CHEWING_R, phrase_length, keys, ranges); + return result; + } + default: + { + result |= middle_and_final_level_search + ((ChewingInitial) first_key.m_initial, + phrase_length, keys, ranges); + return result; + } + } +#undef MATCH + return result; +} + + +int ChewingBitmapIndexLevel::middle_and_final_level_search +(ChewingInitial initial, int phrase_length, /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + +/* macros */ +#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \ + { \ + result = tone_level_search \ + (initial, middle, \ + ORIGIN, phrase_length, keys, ranges); \ + if (m_options & AMBIGUITY) { \ + result |= tone_level_search \ + (initial, middle, \ + ANOTHER, phrase_length, keys, ranges); \ + } \ + return result; \ + } + + int result = SEARCH_NONE; + const ChewingKey & first_key = keys[0]; + const ChewingMiddle middle = (ChewingMiddle)first_key.m_middle; + + switch(first_key.m_final) { + case CHEWING_ZERO_FINAL: + { + if (middle == CHEWING_ZERO_MIDDLE) { /* in-complete pinyin */ + if (!(m_options & PINYIN_INCOMPLETE)) + return result; + for (int m = CHEWING_ZERO_MIDDLE; + m < CHEWING_NUMBER_OF_MIDDLES; ++m) + for (int n = CHEWING_ZERO_FINAL; + n < CHEWING_NUMBER_OF_FINALS; ++n) { + + if (CHEWING_ZERO_MIDDLE == m && + CHEWING_ZERO_FINAL == n) + continue; + + result |= tone_level_search + (initial, (ChewingMiddle) m, (ChewingFinal) n, + phrase_length, keys, ranges); + } + return result; + } else { /* normal pinyin */ + result |= tone_level_search + (initial, middle, CHEWING_ZERO_FINAL, + phrase_length, keys, ranges); + return result; + } + } + + MATCH(PINYIN_AMB_AN_ANG, CHEWING_AN, CHEWING_ANG); + MATCH(PINYIN_AMB_AN_ANG, CHEWING_ANG, CHEWING_AN); + MATCH(PINYIN_AMB_EN_ENG, CHEWING_EN, CHEWING_ENG); + MATCH(PINYIN_AMB_EN_ENG, CHEWING_ENG, CHEWING_EN); + MATCH(PINYIN_AMB_IN_ING, PINYIN_IN, PINYIN_ING); + MATCH(PINYIN_AMB_IN_ING, PINYIN_ING, PINYIN_IN); + + default: + { + result |= tone_level_search + (initial, middle, (ChewingFinal) first_key.m_final, + phrase_length, keys, ranges); + return result; + } + } +#undef MATCH + return result; +} + + +int ChewingBitmapIndexLevel::tone_level_search +(ChewingInitial initial, ChewingMiddle middle, ChewingFinal final, + int phrase_length, /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + + int result = SEARCH_NONE; + const ChewingKey & first_key = keys[0]; + + switch (first_key.m_tone) { + case CHEWING_ZERO_TONE: + { + /* deal with zero tone in chewing large table. */ + for (int i = CHEWING_ZERO_TONE; i < CHEWING_NUMBER_OF_TONES; ++i) { + ChewingLengthIndexLevel * phrases = + m_chewing_length_indexes + [initial][middle][final][(ChewingTone)i]; + if (phrases) + result |= phrases->search + (m_options, phrase_length - 1, keys + 1, ranges); + } + return result; + } + default: + { + ChewingLengthIndexLevel * phrases = + m_chewing_length_indexes + [initial][middle][final][CHEWING_ZERO_TONE]; + if (phrases) + result |= phrases->search + (m_options, phrase_length - 1, keys + 1, ranges); + + phrases = m_chewing_length_indexes + [initial][middle][final][(ChewingTone) first_key.m_tone]; + if (phrases) + result |= phrases->search + (m_options, phrase_length - 1, keys + 1, ranges); + return result; + } + } + return result; +} + + +ChewingLengthIndexLevel::ChewingLengthIndexLevel() { + m_chewing_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *)); +} + +ChewingLengthIndexLevel::~ChewingLengthIndexLevel() { +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \ + if (array) \ + delete array; \ + array = NULL; \ + break; \ + } + + for (guint i = 0; i < m_chewing_array_indexes->len; ++i) { + switch (i){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + } +#undef CASE + g_array_free(m_chewing_array_indexes, TRUE); +} + + +int ChewingLengthIndexLevel::search(pinyin_option_t options, int phrase_length, + /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + int result = SEARCH_NONE; + if (m_chewing_array_indexes->len < phrase_length + 1) + return result; + if (m_chewing_array_indexes->len > phrase_length + 1) + result |= SEARCH_CONTINUED; + +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \ + if (!array) \ + return result; \ + result |= array->search(options, keys, ranges); \ + return result; \ + } + + switch (phrase_length) { + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + +#undef CASE +} + + +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::search +(pinyin_option_t options, /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + IndexItem * chunk_begin = NULL, * chunk_end = NULL; + chunk_begin = (IndexItem *) m_chunk.begin(); + chunk_end = (IndexItem *) m_chunk.end(); + + /* do the search */ + ChewingKey left_keys[phrase_length], right_keys[phrase_length]; + compute_lower_value2(options, keys, left_keys, phrase_length); + compute_upper_value2(options, keys, right_keys, phrase_length); + + IndexItem left(left_keys, -1), right(right_keys, -1); + + IndexItem * begin = std_lite::lower_bound + (chunk_begin, chunk_end, left, + phrase_exact_less_than2<phrase_length>); + IndexItem * end = std_lite::upper_bound + (chunk_begin, chunk_end, right, + phrase_exact_less_than2<phrase_length>); + + return convert(options, keys, begin, end, ranges); +} + +/* compress consecutive tokens */ +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::convert +(pinyin_option_t options, const ChewingKey keys[], + IndexItem * begin, IndexItem * end, + PhraseIndexRanges ranges) const { + IndexItem * iter = NULL; + PhraseIndexRange cursor; + GArray * head, * cursor_head = NULL; + + int result = SEARCH_NONE; + /* TODO: check the below code */ + cursor.m_range_begin = null_token; cursor.m_range_end = null_token; + for (iter = begin; iter != end; ++iter) { + if (0 != pinyin_compare_with_ambiguities2 + (options, keys, iter->m_keys, phrase_length)) + continue; + + phrase_token_t token = iter->m_token; + head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)]; + if (NULL == head) + continue; + + result |= SEARCH_OK; + + if (null_token == cursor.m_range_begin) { + cursor.m_range_begin = token; + cursor.m_range_end = token + 1; + cursor_head = head; + } else if (cursor.m_range_end == token && + PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_begin) == + PHRASE_INDEX_LIBRARY_INDEX(token)) { + ++cursor.m_range_end; + } else { + g_array_append_val(cursor_head, cursor); + cursor.m_range_begin = token; cursor.m_range_end = token + 1; + cursor_head = head; + } + } + + if (null_token == cursor.m_range_begin) + return result; + + g_array_append_val(cursor_head, cursor); + return result; +} + + +/* add/remove index method */ + +int ChewingBitmapIndexLevel::add_index(int phrase_length, + /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + const ChewingKey first_key = keys[0]; + ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes + [first_key.m_initial][first_key.m_middle] + [first_key.m_final][first_key.m_tone]; + + if (NULL == length_array) { + length_array = new ChewingLengthIndexLevel(); + } + + return length_array->add_index(phrase_length - 1, keys + 1, token); +} + +int ChewingBitmapIndexLevel::remove_index(int phrase_length, + /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + const ChewingKey first_key = keys[0]; + ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes + [first_key.m_initial][first_key.m_middle] + [first_key.m_final][first_key.m_tone]; + + if (NULL == length_array) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + + int retval = length_array->remove_index(phrase_length - 1, keys + 1, token); + + /* remove empty array. */ + if (0 == length_array->get_length()) { + delete length_array; + length_array = NULL; + } + + return retval; +} + +int ChewingLengthIndexLevel::add_index(int phrase_length, + /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + if (!(phrase_length + 1 < MAX_PHRASE_LENGTH)) + return ERROR_PHRASE_TOO_LONG; + + if (m_chewing_array_indexes->len <= phrase_length) + g_array_set_size(m_chewing_array_indexes, phrase_length + 1); + +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, \ + ChewingArrayIndexLevel<len> *, len); \ + if (NULL == array) \ + array = new ChewingArrayIndexLevel<len>; \ + return array->add_index(keys, token); \ + } + + switch(phrase_length) { + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + +#undef CASE +} + +int ChewingLengthIndexLevel::remove_index(int phrase_length, + /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + if (!(phrase_length + 1 < MAX_PHRASE_LENGTH)) + return ERROR_PHRASE_TOO_LONG; + + if (m_chewing_array_indexes->len <= phrase_length) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, \ + ChewingArrayIndexLevel<len> *, len); \ + if (NULL == array) \ + return ERROR_REMOVE_ITEM_DONOT_EXISTS; \ + int retval = array->remove_index(keys, token); \ + \ + /* remove empty array. */ \ + if (0 == array->get_length()) { \ + delete array; \ + array = NULL; \ + \ + /* shrink self array. */ \ + g_array_set_size(m_chewing_array_indexes, \ + get_length()); \ + } \ + return retval; \ + } + + switch (phrase_length) { + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + +#undef CASE +} + +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::add_index +(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) { + IndexItem * begin, * end; + + IndexItem add_elem(keys, token); + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + std_lite::pair<IndexItem *, IndexItem *> range; + range = std_lite::equal_range + (begin, end, add_elem, phrase_exact_less_than2<phrase_length>); + + IndexItem * cur_elem; + for (cur_elem = range.first; + cur_elem != range.second; ++cur_elem) { + if (cur_elem->m_token == token) + return ERROR_INSERT_ITEM_EXISTS; + if (cur_elem->m_token > token) + break; + } + + int offset = (cur_elem - begin) * sizeof(IndexItem); + m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem)); + return ERROR_OK; +} + +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::remove_index +(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) { + IndexItem * begin, * end; + + IndexItem remove_elem(keys, token); + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + std_lite::pair<IndexItem *, IndexItem *> range; + range = std_lite::equal_range + (begin, end, remove_elem, phrase_exact_less_than2<phrase_length>); + + IndexItem * cur_elem; + for (cur_elem = range.first; + cur_elem != range.second; ++cur_elem) { + if (cur_elem->m_token == token) + break; + } + + if (cur_elem == range.second) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + + int offset = (cur_elem - begin) * sizeof(IndexItem); + m_chunk.remove_content(offset, sizeof(IndexItem)); + return ERROR_OK; +} + + +/* load text method */ +bool ChewingLargeTable::load_text(FILE * infile) { + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + + while (!feof(infile)) { + int num = fscanf(infile, "%s %s %u %ld", + pinyin, phrase, &token, &freq); + + if (4 != num) + continue; + + if(feof(infile)) + break; + + glong len = g_utf8_strlen(phrase, -1); + + FullPinyinParser2 parser; + ChewingKeyVector keys; + ChewingKeyRestVector key_rests; + + keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + pinyin_option_t options = USE_TONE; + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + + if (len != keys->len) { + fprintf(stderr, "ChewingLargeTable::load_text:%s\t%s\t%u\t%ld\n", + pinyin, phrase, token, freq); + continue; + } + + add_index(keys->len, (ChewingKey *)keys->data, token); + + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + } + + return true; +} + + +/* load/store method */ + +bool ChewingBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, + table_offset_t end) { + reset(); + char * begin = (char *) chunk->begin(); + table_offset_t phrase_begin, phrase_end; + table_offset_t * index = (table_offset_t *) (begin + offset); + phrase_end = *index; + + for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k) + for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l) + for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m) + for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) { + phrase_begin = phrase_end; + index++; + phrase_end = *index; + + if (phrase_begin == phrase_end) /* null pointer */ + continue; + + /* after reset() all phrases are null pointer. */ + ChewingLengthIndexLevel * phrases = new ChewingLengthIndexLevel; + m_chewing_length_indexes[k][l][m][n] = phrases; + + phrases->load(chunk, phrase_begin, phrase_end - 1); + assert(phrase_end <= end); + assert(*(begin + phrase_end - 1) == c_separate); + } + + offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t); + assert(c_separate == *(begin + offset)); + return true; +} + +bool ChewingBitmapIndexLevel::store(MemoryChunk * new_chunk, + table_offset_t offset, + table_offset_t & end) { + table_offset_t phrase_end; + table_offset_t index = offset; + offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t); + + /* add '#' */ + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + + for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k) + for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l) + for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m) + for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) { + ChewingLengthIndexLevel * phrases = + m_chewing_length_indexes[k][l][m][n]; + + if (NULL == phrases) { /* null pointer */ + new_chunk->set_content(index, &offset, + sizeof(table_offset_t)); + index += sizeof(table_offset_t); + continue; + } + + /* has a end '#' */ + phrases->store(new_chunk, offset, phrase_end); + offset = phrase_end; + + /* add '#' */ + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, + sizeof(table_offset_t)); + index += sizeof(table_offset_t); + } + + end = offset; + return true; +} + +bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, + table_offset_t end) { + char * begin = (char *) chunk->begin(); + guint32 nindex = *((guint32 *)(begin + offset)); /* number of index */ + table_offset_t * index = (table_offset_t *) + (begin + offset + sizeof(guint32)); + + table_offset_t phrase_begin, phrase_end = *index; + g_array_set_size(m_chewing_array_indexes, 0); + for (guint32 i = 0; i < nindex; ++i) { + phrase_begin = phrase_end; + index++; + phrase_end = *index; + + if (phrase_begin == phrase_end) { + void * null = NULL; + g_array_append_val(m_chewing_array_indexes, null); + continue; + } + +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * phrase = \ + new ChewingArrayIndexLevel<len>; \ + phrase->load(chunk, phrase_begin, phrase_end - 1); \ + assert(*(begin + phrase_end - 1) == c_separate); \ + assert(phrase_end <= end); \ + g_array_append_val(m_chewing_array_indexes, phrase); \ + break; \ + } + + switch ( i ){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + +#undef CASE + } + + /* check '#' */ + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + assert(c_separate == *(begin + offset)); + return true; +} + +bool ChewingLengthIndexLevel::store(MemoryChunk * new_chunk, + table_offset_t offset, + table_offset_t & end) { + guint32 nindex = m_chewing_array_indexes->len; /* number of index */ + new_chunk->set_content(offset, &nindex, sizeof(guint32)); + table_offset_t index = offset + sizeof(guint32); + + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + + table_offset_t phrase_end; + for (guint32 i = 0; i < nindex; ++i) { +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * phrase = g_array_index \ + (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \ + if (NULL == phrase) { \ + new_chunk->set_content \ + (index, &offset, sizeof(table_offset_t)); \ + index += sizeof(table_offset_t); \ + continue; \ + } \ + phrase->store(new_chunk, offset, phrase_end); \ + offset = phrase_end; \ + break; \ + } + + switch ( i ){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } +#undef CASE + + /* add '#' */ + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + } + + end = offset; + return true; +} + +template<size_t phrase_length> +bool ChewingArrayIndexLevel<phrase_length>:: +load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end) { + char * begin = (char *) chunk->begin(); + m_chunk.set_chunk(begin + offset, end - offset, NULL); + return true; +} + +template<size_t phrase_length> +bool ChewingArrayIndexLevel<phrase_length>:: +store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) { + new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size()); + end = offset + m_chunk.size(); + return true; +} + + +/* get length method */ + +int ChewingLengthIndexLevel::get_length() const { + int length = m_chewing_array_indexes->len; + + /* trim trailing zero. */ + for (int i = length - 1; i >= 0; --i) { + void * array = g_array_index(m_chewing_array_indexes, void *, i); + + if (NULL != array) + break; + + --length; + } + + return length; +} + +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::get_length() const { + IndexItem * chunk_begin = NULL, * chunk_end = NULL; + chunk_begin = (IndexItem *) m_chunk.begin(); + chunk_end = (IndexItem *) m_chunk.end(); + + return chunk_end - chunk_begin; +} + + +/* mask out method */ + +bool ChewingBitmapIndexLevel::mask_out(phrase_token_t mask, + phrase_token_t value) { + for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k) + for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l) + for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m) + for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES; + ++n) { + ChewingLengthIndexLevel * & length_array = + m_chewing_length_indexes[k][l][m][n]; + + if (NULL == length_array) + continue; + + length_array->mask_out(mask, value); + + if (0 == length_array->get_length()) { + delete length_array; + length_array = NULL; + } + } + return true; +} + +bool ChewingLengthIndexLevel::mask_out(phrase_token_t mask, + phrase_token_t value) { +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, \ + ChewingArrayIndexLevel<len> *, len); \ + \ + if (NULL == array) \ + continue; \ + \ + array->mask_out(mask, value); \ + \ + if (0 == array->get_length()) { \ + delete array; \ + array = NULL; \ + } \ + break; \ + } + + for (guint i = 0; i < m_chewing_array_indexes->len; ++i) { + switch (i){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + } +#undef CASE + g_array_set_size(m_chewing_array_indexes, get_length()); + return true; +} + +template<size_t phrase_length> +bool ChewingArrayIndexLevel<phrase_length>::mask_out +(phrase_token_t mask, phrase_token_t value) { + IndexItem * begin = NULL, * end = NULL; + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + for (IndexItem * cur = begin; cur != end; ++cur) { + if ((cur->m_token & mask) != value) + continue; + + int offset = (cur - begin) * sizeof(IndexItem); + m_chunk.remove_content(offset, sizeof(IndexItem)); + + /* update chunk end. */ + end = (IndexItem *) m_chunk.end(); + --cur; + } + + return true; +} |