From 0634ab933695ef0ecbbffd7495da351b8ea0022f Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Wed, 27 Jun 2012 15:53:05 +0800 Subject: write pinyin_iterator_add_phrase --- src/pinyin.cpp | 90 ++++++++++++++++++++++++++++++++++++- src/storage/phrase_index.cpp | 2 + tests/lookup/test_pinyin_lookup.cpp | 2 +- 3 files changed, 91 insertions(+), 3 deletions(-) diff --git a/src/pinyin.cpp b/src/pinyin.cpp index 3ea6f08..7550368 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -247,6 +247,92 @@ import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context, return iter; } +bool pinyin_iterator_add_phrase(import_iterator_t * iter, + const char * phrase, + const char * pinyin, + gint count){ + /* if -1 == count, use the default value. */ + const int default_count = 100; + if (-1 == count) + count = default_count; + + pinyin_context_t * & context = iter->m_context; + FacadePhraseTable * & phrase_table = context->m_phrase_table; + FacadeChewingTable * & pinyin_table = context->m_pinyin_table; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + /* check whether the phrase exists in phrase table */ + glong len_phrase = 0; + ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &len_phrase, NULL); + phrase_token_t token = null_token; + + bool result = false; + + pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; + FullPinyinParser2 parser; + ChewingKeyVector keys = + g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + PhraseItem item; + int retval = phrase_table->search(len_phrase, ucs4_phrase, token); + if (!(retval & SEARCH_OK)) { + /* if not exists, get the maximum token, + then add it directly with maximum token + 1; */ + PhraseIndexRange range; + retval = phrase_index->get_range(iter->m_phrase_index, range); + + if (ERROR_OK == retval) { + token = range.m_range_end; + + /* parse the pinyin. */ + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + + if ( len_phrase == keys->len ) { /* valid pinyin */ + phrase_table->add_index(len_phrase, ucs4_phrase, token); + pinyin_table->add_index + (keys->len, (ChewingKey *)(keys->data), token); + + item.set_phrase_string(len_phrase, ucs4_phrase); + item.append_pronunciation((ChewingKey *)(keys->data), count); + phrase_index->add_phrase_item(token, &item); + result = true; + } + } + } else { + /* if exists, check whether in the same sub phrase; */ + if (PHRASE_INDEX_LIBRARY_INDEX(token) == iter->m_phrase_index) { + /* if so, remove the phrase, add the pinyin for the phrase item, + then add it back;*/ + phrase_index->get_phrase_item(token, item); + assert(len_phrase == item.get_phrase_length()); + ucs4_t tmp_phrase[MAX_PHRASE_LENGTH]; + item.get_phrase_string(tmp_phrase); + assert(0 == memcmp + (ucs4_phrase, tmp_phrase, sizeof(ucs4_t) * len_phrase)); + + /* parse the pinyin. */ + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + + PhraseItem * removed_item = NULL; + retval = phrase_index->remove_phrase_item(token, removed_item); + if (ERROR_OK == retval) { + removed_item->append_pronunciation((ChewingKey *)keys->data, + count); + phrase_index->add_phrase_item(token, removed_item); + delete removed_item; + } + } else { + /* if not, return false; */ + } + } + + g_array_free(key_rests, TRUE); + g_array_free(keys, TRUE); + g_free(ucs4_phrase); + return result; +} void pinyin_end_add_phrases(import_iterator_t * iter){ /* compact the content memory chunk of phrase index. */ @@ -847,7 +933,7 @@ static bool _try_divided_table(pinyin_instance_t * instance, ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; assert(pinyin_keys->len == pinyin_key_rests->len); - gint num_keys = pinyin_keys->len; + guint num_keys = pinyin_keys->len; assert(offset < num_keys); /* handle "^xian$" -> "xi'an" here */ @@ -941,7 +1027,7 @@ static bool _try_resplit_table(pinyin_instance_t * instance, ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; assert(pinyin_keys->len == pinyin_key_rests->len); - gint num_keys = pinyin_keys->len; + guint num_keys = pinyin_keys->len; assert(offset + 1 < num_keys); guint16 next_tone = CHEWING_ZERO_TONE; diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp index 8e73517..64eb649 100644 --- a/src/storage/phrase_index.cpp +++ b/src/storage/phrase_index.cpp @@ -490,7 +490,9 @@ bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){ add_phrase_item( cur_token, item_ptr); delete item_ptr; +#if 0 m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq(); +#endif return true; } diff --git a/tests/lookup/test_pinyin_lookup.cpp b/tests/lookup/test_pinyin_lookup.cpp index 25f7a2b..eea0437 100644 --- a/tests/lookup/test_pinyin_lookup.cpp +++ b/tests/lookup/test_pinyin_lookup.cpp @@ -61,7 +61,7 @@ int main( int argc, char * argv[]){ break; FullPinyinParser2 parser; - ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); -- cgit