From 30283199261a82a3807342d61ab36e8103f04063 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 18 Aug 2011 15:56:12 +0800 Subject: write get candidates --- src/pinyin.cpp | 46 +++++++++++++++++++++++++++++++------------- src/storage/phrase_index.cpp | 12 ++++++++++++ src/storage/phrase_index.h | 5 ++++- 3 files changed, 49 insertions(+), 14 deletions(-) diff --git a/src/pinyin.cpp b/src/pinyin.cpp index 2711240..48f8c59 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -39,7 +39,7 @@ pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ context->m_pinyin_table = new PinyinLargeTable(&(context->m_custom)); MemoryChunk * chunk = new MemoryChunk; gchar * filename = g_build_filename - (context->m_system_dir, "pinyin_index.bin"); + (context->m_system_dir, "pinyin_index.bin", NULL); chunk->load(filename); context->m_pinyin_table->load(chunk); @@ -49,32 +49,32 @@ pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ context->m_phrase_table = new PhraseLargeTable; chunk = new MemoryChunk; - filename = g_build_filename(context->m_system_dir, "phrase_index.bin"); + filename = g_build_filename(context->m_system_dir, "phrase_index.bin", NULL); chunk->load(filename); context->m_phrase_table->load(chunk); context->m_phrase_index = new FacadePhraseIndex; MemoryChunk * log = new MemoryChunk; chunk = new MemoryChunk; - filename = g_build_filename(context->m_system_dir, "gb_char.bin"); + filename = g_build_filename(context->m_system_dir, "gb_char.bin", NULL); chunk->load(filename); context->m_phrase_index->load(1, chunk); - filename = g_build_filename(context->m_user_dir, "gb_char.dbin"); + filename = g_build_filename(context->m_user_dir, "gb_char.dbin", NULL); log->load(filename); context->m_phrase_index->merge(1, log); log = new MemoryChunk; chunk = new MemoryChunk; - filename = g_build_filename(context->m_system_dir, "gbk_char.bin"); + filename = g_build_filename(context->m_system_dir, "gbk_char.bin", NULL); chunk->load(filename); context->m_phrase_index->load(2, chunk); - filename = g_build_filename(context->m_user_dir, "gbk_char.dbin"); + filename = g_build_filename(context->m_user_dir, "gbk_char.dbin", NULL); log->load(filename); context->m_phrase_index->merge(2, log); context->m_system_bigram = new Bigram; - filename = g_build_filename(context->m_system_dir, "system.db"); + filename = g_build_filename(context->m_system_dir, "system.db", NULL); context->m_system_bigram->attach(filename, ATTACH_READONLY); context->m_user_bigram = new Bigram; - filename = g_build_filename(context->m_user_dir, "user.db"); + filename = g_build_filename(context->m_user_dir, "user.db", NULL); context->m_user_bigram->attach(filename, ATTACH_CREATE|ATTACH_READWRITE); context->m_pinyin_lookup = new PinyinLookup @@ -238,9 +238,12 @@ bool pinyin_get_candidates(pinyin_context_t * context, size_t pinyin_len = context->m_pinyin_keys->len - offset; PhraseIndexRanges ranges; - size_t min_index = 1, max_index = 2; memset(ranges, 0, sizeof(ranges)); + guint8 min_index, max_index; + assert( ERROR_OK == context->m_phrase_index-> + get_sub_phrase_range(min_index, max_index)); + for (size_t m = min_index; m <= max_index; ++m) { ranges[m] = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange)); } @@ -249,9 +252,19 @@ bool pinyin_get_candidates(pinyin_context_t * context, for (ssize_t i = pinyin_len; i >= 1; --i) { g_array_set_size(tokens, 0); + + /* clear ranges. */ + for ( size_t m = min_index; m <= max_index; ++m ) { + g_array_set_size(ranges[m], 0); + } + /* do pinyin search. */ int retval = context->m_pinyin_table->search (i, pinyin_keys, ranges); + + if ( !(retval & SEARCH_OK) ) + continue; + /* reduce to a single GArray. */ for (size_t m = min_index; m <= max_index; ++m) { g_array_append_vals(tokens, ranges[m]->data, ranges[m]->len); @@ -274,6 +287,9 @@ bool pinyin_get_candidates(pinyin_context_t * context, /* copy out candidates. */ g_array_append_vals(candidates, tokens->data, tokens->len); + + if ( !(retval & SEARCH_CONTINUED) ) + break; } g_array_free(tokens, TRUE); @@ -331,17 +347,21 @@ bool pinyin_save(pinyin_context_t * context){ MemoryChunk * oldchunk = new MemoryChunk; MemoryChunk * newlog = new MemoryChunk; - gchar * filename = g_build_filename(context->m_system_dir, "gb_char.bin"); + gchar * filename = g_build_filename(context->m_system_dir, + "gb_char.bin", NULL); oldchunk->load(filename); context->m_phrase_index->diff(1, oldchunk, newlog); - filename = g_build_filename(context->m_user_dir, "gb_char.dbin"); + filename = g_build_filename(context->m_user_dir, + "gb_char.dbin", NULL); newlog->save(filename); delete newlog; oldchunk = new MemoryChunk; newlog = new MemoryChunk; - filename = g_build_filename(context->m_system_dir, "gbk_char.bin"); + filename = g_build_filename(context->m_system_dir, + "gbk_char.bin", NULL); context->m_phrase_index->diff(2, oldchunk, newlog); - filename = g_build_filename(context->m_user_dir, "gbk_char.dbin"); + filename = g_build_filename(context->m_user_dir, + "gbk_char.dbin", NULL); newlog->save(filename); delete newlog; diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp index 4e1eb70..5517169 100644 --- a/src/storage/phrase_index.cpp +++ b/src/storage/phrase_index.cpp @@ -456,6 +456,18 @@ bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){ return true; } +int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index, + guint8 & max_index){ + min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0; + for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){ + if ( m_sub_phrase_indices[i] ) { + min_index = std_lite::min(min_index, i); + max_index = std_lite::max(max_index, i); + } + } + return ERROR_OK; +} + int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index]; if ( !sub_phrase ) diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h index 0c5c824..17d81c4 100644 --- a/src/storage/phrase_index.h +++ b/src/storage/phrase_index.h @@ -237,9 +237,12 @@ public: MemoryChunk * newlog); bool merge(guint8 phrase_index, MemoryChunk * log); - /* compat all SubPhraseIndex m_phrase_content memory usage.*/ + /* compat all SubPhraseIndex m_phrase_content memory usage. */ bool compat(); + /* get all available sub phrase indices. */ + int get_sub_phrase_range(guint8 & min_index, guint8 & max_index); + /* get each sub phrase token range with phrase_index added */ int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range); -- cgit