From 70571740ae2b8a71cfec139d674c8f3b3fd1c232 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Wed, 17 Aug 2011 12:01:22 +0800 Subject: pinyin apis WIP --- src/pinyin.cpp | 131 ++++++++++++++++++++++++++++++++++-- tests/lookup/test_simple_lookup.cpp | 6 +- 2 files changed, 128 insertions(+), 9 deletions(-) diff --git a/src/pinyin.cpp b/src/pinyin.cpp index ca46287..381bd1f 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -25,12 +25,95 @@ struct _pinyin_context_t{ MatchResults m_match_results; CandidateConstraints m_constraints; - const char * m_system_dir; - const char * m_user_dir; + char * m_system_dir; + char * m_user_dir; }; -pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir); -void pinyin_fini(pinyin_context_t * context); +pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ + pinyin_context_t * context = new pinyin_context_t; + + context->m_system_dir = g_strdup(systemdir); + context->m_user_dir = g_strdup(userdir); + + context->m_pinyin_table = new PinyinLargeTable(&(context->m_custom)); + MemoryChunk * chunk = new MemoryChunk; + gchar * filename = g_build_filename + (context->m_system_dir, "pinyin_index.bin"); + chunk->load(filename); + context->m_pinyin_table->load(chunk); + + context->m_validator.initialize(context->m_pinyin_table); + context->m_default_parser = new PinyinDefaultParser; + context->m_shuang_pin_parser = new PinyinShuangPinParser; + + context->m_phrase_table = new PhraseLargeTable; + chunk = new MemoryChunk; + filename = g_build_filename(context->m_system_dir, "phrase_index.bin"); + chunk->load(filename); + context->m_phrase_table->load(chunk); + + context->m_phrase_index = new FacadePhraseIndex; + MemoryChunk * log = new MemoryChunk; chunk = new MemoryChunk; + filename = g_build_filename(context->m_system_dir, "gb_char.bin"); + chunk->load(filename); + context->m_phrase_index->load(1, chunk); + filename = g_build_filename(context->m_user_dir, "gb_char.dbin"); + log->load(filename); + context->m_phrase_index->merge(1, log); + + log = new MemoryChunk; chunk = new MemoryChunk; + filename = g_build_filename(context->m_system_dir, "gbk_char.bin"); + chunk->load(filename); + context->m_phrase_index->load(2, chunk); + filename = g_build_filename(context->m_user_dir, "gbk_char.dbin"); + log->load(filename); + context->m_phrase_index->merge(2, log); + + context->m_system_bigram = new Bigram; + filename = g_build_filename(context->m_system_dir, "system.db"); + context->m_system_bigram->attach(filename, ATTACH_READONLY); + context->m_user_bigram = new Bigram; + filename = g_build_filename(context->m_user_dir, "user.db"); + context->m_user_bigram->attach(filename, ATTACH_CREATE|ATTACH_READWRITE); + + context->m_pinyin_lookup = new PinyinLookup + ( &(context->m_custom), context->m_pinyin_table, + context->m_phrase_index, context->m_system_bigram, + context->m_user_bigram); + + context->m_phrase_lookup = new PhraseLookup + (context->m_phrase_table, context->m_phrase_index, + context->m_system_bigram, context->m_user_bigram); + + context->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(PinyinKey)); + context->m_match_results = g_array_new + (FALSE, FALSE, sizeof(phrase_token_t)); + context->m_constraints = g_array_new + (FALSE, FALSE, sizeof(lookup_constraint_t)); + + return context; +} + +void pinyin_fini(pinyin_context_t * context){ + delete context->m_default_parser; + delete context->m_shuang_pin_parser; + delete context->m_pinyin_table; + delete context->m_phrase_table; + delete context->m_phrase_index; + delete context->m_system_bigram; + delete context->m_user_bigram; + delete context->m_pinyin_lookup; + delete context->m_phrase_lookup; + + g_array_free(context->m_pinyin_keys, true); + g_array_free(context->m_match_results, true); + g_array_free(context->m_constraints, true); + + g_free(context->m_system_dir); + g_free(context->m_user_dir); + + delete context; +} /* copy from custom to context->m_custom. */ bool pinyin_set_options(pinyin_context_t * context, @@ -43,9 +126,21 @@ bool pinyin_set_options(pinyin_context_t * context, /* copy from pinyin_keys to m_pinyin_keys. */ bool pinyin_set_pinyin_keys(pinyin_context_t * context, PinyinKeyVector pinyin_keys){ + size_t key_len = context->m_pinyin_keys->len; g_array_set_size(context->m_pinyin_keys, 0); g_array_append_vals(context->m_pinyin_keys, pinyin_keys->data, pinyin_keys->len); + + g_array_set_size(context->m_constraints, context->m_pinyin_keys->len); + for (size_t i = key_len; i < context->m_pinyin_keys->len; ++i ) { + lookup_constraint_t * constraint = + &g_array_index(context->m_constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + context->m_pinyin_lookup->validate_constraint + (context->m_constraints, context->m_pinyin_keys); + return true; } @@ -157,9 +252,33 @@ bool pinyin_train(pinyin_context_t * context){ return retval; } -bool pinyin_save(pinyin_context_t * context); +bool pinyin_save(pinyin_context_t * context){ + MemoryChunk * oldchunk = new MemoryChunk; + MemoryChunk * newlog = new MemoryChunk; + + gchar * filename = g_build_filename(context->m_system_dir, "gb_char.bin"); + oldchunk->load(filename); + context->m_phrase_index->diff(1, oldchunk, newlog); + filename = g_build_filename(context->m_user_dir, "gb_char.dbin"); + newlog->save(filename); + delete newlog; -bool pinyin_reset(pinyin_context_t * context); + oldchunk = new MemoryChunk; newlog = new MemoryChunk; + filename = g_build_filename(context->m_system_dir, "gbk_char.bin"); + context->m_phrase_index->diff(2, oldchunk, newlog); + filename = g_build_filename(context->m_user_dir, "gbk_char.dbin"); + newlog->save(filename); + delete newlog; + + return true; +} + +bool pinyin_reset(pinyin_context_t * context){ + g_array_set_size(context->m_pinyin_keys, 0); + g_array_set_size(context->m_match_results, 0); + g_array_set_size(context->m_constraints, 0); + return true; +} /** TODO: to be implemented. * bool pinyin_get_guessed_sentence_with_prefix(...); diff --git a/tests/lookup/test_simple_lookup.cpp b/tests/lookup/test_simple_lookup.cpp index 7e4d256..96c512a 100644 --- a/tests/lookup/test_simple_lookup.cpp +++ b/tests/lookup/test_simple_lookup.cpp @@ -29,12 +29,12 @@ int main( int argc, char * argv[]){ PinyinCustomSettings custom; PinyinLargeTable largetable(&custom); - BitmapPinyinValidator validator; - validator.initialize(&largetable); - MemoryChunk * new_chunk = new MemoryChunk; new_chunk->load("../../data/pinyin_index.bin"); largetable.load(new_chunk); + + BitmapPinyinValidator validator; + validator.initialize(&largetable); FacadePhraseIndex phrase_index; new_chunk = new MemoryChunk; -- cgit