diff options
author | Peng Wu <alexepico@gmail.com> | 2011-09-09 15:06:57 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-09-09 15:06:57 +0800 |
commit | 65d222d080b2bea07a49b981460435f39bff4f4b (patch) | |
tree | 6a6804df29d882753d1c9cb4989883491871a0c0 /src | |
parent | 5846c125ad7eda5d8cd914dd32cebe2b34df9b97 (diff) | |
download | libpinyin-65d222d080b2bea07a49b981460435f39bff4f4b.tar.gz libpinyin-65d222d080b2bea07a49b981460435f39bff4f4b.tar.xz libpinyin-65d222d080b2bea07a49b981460435f39bff4f4b.zip |
refine pinyin header
Diffstat (limited to 'src')
-rw-r--r-- | src/lookup/lookup.cpp | 4 | ||||
-rw-r--r-- | src/lookup/pinyin_lookup.h | 3 | ||||
-rw-r--r-- | src/pinyin.cpp | 288 | ||||
-rw-r--r-- | src/pinyin.h | 8 |
4 files changed, 140 insertions, 163 deletions
diff --git a/src/lookup/lookup.cpp b/src/lookup/lookup.cpp index e0a91dc..e0ccce9 100644 --- a/src/lookup/lookup.cpp +++ b/src/lookup/lookup.cpp @@ -22,7 +22,7 @@ #include "phrase_index.h" #include "lookup.h" -using namespace pinyin; +namespace pinyin{ bool convert_to_utf8(FacadePhraseIndex * phrase_index, MatchResults match_results, @@ -56,3 +56,5 @@ bool convert_to_utf8(FacadePhraseIndex * phrase_index, } return true; } + +}; diff --git a/src/lookup/pinyin_lookup.h b/src/lookup/pinyin_lookup.h index da70f24..47cc4a9 100644 --- a/src/lookup/pinyin_lookup.h +++ b/src/lookup/pinyin_lookup.h @@ -127,11 +127,10 @@ public: bool train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results); bool convert_to_utf8(MatchResults results, - /* in */ const char * delimiter, /* out */ char * & result_string) { return pinyin::convert_to_utf8(m_phrase_index, results, - delimiter, result_string); + NULL, result_string); } /* user interactions */ diff --git a/src/pinyin.cpp b/src/pinyin.cpp index afe359a..706250d 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -85,6 +85,35 @@ pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ return context; } +bool pinyin_save(pinyin_context_t * context){ + MemoryChunk * oldchunk = new MemoryChunk; + MemoryChunk * newlog = new MemoryChunk; + + gchar * filename = g_build_filename(context->m_system_dir, + "gb_char.bin", NULL); + oldchunk->load(filename); + context->m_phrase_index->diff(1, oldchunk, newlog); + filename = g_build_filename(context->m_user_dir, + "gb_char.dbin", NULL); + newlog->save(filename); + delete newlog; + + oldchunk = new MemoryChunk; newlog = new MemoryChunk; + filename = g_build_filename(context->m_system_dir, + "gbk_char.bin", NULL); + oldchunk->load(filename); + context->m_phrase_index->diff(2, oldchunk, newlog); + filename = g_build_filename(context->m_user_dir, + "gbk_char.dbin", NULL); + newlog->save(filename); + delete newlog; + + filename = g_build_filename(context->m_user_dir, "user.db", NULL); + context->m_user_bigram->save_db(filename); + + return true; +} + void pinyin_fini(pinyin_context_t * context){ delete context->m_default_parser; delete context->m_shuang_pin_parser; @@ -103,69 +132,44 @@ void pinyin_fini(pinyin_context_t * context){ delete context; } -static bool pinyin_alloc_auxiliary_arrays(pinyin_context_t * context, - PinyinKeyVector * pinyin_keys, - PinyinKeyPosVector * pinyin_poses, - CandidateConstraints * constraints, - MatchResults * match_results){ - - *pinyin_keys = g_array_new(FALSE, FALSE, sizeof(PinyinKey)); - *pinyin_poses = g_array_new(FALSE, FALSE, sizeof(PinyinKeyPos)); - *constraints = g_array_new(FALSE, FALSE, sizeof(lookup_constraint_t)); - *match_results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - +/* copy from custom to context->m_custom. */ +bool pinyin_set_options(pinyin_context_t * context, + PinyinCustomSettings * custom){ + guint32 option = custom->to_value(); + context->m_custom.from_value(option); return true; } -static bool pinyin_free_auxiliary_arrays(pinyin_context_t * context, - PinyinKeyVector * pinyin_keys, - PinyinKeyPosVector * pinyin_poses, - CandidateConstraints * constraints, - MatchResults * match_results){ - g_array_free(*pinyin_keys, TRUE); - *pinyin_keys = NULL; - g_array_free(*pinyin_poses, TRUE); - *pinyin_poses = NULL; - g_array_free(*constraints, TRUE); - *constraints = NULL; - g_array_free(*match_results, TRUE); - *match_results = NULL; - - return true; -} pinyin_instance_t * pinyin_get_instance(pinyin_context_t * context){ pinyin_instance_t * instance = new pinyin_instance_t; instance->m_context = context; - pinyin_alloc_auxiliary_arrays - (instance->m_context, &(instance->m_pinyin_keys), - &(instance->m_pinyin_poses), - &(instance->m_constraints), - &(instance->m_match_results)); + + instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(PinyinKey)); + instance->m_pinyin_poses = g_array_new(FALSE, FALSE, sizeof(PinyinKeyPos)); + instance->m_constraints = g_array_new + (FALSE, FALSE, sizeof(lookup_constraint_t)); + instance->m_match_results = + g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + return instance; } void pinyin_release_instance(pinyin_instance_t * instance){ - pinyin_free_auxiliary_arrays - (instance->m_context, &(instance->m_pinyin_keys), - &(instance->m_pinyin_poses), - &(instance->m_constraints), - &(instance->m_match_results)); + g_array_free(instance->m_pinyin_keys, TRUE); + g_array_free(instance->m_pinyin_poses, TRUE); + g_array_free(instance->m_constraints, TRUE); + g_array_free(instance->m_match_results, TRUE); + delete instance; } -/* copy from custom to context->m_custom. */ -bool pinyin_set_options(pinyin_context_t * context, - PinyinCustomSettings * custom){ - guint32 option = custom->to_value(); - context->m_custom.from_value(option); - return true; -} +static bool pinyin_update_constraints(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; + PinyinKeyVector & pinyin_keys = instance->m_pinyin_keys; + CandidateConstraints & constraints = instance->m_constraints; -bool pinyin_update_constraints(pinyin_context_t * context, - PinyinKeyVector pinyin_keys, - CandidateConstraints constraints){ size_t key_len = constraints->len; g_array_set_size(constraints, pinyin_keys->len); for (size_t i = key_len; i < pinyin_keys->len; ++i ) { @@ -181,89 +185,112 @@ bool pinyin_update_constraints(pinyin_context_t * context, } -bool pinyin_get_guessed_tokens(pinyin_context_t * context, - PinyinKeyVector pinyin_keys, - CandidateConstraints constraints, - MatchResults match_results){ +bool pinyin_guess_sentence(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; + + pinyin_update_constraints(instance); bool retval = context->m_pinyin_lookup->get_best_match - (pinyin_keys, constraints, match_results); + (instance->m_pinyin_keys, + instance->m_constraints, + instance->m_match_results); return retval; } +bool pinyin_phrase_segment(pinyin_instance_t * instance, + const char * sentence){ + pinyin_context_t * & context = instance->m_context; + + const glong num_of_chars = g_utf8_strlen(sentence, -1); + glong utf16_len = 0; + utf16_t * utf16 = g_utf8_to_utf16(sentence, -1, NULL, &utf16_len, NULL); + + g_return_val_if_fail(num_of_chars == utf16_len, false); + + bool retval = context->m_phrase_lookup->get_best_match + (utf16_len, utf16, instance->m_match_results); + + g_free(utf16); + return retval; +} + /* the returned sentence should be freed by g_free(). */ -bool pinyin_get_sentence(pinyin_context_t * context, - MatchResults match_results, +bool pinyin_get_sentence(pinyin_instance_t * instance, char ** sentence){ + pinyin_context_t * & context = instance->m_context; - bool retval = context->m_pinyin_lookup->convert_to_utf8 - (match_results, *sentence); + bool retval = pinyin::convert_to_utf8 + (context->m_phrase_index, instance->m_match_results, + NULL, *sentence); return retval; } -bool pinyin_parse_full(pinyin_context_t * context, +bool pinyin_parse_full(pinyin_instance_t * instance, const char * onepinyin, PinyinKey * onekey){ + pinyin_context_t * & context = instance->m_context; + int pinyin_len = strlen(onepinyin); int parse_len = context->m_default_parser->parse_one_key ( context->m_validator, *onekey, onepinyin, pinyin_len); return pinyin_len == parse_len; } -bool pinyin_parse_more_fulls(pinyin_context_t * context, - const char * pinyins, - PinyinKeyVector pinyin_keys, - PinyinKeyPosVector pinyin_poses){ +bool pinyin_parse_more_fulls(pinyin_instance_t * instance, + const char * pinyins){ + pinyin_context_t * & context = instance->m_context; int pinyin_len = strlen(pinyins); int parse_len = context->m_default_parser->parse - ( context->m_validator, pinyin_keys, - pinyin_poses, pinyins, pinyin_len); + ( context->m_validator, instance->m_pinyin_keys, + instance->m_pinyin_poses, pinyins, pinyin_len); return pinyin_len == parse_len; } -bool pinyin_parse_double(pinyin_context_t * context, +bool pinyin_parse_double(pinyin_instance_t * instance, const char * onepinyin, PinyinKey * onekey){ + pinyin_context_t * & context = instance->m_context; + int pinyin_len = strlen(onepinyin); int parse_len = context->m_shuang_pin_parser->parse_one_key ( context->m_validator, *onekey, onepinyin, pinyin_len); return pinyin_len == parse_len; } -bool pinyin_parse_more_doubles(pinyin_context_t * context, - const char * pinyins, - PinyinKeyVector pinyin_keys, - PinyinKeyPosVector pinyin_poses){ +bool pinyin_parse_more_doubles(pinyin_instance_t * instance, + const char * pinyins){ + pinyin_context_t * & context = instance->m_context; int pinyin_len = strlen(pinyins); int parse_len = context->m_shuang_pin_parser->parse - ( context->m_validator, pinyin_keys, - pinyin_poses, pinyins, pinyin_len); + ( context->m_validator, instance->m_pinyin_keys, + instance->m_pinyin_poses, pinyins, pinyin_len); return pinyin_len == parse_len; } -bool pinyin_parse_chewing(pinyin_context_t * context, +bool pinyin_parse_chewing(pinyin_instance_t * instance, const char * onechewing, PinyinKey * onekey){ + pinyin_context_t * & context = instance->m_context; + int chewing_len = strlen(onechewing); int parse_len = context->m_chewing_parser->parse_one_key ( context->m_validator, *onekey, onechewing, chewing_len ); return chewing_len == parse_len; } -bool pinyin_parse_more_chewings(pinyin_context_t * context, - const char * chewings, - PinyinKeyVector pinyin_keys, - PinyinKeyPosVector pinyin_poses){ +bool pinyin_parse_more_chewings(pinyin_instance_t * instance, + const char * chewings){ + pinyin_context_t * & context = instance->m_context; int chewing_len = strlen(chewings); int parse_len = context->m_chewing_parser->parse - ( context->m_validator, pinyin_keys, - pinyin_poses, chewings, chewing_len); + ( context->m_validator, instance->m_pinyin_keys, + instance->m_pinyin_poses, chewings, chewing_len); return chewing_len == parse_len; } @@ -291,10 +318,11 @@ static gint compare_token_with_unigram_freq(gconstpointer lhs, return freq_lhs - freq_rhs; } -bool pinyin_get_candidates(pinyin_context_t * context, +bool pinyin_get_candidates(pinyin_instance_t * instance, size_t offset, - PinyinKeyVector pinyin_keys, TokenVector candidates){ + pinyin_context_t * & context = instance->m_context; + PinyinKeyVector & pinyin_keys = instance->m_pinyin_keys; g_array_set_size(candidates, 0); PinyinKey * keys = &g_array_index @@ -364,62 +392,46 @@ bool pinyin_get_candidates(pinyin_context_t * context, return true; } -bool pinyin_choose_candidate(pinyin_context_t * context, +bool pinyin_choose_candidate(pinyin_instance_t * instance, size_t offset, - PinyinKeyVector pinyin_keys, - CandidateConstraints constraints, phrase_token_t token){ + pinyin_context_t * & context = instance->m_context; + bool retval = context->m_pinyin_lookup->add_constraint - (constraints, offset, token); + (instance->m_constraints, offset, token); retval = context->m_pinyin_lookup->validate_constraint - (constraints, pinyin_keys) && retval; + (instance->m_constraints, instance->m_pinyin_keys) && retval; return retval; } -bool pinyin_clear_constraint(pinyin_context_t * context, - size_t offset, - CandidateConstraints constraints){ +bool pinyin_clear_constraint(pinyin_instance_t * instance, + size_t offset){ + pinyin_context_t * & context = instance->m_context; + bool retval = context->m_pinyin_lookup->clear_constraint - (constraints, offset); + (instance->m_constraints, offset); return retval; } -bool pinyin_clear_constraints(pinyin_context_t * context, - CandidateConstraints constraints){ +bool pinyin_clear_constraints(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; bool retval = true; - for ( size_t i = 0; i < constraints->len; ++i ) { + for ( size_t i = 0; i < instance->m_constraints->len; ++i ) { retval = context->m_pinyin_lookup->clear_constraint - (constraints, i) && retval; + (instance->m_constraints, i) && retval; } return retval; } -bool pinyin_phrase_segment(pinyin_context_t * context, - const char * sentence, - MatchResults match_results){ - - const glong num_of_chars = g_utf8_strlen(sentence, -1); - glong utf16_len = 0; - utf16_t * utf16 = g_utf8_to_utf16(sentence, -1, NULL, &utf16_len, NULL); - - g_return_val_if_fail(num_of_chars == utf16_len, false); - - bool retval = context->m_phrase_lookup-> - get_best_match(utf16_len, utf16, match_results); - - g_free(utf16); - - return retval; -} - /* the returned word should be freed by g_free. */ -bool pinyin_translate_token(pinyin_context_t * context, +bool pinyin_translate_token(pinyin_instance_t * instance, phrase_token_t token, char ** word){ + pinyin_context_t * & context = instance->m_context; PhraseItem item; utf16_t buffer[MAX_PHRASE_LENGTH]; @@ -430,54 +442,22 @@ bool pinyin_translate_token(pinyin_context_t * context, return retval; } -bool pinyin_train(pinyin_context_t * context, - PinyinKeyVector pinyin_keys, - CandidateConstraints constraints, - MatchResults match_results){ - bool retval = context->m_pinyin_lookup->train_result - (pinyin_keys, constraints, match_results); - return retval; -} - -bool pinyin_save(pinyin_context_t * context){ - MemoryChunk * oldchunk = new MemoryChunk; - MemoryChunk * newlog = new MemoryChunk; - - gchar * filename = g_build_filename(context->m_system_dir, - "gb_char.bin", NULL); - oldchunk->load(filename); - context->m_phrase_index->diff(1, oldchunk, newlog); - filename = g_build_filename(context->m_user_dir, - "gb_char.dbin", NULL); - newlog->save(filename); - delete newlog; - - oldchunk = new MemoryChunk; newlog = new MemoryChunk; - filename = g_build_filename(context->m_system_dir, - "gbk_char.bin", NULL); - oldchunk->load(filename); - context->m_phrase_index->diff(2, oldchunk, newlog); - filename = g_build_filename(context->m_user_dir, - "gbk_char.dbin", NULL); - newlog->save(filename); - delete newlog; +bool pinyin_train(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; - filename = g_build_filename(context->m_user_dir, "user.db", NULL); - context->m_user_bigram->save_db(filename); + bool retval = context->m_pinyin_lookup->train_result + (instance->m_pinyin_keys, instance->m_constraints, + instance->m_match_results); - return true; + return retval; } -bool pinyin_reset(pinyin_context_t * context, - PinyinKeyVector pinyin_keys, - CandidateConstraints constraints, - MatchResults match_results){ - - g_array_set_size(pinyin_keys, 0); - g_array_set_size(constraints, 0); - g_array_set_size(match_results, 0); +bool pinyin_reset(pinyin_instance_t * instance){ + g_array_set_size(instance->m_pinyin_keys, 0); + g_array_set_size(instance->m_pinyin_poses, 0); + g_array_set_size(instance->m_constraints, 0); + g_array_set_size(instance->m_match_results, 0); - /* TODO: to be implemented. */ return true; } diff --git a/src/pinyin.h b/src/pinyin.h index e2099a0..9f88cd6 100644 --- a/src/pinyin.h +++ b/src/pinyin.h @@ -43,6 +43,7 @@ typedef struct { } pinyin_instance_t; pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir); +bool pinyin_save(pinyin_context_t * context); void pinyin_fini(pinyin_context_t * context); bool pinyin_set_options(pinyin_context_t * context, @@ -51,10 +52,7 @@ bool pinyin_set_options(pinyin_context_t * context, pinyin_instance_t * pinyin_get_instance(pinyin_context_t * context); void pinyin_release_instance(pinyin_instance_t * instance); -static bool pinyin_update_constraints(pinyin_instance_t * instance); - -bool pinyin_get_guessed_tokens(pinyin_instance_t * instance, - MatchResults * match_results); +bool pinyin_guess_sentence(pinyin_instance_t * instance); bool pinyin_phrase_segment(pinyin_instance_t * instance, const char * sentence); @@ -71,7 +69,6 @@ bool pinyin_parse_more_fulls(pinyin_instance_t * instance, bool pinyin_parse_double(pinyin_instance_t * instance, const char * onepinyin, PinyinKey * onekey); - bool pinyin_parse_more_doubles(pinyin_instance_t * instance, const char * pinyins); @@ -97,7 +94,6 @@ bool pinyin_translate_token(pinyin_instance_t * instance, phrase_token_t token, char ** word); bool pinyin_train(pinyin_instance_t * instance); -bool pinyin_save(pinyin_instance_t * instance); bool pinyin_reset(pinyin_instance_t * instance); }; |