diff options
author | Peng Wu <alexepico@gmail.com> | 2012-01-12 11:20:55 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-01-12 11:20:55 +0800 |
commit | ee0b9cde31d362a95f514cbab043229940d04973 (patch) | |
tree | bb48cf621c97edddf194bf687b9f94401baf7597 | |
parent | a9909e795ba81830439a1007a2b3ef24c9fc1f53 (diff) | |
download | libpinyin-ee0b9cde31d362a95f514cbab043229940d04973.tar.gz libpinyin-ee0b9cde31d362a95f514cbab043229940d04973.tar.xz libpinyin-ee0b9cde31d362a95f514cbab043229940d04973.zip |
write train result2
-rw-r--r-- | src/lookup/pinyin_lookup.cpp | 81 | ||||
-rw-r--r-- | src/lookup/pinyin_lookup.h | 2 | ||||
-rw-r--r-- | src/pinyin.cpp | 2 |
3 files changed, 83 insertions, 2 deletions
diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp index 93b5547..2c6974f 100644 --- a/src/lookup/pinyin_lookup.cpp +++ b/src/lookup/pinyin_lookup.cpp @@ -419,6 +419,7 @@ bool PinyinLookup::final_step(MatchResults & results){ return true; } +#if 0 bool PinyinLookup::train_result(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results){ bool train_next = false; ChewingKey * pinyin_keys = (ChewingKey *)keys->data; @@ -478,6 +479,86 @@ bool PinyinLookup::train_result(ChewingKeyVector keys, CandidateConstraints cons } return true; } +#endif + + +bool PinyinLookup::train_result2(ChewingKeyVector keys, + CandidateConstraints constraints, + MatchResults results) { + const guint32 initial_seed = 23 * 15; + const guint32 expand_factor = 2; + const guint32 unigram_factor = 7; + const guint32 pinyin_factor = 1; + const guint32 ceiling_seed = 23 * 15 * 64; + + /* begin training based on constraints and results. */ + bool train_next = false; + ChewingKey * pinyin_keys = (ChewingKey *) keys->data; + + phrase_token_t last_token = sentence_start; + /* constraints->len + 1 == results->len */ + for (size_t i = 0; i < constraints->len; ++i) { + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if (null_token == *token) + continue; + + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, i); + if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) { + if (CONSTRAINT_ONESTEP == constraint->m_type) { + assert(*token == constraint->m_token); + train_next = true; + } else { + train_next = false; + } + + guint32 seed = initial_seed; + /* train bi-gram first, and get train seed. */ + if (last_token) { + SingleGram * user = NULL; + m_user_bigram->load(last_token, user); + + guint32 total_freq = 0; + if (!user) { + user = new SingleGram; + } + assert(user->get_total_freq(total_freq)); + + guint32 freq = 0; + /* compute train factor */ + if (!user->get_freq(*token, freq)) { + assert(user->insert_freq(*token, 0)); + seed = initial_seed; + } else { + seed = std_lite::max(freq, initial_seed); + seed *= expand_factor; + seed = std_lite::min(seed, ceiling_seed); + } + + /* protect against total_freq overflow */ + if (seed > 0 && total_freq > total_freq + seed) + goto next; + + assert(user->set_total_freq(total_freq + seed)); + /* if total_freq is not overflow, then freq won't overflow. */ + assert(user->set_freq(*token, freq + seed)); + assert(m_user_bigram->store(last_token, user)); + next: + if (user) + delete user; + } + + /* train uni-gram */ + m_phrase_index->get_phrase_item(*token, m_cache_phrase_item); + m_cache_phrase_item.increase_pronunciation_possibility + (m_options, pinyin_keys + i, seed * pinyin_factor); + m_phrase_index->add_unigram_frequency + (*token, seed * unigram_factor); + } + last_token = *token; + } + return true; +} guint8 PinyinLookup::add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token){ if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item) ) diff --git a/src/lookup/pinyin_lookup.h b/src/lookup/pinyin_lookup.h index aaf380f..9275f36 100644 --- a/src/lookup/pinyin_lookup.h +++ b/src/lookup/pinyin_lookup.h @@ -136,7 +136,7 @@ public: bool get_best_match(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results); - bool train_result(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results); + bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results); bool convert_to_utf8(MatchResults results, /* out */ char * & result_string) diff --git a/src/pinyin.cpp b/src/pinyin.cpp index 14b3604..4f32c68 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -605,7 +605,7 @@ bool pinyin_train(pinyin_instance_t * instance){ pinyin_context_t * & context = instance->m_context; context->m_modified = true; - bool retval = context->m_pinyin_lookup->train_result + bool retval = context->m_pinyin_lookup->train_result2 (instance->m_pinyin_keys, instance->m_constraints, instance->m_match_results); |