summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-01-12 11:20:55 +0800
committerPeng Wu <alexepico@gmail.com>2012-01-12 11:20:55 +0800
commitee0b9cde31d362a95f514cbab043229940d04973 (patch)
treebb48cf621c97edddf194bf687b9f94401baf7597
parenta9909e795ba81830439a1007a2b3ef24c9fc1f53 (diff)
downloadlibpinyin-ee0b9cde31d362a95f514cbab043229940d04973.tar.gz
libpinyin-ee0b9cde31d362a95f514cbab043229940d04973.tar.xz
libpinyin-ee0b9cde31d362a95f514cbab043229940d04973.zip
write train result2
-rw-r--r--src/lookup/pinyin_lookup.cpp81
-rw-r--r--src/lookup/pinyin_lookup.h2
-rw-r--r--src/pinyin.cpp2
3 files changed, 83 insertions, 2 deletions
diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp
index 93b5547..2c6974f 100644
--- a/src/lookup/pinyin_lookup.cpp
+++ b/src/lookup/pinyin_lookup.cpp
@@ -419,6 +419,7 @@ bool PinyinLookup::final_step(MatchResults & results){
return true;
}
+#if 0
bool PinyinLookup::train_result(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results){
bool train_next = false;
ChewingKey * pinyin_keys = (ChewingKey *)keys->data;
@@ -478,6 +479,86 @@ bool PinyinLookup::train_result(ChewingKeyVector keys, CandidateConstraints cons
}
return true;
}
+#endif
+
+
+bool PinyinLookup::train_result2(ChewingKeyVector keys,
+ CandidateConstraints constraints,
+ MatchResults results) {
+ const guint32 initial_seed = 23 * 15;
+ const guint32 expand_factor = 2;
+ const guint32 unigram_factor = 7;
+ const guint32 pinyin_factor = 1;
+ const guint32 ceiling_seed = 23 * 15 * 64;
+
+ /* begin training based on constraints and results. */
+ bool train_next = false;
+ ChewingKey * pinyin_keys = (ChewingKey *) keys->data;
+
+ phrase_token_t last_token = sentence_start;
+ /* constraints->len + 1 == results->len */
+ for (size_t i = 0; i < constraints->len; ++i) {
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ if (null_token == *token)
+ continue;
+
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, i);
+ if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) {
+ if (CONSTRAINT_ONESTEP == constraint->m_type) {
+ assert(*token == constraint->m_token);
+ train_next = true;
+ } else {
+ train_next = false;
+ }
+
+ guint32 seed = initial_seed;
+ /* train bi-gram first, and get train seed. */
+ if (last_token) {
+ SingleGram * user = NULL;
+ m_user_bigram->load(last_token, user);
+
+ guint32 total_freq = 0;
+ if (!user) {
+ user = new SingleGram;
+ }
+ assert(user->get_total_freq(total_freq));
+
+ guint32 freq = 0;
+ /* compute train factor */
+ if (!user->get_freq(*token, freq)) {
+ assert(user->insert_freq(*token, 0));
+ seed = initial_seed;
+ } else {
+ seed = std_lite::max(freq, initial_seed);
+ seed *= expand_factor;
+ seed = std_lite::min(seed, ceiling_seed);
+ }
+
+ /* protect against total_freq overflow */
+ if (seed > 0 && total_freq > total_freq + seed)
+ goto next;
+
+ assert(user->set_total_freq(total_freq + seed));
+ /* if total_freq is not overflow, then freq won't overflow. */
+ assert(user->set_freq(*token, freq + seed));
+ assert(m_user_bigram->store(last_token, user));
+ next:
+ if (user)
+ delete user;
+ }
+
+ /* train uni-gram */
+ m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+ m_cache_phrase_item.increase_pronunciation_possibility
+ (m_options, pinyin_keys + i, seed * pinyin_factor);
+ m_phrase_index->add_unigram_frequency
+ (*token, seed * unigram_factor);
+ }
+ last_token = *token;
+ }
+ return true;
+}
guint8 PinyinLookup::add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token){
if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item) )
diff --git a/src/lookup/pinyin_lookup.h b/src/lookup/pinyin_lookup.h
index aaf380f..9275f36 100644
--- a/src/lookup/pinyin_lookup.h
+++ b/src/lookup/pinyin_lookup.h
@@ -136,7 +136,7 @@ public:
bool get_best_match(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results);
- bool train_result(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+ bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results);
bool convert_to_utf8(MatchResults results,
/* out */ char * & result_string)
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 14b3604..4f32c68 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -605,7 +605,7 @@ bool pinyin_train(pinyin_instance_t * instance){
pinyin_context_t * & context = instance->m_context;
context->m_modified = true;
- bool retval = context->m_pinyin_lookup->train_result
+ bool retval = context->m_pinyin_lookup->train_result2
(instance->m_pinyin_keys, instance->m_constraints,
instance->m_match_results);