summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2017-01-22 15:55:26 +0800
committerPeng Wu <alexepico@gmail.com>2017-01-22 16:12:11 +0800
commit29ac48475fe23f6f2cf3b3b9677b3902b9cf8f0b (patch)
treee3af5cbd057cfb1f5b3c97a67aabd3e73504a2bc /src
parentec83c02c3779a82489829d8284506a88bc73534b (diff)
downloadlibpinyin-29ac48475fe23f6f2cf3b3b9677b3902b9cf8f0b.tar.gz
libpinyin-29ac48475fe23f6f2cf3b3b9677b3902b9cf8f0b.tar.xz
libpinyin-29ac48475fe23f6f2cf3b3b9677b3902b9cf8f0b.zip
write train_result3 method
Diffstat (limited to 'src')
-rw-r--r--src/lookup/phonetic_lookup.h102
1 files changed, 98 insertions, 4 deletions
diff --git a/src/lookup/phonetic_lookup.h b/src/lookup/phonetic_lookup.h
index 3506068..94a9863 100644
--- a/src/lookup/phonetic_lookup.h
+++ b/src/lookup/phonetic_lookup.h
@@ -422,6 +422,7 @@ public:
bool clear_constraint(size_t index);
bool validate_constraint(PhoneticKeyMatrix * matrix);
+ size_t length() const { return m_constraints->len; }
bool get_constraint(size_t index,
const trellis_constraint_t * & constraint) const {
if (index >= m_constraints->len)
@@ -631,10 +632,10 @@ protected:
public:
- bool get_best_match(TokenVector prefixes,
- PhoneticKeyMatrix * matrix,
- ForwardPhoneticConstraints constraints,
- NBestMatchResults & results) {
+ bool get_nbest_match(TokenVector prefixes,
+ PhoneticKeyMatrix * matrix,
+ ForwardPhoneticConstraints constraints,
+ NBestMatchResults & results) {
m_constraints = constraints;
m_matrix = matrix;
@@ -740,6 +741,99 @@ public:
return true;
}
+ bool train_result3(const PhoneticKeyMatrix * matrix,
+ const ForwardPhoneticConstraints * constraints,
+ MatchResults result) {
+ const guint32 initial_seed = 23 * 3;
+ const guint32 expand_factor = 2;
+ const guint32 unigram_factor = 7;
+ const guint32 pinyin_factor = 1;
+ const guint32 ceiling_seed = 23 * 15 * 64;
+
+ /* begin training based on constraints and result. */
+ bool train_next = false;
+
+ phrase_token_t last_token = sentence_start;
+
+ for (size_t i = 0; i < constraints->length(); ++i) {
+ phrase_token_t token = g_array_index(result, phrase_token_t, i);
+ if (null_token == token)
+ continue;
+
+ const trellis_constraint_t * constraint = NULL;
+ assert(constraints->get_constraint(i, constraint));
+
+ if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) {
+ if (CONSTRAINT_ONESTEP == constraint->m_type) {
+ assert(token == constraint->m_token);
+ train_next = true;
+ } else {
+ train_next = false;
+ }
+
+ guint32 seed = initial_seed;
+ /* train bi-gram first, and get train seed. */
+ if (last_token) {
+ SingleGram * user = NULL;
+ m_user_bigram->load(last_token, user);
+
+ guint32 total_freq = 0;
+ if (!user) {
+ user = new SingleGram;
+ }
+ assert(user->get_total_freq(total_freq));
+
+ guint32 freq = 0;
+ /* compute train factor */
+ if (!user->get_freq(token, freq)) {
+ assert(user->insert_freq(token, 0));
+ seed = initial_seed;
+ } else {
+ seed = std_lite::max(freq, initial_seed);
+ seed *= expand_factor;
+ seed = std_lite::min(seed, ceiling_seed);
+ }
+
+ /* protect against total_freq overflow */
+ if (seed > 0 && total_freq > total_freq + seed)
+ goto next;
+
+ assert(user->set_total_freq(total_freq + seed));
+ /* if total_freq is not overflow, then freq won't overflow. */
+ assert(user->set_freq(token, freq + seed));
+ assert(m_user_bigram->store(last_token, user));
+ next:
+ assert(NULL != user);
+ if (user)
+ delete user;
+ }
+
+ /* compute the position of next token. */
+ size_t next_pos = i + 1;
+ for (; next_pos < constraints->length(); ++next_pos) {
+ phrase_token_t next_token = g_array_index
+ (result, phrase_token_t, next_pos);
+
+ if (null_token != next_token)
+ break;
+ }
+ /* safe guard for last token. */
+ next_pos = std_lite::min(next_pos, constraints->length() - 1);
+
+ /* train uni-gram */
+ m_phrase_index->get_phrase_item(token, m_cached_phrase_item);
+ increase_pronunciation_possibility
+ (matrix, i, next_pos,
+ m_cached_keys, m_cached_phrase_item, seed * pinyin_factor);
+ m_phrase_index->add_unigram_frequency
+ (token, seed * unigram_factor);
+ }
+
+ last_token = token;
+ }
+
+ return true;
+ }
};
};