summaryrefslogtreecommitdiffstats
path: root/src/lookup
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2017-01-20 13:54:55 +0800
committerPeng Wu <alexepico@gmail.com>2017-01-20 13:54:55 +0800
commit8b603e06de3cad13918970175478333cca687bfc (patch)
treec438537538b78370fc6542f8cc70ed28ec991e65 /src/lookup
parentd6b988a24c1895127b1d95582372cac3fd6b339a (diff)
downloadlibpinyin-8b603e06de3cad13918970175478333cca687bfc.tar.gz
libpinyin-8b603e06de3cad13918970175478333cca687bfc.tar.xz
libpinyin-8b603e06de3cad13918970175478333cca687bfc.zip
write unigram_gen_next_step and bigram_gen_next_step method
Diffstat (limited to 'src/lookup')
-rw-r--r--src/lookup/phonetic_lookup.h55
1 files changed, 53 insertions, 2 deletions
diff --git a/src/lookup/phonetic_lookup.h b/src/lookup/phonetic_lookup.h
index 6415b27..a21aca6 100644
--- a/src/lookup/phonetic_lookup.h
+++ b/src/lookup/phonetic_lookup.h
@@ -437,6 +437,8 @@ private:
const gfloat unigram_lambda;
/* memory cache */
+ GArray * m_cached_keys;
+ PhraseItem m_cached_phrase_item;
SingleGram m_merged_single_gram;
protected:
@@ -558,11 +560,60 @@ protected:
bool unigram_gen_next_step(int start, int end,
trellis_value_t * cur_step,
- phrase_token_t token);
+ phrase_token_t token) {
+ if (m_phrase_index->get_phrase_item(token, m_cached_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cached_phrase_item.get_phrase_length();
+ gdouble elem_poss = m_cached_phrase_item.get_unigram_frequency() /
+ (gdouble) m_phrase_index->get_phrase_index_total_freq();
+ if ( elem_poss < DBL_EPSILON )
+ return false;
+
+ gfloat pinyin_poss = compute_pronunciation_possibility
+ (m_matrix, start, end, m_cached_keys, m_cached_phrase_item);
+ if (pinyin_poss < FLT_EPSILON )
+ return false;
+
+ trellis_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_sentence_length = cur_step->m_sentence_length + phrase_length;
+ next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda);
+ next_step.m_last_step = start;
+ next_step.m_sub_index = cur_step->m_current_index;
+
+ return save_next_step(end, cur_step, &next_step);
+ }
+
bool bigram_gen_next_step(int start, int end,
trellis_value_t * cur_step,
phrase_token_t token,
- gfloat bigram_poss);
+ gfloat bigram_poss) {
+ if (m_phrase_index->get_phrase_item(token, m_cached_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cached_phrase_item.get_phrase_length();
+ gdouble unigram_poss = m_cached_phrase_item.get_unigram_frequency() /
+ (gdouble) m_phrase_index->get_phrase_index_total_freq();
+ if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
+ return false;
+
+ gfloat pinyin_poss = compute_pronunciation_possibility
+ (m_matrix, start, end,
+ m_cached_keys, m_cached_phrase_item);
+ if ( pinyin_poss < FLT_EPSILON )
+ return false;
+
+ trellis_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_sentence_length = cur_step->m_sentence_length + phrase_length;
+ next_step.m_poss = cur_step->m_poss +
+ log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss);
+ next_step.m_last_step = start;
+ next_step.m_sub_index = cur_step->m_current_index;
+
+ return save_next_step(end, cur_step, &next_step);
+ }
bool save_next_step(int next_step_pos, trellis_value_t * cur_step, trellis_value_t * next_step);