diff options
author | Peng Wu <alexepico@gmail.com> | 2017-01-20 13:54:55 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2017-01-20 13:54:55 +0800 |
commit | 8b603e06de3cad13918970175478333cca687bfc (patch) | |
tree | c438537538b78370fc6542f8cc70ed28ec991e65 /src/lookup | |
parent | d6b988a24c1895127b1d95582372cac3fd6b339a (diff) | |
download | libpinyin-8b603e06de3cad13918970175478333cca687bfc.tar.gz libpinyin-8b603e06de3cad13918970175478333cca687bfc.tar.xz libpinyin-8b603e06de3cad13918970175478333cca687bfc.zip |
write unigram_gen_next_step and bigram_gen_next_step method
Diffstat (limited to 'src/lookup')
-rw-r--r-- | src/lookup/phonetic_lookup.h | 55 |
1 files changed, 53 insertions, 2 deletions
diff --git a/src/lookup/phonetic_lookup.h b/src/lookup/phonetic_lookup.h index 6415b27..a21aca6 100644 --- a/src/lookup/phonetic_lookup.h +++ b/src/lookup/phonetic_lookup.h @@ -437,6 +437,8 @@ private: const gfloat unigram_lambda; /* memory cache */ + GArray * m_cached_keys; + PhraseItem m_cached_phrase_item; SingleGram m_merged_single_gram; protected: @@ -558,11 +560,60 @@ protected: bool unigram_gen_next_step(int start, int end, trellis_value_t * cur_step, - phrase_token_t token); + phrase_token_t token) { + if (m_phrase_index->get_phrase_item(token, m_cached_phrase_item)) + return false; + + size_t phrase_length = m_cached_phrase_item.get_phrase_length(); + gdouble elem_poss = m_cached_phrase_item.get_unigram_frequency() / + (gdouble) m_phrase_index->get_phrase_index_total_freq(); + if ( elem_poss < DBL_EPSILON ) + return false; + + gfloat pinyin_poss = compute_pronunciation_possibility + (m_matrix, start, end, m_cached_keys, m_cached_phrase_item); + if (pinyin_poss < FLT_EPSILON ) + return false; + + trellis_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_sentence_length = cur_step->m_sentence_length + phrase_length; + next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda); + next_step.m_last_step = start; + next_step.m_sub_index = cur_step->m_current_index; + + return save_next_step(end, cur_step, &next_step); + } + bool bigram_gen_next_step(int start, int end, trellis_value_t * cur_step, phrase_token_t token, - gfloat bigram_poss); + gfloat bigram_poss) { + if (m_phrase_index->get_phrase_item(token, m_cached_phrase_item)) + return false; + + size_t phrase_length = m_cached_phrase_item.get_phrase_length(); + gdouble unigram_poss = m_cached_phrase_item.get_unigram_frequency() / + (gdouble) m_phrase_index->get_phrase_index_total_freq(); + if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) + return false; + + gfloat pinyin_poss = compute_pronunciation_possibility + (m_matrix, start, end, + m_cached_keys, m_cached_phrase_item); + if ( pinyin_poss < FLT_EPSILON ) + return false; + + trellis_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_sentence_length = cur_step->m_sentence_length + phrase_length; + next_step.m_poss = cur_step->m_poss + + log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss); + next_step.m_last_step = start; + next_step.m_sub_index = cur_step->m_current_index; + + return save_next_step(end, cur_step, &next_step); + } bool save_next_step(int next_step_pos, trellis_value_t * cur_step, trellis_value_t * next_step); |