From c6aaa8f66fb0267fadb6b72ef91b0143b7676c16 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 11 Sep 2012 14:11:04 +0800 Subject: write get_best_match --- src/lookup/pinyin_lookup2.cpp | 143 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp index 8834a44..a87cf89 100644 --- a/src/lookup/pinyin_lookup2.cpp +++ b/src/lookup/pinyin_lookup2.cpp @@ -373,3 +373,146 @@ bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep, g_array_free(bigram_phrase_items, TRUE); return found; } + + +bool PinyinLookup2::unigram_gen_next_step(int nstep, + lookup_value_t * cur_step, + phrase_token_t token) { + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble) + m_phrase_index->get_phrase_index_total_freq(); + if ( elem_poss < DBL_EPSILON ) + return false; + + ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep; + gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys); + if (pinyin_poss < FLT_EPSILON ) + return false; + + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup2::bigram_gen_next_step(int nstep, + lookup_value_t * cur_step, + phrase_token_t token, + gfloat bigram_poss) { + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() / + (gdouble) m_phrase_index->get_phrase_index_total_freq(); + if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) + return false; + + ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep; + gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys); + if ( pinyin_poss < FLT_EPSILON ) + return false; + + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + + log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup2::save_next_step(int next_step_pos, + lookup_value_t * cur_step, + lookup_value_t * next_step){ + + lookup_key_t next_key = next_step->m_handles[1]; + LookupStepIndex next_lookup_index = (LookupStepIndex) + g_ptr_array_index(m_steps_index, next_step_pos); + LookupStepContent next_lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, next_step_pos); + + gpointer key = NULL, value = NULL; + gboolean lookup_result = g_hash_table_lookup_extended + (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); + + if ( !lookup_result ){ + g_array_append_val(next_lookup_content, *next_step); + g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1)); + return true; + }else{ + size_t step_index = GPOINTER_TO_UINT(value); + lookup_value_t * orig_next_value = &g_array_index + (next_lookup_content, lookup_value_t, step_index); + + if ( orig_next_value->m_poss < next_step->m_poss) { + /* found better result. */ + orig_next_value->m_handles[0] = next_step->m_handles[0]; + assert(orig_next_value->m_handles[1] == next_step->m_handles[1]); + orig_next_value->m_poss = next_step->m_poss; + orig_next_value->m_last_step = next_step->m_last_step; + return true; + } + + return false; + } +} + +bool PinyinLookup2::final_step(MatchResults & results){ + + /* reset results */ + g_array_set_size(results, m_steps_content->len - 1); + for (size_t i = 0; i < results->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + *token = null_token; + } + + /* find max element */ + size_t last_step_pos = m_steps_content->len - 1; + GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos); + if ( last_step_array->len == 0 ) + return false; + + lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0); + for ( size_t i = 1; i < last_step_array->len; ++i){ + lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + /* backtracing */ + while( true ){ + int cur_step_pos = max_value->m_last_step; + if ( -1 == cur_step_pos ) + break; + + phrase_token_t * token = &g_array_index + (results, phrase_token_t, cur_step_pos); + *token = max_value->m_handles[1]; + + phrase_token_t last_token = max_value->m_handles[0]; + LookupStepIndex lookup_step_index = (LookupStepIndex) + g_ptr_array_index(m_steps_index, cur_step_pos); + + gpointer key = NULL, value = NULL; + gboolean result = g_hash_table_lookup_extended + (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); + if (!result) + return false; + + LookupStepContent lookup_step_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, cur_step_pos); + max_value = &g_array_index + (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); + } + + /* no need to reverse the result */ + return true; +} -- cgit