summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-09-11 14:11:04 +0800
committerPeng Wu <alexepico@gmail.com>2012-09-11 14:16:18 +0800
commitc6aaa8f66fb0267fadb6b72ef91b0143b7676c16 (patch)
treeeaa253435491c619dc53f023c6e78cde7aa38e29
parent8ebdaf1f72ad559c9c1fab72ca22e978697e64c9 (diff)
downloadlibpinyin-c6aaa8f66fb0267fadb6b72ef91b0143b7676c16.tar.gz
libpinyin-c6aaa8f66fb0267fadb6b72ef91b0143b7676c16.tar.xz
libpinyin-c6aaa8f66fb0267fadb6b72ef91b0143b7676c16.zip
write get_best_match
-rw-r--r--src/lookup/pinyin_lookup2.cpp143
1 files changed, 143 insertions, 0 deletions
diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp
index 8834a44..a87cf89 100644
--- a/src/lookup/pinyin_lookup2.cpp
+++ b/src/lookup/pinyin_lookup2.cpp
@@ -373,3 +373,146 @@ bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep,
g_array_free(bigram_phrase_items, TRUE);
return found;
}
+
+
+bool PinyinLookup2::unigram_gen_next_step(int nstep,
+ lookup_value_t * cur_step,
+ phrase_token_t token) {
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( elem_poss < DBL_EPSILON )
+ return false;
+
+ ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys);
+ if (pinyin_poss < FLT_EPSILON )
+ return false;
+
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup2::bigram_gen_next_step(int nstep,
+ lookup_value_t * cur_step,
+ phrase_token_t token,
+ gfloat bigram_poss) {
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
+ (gdouble) m_phrase_index->get_phrase_index_total_freq();
+ if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
+ return false;
+
+ ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys);
+ if ( pinyin_poss < FLT_EPSILON )
+ return false;
+
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss +
+ log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup2::save_next_step(int next_step_pos,
+ lookup_value_t * cur_step,
+ lookup_value_t * next_step){
+
+ lookup_key_t next_key = next_step->m_handles[1];
+ LookupStepIndex next_lookup_index = (LookupStepIndex)
+ g_ptr_array_index(m_steps_index, next_step_pos);
+ LookupStepContent next_lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, next_step_pos);
+
+ gpointer key = NULL, value = NULL;
+ gboolean lookup_result = g_hash_table_lookup_extended
+ (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
+
+ if ( !lookup_result ){
+ g_array_append_val(next_lookup_content, *next_step);
+ g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1));
+ return true;
+ }else{
+ size_t step_index = GPOINTER_TO_UINT(value);
+ lookup_value_t * orig_next_value = &g_array_index
+ (next_lookup_content, lookup_value_t, step_index);
+
+ if ( orig_next_value->m_poss < next_step->m_poss) {
+ /* found better result. */
+ orig_next_value->m_handles[0] = next_step->m_handles[0];
+ assert(orig_next_value->m_handles[1] == next_step->m_handles[1]);
+ orig_next_value->m_poss = next_step->m_poss;
+ orig_next_value->m_last_step = next_step->m_last_step;
+ return true;
+ }
+
+ return false;
+ }
+}
+
+bool PinyinLookup2::final_step(MatchResults & results){
+
+ /* reset results */
+ g_array_set_size(results, m_steps_content->len - 1);
+ for (size_t i = 0; i < results->len; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ *token = null_token;
+ }
+
+ /* find max element */
+ size_t last_step_pos = m_steps_content->len - 1;
+ GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos);
+ if ( last_step_array->len == 0 )
+ return false;
+
+ lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0);
+ for ( size_t i = 1; i < last_step_array->len; ++i){
+ lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ /* backtracing */
+ while( true ){
+ int cur_step_pos = max_value->m_last_step;
+ if ( -1 == cur_step_pos )
+ break;
+
+ phrase_token_t * token = &g_array_index
+ (results, phrase_token_t, cur_step_pos);
+ *token = max_value->m_handles[1];
+
+ phrase_token_t last_token = max_value->m_handles[0];
+ LookupStepIndex lookup_step_index = (LookupStepIndex)
+ g_ptr_array_index(m_steps_index, cur_step_pos);
+
+ gpointer key = NULL, value = NULL;
+ gboolean result = g_hash_table_lookup_extended
+ (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
+ if (!result)
+ return false;
+
+ LookupStepContent lookup_step_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, cur_step_pos);
+ max_value = &g_array_index
+ (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
+ }
+
+ /* no need to reverse the result */
+ return true;
+}