From aced528d37c803442cd1f39e51e36e96c59d94f6 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Wed, 10 Oct 2012 13:55:07 +0800 Subject: re-factor phrase lookup --- src/lookup/phrase_lookup.cpp | 110 +++++++++++++++++++++++++++++++++++++------ src/lookup/phrase_lookup.h | 4 +- 2 files changed, 98 insertions(+), 16 deletions(-) (limited to 'src/lookup') diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp index 9f4cf45..3cfdd3a 100644 --- a/src/lookup/phrase_lookup.cpp +++ b/src/lookup/phrase_lookup.cpp @@ -133,20 +133,8 @@ bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[], /* found next phrase */ if ( result & SEARCH_OK ) { - /* iterate every token. */ - for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { - GArray * array = tokens[n]; - if (NULL == array) - continue; - - /* just skip the loop when the length is zero. */ - for (size_t k = 0; k < array->len; ++k) { - phrase_token_t next_token = - g_array_index(array, phrase_token_t, k); - search_bigram(i, next_token), - search_unigram(i, next_token); - } - } + search_bigram2(i, tokens), + search_unigram2(i, tokens); } /* no longer phrase */ @@ -160,6 +148,8 @@ bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[], return final_step(results); } +#if 0 + bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){ LookupStepContent lookup_content = (LookupStepContent) @@ -213,6 +203,98 @@ bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){ return found; } +#endif + +bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return found; + + /* find the maximum node */ + lookup_value_t * max_value = &g_array_index + (lookup_content, lookup_value_t, 0); + + for (size_t i = 1; i < lookup_content->len; ++i) { + lookup_value_t * cur_value = &g_array_index + (lookup_content, lookup_value_t, i); + if (cur_value->m_poss > max_value->m_poss) + max_value = cur_value; + } + + /* iterate over tokens */ + for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { + GArray * array = tokens[n]; + if (NULL == array) + continue; + + /* just skip the loop when the length is zero. */ + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = + g_array_index(array, phrase_token_t, k); + + found = unigram_gen_next_step + (nstep, max_value, token) || found; + } + } + + return found; +} + +bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if (0 == lookup_content->len) + return found; + + for (size_t i = 0; i < lookup_content->len; ++i) { + lookup_value_t * cur_value = &g_array_index + (lookup_content, lookup_value_t, i); + phrase_token_t index_token = cur_value->m_handles[1]; + + SingleGram * system = NULL, * user = NULL; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if (!merge_single_gram + (&m_merged_single_gram, system, user)) + continue; + + /* iterate over tokens */ + for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { + GArray * array = tokens[n]; + if (NULL == array) + continue; + + /* just skip the loop when the length is zero. */ + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = + g_array_index(array, phrase_token_t, k); + + guint32 freq = 0; + if (m_merged_single_gram.get_freq(token, freq)) { + guint32 total_freq = 0; + m_merged_single_gram.get_total_freq(total_freq); + + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found; + } + } + } + + if (system) + delete system; + if (user) + delete user; + } + + return found; +} + bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token){ diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h index c2537ef..65bd2cd 100644 --- a/src/lookup/phrase_lookup.h +++ b/src/lookup/phrase_lookup.h @@ -69,8 +69,8 @@ protected: /* Explicitly search the next phrase, * to avoid double phrase lookup as the next token has only one. */ - bool search_unigram(int nstep, phrase_token_t token); - bool search_bigram(int nstep, phrase_token_t token); + bool search_unigram2(int nstep, PhraseTokens tokens); + bool search_bigram2(int nstep, PhraseTokens tokens); bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token); bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss); -- cgit