From eb63539270fab81678da97c292d247796bd5a367 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 10 Sep 2012 15:48:26 +0800 Subject: write search_unigram2 and search_bigram2 --- src/lookup/pinyin_lookup2.cpp | 106 ++++++++++++++++++++++++++++++++++++++++-- src/lookup/pinyin_lookup2.h | 4 +- 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp index 40730ae..bb57295 100644 --- a/src/lookup/pinyin_lookup2.cpp +++ b/src/lookup/pinyin_lookup2.cpp @@ -20,6 +20,7 @@ */ #include +#include "facade_chewing_table.h" #include "pinyin_lookup2.h" #include "stl_lite.h" @@ -243,7 +244,7 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes, break; lookup_constraint_t * next_constraint = &g_array_index - (m_constraints, lookup_constraint_t, m); + (m_constraints, lookup_constraint_t, m - 1); if (CONSTRAINT_NOSEARCH == next_constraint->m_type) break; @@ -255,8 +256,8 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes, populate_candidates(candidates, step); get_top_results(topresults, candidates); - search_bigram(topresults, i, m, ranges), - search_unigram(topresults, i, m, ranges); + search_bigram2(topresults, i, ranges), + search_unigram2(topresults, i, ranges); /* no longer pinyin */ if (!(result & SEARCH_CONTINUED)) @@ -271,3 +272,102 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes, return final_step(results); } + +bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges) { + if (0 == topresults->len) + return false; + + lookup_value_t * max = (lookup_value_t *) + g_ptr_array_index(topresults, 0); + + lookup_constraint_t * constraint = + &g_array_index(m_constraints, lookup_constraint_t, nstep); + + if (CONSTRAINT_ONESTEP == constraint->m_type) { + return unigram_gen_next_step(nstep, max, constraint->m_token); + } + + bool found = false; + + if (NO_CONSTRAINT == constraint->m_type) { + for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = ranges[m]; + if ( !array ) continue; + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); + for ( phrase_token_t token = range->m_range_begin; + token != range->m_range_end; ++token){ + found = unigram_gen_next_step(nstep, max, token)|| found; + } + } + } + } + + return found; +} + +bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges) { + if (0 == topresults->len) + return false; + + lookup_constraint_t* constraint = + &g_array_index(m_constraints, lookup_constraint_t, nstep); + + bool found = false; + BigramPhraseArray bigram_phrase_items = g_array_new + (FALSE, FALSE, sizeof(BigramPhraseItem)); + + for (size_t i = 0; i < topresults->len; ++i) { + lookup_value_t * value = (lookup_value_t *) + g_ptr_array_index(topresults, i); + + phrase_token_t index_token = value->m_handles[1]; + + SingleGram * system = NULL, * user = NULL; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if ( !merge_single_gram(&m_merged_single_gram, system, user) ) + continue; + + if ( CONSTRAINT_ONESTEP == constraint->m_type ){ + phrase_token_t token = constraint->m_token; + + guint32 freq; + if( m_merged_single_gram.get_freq(token, freq) ){ + guint32 total_freq; + m_merged_single_gram.get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found; + } + } + + if (NO_CONSTRAINT == constraint->m_type) { + for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = ranges[m]; + if ( !array ) continue; + + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = + &g_array_index(array, PhraseIndexRange, n); + + g_array_set_size(bigram_phrase_items, 0); + m_merged_single_gram.search(range, bigram_phrase_items); + for( size_t k = 0; k < bigram_phrase_items->len; ++k) { + BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); + found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found; + } + } + } + } + if (system) + delete system; + if (user) + delete user; + } + + g_array_free(bigram_phrase_items, TRUE); + return found; +} diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h index bf59f1c..e5bf127 100644 --- a/src/lookup/pinyin_lookup2.h +++ b/src/lookup/pinyin_lookup2.h @@ -104,9 +104,9 @@ protected: /* Array of LookupStepContent */ - bool search_unigram2(GPtrArray * topresults, int nstep, int npinyin, + bool search_unigram2(GPtrArray * topresults, int nstep, PhraseIndexRanges ranges); - bool search_bigram2(GPtrArray * topresults, int nstep, int npinyin, + bool search_bigram2(GPtrArray * topresults, int nstep, PhraseIndexRanges ranges); bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token); -- cgit