diff options
author | Peng Wu <alexepico@gmail.com> | 2012-09-10 15:48:26 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-09-10 15:51:00 +0800 |
commit | eb63539270fab81678da97c292d247796bd5a367 (patch) | |
tree | bdb4bf0b72cb82b557fd6b213982c099c1580a7e | |
parent | 7230dbfe956c6a3fc5c060f2859867f825039d25 (diff) | |
download | libpinyin-eb63539270fab81678da97c292d247796bd5a367.tar.gz libpinyin-eb63539270fab81678da97c292d247796bd5a367.tar.xz libpinyin-eb63539270fab81678da97c292d247796bd5a367.zip |
write search_unigram2 and search_bigram2
-rw-r--r-- | src/lookup/pinyin_lookup2.cpp | 106 | ||||
-rw-r--r-- | src/lookup/pinyin_lookup2.h | 4 |
2 files changed, 105 insertions, 5 deletions
diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp index 40730ae..bb57295 100644 --- a/src/lookup/pinyin_lookup2.cpp +++ b/src/lookup/pinyin_lookup2.cpp @@ -20,6 +20,7 @@ */ #include <math.h> +#include "facade_chewing_table.h" #include "pinyin_lookup2.h" #include "stl_lite.h" @@ -243,7 +244,7 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes, break; lookup_constraint_t * next_constraint = &g_array_index - (m_constraints, lookup_constraint_t, m); + (m_constraints, lookup_constraint_t, m - 1); if (CONSTRAINT_NOSEARCH == next_constraint->m_type) break; @@ -255,8 +256,8 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes, populate_candidates(candidates, step); get_top_results(topresults, candidates); - search_bigram(topresults, i, m, ranges), - search_unigram(topresults, i, m, ranges); + search_bigram2(topresults, i, ranges), + search_unigram2(topresults, i, ranges); /* no longer pinyin */ if (!(result & SEARCH_CONTINUED)) @@ -271,3 +272,102 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes, return final_step(results); } + +bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges) { + if (0 == topresults->len) + return false; + + lookup_value_t * max = (lookup_value_t *) + g_ptr_array_index(topresults, 0); + + lookup_constraint_t * constraint = + &g_array_index(m_constraints, lookup_constraint_t, nstep); + + if (CONSTRAINT_ONESTEP == constraint->m_type) { + return unigram_gen_next_step(nstep, max, constraint->m_token); + } + + bool found = false; + + if (NO_CONSTRAINT == constraint->m_type) { + for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = ranges[m]; + if ( !array ) continue; + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); + for ( phrase_token_t token = range->m_range_begin; + token != range->m_range_end; ++token){ + found = unigram_gen_next_step(nstep, max, token)|| found; + } + } + } + } + + return found; +} + +bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges) { + if (0 == topresults->len) + return false; + + lookup_constraint_t* constraint = + &g_array_index(m_constraints, lookup_constraint_t, nstep); + + bool found = false; + BigramPhraseArray bigram_phrase_items = g_array_new + (FALSE, FALSE, sizeof(BigramPhraseItem)); + + for (size_t i = 0; i < topresults->len; ++i) { + lookup_value_t * value = (lookup_value_t *) + g_ptr_array_index(topresults, i); + + phrase_token_t index_token = value->m_handles[1]; + + SingleGram * system = NULL, * user = NULL; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if ( !merge_single_gram(&m_merged_single_gram, system, user) ) + continue; + + if ( CONSTRAINT_ONESTEP == constraint->m_type ){ + phrase_token_t token = constraint->m_token; + + guint32 freq; + if( m_merged_single_gram.get_freq(token, freq) ){ + guint32 total_freq; + m_merged_single_gram.get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found; + } + } + + if (NO_CONSTRAINT == constraint->m_type) { + for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = ranges[m]; + if ( !array ) continue; + + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = + &g_array_index(array, PhraseIndexRange, n); + + g_array_set_size(bigram_phrase_items, 0); + m_merged_single_gram.search(range, bigram_phrase_items); + for( size_t k = 0; k < bigram_phrase_items->len; ++k) { + BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); + found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found; + } + } + } + } + if (system) + delete system; + if (user) + delete user; + } + + g_array_free(bigram_phrase_items, TRUE); + return found; +} diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h index bf59f1c..e5bf127 100644 --- a/src/lookup/pinyin_lookup2.h +++ b/src/lookup/pinyin_lookup2.h @@ -104,9 +104,9 @@ protected: /* Array of LookupStepContent */ - bool search_unigram2(GPtrArray * topresults, int nstep, int npinyin, + bool search_unigram2(GPtrArray * topresults, int nstep, PhraseIndexRanges ranges); - bool search_bigram2(GPtrArray * topresults, int nstep, int npinyin, + bool search_bigram2(GPtrArray * topresults, int nstep, PhraseIndexRanges ranges); bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token); |