summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-09-10 15:48:26 +0800
committerPeng Wu <alexepico@gmail.com>2012-09-10 15:51:00 +0800
commiteb63539270fab81678da97c292d247796bd5a367 (patch)
treebdb4bf0b72cb82b557fd6b213982c099c1580a7e
parent7230dbfe956c6a3fc5c060f2859867f825039d25 (diff)
downloadlibpinyin-eb63539270fab81678da97c292d247796bd5a367.tar.gz
libpinyin-eb63539270fab81678da97c292d247796bd5a367.tar.xz
libpinyin-eb63539270fab81678da97c292d247796bd5a367.zip
write search_unigram2 and search_bigram2
-rw-r--r--src/lookup/pinyin_lookup2.cpp106
-rw-r--r--src/lookup/pinyin_lookup2.h4
2 files changed, 105 insertions, 5 deletions
diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp
index 40730ae..bb57295 100644
--- a/src/lookup/pinyin_lookup2.cpp
+++ b/src/lookup/pinyin_lookup2.cpp
@@ -20,6 +20,7 @@
*/
#include <math.h>
+#include "facade_chewing_table.h"
#include "pinyin_lookup2.h"
#include "stl_lite.h"
@@ -243,7 +244,7 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes,
break;
lookup_constraint_t * next_constraint = &g_array_index
- (m_constraints, lookup_constraint_t, m);
+ (m_constraints, lookup_constraint_t, m - 1);
if (CONSTRAINT_NOSEARCH == next_constraint->m_type)
break;
@@ -255,8 +256,8 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes,
populate_candidates(candidates, step);
get_top_results(topresults, candidates);
- search_bigram(topresults, i, m, ranges),
- search_unigram(topresults, i, m, ranges);
+ search_bigram2(topresults, i, ranges),
+ search_unigram2(topresults, i, ranges);
/* no longer pinyin */
if (!(result & SEARCH_CONTINUED))
@@ -271,3 +272,102 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes,
return final_step(results);
}
+
+bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges) {
+ if (0 == topresults->len)
+ return false;
+
+ lookup_value_t * max = (lookup_value_t *)
+ g_ptr_array_index(topresults, 0);
+
+ lookup_constraint_t * constraint =
+ &g_array_index(m_constraints, lookup_constraint_t, nstep);
+
+ if (CONSTRAINT_ONESTEP == constraint->m_type) {
+ return unigram_gen_next_step(nstep, max, constraint->m_token);
+ }
+
+ bool found = false;
+
+ if (NO_CONSTRAINT == constraint->m_type) {
+ for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = ranges[m];
+ if ( !array ) continue;
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
+ for ( phrase_token_t token = range->m_range_begin;
+ token != range->m_range_end; ++token){
+ found = unigram_gen_next_step(nstep, max, token)|| found;
+ }
+ }
+ }
+ }
+
+ return found;
+}
+
+bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges) {
+ if (0 == topresults->len)
+ return false;
+
+ lookup_constraint_t* constraint =
+ &g_array_index(m_constraints, lookup_constraint_t, nstep);
+
+ bool found = false;
+ BigramPhraseArray bigram_phrase_items = g_array_new
+ (FALSE, FALSE, sizeof(BigramPhraseItem));
+
+ for (size_t i = 0; i < topresults->len; ++i) {
+ lookup_value_t * value = (lookup_value_t *)
+ g_ptr_array_index(topresults, i);
+
+ phrase_token_t index_token = value->m_handles[1];
+
+ SingleGram * system = NULL, * user = NULL;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if ( !merge_single_gram(&m_merged_single_gram, system, user) )
+ continue;
+
+ if ( CONSTRAINT_ONESTEP == constraint->m_type ){
+ phrase_token_t token = constraint->m_token;
+
+ guint32 freq;
+ if( m_merged_single_gram.get_freq(token, freq) ){
+ guint32 total_freq;
+ m_merged_single_gram.get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found;
+ }
+ }
+
+ if (NO_CONSTRAINT == constraint->m_type) {
+ for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = ranges[m];
+ if ( !array ) continue;
+
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range =
+ &g_array_index(array, PhraseIndexRange, n);
+
+ g_array_set_size(bigram_phrase_items, 0);
+ m_merged_single_gram.search(range, bigram_phrase_items);
+ for( size_t k = 0; k < bigram_phrase_items->len; ++k) {
+ BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
+ found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found;
+ }
+ }
+ }
+ }
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ g_array_free(bigram_phrase_items, TRUE);
+ return found;
+}
diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h
index bf59f1c..e5bf127 100644
--- a/src/lookup/pinyin_lookup2.h
+++ b/src/lookup/pinyin_lookup2.h
@@ -104,9 +104,9 @@ protected:
/* Array of LookupStepContent */
- bool search_unigram2(GPtrArray * topresults, int nstep, int npinyin,
+ bool search_unigram2(GPtrArray * topresults, int nstep,
PhraseIndexRanges ranges);
- bool search_bigram2(GPtrArray * topresults, int nstep, int npinyin,
+ bool search_bigram2(GPtrArray * topresults, int nstep,
PhraseIndexRanges ranges);
bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token);