summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2016-05-25 15:50:39 +0800
committerPeng Wu <alexepico@gmail.com>2016-05-25 15:50:39 +0800
commit955b6ea0f53ce0f8708c08d113ea1c44478d685d (patch)
tree3030d2fe9c0fb7cb41ae72d278971cddaaf87374
parente3b1c578005402c45cc27049fafe4c732f7e493c (diff)
downloadlibpinyin-955b6ea0f53ce0f8708c08d113ea1c44478d685d.tar.gz
libpinyin-955b6ea0f53ce0f8708c08d113ea1c44478d685d.tar.xz
libpinyin-955b6ea0f53ce0f8708c08d113ea1c44478d685d.zip
update search_*gram method
-rw-r--r--src/lookup/lookup.h6
-rw-r--r--src/lookup/pinyin_lookup2.cpp37
-rw-r--r--src/lookup/pinyin_lookup2.h15
3 files changed, 36 insertions, 22 deletions
diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h
index bcc7006..902ed0d 100644
--- a/src/lookup/lookup.h
+++ b/src/lookup/lookup.h
@@ -37,13 +37,17 @@ typedef phrase_token_t lookup_key_t;
struct lookup_value_t{
/* previous and current tokens of the node */
phrase_token_t m_handles[2];
+ /* the sentence length */
+ gint32 m_length;
/* maximum possibility of current node */
gfloat m_poss;
/* trace back information for final step */
gint32 m_last_step;
lookup_value_t(gfloat poss = FLT_MAX){
- m_handles[0] = null_token; m_handles[1] = null_token;
+ m_handles[0] = null_token;
+ m_handles[1] = null_token;
+ m_length = 0;
m_poss = poss;
m_last_step = -1;
}
diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp
index b5f115c..8c2ed50 100644
--- a/src/lookup/pinyin_lookup2.cpp
+++ b/src/lookup/pinyin_lookup2.cpp
@@ -288,7 +288,8 @@ bool PinyinLookup2::get_best_match(TokenVector prefixes,
return final_step(results);
}
-bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep,
+bool PinyinLookup2::search_unigram2(GPtrArray * topresults,
+ int start, int end,
PhraseIndexRanges ranges) {
if (0 == topresults->len)
@@ -298,10 +299,10 @@ bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep,
g_ptr_array_index(topresults, 0);
lookup_constraint_t * constraint =
- &g_array_index(m_constraints, lookup_constraint_t, nstep);
+ &g_array_index(m_constraints, lookup_constraint_t, start);
if (CONSTRAINT_ONESTEP == constraint->m_type) {
- return unigram_gen_next_step(nstep, max, constraint->m_token);
+ return unigram_gen_next_step(start, end, max, constraint->m_token);
}
bool found = false;
@@ -315,7 +316,8 @@ bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep,
PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
for ( phrase_token_t token = range->m_range_begin;
token != range->m_range_end; ++token){
- found = unigram_gen_next_step(nstep, max, token)|| found;
+ found = unigram_gen_next_step(start, end, max, token) ||
+ found;
}
}
}
@@ -324,11 +326,12 @@ bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep,
return found;
}
-bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep,
+bool PinyinLookup2::search_bigram2(GPtrArray * topresults,
+ int start, int end,
PhraseIndexRanges ranges) {
lookup_constraint_t * constraint =
- &g_array_index(m_constraints, lookup_constraint_t, nstep);
+ &g_array_index(m_constraints, lookup_constraint_t, start);
bool found = false;
BigramPhraseArray bigram_phrase_items = g_array_new
@@ -340,24 +343,24 @@ bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep,
phrase_token_t index_token = value->m_handles[1];
- SingleGram * system = NULL, * user = NULL;
- m_system_bigram->load(index_token, system);
+ SingleGram * system = NULL, * user = NULL;
+ m_system_bigram->load(index_token, system);
m_user_bigram->load(index_token, user);
if ( !merge_single_gram(&m_merged_single_gram, system, user) )
continue;
- if ( CONSTRAINT_ONESTEP == constraint->m_type ){
- phrase_token_t token = constraint->m_token;
+ if ( CONSTRAINT_ONESTEP == constraint->m_type ){
+ phrase_token_t token = constraint->m_token;
guint32 freq;
if( m_merged_single_gram.get_freq(token, freq) ){
guint32 total_freq;
m_merged_single_gram.get_total_freq(total_freq);
gfloat bigram_poss = freq / (gfloat) total_freq;
- found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found;
+ found = bigram_gen_next_step(start, end, value, token, bigram_poss) || found;
}
- }
+ }
if (NO_CONSTRAINT == constraint->m_type) {
for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
@@ -372,7 +375,7 @@ bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep,
m_merged_single_gram.search(range, bigram_phrase_items);
for( size_t k = 0; k < bigram_phrase_items->len; ++k) {
BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
- found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found;
+ found = bigram_gen_next_step(start, end, value, item->m_token, item->m_freq) || found;
}
}
}
@@ -393,18 +396,18 @@ bool PinyinLookup2::unigram_gen_next_step(int nstep,
phrase_token_t token) {
if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
- return false;
+ return false;
size_t phrase_length = m_cache_phrase_item.get_phrase_length();
gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
- m_phrase_index->get_phrase_index_total_freq();
+ m_phrase_index->get_phrase_index_total_freq();
if ( elem_poss < DBL_EPSILON )
- return false;
+ return false;
ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep;
gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys);
if (pinyin_poss < FLT_EPSILON )
- return false;
+ return false;
lookup_value_t next_step;
next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h
index 25cae10..1a8e888 100644
--- a/src/lookup/pinyin_lookup2.h
+++ b/src/lookup/pinyin_lookup2.h
@@ -107,13 +107,20 @@ protected:
/* Array of LookupStepContent */
- bool search_unigram2(GPtrArray * topresults, int nstep,
+ bool search_unigram2(GPtrArray * topresults,
+ int start, int end,
PhraseIndexRanges ranges);
- bool search_bigram2(GPtrArray * topresults, int nstep,
+ bool search_bigram2(GPtrArray * topresults,
+ int start, int end,
PhraseIndexRanges ranges);
- bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token);
- bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss);
+ bool unigram_gen_next_step(int start, int end,
+ lookup_value_t * cur_step,
+ phrase_token_t token);
+ bool bigram_gen_next_step(int start, int end,
+ lookup_value_t * cur_step,
+ phrase_token_t token,
+ gfloat bigram_poss);
bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step);