summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-10-10 13:55:07 +0800
committerPeng Wu <alexepico@gmail.com>2012-10-10 13:55:07 +0800
commitaced528d37c803442cd1f39e51e36e96c59d94f6 (patch)
tree801b0a828d28b90469fd2d026c37d2a10a0add48
parent1f6d0a9f5155f1e95d1717625fe0a5b4a679aac2 (diff)
downloadlibpinyin-aced528d37c803442cd1f39e51e36e96c59d94f6.tar.gz
libpinyin-aced528d37c803442cd1f39e51e36e96c59d94f6.tar.xz
libpinyin-aced528d37c803442cd1f39e51e36e96c59d94f6.zip
re-factor phrase lookup
-rw-r--r--src/lookup/phrase_lookup.cpp110
-rw-r--r--src/lookup/phrase_lookup.h4
2 files changed, 98 insertions, 16 deletions
diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp
index 9f4cf45..3cfdd3a 100644
--- a/src/lookup/phrase_lookup.cpp
+++ b/src/lookup/phrase_lookup.cpp
@@ -133,20 +133,8 @@ bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
/* found next phrase */
if ( result & SEARCH_OK ) {
- /* iterate every token. */
- for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
- GArray * array = tokens[n];
- if (NULL == array)
- continue;
-
- /* just skip the loop when the length is zero. */
- for (size_t k = 0; k < array->len; ++k) {
- phrase_token_t next_token =
- g_array_index(array, phrase_token_t, k);
- search_bigram(i, next_token),
- search_unigram(i, next_token);
- }
- }
+ search_bigram2(i, tokens),
+ search_unigram2(i, tokens);
}
/* no longer phrase */
@@ -160,6 +148,8 @@ bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
return final_step(results);
}
+#if 0
+
bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
LookupStepContent lookup_content = (LookupStepContent)
@@ -213,6 +203,98 @@ bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
return found;
}
+#endif
+
+bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return found;
+
+ /* find the maximum node */
+ lookup_value_t * max_value = &g_array_index
+ (lookup_content, lookup_value_t, 0);
+
+ for (size_t i = 1; i < lookup_content->len; ++i) {
+ lookup_value_t * cur_value = &g_array_index
+ (lookup_content, lookup_value_t, i);
+ if (cur_value->m_poss > max_value->m_poss)
+ max_value = cur_value;
+ }
+
+ /* iterate over tokens */
+ for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+ GArray * array = tokens[n];
+ if (NULL == array)
+ continue;
+
+ /* just skip the loop when the length is zero. */
+ for (size_t k = 0; k < array->len; ++k) {
+ phrase_token_t token =
+ g_array_index(array, phrase_token_t, k);
+
+ found = unigram_gen_next_step
+ (nstep, max_value, token) || found;
+ }
+ }
+
+ return found;
+}
+
+bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if (0 == lookup_content->len)
+ return found;
+
+ for (size_t i = 0; i < lookup_content->len; ++i) {
+ lookup_value_t * cur_value = &g_array_index
+ (lookup_content, lookup_value_t, i);
+ phrase_token_t index_token = cur_value->m_handles[1];
+
+ SingleGram * system = NULL, * user = NULL;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if (!merge_single_gram
+ (&m_merged_single_gram, system, user))
+ continue;
+
+ /* iterate over tokens */
+ for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+ GArray * array = tokens[n];
+ if (NULL == array)
+ continue;
+
+ /* just skip the loop when the length is zero. */
+ for (size_t k = 0; k < array->len; ++k) {
+ phrase_token_t token =
+ g_array_index(array, phrase_token_t, k);
+
+ guint32 freq = 0;
+ if (m_merged_single_gram.get_freq(token, freq)) {
+ guint32 total_freq = 0;
+ m_merged_single_gram.get_total_freq(total_freq);
+
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
+ }
+ }
+ }
+
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ return found;
+}
+
bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
phrase_token_t token){
diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h
index c2537ef..65bd2cd 100644
--- a/src/lookup/phrase_lookup.h
+++ b/src/lookup/phrase_lookup.h
@@ -69,8 +69,8 @@ protected:
/* Explicitly search the next phrase,
* to avoid double phrase lookup as the next token has only one.
*/
- bool search_unigram(int nstep, phrase_token_t token);
- bool search_bigram(int nstep, phrase_token_t token);
+ bool search_unigram2(int nstep, PhraseTokens tokens);
+ bool search_bigram2(int nstep, PhraseTokens tokens);
bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token);
bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss);