re-factor phrase lookup

author: Peng Wu <alexepico@gmail.com> 2012-10-10 13:55:07 +0800
committer: Peng Wu <alexepico@gmail.com> 2012-10-10 13:55:07 +0800
commit: aced528d37c803442cd1f39e51e36e96c59d94f6 (patch)
tree: 801b0a828d28b90469fd2d026c37d2a10a0add48
parent: 1f6d0a9f5155f1e95d1717625fe0a5b4a679aac2 (diff)
download: libpinyin-aced528d37c803442cd1f39e51e36e96c59d94f6.tar.gz
libpinyin-aced528d37c803442cd1f39e51e36e96c59d94f6.tar.xz
libpinyin-aced528d37c803442cd1f39e51e36e96c59d94f6.zip
2 files changed, 98 insertions, 16 deletions
diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp
index 9f4cf45..3cfdd3a 100644
--- a/src/lookup/phrase_lookup.cpp
+++ b/src/lookup/phrase_lookup.cpp
@@ -133,20 +133,8 @@ bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
 
             /* found next phrase */
             if ( result & SEARCH_OK ) {
-                /* iterate every token. */
-                for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
-                    GArray * array = tokens[n];
-                    if (NULL == array)
-                        continue;
-
-                    /* just skip the loop when the length is zero. */
-                    for (size_t k = 0; k < array->len; ++k) {
-                        phrase_token_t next_token =
-                            g_array_index(array, phrase_token_t, k);
-                        search_bigram(i, next_token),
-                            search_unigram(i, next_token);
-                    }
-                }
+                search_bigram2(i, tokens),
+                    search_unigram2(i, tokens);
             }
 
             /* no longer phrase */
@@ -160,6 +148,8 @@ bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
     return final_step(results);
 }
 
+#if 0
+
 bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
 
     LookupStepContent lookup_content = (LookupStepContent)
@@ -213,6 +203,98 @@ bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
     return found;
 }
 
+#endif
+
+bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
+    bool found = false;
+
+    LookupStepContent lookup_content = (LookupStepContent)
+        g_ptr_array_index(m_steps_content, nstep);
+    if ( 0 == lookup_content->len )
+        return found;
+
+    /* find the maximum node */
+    lookup_value_t * max_value = &g_array_index
+        (lookup_content, lookup_value_t, 0);
+
+    for (size_t i = 1; i < lookup_content->len; ++i) {
+        lookup_value_t * cur_value = &g_array_index
+            (lookup_content, lookup_value_t, i);
+        if (cur_value->m_poss > max_value->m_poss)
+            max_value = cur_value;
+    }
+
+    /* iterate over tokens */
+    for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+        GArray * array = tokens[n];
+        if (NULL == array)
+            continue;
+
+        /* just skip the loop when the length is zero. */
+        for (size_t k = 0; k < array->len; ++k) {
+            phrase_token_t token =
+                g_array_index(array, phrase_token_t, k);
+
+            found = unigram_gen_next_step
+                (nstep, max_value, token) || found;
+        }
+    }
+
+    return found;
+}
+
+bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
+    bool found = false;
+
+    LookupStepContent lookup_content = (LookupStepContent)
+        g_ptr_array_index(m_steps_content, nstep);
+    if (0 == lookup_content->len)
+        return found;
+
+    for (size_t i = 0; i < lookup_content->len; ++i) {
+        lookup_value_t * cur_value = &g_array_index
+            (lookup_content, lookup_value_t, i);
+        phrase_token_t index_token = cur_value->m_handles[1];
+
+        SingleGram * system = NULL, * user = NULL;
+        m_system_bigram->load(index_token, system);
+        m_user_bigram->load(index_token, user);
+
+        if (!merge_single_gram
+            (&m_merged_single_gram, system, user))
+            continue;
+
+        /* iterate over tokens */
+        for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+            GArray * array = tokens[n];
+            if (NULL == array)
+                continue;
+
+            /* just skip the loop when the length is zero. */
+            for (size_t k = 0; k < array->len; ++k) {
+                phrase_token_t token =
+                    g_array_index(array, phrase_token_t, k);
+
+                guint32 freq = 0;
+                if (m_merged_single_gram.get_freq(token, freq)) {
+                    guint32 total_freq = 0;
+                    m_merged_single_gram.get_total_freq(total_freq);
+
+                    gfloat bigram_poss = freq / (gfloat) total_freq;
+                    found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
+                }
+            }
+        }
+
+        if (system)
+            delete system;
+        if (user)
+            delete user;
+    }
+
+    return found;
+}
+
 bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
 phrase_token_t token){
 
diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h
index c2537ef..65bd2cd 100644
--- a/src/lookup/phrase_lookup.h
+++ b/src/lookup/phrase_lookup.h
@@ -69,8 +69,8 @@ protected:
     /* Explicitly search the next phrase,
      *  to avoid double phrase lookup as the next token has only one.
      */
-    bool search_unigram(int nstep, phrase_token_t token);
-    bool search_bigram(int nstep, phrase_token_t token);
+    bool search_unigram2(int nstep, PhraseTokens tokens);
+    bool search_bigram2(int nstep, PhraseTokens tokens);
 
     bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token);
     bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss);
author	Peng Wu <alexepico@gmail.com>	2012-10-10 13:55:07 +0800
committer	Peng Wu <alexepico@gmail.com>	2012-10-10 13:55:07 +0800
commit	aced528d37c803442cd1f39e51e36e96c59d94f6 (patch)
tree	801b0a828d28b90469fd2d026c37d2a10a0add48
parent	1f6d0a9f5155f1e95d1717625fe0a5b4a679aac2 (diff)
download	libpinyin-aced528d37c803442cd1f39e51e36e96c59d94f6.tar.gz libpinyin-aced528d37c803442cd1f39e51e36e96c59d94f6.tar.xz libpinyin-aced528d37c803442cd1f39e51e36e96c59d94f6.zip