Support prefix candidate

author: Peng Wu <alexepico@gmail.com> 2022-08-26 17:19:57 +0800
committer: Peng Wu <alexepico@gmail.com> 2022-08-26 17:28:13 +0800
commit: 3e857c0a978133ad1a59eec1788f5fd6f1e7f187 (patch)
tree: 63aa5d3643c95ab2e727b01b50868a4cab43533d /src
parent: 96aeb47a5e9b0f7069975ee2507d1e695fc3f4cd (diff)
download: libpinyin-3e857c0a978133ad1a59eec1788f5fd6f1e7f187.tar.gz
libpinyin-3e857c0a978133ad1a59eec1788f5fd6f1e7f187.tar.xz
libpinyin-3e857c0a978133ad1a59eec1788f5fd6f1e7f187.zip
2 files changed, 81 insertions, 15 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 8b0fed6..1446977 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -71,6 +71,8 @@ struct _pinyin_instance_t{
     /* pointer of pinyin_context_t. */
     pinyin_context_t * m_context;
 
+    ucs4_t * m_prefix_ucs4;
+    glong m_prefix_len;
     /* the tokens of phrases before the user input. */
     TokenVector m_prefixes;
 
@@ -1116,6 +1118,8 @@ pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
     pinyin_instance_t * instance = new pinyin_instance_t;
     instance->m_context = context;
 
+    instance->m_prefix_ucs4 = NULL;
+    instance->m_prefix_len = 0;
     instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
 
     instance->m_parsed_len = 0;
@@ -1144,6 +1148,7 @@ static bool _free_candidates(CandidateVector candidates) {
 }
 
 void pinyin_free_instance(pinyin_instance_t * instance){
+    g_free(instance->m_prefix_ucs4);
     g_array_free(instance->m_prefixes, TRUE);
     delete instance->m_constraints;
     g_array_free(instance->m_phrase_result, TRUE);
@@ -1189,17 +1194,22 @@ static void _compute_prefixes(pinyin_instance_t * instance,
     pinyin_context_t * & context = instance->m_context;
     FacadePhraseIndex * & phrase_index = context->m_phrase_index;
 
-    glong len_str = 0;
-    ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL);
     GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
 
+    g_free (instance->m_prefix_ucs4);
+    instance->m_prefix_ucs4 = g_utf8_to_ucs4(prefix, -1, NULL,
+                                             &(instance->m_prefix_len), NULL);
+
+    const ucs4_t * ucs4_str = instance->m_prefix_ucs4;
+    const glong len_str = instance->m_prefix_len;
+
     if (ucs4_str && len_str) {
         /* add prefixes. */
         for (ssize_t i = 1; i <= len_str; ++i) {
             if (i > MAX_PHRASE_LENGTH)
                 break;
 
-            ucs4_t * start = ucs4_str + len_str - i;
+            const ucs4_t * start = ucs4_str + len_str - i;
 
             PhraseTokens tokens;
             memset(tokens, 0, sizeof(tokens));
@@ -1214,7 +1224,6 @@ static void _compute_prefixes(pinyin_instance_t * instance,
         }
     }
     g_array_free(tokenarray, TRUE);
-    g_free(ucs4_str);
 }
 
 bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
@@ -1417,6 +1426,7 @@ bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
 
 static bool _token_get_phrase(FacadePhraseIndex * phrase_index,
                               phrase_token_t token,
+                              guint begin,
                               guint * len,
                               gchar ** utf8_str) {
     PhraseItem item;
@@ -1431,7 +1441,7 @@ static bool _token_get_phrase(FacadePhraseIndex * phrase_index,
     if (len)
         *len = length;
     if (utf8_str)
-        *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
+        *utf8_str = g_ucs4_to_utf8(buffer + begin, length, NULL, NULL, NULL);
     return true;
 }
 
@@ -1589,7 +1599,23 @@ static void _compute_frequency_of_items(pinyin_context_t * context,
 
         gfloat lambda = context->m_system_table_info.get_lambda();
 
-        /* handle addon candidates first. */
+        /* handle prefix candidates. */
+        if (PREDICTED_PREFIX_CANDIDATE == item->m_candidate_type) {
+            total_freq = context->m_phrase_index->
+                get_phrase_index_total_freq();
+
+            context->m_phrase_index->get_phrase_item
+                (token, cached_item);
+
+            /* Note: possibility value <= 1.0. */
+            guint32 freq = ((1 - lambda) *
+                            cached_item.get_unigram_frequency() /
+                            (gfloat) total_freq) * 256 * 256 * 256;
+            item->m_freq = freq;
+            continue;
+        }
+
+        /* handle addon candidates. */
         if (ADDON_CANDIDATE == item->m_candidate_type) {
             total_freq = context->m_phrase_index->
                 get_phrase_index_total_freq();
@@ -1666,11 +1692,17 @@ static bool _compute_phrase_length(pinyin_context_t * context,
         case NBEST_MATCH_CANDIDATE:
             abort();
         case NORMAL_CANDIDATE:
-        case PREDICTED_CANDIDATE: {
+        case PREDICTED_BIGRAM_CANDIDATE: {
             phrase_index->get_phrase_item(candidate->m_token, item);
             candidate->m_phrase_length = item.get_phrase_length();
             break;
         }
+        case PREDICTED_PREFIX_CANDIDATE: {
+            phrase_index->get_phrase_item(candidate->m_token, item);
+            candidate->m_phrase_length =
+                item.get_phrase_length() - candidate->m_begin;
+            break;
+        }
         case ADDON_CANDIDATE: {
             addon_phrase_index->get_phrase_item(candidate->m_token, item);
             candidate->m_phrase_length = item.get_phrase_length();
@@ -1700,16 +1732,22 @@ static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
             break;
         }
         case NORMAL_CANDIDATE:
-        case PREDICTED_CANDIDATE:
+        case PREDICTED_BIGRAM_CANDIDATE:
+            _token_get_phrase
+                (instance->m_context->m_phrase_index,
+                 candidate->m_token, 0, NULL,
+                 &(candidate->m_phrase_string));
+            break;
+        case PREDICTED_PREFIX_CANDIDATE:
             _token_get_phrase
                 (instance->m_context->m_phrase_index,
-                 candidate->m_token, NULL,
+                 candidate->m_token, candidate->m_begin, NULL,
                  &(candidate->m_phrase_string));
             break;
         case ADDON_CANDIDATE:
             _token_get_phrase
                 (instance->m_context->m_addon_phrase_index,
-                 candidate->m_token, NULL,
+                 candidate->m_token, 0, NULL,
                  &(candidate->m_phrase_string));
             break;
         case ZOMBIE_CANDIDATE:
@@ -1986,6 +2024,7 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
 
     _free_candidates(candidates);
 
+    /* search bigram candidate. */
     g_array_set_size(instance->m_prefixes, 0);
     _compute_prefixes(instance, prefix);
 
@@ -2036,13 +2075,39 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
                 continue;
 
             lookup_candidate_t item;
-            item.m_candidate_type = PREDICTED_CANDIDATE;
+            item.m_candidate_type = PREDICTED_BIGRAM_CANDIDATE;
             item.m_token = phrase_item->m_token;
             g_array_append_val(candidates, item);
         }
 
     }
 
+    /* search prefix candidate. */
+    GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+    PhraseTokens phrase_tokens;
+    memset(phrase_tokens, 0, sizeof(phrase_tokens));
+    phrase_index->prepare_tokens(phrase_tokens);
+    int result = context->m_phrase_table->search_suggestion
+        (instance->m_prefix_len, instance->m_prefix_ucs4, phrase_tokens);
+    int num = reduce_tokens(phrase_tokens, tokenarray);
+    phrase_index->destroy_tokens(phrase_tokens);
+
+    for (size_t i = 0; i < tokenarray->len; ++i) {
+        phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i);
+
+        lookup_candidate_t template_item;
+        template_item.m_candidate_type = PREDICTED_PREFIX_CANDIDATE;
+        template_item.m_token = candidate;
+        template_item.m_begin = instance->m_prefix_len;
+        /* The prefix candidate only uses the m_begin variable. */
+        template_item.m_end = 0;
+
+        g_array_append_val(candidates, template_item);
+    }
+
+    g_array_free(tokenarray, TRUE);
+
     /* post process to sort the candidates */
 
     _compute_phrase_length(context, candidates);
@@ -2064,7 +2129,7 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
 int pinyin_choose_candidate(pinyin_instance_t * instance,
                             size_t offset,
                             lookup_candidate_t * candidate){
-    assert(PREDICTED_CANDIDATE != candidate->m_candidate_type);
+    assert(PREDICTED_BIGRAM_CANDIDATE != candidate->m_candidate_type);
 
     pinyin_context_t * context = instance->m_context;
     PhoneticKeyMatrix & matrix = instance->m_matrix;
@@ -2125,7 +2190,7 @@ int pinyin_choose_candidate(pinyin_instance_t * instance,
 
 bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance,
                                        lookup_candidate_t * candidate){
-    assert(PREDICTED_CANDIDATE == candidate->m_candidate_type);
+    assert(PREDICTED_BIGRAM_CANDIDATE == candidate->m_candidate_type);
 
     const guint32 initial_seed = 23 * 3;
     const guint32 unigram_factor = 7;
@@ -2305,7 +2370,7 @@ bool pinyin_token_get_phrase(pinyin_instance_t * instance,
     pinyin_context_t * & context = instance->m_context;
 
     return _token_get_phrase(context->m_phrase_index,
-                             token, len, utf8_str);
+                             token, 0, len, utf8_str);
 }
 
 bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance,
diff --git a/src/pinyin.h b/src/pinyin.h
index 7cab515..4b631fe 100644
--- a/src/pinyin.h
+++ b/src/pinyin.h
@@ -43,7 +43,8 @@ typedef enum _lookup_candidate_type_t{
     NBEST_MATCH_CANDIDATE = 1,
     NORMAL_CANDIDATE,
     ZOMBIE_CANDIDATE,
-    PREDICTED_CANDIDATE,
+    PREDICTED_BIGRAM_CANDIDATE,
+    PREDICTED_PREFIX_CANDIDATE,
     ADDON_CANDIDATE,
 } lookup_candidate_type_t;
author	Peng Wu <alexepico@gmail.com>	2022-08-26 17:19:57 +0800
committer	Peng Wu <alexepico@gmail.com>	2022-08-26 17:28:13 +0800
commit	3e857c0a978133ad1a59eec1788f5fd6f1e7f187 (patch)
tree	63aa5d3643c95ab2e727b01b50868a4cab43533d /src
parent	96aeb47a5e9b0f7069975ee2507d1e695fc3f4cd (diff)
download	libpinyin-3e857c0a978133ad1a59eec1788f5fd6f1e7f187.tar.gz libpinyin-3e857c0a978133ad1a59eec1788f5fd6f1e7f187.tar.xz libpinyin-3e857c0a978133ad1a59eec1788f5fd6f1e7f187.zip