Fix export bigram phraseexport-bigram

author: Peng Wu <alexepico@gmail.com> 2024-08-07 12:58:43 +0800
committer: Peng Wu <alexepico@gmail.com> 2024-08-07 12:58:43 +0800
commit: ed3ecdc3c69473e708e7d42f28ae68d80c5ff28e (patch)
tree: 27647454d173eef5b166d751bf9c63fc4f552e09
parent: b56d3375921238ac990b5251b220f292e6dfdd61 (diff)
download: libpinyin-export-bigram.tar.gz
libpinyin-export-bigram.tar.xz
libpinyin-export-bigram.zip
1 files changed, 24 insertions, 23 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index b5a0466..a8fda16 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -140,6 +140,8 @@ struct _bigram_export_iterator_t{
     /* The pinyins for the current phrase. */
     GPtrArray * m_pinyins;
     size_t m_pinyin_index;
+    /* Cache the count. */
+    gint m_count;
 };
 
 static bool _clean_user_files(const char * user_dir,
@@ -771,30 +773,31 @@ bigram_export_iterator_t * pinyin_begin_get_bigram_phrases(pinyin_context_t * co
     iter->m_phrase = NULL;
     iter->m_pinyins = g_ptr_array_new();
     iter->m_pinyin_index = 0;
+    iter->m_count = 0;
     return iter;
 }
 
 bool pinyin_bigram_iterator_has_next_phrase(bigram_export_iterator_t * iter){
     /* pre-check the bigram sequence has been used at least twice. */
     const guint32 initial_seed = 23 * 3;
-    const guint32 expand_factor = 2;
-    const guint32 threshold = initial_seed * expand_factor - 1;
+    const guint32 threshold = initial_seed - 1;
+    bool retval = false;
 
     if (iter->m_phrase && iter->m_pinyin_index < iter->m_pinyins->len)
         return true;
 
+    /* clean up old values. */
+    iter->m_pinyin_index = 0;
+    g_ptr_array_free(iter->m_pinyins, TRUE);
+    iter->m_pinyins = g_ptr_array_new();
+
     do {
-        if (iter->m_index_token) {
-            while (iter->m_phrase_tokens->len) {
+        if (iter->m_index_token != null_token && iter->m_index_token != sentence_start) {
+            while (iter->m_phrase_tokens->len > 0) {
                 BigramPhraseItemWithCount * item = &g_array_index
                     (iter->m_phrase_tokens, BigramPhraseItemWithCount, 0);
                 /* find the next item. */
                 if (item->m_count > threshold) {
-                    /* clean up old values. */
-                    iter->m_pinyin_index = 0;
-                    g_ptr_array_free(iter->m_pinyins, TRUE);
-                    iter->m_pinyins = g_ptr_array_new();
-
                     /* list all the pinyins here. */
                     PhraseItem first_item, second_item;
                     iter->m_context->m_phrase_index->get_phrase_item
@@ -818,6 +821,7 @@ bool pinyin_bigram_iterator_has_next_phrase(bigram_export_iterator_t * iter){
                     gchar * cur_phrase = g_strconcat(first_phrase, second_phrase, NULL);
                     g_free(iter->m_phrase);
                     iter->m_phrase = cur_phrase;
+                    iter->m_count = item->m_count;
 
                     g_free(second_phrase);
                     g_free(first_phrase);
@@ -858,13 +862,15 @@ bool pinyin_bigram_iterator_has_next_phrase(bigram_export_iterator_t * iter){
                         g_free(first_pinyin);
                     }
 
-                    return true;
+                    g_array_remove_index (iter->m_phrase_tokens, 0);
+                    retval = true;
+                    break;
                 }
                 g_array_remove_index (iter->m_phrase_tokens, 0);
             }
         }
 
-        if (iter->m_items->len == 0)
+        if (retval || iter->m_items->len == 0)
             break;
 
         iter->m_index_token = g_array_index(iter->m_items, phrase_token_t, 0);
@@ -875,7 +881,7 @@ bool pinyin_bigram_iterator_has_next_phrase(bigram_export_iterator_t * iter){
         delete user_gram;
     } while (iter->m_items->len);
 
-    return false;
+    return retval;
 }
 
 bool pinyin_bigram_iterator_get_next_phrase(bigram_export_iterator_t * iter,
@@ -883,18 +889,12 @@ bool pinyin_bigram_iterator_get_next_phrase(bigram_export_iterator_t * iter,
                                             gchar ** pinyin,
                                             gint * count){
     /* just get the first phrase as the phrase is pre-checked by has_next_phrase. */
-    const guint32 initial_seed = 23 * 3;
-    const guint32 expand_factor = 2;
-    const guint32 threshold = initial_seed * expand_factor - 1;
-    const guint32 unigram_factor = 7;
-    assert(iter->m_index_token != null_token);
-    BigramPhraseItemWithCount item = g_array_index
-        (iter->m_phrase_tokens, BigramPhraseItemWithCount, 0);
-    assert(item.m_count > threshold);
+    const guint32 unigram_factor = 2;
+    assert(iter->m_index_token != null_token && iter->m_index_token != sentence_start);
 
-    *phrase = iter->m_phrase;
-    *pinyin = (gchar *) g_ptr_array_index(iter->m_pinyins, iter->m_pinyin_index);
-    *count = item.m_count * unigram_factor;
+    *phrase = g_strdup(iter->m_phrase);
+    *pinyin = g_strdup((gchar *) g_ptr_array_index(iter->m_pinyins, iter->m_pinyin_index));
+    *count = iter->m_count * unigram_factor;
 
     ++(iter->m_pinyin_index);
 
@@ -906,6 +906,7 @@ void pinyin_end_get_bigram_phrases(bigram_export_iterator_t * iter){
     g_array_free(iter->m_items, TRUE);
     g_ptr_array_free(iter->m_pinyins, TRUE);
     iter->m_pinyin_index = 0;
+    iter->m_count = 0;
     delete iter;
 }
author	Peng Wu <alexepico@gmail.com>	2024-08-07 12:58:43 +0800
committer	Peng Wu <alexepico@gmail.com>	2024-08-07 12:58:43 +0800
commit	ed3ecdc3c69473e708e7d42f28ae68d80c5ff28e (patch)
tree	27647454d173eef5b166d751bf9c63fc4f552e09
parent	b56d3375921238ac990b5251b220f292e6dfdd61 (diff)
download	libpinyin-export-bigram.tar.gz libpinyin-export-bigram.tar.xz libpinyin-export-bigram.zip