update spseg.cpp

author: Peng Wu <alexepico@gmail.com> 2013-04-15 13:45:30 +0800
committer: Peng Wu <alexepico@gmail.com> 2013-04-15 13:48:10 +0800
commit: fc4779bafe885a7b43dc82cba3543b8449a8d5ff (patch)
tree: cc0ce47b5e3a83497147b101fd7309e17a6bffdf /utils/segment
parent: 196601b5a3b8a2db0c520b03e6cde66904476bb4 (diff)
download: libpinyin-fc4779bafe885a7b43dc82cba3543b8449a8d5ff.tar.gz
libpinyin-fc4779bafe885a7b43dc82cba3543b8449a8d5ff.tar.xz
libpinyin-fc4779bafe885a7b43dc82cba3543b8449a8d5ff.zip
2 files changed, 106 insertions, 19 deletions
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
index 5fb2a42..fee7449 100644
--- a/utils/segment/ngseg.cpp
+++ b/utils/segment/ngseg.cpp
@@ -246,6 +246,8 @@ int main(int argc, char * argv[]){
         /* print extra enter */
         if ( gen_extra_enter )
             fprintf(output, "%d \n", null_token);
+
+        g_free(sentence);
     }
     phrase_index.destroy_tokens(tokens);
 
diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp
index d37ddc7..1978c97 100644
--- a/utils/segment/spseg.cpp
+++ b/utils/segment/spseg.cpp
@@ -2,7 +2,7 @@
  *  libpinyin
  *  Library to deal with pinyin.
  *  
- *  Copyright (C) 2010 Peng Wu
+ *  Copyright (C) 2010,2013 Peng Wu
  *  
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -50,6 +50,12 @@ static GOptionEntry entries[] =
  * which contains non-ucs4 characters.
  */
 
+enum CONTEXT_STATE{
+    CONTEXT_INIT,
+    CONTEXT_SEGMENTABLE,
+    CONTEXT_UNKNOWN
+};
+
 struct SegmentStep{
     phrase_token_t m_handle;
     ucs4_t * m_phrase;
@@ -70,12 +76,14 @@ public:
 
 bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
 
-//Note: do not free phrase, as it is used by strings (array of segment).
+/* Note: do not free phrase, as it is used by strings (array of segment). */
 bool segment(FacadePhraseTable2 * phrase_table,
              FacadePhraseIndex * phrase_index,
-             ucs4_t * phrase,
-             glong phrase_len,
-             GArray * strings /* Array of Segment *. */){
+             GArray * current_ucs4,
+             GArray * strings /* Array of SegmentStep. */){
+    ucs4_t * phrase = (ucs4_t *)current_ucs4->data;
+    guint phrase_len = current_ucs4->len;
+
     /* Prepare for shortest path segment dynamic programming. */
     GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
     SegmentStep step;
@@ -126,18 +134,18 @@ bool segment(FacadePhraseTable2 * phrase_table,
 }
 
 bool backtrace(GArray * steps, glong phrase_len, GArray * strings){
-    //backtracing to get the result.
+    /* backtracing to get the result. */
     size_t cur_step = phrase_len;
     g_array_set_size(strings, 0);
     while ( cur_step ){
         SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step);
         g_array_append_val(strings, *step);
         cur_step = cur_step + step->m_backward_nstep;
-        //intended to avoid leaking internal informations
+        /* intended to avoid leaking internal informations. */
         step->m_nword = 0; step->m_backward_nstep = 0;
     }
 
-    //reverse the strings
+    /* reverse the strings. */
     for ( size_t i = 0; i < strings->len / 2; ++i ) {
         SegmentStep * head, * tail;
         head = &g_array_index(strings, SegmentStep, i);
@@ -152,6 +160,35 @@ bool backtrace(GArray * steps, glong phrase_len, GArray * strings){
     return true;
 }
 
+bool deal_with_segmentable(FacadePhraseTable2 * phrase_table,
+                           FacadePhraseIndex * phrase_index,
+                           GArray * current_ucs4,
+                           FILE * output){
+
+    /* do segment stuff. */
+    GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+    segment(phrase_table, phrase_index, current_ucs4, strings);
+
+    /* print out the split phrase. */
+    for ( glong i = 0; i < strings->len; ++i ) {
+        SegmentStep * step = &g_array_index(strings, SegmentStep, i);
+        char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
+        fprintf(output, "%d %s\n", step->m_handle, string);
+        g_free(string);
+    }
+    g_array_free(strings, TRUE);
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+    char * result_string = g_ucs4_to_utf8
+        ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+          NULL, NULL, NULL);
+    fprintf(output, "%d %s\n", null_token, result_string);
+    g_free(result_string);
+    return true;
+}
+
+
 int main(int argc, char * argv[]){
     FILE * input = stdin;
     FILE * output = stdout;
@@ -212,13 +249,20 @@ int main(int argc, char * argv[]){
     if (!load_phrase_index(phrase_files, &phrase_index))
         exit(ENOENT);
 
+    CONTEXT_STATE state, next_state;
+    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(PhraseTokens));
+    phrase_index.prepare_tokens(tokens);
+
     char * linebuf = NULL; size_t size = 0; ssize_t read;
     while( (read = getline(&linebuf, &size, input)) != -1 ){
         if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
             linebuf[strlen(linebuf) - 1] = '\0';
         }
 
-        //check non-ucs4 characters
+        /* check non-ucs4 characters. */
         const glong num_of_chars = g_utf8_strlen(linebuf, -1);
         glong len = 0;
         ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
@@ -228,28 +272,69 @@ int main(int argc, char * argv[]){
             continue;
         }
 
-        //do segment stuff
-        GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
-        segment(&phrase_table, &phrase_index, sentence, len, strings);
+        /* only new-line persists. */
+        if ( 0  == num_of_chars ) {
+            fprintf(output, "%d \n", null_token);
+            continue;
+        }
+
+        state = CONTEXT_INIT;
+        int result = phrase_table.search( 1, sentence, tokens);
+        g_array_append_val( current_ucs4, sentence[0]);
+        if ( result & SEARCH_OK )
+            state = CONTEXT_SEGMENTABLE;
+        else
+            state = CONTEXT_UNKNOWN;
+
+        for ( int i = 1; i < num_of_chars; ++i) {
+            int result = phrase_table.search( 1, sentence + i, tokens);
+            if ( result & SEARCH_OK )
+                next_state = CONTEXT_SEGMENTABLE;
+            else
+                next_state = CONTEXT_UNKNOWN;
+
+            if ( state == next_state ){
+                g_array_append_val(current_ucs4, sentence[i]);
+                continue;
+            }
+
+            assert ( state != next_state );
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_table, &phrase_index,
+                                      current_ucs4, output);
+
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+
+            /* save the current character */
+            g_array_set_size(current_ucs4, 0);
+            g_array_append_val(current_ucs4, sentence[i]);
+            state = next_state;
+        }
+
+        if ( current_ucs4->len ) {
+            /* this seems always true. */
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_table, &phrase_index,
+                                      current_ucs4, output);
 
-        //print out the split phrase
-        for ( glong i = 0; i < strings->len; ++i ) {
-            SegmentStep * step = &g_array_index(strings, SegmentStep, i);
-            char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
-            fprintf(output, "%d %s\n", step->m_handle, string);
-            g_free(string);
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+            g_array_set_size(current_ucs4, 0);
         }
 
         /* print extra enter */
         if ( gen_extra_enter )
             fprintf(output, "%d \n", null_token);
 
-        g_array_free(strings, TRUE);
         g_free(sentence);
     }
+    phrase_index.destroy_tokens(tokens);
 
     /* print enter at file tail */
     fprintf(output, "%d \n", null_token);
+    g_array_free(current_ucs4, TRUE);
+    free(linebuf);
     fclose(input);
     fclose(output);
     return 0;
author	Peng Wu <alexepico@gmail.com>	2013-04-15 13:45:30 +0800
committer	Peng Wu <alexepico@gmail.com>	2013-04-15 13:48:10 +0800
commit	fc4779bafe885a7b43dc82cba3543b8449a8d5ff (patch)
tree	cc0ce47b5e3a83497147b101fd7309e17a6bffdf /utils/segment
parent	196601b5a3b8a2db0c520b03e6cde66904476bb4 (diff)
download	libpinyin-fc4779bafe885a7b43dc82cba3543b8449a8d5ff.tar.gz libpinyin-fc4779bafe885a7b43dc82cba3543b8449a8d5ff.tar.xz libpinyin-fc4779bafe885a7b43dc82cba3543b8449a8d5ff.zip