write the shortest path segment

author: Peng Wu <alexepico@gmail.com> 2010-09-10 14:39:40 +0800
committer: Peng Wu <alexepico@gmail.com> 2010-09-10 14:39:40 +0800
commit: 598490883a327c0ab0c414e5851d8e749403492e (patch)
tree: 182654ca1729fe761115c51222c6bf716aebd3f3 /src/segment/spseg.cpp
parent: c8fbacc76996e693d734d0d5c94f63a7d6fe198a (diff)
download: libpinyin-598490883a327c0ab0c414e5851d8e749403492e.tar.gz
libpinyin-598490883a327c0ab0c414e5851d8e749403492e.tar.xz
libpinyin-598490883a327c0ab0c414e5851d8e749403492e.zip
1 files changed, 108 insertions, 2 deletions
diff --git a/src/segment/spseg.cpp b/src/segment/spseg.cpp
index 6522b2e..238c732 100644
--- a/src/segment/spseg.cpp
+++ b/src/segment/spseg.cpp
@@ -26,6 +26,8 @@
 #include "novel_types.h"
 #include "phrase_large_table.h"
 
+/* graph shortest path sentence segment. */
+
 /* Note:
  * Currently libpinyin only supports ucs2 characters, as this is a
  * pre-processor tool for raw corpus, it will skip all sentences
@@ -41,9 +43,87 @@ struct SegmentStep{
     //use formula W = number of words. Zero handle means one word.
     size_t m_nword;
     //backtrace information, -1 one step backward.
-    gint m_backword_nstep;
+    gint m_backward_nstep;
+public:
+    SegmentStep(){
+        m_handle = 0;
+        m_phrase = NULL;
+        m_phrase_len = 0;
+        m_nword = UINT_MAX;
+        m_backward_nstep = -0;
+    }
 };
 
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
+
+//Note: do not free phrase, as it is used by strings (array of segment).
+bool segment(PhraseLargeTable * phrases, //Lookup Phrase
+             utf16_t * phrase,
+             glong phrase_len,
+             GArray * strings /* Array of Segment *. */){
+    /* Prepare for shortest path segment dynamic programming. */
+    GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+    SegmentStep step;
+    for ( glong i = 0; i < phrase_len + 1; ++i ){
+        g_array_append_val(steps, step);
+    }
+
+    for ( glong i = 0; i < phrase_len + 1; ++i ) {
+        SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
+        size_t nword = step_begin->m_nword;
+        for ( glong k = i + 1; k < phrase_len + 1; ++k ) {
+            size_t len = k - i;
+            utf16_t * cur_phrase = phrase + i;
+
+            phrase_token_t token = 0;
+            int result = g_phrases->search(len, cur_phrase, token);
+            if ( !(result & SEARCH_OK) ){
+                token = 0;
+                if ( 1 != len )
+                    continue;
+            }
+            ++nword;
+            SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
+            if ( nword < step_end->m_nword ) {
+                step_end->m_handle = token;
+                step_end->m_phrase = cur_phrase;
+                step_end->m_phrase_len = len;
+                step_end->m_nword = nword;
+                step_end->m_backward_nstep = i - k;
+            }
+            if ( !(result & SEARCH_CONTINUED) )
+                break;
+        }
+    }
+    return backtrace(steps, phrase_len, strings);
+}
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings){
+    //backtracing to get the result.
+    size_t cur_step = phrase_len;
+    g_array_set_size(strings, 0);
+    while ( cur_step ){
+        SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step);
+        step->m_nword = 0; step->m_backward_nstep = 0;
+        g_array_append_val(strings, *step);
+        cur_step = cur_step + step->m_backward_nstep;
+    }
+
+    //reverse the strings
+    for ( size_t i = 0; i < strings->len / 2; ++i ) {
+        SegmentStep * head, * tail;
+        head = &g_array_index(strings, SegmentStep, i);
+        tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i );
+        SegmentStep tmp;
+        tmp = *head;
+        *head = *tail;
+        *tail = tmp;
+    }
+
+    g_array_free(steps, TRUE);
+    return true;
+}
+
 void print_help(){
     printf("Usage: mmseg [--generate-extra-enter]\n");
     exit(1);
@@ -93,7 +173,33 @@ int main(int argc, char * argv[]){
             break;
         linebuf[strlen(linebuf) - 1] = '\0';
 
-        
+        //check non-ucs2 characters
+        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+        glong len = 0;
+        utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL);
+        if ( len != num_of_chars ) {
+            fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf);
+            continue;
+        }
+
+        //do segment stuff
+        GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+        segment(g_phrases, sentence, len, strings);
+
+        //print out the split phrase
+        for ( glong i = 0; i < strings->len; ++i ) {
+            SegmentStep * step = &g_array_index(strings, SegmentStep, i);
+            char * string = g_utf16_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
+            printf("%s\n", string);
+            g_free(string);
+        }
+
+        //print extra enter
+        if ( gen_extra_enter )
+            printf("\n");
+
+        g_array_free(strings, TRUE);
+        g_free(sentence);
     }
     return 0;
 }
author	Peng Wu <alexepico@gmail.com>	2010-09-10 14:39:40 +0800
committer	Peng Wu <alexepico@gmail.com>	2010-09-10 14:39:40 +0800
commit	598490883a327c0ab0c414e5851d8e749403492e (patch)
tree	182654ca1729fe761115c51222c6bf716aebd3f3 /src/segment/spseg.cpp
parent	c8fbacc76996e693d734d0d5c94f63a7d6fe198a (diff)
download	libpinyin-598490883a327c0ab0c414e5851d8e749403492e.tar.gz libpinyin-598490883a327c0ab0c414e5851d8e749403492e.tar.xz libpinyin-598490883a327c0ab0c414e5851d8e749403492e.zip