From 0ed214b64542bf59948cd2423c6a31d7d1de6dde Mon Sep 17 00:00:00 2001
From: Peng Wu <alexepico@gmail.com>
Date: Wed, 22 Feb 2012 14:36:54 +0800
Subject: update utils/segment

---
 utils/segment/ngseg.cpp | 42 +++++++++++++++++++++---------------------
 utils/segment/spseg.cpp | 10 +++++-----
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
index 56afd30..6acde10 100644
--- a/utils/segment/ngseg.cpp
+++ b/utils/segment/ngseg.cpp
@@ -49,10 +49,10 @@ void print_help(){
     printf("Usage: ngseg [--generate-extra-enter]\n");
 }
 
-bool deal_with_segmentable(GArray * current_utf16){
+bool deal_with_segmentable(GArray * current_ucs4){
     char * result_string = NULL;
     MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
-    g_phrase_lookup->get_best_match(current_utf16->len, (utf16_t *) current_utf16->data, results);
+    g_phrase_lookup->get_best_match(current_ucs4->len, (ucs4_t *) current_ucs4->data, results);
 #if 0
     for ( size_t i = 0; i < results->len; ++i) {
         phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
@@ -66,8 +66,8 @@ bool deal_with_segmentable(GArray * current_utf16){
     if (result_string) {
         printf("%s\n", result_string);
     } else {
-        char * tmp_string = g_utf16_to_utf8
-            ( (utf16_t *) current_utf16->data, current_utf16->len,
+        char * tmp_string = g_ucs4_to_utf8
+            ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
               NULL, NULL, NULL);
         fprintf(stderr, "Un-segmentable sentence encountered:%s\n",
                 tmp_string);
@@ -79,9 +79,9 @@ bool deal_with_segmentable(GArray * current_utf16){
     return true;
 }
 
-bool deal_with_unknown(GArray * current_utf16){
-    char * result_string = g_utf16_to_utf8
-        ( (utf16_t *) current_utf16->data, current_utf16->len,
+bool deal_with_unknown(GArray * current_ucs4){
+    char * result_string = g_ucs4_to_utf8
+        ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
           NULL, NULL, NULL);
     printf("%s\n", result_string);
     g_free(result_string);
@@ -134,7 +134,7 @@ int main(int argc, char * argv[]){
 
 
     CONTEXT_STATE state, next_state;
-    GArray * current_utf16 = g_array_new(TRUE, TRUE, sizeof(utf16_t));
+    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
     phrase_token_t token = null_token;
 
     //split the sentence
@@ -149,9 +149,9 @@ int main(int argc, char * argv[]){
         //check non-ucs2 characters
         const glong num_of_chars = g_utf8_strlen(linebuf, -1);
         glong len = 0;
-        utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL);
+        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
         if ( len != num_of_chars ) {
-            fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf);
+            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
             printf("\n");
             continue;
         }
@@ -164,7 +164,7 @@ int main(int argc, char * argv[]){
 
         state = CONTEXT_INIT;
         bool result = phrase_table.search( 1, sentence, token);
-        g_array_append_val( current_utf16, sentence[0]);
+        g_array_append_val( current_ucs4, sentence[0]);
         if ( result & SEARCH_OK )
             state = CONTEXT_SEGMENTABLE;
         else
@@ -178,31 +178,31 @@ int main(int argc, char * argv[]){
                 next_state = CONTEXT_UNKNOWN;
 
             if ( state == next_state ){
-                g_array_append_val(current_utf16, sentence[i]);
+                g_array_append_val(current_ucs4, sentence[i]);
                 continue;
             }
 
             assert ( state != next_state );
             if ( state == CONTEXT_SEGMENTABLE )
-                deal_with_segmentable(current_utf16);
+                deal_with_segmentable(current_ucs4);
 
             if ( state == CONTEXT_UNKNOWN )
-                deal_with_unknown(current_utf16);
+                deal_with_unknown(current_ucs4);
 
             /* save the current character */
-            g_array_set_size(current_utf16, 0);
-            g_array_append_val(current_utf16, sentence[i]);
+            g_array_set_size(current_ucs4, 0);
+            g_array_append_val(current_ucs4, sentence[i]);
             state = next_state;
         }
 
-        if ( current_utf16->len ) {
+        if ( current_ucs4->len ) {
             /* this seems always true. */
             if ( state == CONTEXT_SEGMENTABLE )
-                deal_with_segmentable(current_utf16);
+                deal_with_segmentable(current_ucs4);
 
             if ( state == CONTEXT_UNKNOWN )
-                deal_with_unknown(current_utf16);
-            g_array_set_size(current_utf16, 0);
+                deal_with_unknown(current_ucs4);
+            g_array_set_size(current_ucs4, 0);
         }
 
         /* print extra enter */
@@ -214,7 +214,7 @@ int main(int argc, char * argv[]){
     g_phrase_lookup = NULL;
     /* print enter at file tail */
     printf("\n");
-    g_array_free(current_utf16, TRUE);
+    g_array_free(current_ucs4, TRUE);
     free(linebuf);
     return 0;
 }
diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp
index b51982e..448ce33 100644
--- a/utils/segment/spseg.cpp
+++ b/utils/segment/spseg.cpp
@@ -35,7 +35,7 @@
 
 struct SegmentStep{
     phrase_token_t m_handle;
-    utf16_t * m_phrase;
+    ucs4_t * m_phrase;
     size_t m_phrase_len;
     //use formula W = number of words. Zero handle means one word.
     guint m_nword;
@@ -55,7 +55,7 @@ bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
 
 //Note: do not free phrase, as it is used by strings (array of segment).
 bool segment(PhraseLargeTable * phrases, //Lookup Phrase
-             utf16_t * phrase,
+             ucs4_t * phrase,
              glong phrase_len,
              GArray * strings /* Array of Segment *. */){
     /* Prepare for shortest path segment dynamic programming. */
@@ -73,7 +73,7 @@ bool segment(PhraseLargeTable * phrases, //Lookup Phrase
         size_t nword = step_begin->m_nword;
         for ( glong k = i + 1; k < phrase_len + 1; ++k ) {
             size_t len = k - i;
-            utf16_t * cur_phrase = phrase + i;
+            ucs4_t * cur_phrase = phrase + i;
 
             phrase_token_t token = 0;
             int result = phrases->search(len, cur_phrase, token);
@@ -165,7 +165,7 @@ int main(int argc, char * argv[]){
         //check non-ucs2 characters
         const glong num_of_chars = g_utf8_strlen(linebuf, -1);
         glong len = 0;
-        utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL);
+        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
         if ( len != num_of_chars ) {
             fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf);
             printf("\n");
@@ -179,7 +179,7 @@ int main(int argc, char * argv[]){
         //print out the split phrase
         for ( glong i = 0; i < strings->len; ++i ) {
             SegmentStep * step = &g_array_index(strings, SegmentStep, i);
-            char * string = g_utf16_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
+            char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
             printf("%s\n", string);
             g_free(string);
         }
-- 
cgit