From 0ed214b64542bf59948cd2423c6a31d7d1de6dde Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Wed, 22 Feb 2012 14:36:54 +0800 Subject: update utils/segment --- utils/segment/ngseg.cpp | 42 +++++++++++++++++++++--------------------- utils/segment/spseg.cpp | 10 +++++----- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp index 56afd30..6acde10 100644 --- a/utils/segment/ngseg.cpp +++ b/utils/segment/ngseg.cpp @@ -49,10 +49,10 @@ void print_help(){ printf("Usage: ngseg [--generate-extra-enter]\n"); } -bool deal_with_segmentable(GArray * current_utf16){ +bool deal_with_segmentable(GArray * current_ucs4){ char * result_string = NULL; MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - g_phrase_lookup->get_best_match(current_utf16->len, (utf16_t *) current_utf16->data, results); + g_phrase_lookup->get_best_match(current_ucs4->len, (ucs4_t *) current_ucs4->data, results); #if 0 for ( size_t i = 0; i < results->len; ++i) { phrase_token_t * token = &g_array_index(results, phrase_token_t, i); @@ -66,8 +66,8 @@ bool deal_with_segmentable(GArray * current_utf16){ if (result_string) { printf("%s\n", result_string); } else { - char * tmp_string = g_utf16_to_utf8 - ( (utf16_t *) current_utf16->data, current_utf16->len, + char * tmp_string = g_ucs4_to_utf8 + ( (ucs4_t *) current_ucs4->data, current_ucs4->len, NULL, NULL, NULL); fprintf(stderr, "Un-segmentable sentence encountered:%s\n", tmp_string); @@ -79,9 +79,9 @@ bool deal_with_segmentable(GArray * current_utf16){ return true; } -bool deal_with_unknown(GArray * current_utf16){ - char * result_string = g_utf16_to_utf8 - ( (utf16_t *) current_utf16->data, current_utf16->len, +bool deal_with_unknown(GArray * current_ucs4){ + char * result_string = g_ucs4_to_utf8 + ( (ucs4_t *) current_ucs4->data, current_ucs4->len, NULL, NULL, NULL); printf("%s\n", result_string); g_free(result_string); @@ -134,7 +134,7 @@ int main(int argc, char * argv[]){ CONTEXT_STATE state, next_state; - GArray * current_utf16 = g_array_new(TRUE, TRUE, sizeof(utf16_t)); + GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); phrase_token_t token = null_token; //split the sentence @@ -149,9 +149,9 @@ int main(int argc, char * argv[]){ //check non-ucs2 characters const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; - utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL); + ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { - fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf); + fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); printf("\n"); continue; } @@ -164,7 +164,7 @@ int main(int argc, char * argv[]){ state = CONTEXT_INIT; bool result = phrase_table.search( 1, sentence, token); - g_array_append_val( current_utf16, sentence[0]); + g_array_append_val( current_ucs4, sentence[0]); if ( result & SEARCH_OK ) state = CONTEXT_SEGMENTABLE; else @@ -178,31 +178,31 @@ int main(int argc, char * argv[]){ next_state = CONTEXT_UNKNOWN; if ( state == next_state ){ - g_array_append_val(current_utf16, sentence[i]); + g_array_append_val(current_ucs4, sentence[i]); continue; } assert ( state != next_state ); if ( state == CONTEXT_SEGMENTABLE ) - deal_with_segmentable(current_utf16); + deal_with_segmentable(current_ucs4); if ( state == CONTEXT_UNKNOWN ) - deal_with_unknown(current_utf16); + deal_with_unknown(current_ucs4); /* save the current character */ - g_array_set_size(current_utf16, 0); - g_array_append_val(current_utf16, sentence[i]); + g_array_set_size(current_ucs4, 0); + g_array_append_val(current_ucs4, sentence[i]); state = next_state; } - if ( current_utf16->len ) { + if ( current_ucs4->len ) { /* this seems always true. */ if ( state == CONTEXT_SEGMENTABLE ) - deal_with_segmentable(current_utf16); + deal_with_segmentable(current_ucs4); if ( state == CONTEXT_UNKNOWN ) - deal_with_unknown(current_utf16); - g_array_set_size(current_utf16, 0); + deal_with_unknown(current_ucs4); + g_array_set_size(current_ucs4, 0); } /* print extra enter */ @@ -214,7 +214,7 @@ int main(int argc, char * argv[]){ g_phrase_lookup = NULL; /* print enter at file tail */ printf("\n"); - g_array_free(current_utf16, TRUE); + g_array_free(current_ucs4, TRUE); free(linebuf); return 0; } diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp index b51982e..448ce33 100644 --- a/utils/segment/spseg.cpp +++ b/utils/segment/spseg.cpp @@ -35,7 +35,7 @@ struct SegmentStep{ phrase_token_t m_handle; - utf16_t * m_phrase; + ucs4_t * m_phrase; size_t m_phrase_len; //use formula W = number of words. Zero handle means one word. guint m_nword; @@ -55,7 +55,7 @@ bool backtrace(GArray * steps, glong phrase_len, GArray * strings); //Note: do not free phrase, as it is used by strings (array of segment). bool segment(PhraseLargeTable * phrases, //Lookup Phrase - utf16_t * phrase, + ucs4_t * phrase, glong phrase_len, GArray * strings /* Array of Segment *. */){ /* Prepare for shortest path segment dynamic programming. */ @@ -73,7 +73,7 @@ bool segment(PhraseLargeTable * phrases, //Lookup Phrase size_t nword = step_begin->m_nword; for ( glong k = i + 1; k < phrase_len + 1; ++k ) { size_t len = k - i; - utf16_t * cur_phrase = phrase + i; + ucs4_t * cur_phrase = phrase + i; phrase_token_t token = 0; int result = phrases->search(len, cur_phrase, token); @@ -165,7 +165,7 @@ int main(int argc, char * argv[]){ //check non-ucs2 characters const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; - utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL); + ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf); printf("\n"); @@ -179,7 +179,7 @@ int main(int argc, char * argv[]){ //print out the split phrase for ( glong i = 0; i < strings->len; ++i ) { SegmentStep * step = &g_array_index(strings, SegmentStep, i); - char * string = g_utf16_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL); + char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL); printf("%s\n", string); g_free(string); } -- cgit