summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-02-22 14:36:54 +0800
committerPeng Wu <alexepico@gmail.com>2012-02-22 14:36:54 +0800
commit0ed214b64542bf59948cd2423c6a31d7d1de6dde (patch)
tree863664bc3bc64e7f43da820c6e04ee28faba59c3
parentcae862b94fc2805e2b44d839bfec5960acf6e8e1 (diff)
downloadlibpinyin-0ed214b64542bf59948cd2423c6a31d7d1de6dde.zip
libpinyin-0ed214b64542bf59948cd2423c6a31d7d1de6dde.tar.gz
libpinyin-0ed214b64542bf59948cd2423c6a31d7d1de6dde.tar.xz
update utils/segment
-rw-r--r--utils/segment/ngseg.cpp42
-rw-r--r--utils/segment/spseg.cpp10
2 files changed, 26 insertions, 26 deletions
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
index 56afd30..6acde10 100644
--- a/utils/segment/ngseg.cpp
+++ b/utils/segment/ngseg.cpp
@@ -49,10 +49,10 @@ void print_help(){
printf("Usage: ngseg [--generate-extra-enter]\n");
}
-bool deal_with_segmentable(GArray * current_utf16){
+bool deal_with_segmentable(GArray * current_ucs4){
char * result_string = NULL;
MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
- g_phrase_lookup->get_best_match(current_utf16->len, (utf16_t *) current_utf16->data, results);
+ g_phrase_lookup->get_best_match(current_ucs4->len, (ucs4_t *) current_ucs4->data, results);
#if 0
for ( size_t i = 0; i < results->len; ++i) {
phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
@@ -66,8 +66,8 @@ bool deal_with_segmentable(GArray * current_utf16){
if (result_string) {
printf("%s\n", result_string);
} else {
- char * tmp_string = g_utf16_to_utf8
- ( (utf16_t *) current_utf16->data, current_utf16->len,
+ char * tmp_string = g_ucs4_to_utf8
+ ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
NULL, NULL, NULL);
fprintf(stderr, "Un-segmentable sentence encountered:%s\n",
tmp_string);
@@ -79,9 +79,9 @@ bool deal_with_segmentable(GArray * current_utf16){
return true;
}
-bool deal_with_unknown(GArray * current_utf16){
- char * result_string = g_utf16_to_utf8
- ( (utf16_t *) current_utf16->data, current_utf16->len,
+bool deal_with_unknown(GArray * current_ucs4){
+ char * result_string = g_ucs4_to_utf8
+ ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
NULL, NULL, NULL);
printf("%s\n", result_string);
g_free(result_string);
@@ -134,7 +134,7 @@ int main(int argc, char * argv[]){
CONTEXT_STATE state, next_state;
- GArray * current_utf16 = g_array_new(TRUE, TRUE, sizeof(utf16_t));
+ GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
phrase_token_t token = null_token;
//split the sentence
@@ -149,9 +149,9 @@ int main(int argc, char * argv[]){
//check non-ucs2 characters
const glong num_of_chars = g_utf8_strlen(linebuf, -1);
glong len = 0;
- utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL);
+ ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
if ( len != num_of_chars ) {
- fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf);
+ fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
printf("\n");
continue;
}
@@ -164,7 +164,7 @@ int main(int argc, char * argv[]){
state = CONTEXT_INIT;
bool result = phrase_table.search( 1, sentence, token);
- g_array_append_val( current_utf16, sentence[0]);
+ g_array_append_val( current_ucs4, sentence[0]);
if ( result & SEARCH_OK )
state = CONTEXT_SEGMENTABLE;
else
@@ -178,31 +178,31 @@ int main(int argc, char * argv[]){
next_state = CONTEXT_UNKNOWN;
if ( state == next_state ){
- g_array_append_val(current_utf16, sentence[i]);
+ g_array_append_val(current_ucs4, sentence[i]);
continue;
}
assert ( state != next_state );
if ( state == CONTEXT_SEGMENTABLE )
- deal_with_segmentable(current_utf16);
+ deal_with_segmentable(current_ucs4);
if ( state == CONTEXT_UNKNOWN )
- deal_with_unknown(current_utf16);
+ deal_with_unknown(current_ucs4);
/* save the current character */
- g_array_set_size(current_utf16, 0);
- g_array_append_val(current_utf16, sentence[i]);
+ g_array_set_size(current_ucs4, 0);
+ g_array_append_val(current_ucs4, sentence[i]);
state = next_state;
}
- if ( current_utf16->len ) {
+ if ( current_ucs4->len ) {
/* this seems always true. */
if ( state == CONTEXT_SEGMENTABLE )
- deal_with_segmentable(current_utf16);
+ deal_with_segmentable(current_ucs4);
if ( state == CONTEXT_UNKNOWN )
- deal_with_unknown(current_utf16);
- g_array_set_size(current_utf16, 0);
+ deal_with_unknown(current_ucs4);
+ g_array_set_size(current_ucs4, 0);
}
/* print extra enter */
@@ -214,7 +214,7 @@ int main(int argc, char * argv[]){
g_phrase_lookup = NULL;
/* print enter at file tail */
printf("\n");
- g_array_free(current_utf16, TRUE);
+ g_array_free(current_ucs4, TRUE);
free(linebuf);
return 0;
}
diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp
index b51982e..448ce33 100644
--- a/utils/segment/spseg.cpp
+++ b/utils/segment/spseg.cpp
@@ -35,7 +35,7 @@
struct SegmentStep{
phrase_token_t m_handle;
- utf16_t * m_phrase;
+ ucs4_t * m_phrase;
size_t m_phrase_len;
//use formula W = number of words. Zero handle means one word.
guint m_nword;
@@ -55,7 +55,7 @@ bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
//Note: do not free phrase, as it is used by strings (array of segment).
bool segment(PhraseLargeTable * phrases, //Lookup Phrase
- utf16_t * phrase,
+ ucs4_t * phrase,
glong phrase_len,
GArray * strings /* Array of Segment *. */){
/* Prepare for shortest path segment dynamic programming. */
@@ -73,7 +73,7 @@ bool segment(PhraseLargeTable * phrases, //Lookup Phrase
size_t nword = step_begin->m_nword;
for ( glong k = i + 1; k < phrase_len + 1; ++k ) {
size_t len = k - i;
- utf16_t * cur_phrase = phrase + i;
+ ucs4_t * cur_phrase = phrase + i;
phrase_token_t token = 0;
int result = phrases->search(len, cur_phrase, token);
@@ -165,7 +165,7 @@ int main(int argc, char * argv[]){
//check non-ucs2 characters
const glong num_of_chars = g_utf8_strlen(linebuf, -1);
glong len = 0;
- utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL);
+ ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
if ( len != num_of_chars ) {
fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf);
printf("\n");
@@ -179,7 +179,7 @@ int main(int argc, char * argv[]){
//print out the split phrase
for ( glong i = 0; i < strings->len; ++i ) {
SegmentStep * step = &g_array_index(strings, SegmentStep, i);
- char * string = g_utf16_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
+ char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
printf("%s\n", string);
g_free(string);
}