summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2010-11-08 16:44:32 +0800
committerPeng Wu <alexepico@gmail.com>2010-11-08 16:44:32 +0800
commitf5f8fe94ffcda84c5a905fcc4a9dd452d8a12bbf (patch)
tree26571cf7df8e7135eb3254afd516043cc8cb37c9
parentd7b59e84a518e06cfe04f21bba90dddcb4121ba2 (diff)
downloadlibpinyin-f5f8fe94ffcda84c5a905fcc4a9dd452d8a12bbf.tar.gz
libpinyin-f5f8fe94ffcda84c5a905fcc4a9dd452d8a12bbf.tar.xz
libpinyin-f5f8fe94ffcda84c5a905fcc4a9dd452d8a12bbf.zip
write n-gram segment.
-rw-r--r--utils/segment/ngseg.cpp103
1 files changed, 103 insertions, 0 deletions
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
index 0ccc21a..dd8a704 100644
--- a/utils/segment/ngseg.cpp
+++ b/utils/segment/ngseg.cpp
@@ -42,11 +42,38 @@ FacadePhraseIndex * g_phrase_index = NULL;
Bigram * g_bigram = NULL;
PhraseLookup * g_phrase_lookup = NULL;
+enum CONTEXT_STATE{
+ CONTEXT_INIT,
+ CONTEXT_SEGMENTABLE,
+ CONTEXT_UNKNOWN
+};
+
void print_help(){
printf("Usage: ngseg [--generate-extra-enter]\n");
exit(1);
}
+bool deal_with_segmentable(GArray * current_utf16){
+ char * result_string = NULL;
+ MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ g_phrase_lookup->get_best_match(current_utf16->len, (utf16_t *) current_utf16->data, results);
+ g_phrase_lookup->convert_to_utf8(results, "\n", result_string);
+ printf("%s\n", result_string);
+ g_array_free(results, TRUE);
+ g_free(result_string);
+ return true;
+}
+
+bool deal_with_unknown(GArray * current_utf16){
+ char * result_string = g_utf16_to_utf8
+ ( (utf16_t *) current_utf16->data, current_utf16->len,
+ NULL, NULL, NULL);
+ printf("%s\n", result_string);
+ g_free(result_string);
+ return true;
+}
+
+
int main(int argc, char * argv[]){
int i = 1;
bool gen_extra_enter = false;
@@ -86,5 +113,81 @@ int main(int argc, char * argv[]){
g_bigram);
+ CONTEXT_STATE state, next_state;
+ GArray * current_utf16 = g_array_new(TRUE, TRUE, sizeof(utf16_t));
+ phrase_token_t token = null_token;
+
+ //split the sentence
+ char * linebuf = NULL;
+ size_t size = 0;
+ ssize_t read;
+ while( (read = getline(&linebuf, &size, stdin)) != -1 ){
+ linebuf[strlen(linebuf) - 1] = '\0';
+
+ //check non-ucs2 characters
+ const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+ glong len = 0;
+ utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL);
+ if ( len != num_of_chars ) {
+ fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf);
+ continue;
+ }
+
+ /* only new-line persists. */
+ if ( 0 == num_of_chars ) {
+ printf("\n");
+ continue;
+ }
+
+ state = CONTEXT_INIT;
+ bool result = g_phrase_table->search( 1, sentence, token);
+ g_array_append_val( current_utf16, sentence[0]);
+ if ( result & SEARCH_OK )
+ state = CONTEXT_SEGMENTABLE;
+ else
+ state = CONTEXT_UNKNOWN;
+
+ for ( int i = 1; i < num_of_chars; ++i) {
+ bool result = g_phrase_table->search( 1, sentence + i, token);
+ if ( result & SEARCH_OK )
+ next_state = CONTEXT_SEGMENTABLE;
+ else
+ next_state = CONTEXT_UNKNOWN;
+
+ if ( state == next_state ){
+ g_array_append_val(current_utf16, sentence[i]);
+ continue;
+ }
+
+ assert ( state != next_state );
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(current_utf16);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_utf16);
+
+ /* save the current character */
+ g_array_set_size(current_utf16, 0);
+ g_array_append_val(current_utf16, sentence[i]);
+ state = next_state;
+ }
+
+ if ( current_utf16->len ) {
+ /* this seems always true. */
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(current_utf16);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_utf16);
+ g_array_set_size(current_utf16, 0);
+ }
+
+ /* print extra enter */
+ if ( gen_extra_enter )
+ printf("\n");
+ }
+
+ g_array_free(current_utf16, TRUE);
+ free(linebuf);
return 0;
}