summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-10-19 11:50:19 +0800
committerPeng Wu <alexepico@gmail.com>2012-10-19 11:50:19 +0800
commitf1e74f78a7569f5fa566ddae861624026db86563 (patch)
tree92df950d5b93bfcf1879e21e9e097db2134fedde /utils
parentfe5b03bb049d1eda7b447a433060203bb239bba6 (diff)
downloadlibpinyin-f1e74f78a7569f5fa566ddae861624026db86563.tar.gz
libpinyin-f1e74f78a7569f5fa566ddae861624026db86563.tar.xz
libpinyin-f1e74f78a7569f5fa566ddae861624026db86563.zip
update gen ngram
Diffstat (limited to 'utils')
-rw-r--r--utils/training/gen_ngram.cpp19
1 files changed, 5 insertions, 14 deletions
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
index 983f967..cc68d9e 100644
--- a/utils/training/gen_ngram.cpp
+++ b/utils/training/gen_ngram.cpp
@@ -80,22 +80,13 @@ int main(int argc, char * argv[]){
while( getline(&linebuf, &size, stdin) ){
if ( feof(stdin) )
break;
- linebuf[strlen(linebuf)-1] = '\0';
-
- glong phrase_len = 0;
- ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL);
-
- phrase_token_t token = null_token;
- if ( 0 != phrase_len ) {
- phrase_index.clear_tokens(tokens);
- int result = phrase_table.search(phrase_len, phrase, tokens);
- int num = get_first_token(tokens, token);
- if ( !(result & SEARCH_OK) )
- token = null_token;
- g_free(phrase);
- phrase = NULL;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
}
+ TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
last_token = cur_token;
cur_token = token;