diff options
author | Peng Wu <alexepico@gmail.com> | 2012-10-19 11:50:19 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-10-19 11:50:19 +0800 |
commit | f1e74f78a7569f5fa566ddae861624026db86563 (patch) | |
tree | 92df950d5b93bfcf1879e21e9e097db2134fedde /utils/training | |
parent | fe5b03bb049d1eda7b447a433060203bb239bba6 (diff) | |
download | libpinyin-f1e74f78a7569f5fa566ddae861624026db86563.tar.gz libpinyin-f1e74f78a7569f5fa566ddae861624026db86563.tar.xz libpinyin-f1e74f78a7569f5fa566ddae861624026db86563.zip |
update gen ngram
Diffstat (limited to 'utils/training')
-rw-r--r-- | utils/training/gen_ngram.cpp | 19 |
1 files changed, 5 insertions, 14 deletions
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp index 983f967..cc68d9e 100644 --- a/utils/training/gen_ngram.cpp +++ b/utils/training/gen_ngram.cpp @@ -80,22 +80,13 @@ int main(int argc, char * argv[]){ while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; - linebuf[strlen(linebuf)-1] = '\0'; - - glong phrase_len = 0; - ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); - - phrase_token_t token = null_token; - if ( 0 != phrase_len ) { - phrase_index.clear_tokens(tokens); - int result = phrase_table.search(phrase_len, phrase, tokens); - int num = get_first_token(tokens, token); - if ( !(result & SEARCH_OK) ) - token = null_token; - g_free(phrase); - phrase = NULL; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; } + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + last_token = cur_token; cur_token = token; |