From f1e74f78a7569f5fa566ddae861624026db86563 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Fri, 19 Oct 2012 11:50:19 +0800 Subject: update gen ngram --- utils/training/gen_ngram.cpp | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'utils/training') diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp index 983f967..cc68d9e 100644 --- a/utils/training/gen_ngram.cpp +++ b/utils/training/gen_ngram.cpp @@ -80,22 +80,13 @@ int main(int argc, char * argv[]){ while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; - linebuf[strlen(linebuf)-1] = '\0'; - - glong phrase_len = 0; - ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); - - phrase_token_t token = null_token; - if ( 0 != phrase_len ) { - phrase_index.clear_tokens(tokens); - int result = phrase_table.search(phrase_len, phrase, tokens); - int num = get_first_token(tokens, token); - if ( !(result & SEARCH_OK) ) - token = null_token; - g_free(phrase); - phrase = NULL; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; } + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + last_token = cur_token; cur_token = token; -- cgit