From f1e74f78a7569f5fa566ddae861624026db86563 Mon Sep 17 00:00:00 2001
From: Peng Wu <alexepico@gmail.com>
Date: Fri, 19 Oct 2012 11:50:19 +0800
Subject: update gen ngram

---
 utils/training/gen_ngram.cpp | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
index 983f967..cc68d9e 100644
--- a/utils/training/gen_ngram.cpp
+++ b/utils/training/gen_ngram.cpp
@@ -80,22 +80,13 @@ int main(int argc, char * argv[]){
     while( getline(&linebuf, &size, stdin) ){
 	if ( feof(stdin) )
 	    break;
-        linebuf[strlen(linebuf)-1] = '\0';
-
-        glong phrase_len = 0;
-        ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL);
-
-	phrase_token_t token = null_token;
-        if ( 0 != phrase_len ) {
-            phrase_index.clear_tokens(tokens);
-            int result = phrase_table.search(phrase_len, phrase, tokens);
-            int num = get_first_token(tokens, token);
-            if ( !(result & SEARCH_OK) )
-                token = null_token;
-            g_free(phrase);
-            phrase = NULL;
+
+        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
         }
 
+        TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
 	last_token = cur_token;
 	cur_token = token;
 
-- 
cgit