From 2dad7c6847f49db991f092fa814f1d0660ba1d25 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Fri, 19 Oct 2012 12:01:01 +0800 Subject: update gen k mixture model --- utils/training/gen_k_mixture_model.cpp | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp index c8a8b38..eae75c4 100644 --- a/utils/training/gen_k_mixture_model.cpp +++ b/utils/training/gen_k_mixture_model.cpp @@ -62,26 +62,13 @@ bool read_document(PhraseLargeTable2 * phrase_table, while ( getline(&linebuf, &size, document) ){ if ( feof(document) ) break; - /* Note: check '\n' here? */ - linebuf[strlen(linebuf) - 1] = '\0'; - glong phrase_len = 0; - ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); - - phrase_token_t token = null_token; - if ( 0 != phrase_len ) { - phrase_index->clear_tokens(tokens); - int search_result = phrase_table->search - (phrase_len, phrase, tokens); - int num = get_first_token(tokens, token); - - if ( !(search_result & SEARCH_OK) ) - token = null_token; - - g_free(phrase); - phrase = NULL; + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; } + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + last_token = cur_token; cur_token = token; -- cgit