diff options
Diffstat (limited to 'utils')
-rw-r--r-- | utils/training/gen_deleted_ngram.cpp | 21 |
1 files changed, 5 insertions, 16 deletions
diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp index fe0a9d9..55679ed 100644 --- a/utils/training/gen_deleted_ngram.cpp +++ b/utils/training/gen_deleted_ngram.cpp @@ -80,24 +80,13 @@ int main(int argc, char * argv[]){ while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; - if ( '\n' == linebuf[strlen(linebuf)-1] ) - linebuf[strlen(linebuf)-1] = '\0'; - - glong phrase_len = 0; - ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); - - phrase_token_t token = null_token; - if ( 0 != phrase_len ) { - phrase_index.clear_tokens(tokens); - int result = phrase_table.search(phrase_len, phrase, tokens); - int num = get_first_token(tokens, token); - - if ( !(result & SEARCH_OK) ) - token = null_token; - g_free(phrase); - phrase = NULL; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; } + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + last_token = cur_token; cur_token = token; |