From aeadca38bc3187b99f106656f2f48481851368ff Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 3 Sep 2012 17:21:33 +0800 Subject: update gen_k_mixture_model.cpp --- utils/training/gen_k_mixture_model.cpp | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp index 01b9246..c8bc5f2 100644 --- a/utils/training/gen_k_mixture_model.cpp +++ b/utils/training/gen_k_mixture_model.cpp @@ -23,6 +23,7 @@ #include #include #include "pinyin_internal.h" +#include "utils_helper.h" #include "k_mixture_model.h" /* Hash token of Hash token of word count. */ @@ -45,9 +46,15 @@ void print_help(){ } -bool read_document(PhraseLargeTable * phrases, FILE * document, +bool read_document(PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + FILE * document, HashofDocument hash_of_document, HashofUnigram hash_of_unigram){ + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + char * linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; @@ -61,11 +68,15 @@ bool read_document(PhraseLargeTable * phrases, FILE * document, glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); - phrase_token_t token = 0; + phrase_token_t token = null_token; if ( 0 != phrase_len ) { - int search_result = phrases->search( phrase_len, phrase, token ); - if ( ! (search_result & SEARCH_OK) ) - token = 0; + int search_result = phrase_table->search + (phrase_len, phrase, tokens); + int num = get_first_token(tokens, token); + + if ( !(search_result & SEARCH_OK) ) + token = null_token; + g_free(phrase); phrase = NULL; } @@ -129,6 +140,8 @@ bool read_document(PhraseLargeTable * phrases, FILE * document, free(linebuf); + phrase_index->destroy_tokens(tokens); + return true; } @@ -343,10 +356,14 @@ int main(int argc, char * argv[]){ ++i; } - PhraseLargeTable phrases; + PhraseLargeTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); - phrases.load(chunk); + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + if (!load_phrase_index(&phrase_index)) + exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); @@ -366,7 +383,7 @@ int main(int argc, char * argv[]){ HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); - assert(read_document(&phrases, document, + assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; -- cgit