diff options
author | Peng Wu <alexepico@gmail.com> | 2012-09-03 17:21:33 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-09-03 17:21:33 +0800 |
commit | aeadca38bc3187b99f106656f2f48481851368ff (patch) | |
tree | fa84097d49536ef1ff4e193ab5d227388b2527b8 | |
parent | 3c430d73871ad46a9994316d579202945e323a26 (diff) | |
download | libpinyin-aeadca38bc3187b99f106656f2f48481851368ff.tar.gz libpinyin-aeadca38bc3187b99f106656f2f48481851368ff.tar.xz libpinyin-aeadca38bc3187b99f106656f2f48481851368ff.zip |
update gen_k_mixture_model.cpp
-rw-r--r-- | utils/training/gen_k_mixture_model.cpp | 33 |
1 files changed, 25 insertions, 8 deletions
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp index 01b9246..c8bc5f2 100644 --- a/utils/training/gen_k_mixture_model.cpp +++ b/utils/training/gen_k_mixture_model.cpp @@ -23,6 +23,7 @@ #include <glib.h> #include <locale.h> #include "pinyin_internal.h" +#include "utils_helper.h" #include "k_mixture_model.h" /* Hash token of Hash token of word count. */ @@ -45,9 +46,15 @@ void print_help(){ } -bool read_document(PhraseLargeTable * phrases, FILE * document, +bool read_document(PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + FILE * document, HashofDocument hash_of_document, HashofUnigram hash_of_unigram){ + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + char * linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; @@ -61,11 +68,15 @@ bool read_document(PhraseLargeTable * phrases, FILE * document, glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); - phrase_token_t token = 0; + phrase_token_t token = null_token; if ( 0 != phrase_len ) { - int search_result = phrases->search( phrase_len, phrase, token ); - if ( ! (search_result & SEARCH_OK) ) - token = 0; + int search_result = phrase_table->search + (phrase_len, phrase, tokens); + int num = get_first_token(tokens, token); + + if ( !(search_result & SEARCH_OK) ) + token = null_token; + g_free(phrase); phrase = NULL; } @@ -129,6 +140,8 @@ bool read_document(PhraseLargeTable * phrases, FILE * document, free(linebuf); + phrase_index->destroy_tokens(tokens); + return true; } @@ -343,10 +356,14 @@ int main(int argc, char * argv[]){ ++i; } - PhraseLargeTable phrases; + PhraseLargeTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); - phrases.load(chunk); + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + if (!load_phrase_index(&phrase_index)) + exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); @@ -366,7 +383,7 @@ int main(int argc, char * argv[]){ HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); - assert(read_document(&phrases, document, + assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; |