From 4797c0419273b71e5fa64ba7a1ee233de7c0ac48 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 17 May 2012 15:01:11 +0800 Subject: update utils/training --- utils/training/estimate_interpolation.cpp | 28 ++++++++++++++--------- utils/training/eval_correction_rate.cpp | 35 +++++++++++++++++++---------- utils/training/export_k_mixture_model.cpp | 26 ++++++++++++++-------- utils/training/gen_ngram.cpp | 37 ++++++++++++++++++------------- utils/training/gen_unigram.cpp | 34 +++++++++++++--------------- 5 files changed, 95 insertions(+), 65 deletions(-) (limited to 'utils/training') diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp index e62e8c0..5f5abae 100644 --- a/utils/training/estimate_interpolation.cpp +++ b/utils/training/estimate_interpolation.cpp @@ -90,16 +90,24 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram, int main(int argc, char * argv[]){ FacadePhraseIndex phrase_index; - - //gb_char binary file - MemoryChunk * chunk = new MemoryChunk; - chunk->load("gb_char.bin"); - phrase_index.load(1, chunk); - - //gbk_char binary file - chunk = new MemoryChunk; - chunk->load("gbk_char.bin"); - phrase_index.load(2, chunk); + MemoryChunk * chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + g_free(filename); + } Bigram bigram; bigram.attach("bigram.db", ATTACH_READONLY); diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp index 5a0ec77..55fc3b0 100644 --- a/utils/training/eval_correction_rate.cpp +++ b/utils/training/eval_correction_rate.cpp @@ -118,22 +118,33 @@ int main(int argc, char * argv[]){ pinyin_option_t options = USE_TONE; FacadeChewingTable largetable; - MemoryChunk * new_chunk = new MemoryChunk; - new_chunk->load("pinyin_index.bin"); - largetable.load(options, new_chunk, NULL); + MemoryChunk * chunk = new MemoryChunk; + chunk->load("pinyin_index.bin"); + largetable.load(options, chunk, NULL); FacadePhraseIndex phrase_index; - new_chunk = new MemoryChunk; - new_chunk->load("gb_char.bin"); - phrase_index.load(1, new_chunk); - new_chunk = new MemoryChunk; - new_chunk->load("gbk_char.bin"); - phrase_index.load(2, new_chunk); + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + g_free(filename); + } FacadePhraseTable phrases; - new_chunk = new MemoryChunk; - new_chunk->load("phrase_index.bin"); - phrases.load(new_chunk, NULL); + chunk = new MemoryChunk; + chunk->load("phrase_index.bin"); + phrases.load(chunk, NULL); Bigram system_bigram; system_bigram.attach("bigram.db", ATTACH_READONLY); diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp index af4116d..fd20a84 100644 --- a/utils/training/export_k_mixture_model.cpp +++ b/utils/training/export_k_mixture_model.cpp @@ -125,16 +125,24 @@ int main(int argc, char * argv[]){ } FacadePhraseIndex phrase_index; + MemoryChunk * chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } - //gb_char binary file - MemoryChunk * chunk = new MemoryChunk; - chunk->load("gb_char.bin"); - phrase_index.load(1, chunk); - - //gbk_char binary file - chunk = new MemoryChunk; - chunk->load("gbk_char.bin"); - phrase_index.load(2, chunk); + phrase_index.load(i, chunk); + g_free(filename); + } KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READONLY); diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp index be3364d..8c2a31c 100644 --- a/utils/training/gen_ngram.cpp +++ b/utils/training/gen_ngram.cpp @@ -26,8 +26,6 @@ #include #include "pinyin_internal.h" -static PhraseLargeTable * g_phrases = NULL; - void print_help(){ printf("Usage: gen_ngram [--skip-pi-gram-training]\n"); printf(" [--bigram-file ]\n"); @@ -58,23 +56,30 @@ int main(int argc, char * argv[]){ ++i; } - g_phrases = new PhraseLargeTable; - //init phrase lookup + PhraseLargeTable phrases; + /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); - g_phrases->load(chunk); + phrases.load(chunk); FacadePhraseIndex phrase_index; - - //gb_char binary file - chunk = new MemoryChunk; - chunk->load("gb_char.bin"); - phrase_index.load(1, chunk); - - //gbk_char binary file - chunk = new MemoryChunk; - chunk->load("gbk_char.bin"); - phrase_index.load(2, chunk); + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + g_free(filename); + } Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); @@ -93,7 +98,7 @@ int main(int argc, char * argv[]){ phrase_token_t token = 0; if ( 0 != phrase_len ) { - int result = g_phrases->search( phrase_len, phrase, token); + int result = phrases.search( phrase_len, phrase, token); if ( ! (result & SEARCH_OK) ) token = 0; g_free(phrase); diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp index cd938f6..2656647 100644 --- a/utils/training/gen_unigram.cpp +++ b/utils/training/gen_unigram.cpp @@ -26,26 +26,24 @@ /* increase all unigram frequency by a constant. */ int main(int argc, char * argv[]){ - FacadePhraseIndex phrase_index; - - /* gb_char binary file */ - MemoryChunk * chunk = new MemoryChunk; - bool retval = chunk->load("gb_char.bin"); - if (!retval) { - fprintf(stderr, "open gb_char.bin failed!\n"); - exit(ENOENT); - } - phrase_index.load(1, chunk); - - /* gbk_char binary file */ - chunk = new MemoryChunk; - retval = chunk->load("gbk_char.bin"); - if (!retval) { - fprintf(stderr, "open gbk_char.bin failed!\n"); - exit(ENOENT); + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + g_free(filename); } - phrase_index.load(2, chunk); /* Note: please increase the value when corpus size becomes larger. * To avoid zero value when computing unigram frequency in float format. -- cgit