diff options
author | Peng Wu <alexepico@gmail.com> | 2012-05-17 15:01:11 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-05-17 15:01:11 +0800 |
commit | 4797c0419273b71e5fa64ba7a1ee233de7c0ac48 (patch) | |
tree | 222c7be46f7e2bd64bd767273d8b62b98153e041 /utils/training | |
parent | 41286716a6b90e78eb3abe3aa5f1620bc5f0f605 (diff) | |
download | libpinyin-4797c0419273b71e5fa64ba7a1ee233de7c0ac48.tar.gz libpinyin-4797c0419273b71e5fa64ba7a1ee233de7c0ac48.tar.xz libpinyin-4797c0419273b71e5fa64ba7a1ee233de7c0ac48.zip |
update utils/training
Diffstat (limited to 'utils/training')
-rw-r--r-- | utils/training/estimate_interpolation.cpp | 28 | ||||
-rw-r--r-- | utils/training/eval_correction_rate.cpp | 35 | ||||
-rw-r--r-- | utils/training/export_k_mixture_model.cpp | 26 | ||||
-rw-r--r-- | utils/training/gen_ngram.cpp | 37 | ||||
-rw-r--r-- | utils/training/gen_unigram.cpp | 34 |
5 files changed, 95 insertions, 65 deletions
diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp index e62e8c0..5f5abae 100644 --- a/utils/training/estimate_interpolation.cpp +++ b/utils/training/estimate_interpolation.cpp @@ -90,16 +90,24 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram, int main(int argc, char * argv[]){ FacadePhraseIndex phrase_index; - - //gb_char binary file - MemoryChunk * chunk = new MemoryChunk; - chunk->load("gb_char.bin"); - phrase_index.load(1, chunk); - - //gbk_char binary file - chunk = new MemoryChunk; - chunk->load("gbk_char.bin"); - phrase_index.load(2, chunk); + MemoryChunk * chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + g_free(filename); + } Bigram bigram; bigram.attach("bigram.db", ATTACH_READONLY); diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp index 5a0ec77..55fc3b0 100644 --- a/utils/training/eval_correction_rate.cpp +++ b/utils/training/eval_correction_rate.cpp @@ -118,22 +118,33 @@ int main(int argc, char * argv[]){ pinyin_option_t options = USE_TONE; FacadeChewingTable largetable; - MemoryChunk * new_chunk = new MemoryChunk; - new_chunk->load("pinyin_index.bin"); - largetable.load(options, new_chunk, NULL); + MemoryChunk * chunk = new MemoryChunk; + chunk->load("pinyin_index.bin"); + largetable.load(options, chunk, NULL); FacadePhraseIndex phrase_index; - new_chunk = new MemoryChunk; - new_chunk->load("gb_char.bin"); - phrase_index.load(1, new_chunk); - new_chunk = new MemoryChunk; - new_chunk->load("gbk_char.bin"); - phrase_index.load(2, new_chunk); + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + g_free(filename); + } FacadePhraseTable phrases; - new_chunk = new MemoryChunk; - new_chunk->load("phrase_index.bin"); - phrases.load(new_chunk, NULL); + chunk = new MemoryChunk; + chunk->load("phrase_index.bin"); + phrases.load(chunk, NULL); Bigram system_bigram; system_bigram.attach("bigram.db", ATTACH_READONLY); diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp index af4116d..fd20a84 100644 --- a/utils/training/export_k_mixture_model.cpp +++ b/utils/training/export_k_mixture_model.cpp @@ -125,16 +125,24 @@ int main(int argc, char * argv[]){ } FacadePhraseIndex phrase_index; + MemoryChunk * chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } - //gb_char binary file - MemoryChunk * chunk = new MemoryChunk; - chunk->load("gb_char.bin"); - phrase_index.load(1, chunk); - - //gbk_char binary file - chunk = new MemoryChunk; - chunk->load("gbk_char.bin"); - phrase_index.load(2, chunk); + phrase_index.load(i, chunk); + g_free(filename); + } KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READONLY); diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp index be3364d..8c2a31c 100644 --- a/utils/training/gen_ngram.cpp +++ b/utils/training/gen_ngram.cpp @@ -26,8 +26,6 @@ #include <glib.h> #include "pinyin_internal.h" -static PhraseLargeTable * g_phrases = NULL; - void print_help(){ printf("Usage: gen_ngram [--skip-pi-gram-training]\n"); printf(" [--bigram-file <FILENAME>]\n"); @@ -58,23 +56,30 @@ int main(int argc, char * argv[]){ ++i; } - g_phrases = new PhraseLargeTable; - //init phrase lookup + PhraseLargeTable phrases; + /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); - g_phrases->load(chunk); + phrases.load(chunk); FacadePhraseIndex phrase_index; - - //gb_char binary file - chunk = new MemoryChunk; - chunk->load("gb_char.bin"); - phrase_index.load(1, chunk); - - //gbk_char binary file - chunk = new MemoryChunk; - chunk->load("gbk_char.bin"); - phrase_index.load(2, chunk); + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + g_free(filename); + } Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); @@ -93,7 +98,7 @@ int main(int argc, char * argv[]){ phrase_token_t token = 0; if ( 0 != phrase_len ) { - int result = g_phrases->search( phrase_len, phrase, token); + int result = phrases.search( phrase_len, phrase, token); if ( ! (result & SEARCH_OK) ) token = 0; g_free(phrase); diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp index cd938f6..2656647 100644 --- a/utils/training/gen_unigram.cpp +++ b/utils/training/gen_unigram.cpp @@ -26,26 +26,24 @@ /* increase all unigram frequency by a constant. */ int main(int argc, char * argv[]){ - FacadePhraseIndex phrase_index; - - /* gb_char binary file */ - MemoryChunk * chunk = new MemoryChunk; - bool retval = chunk->load("gb_char.bin"); - if (!retval) { - fprintf(stderr, "open gb_char.bin failed!\n"); - exit(ENOENT); - } - phrase_index.load(1, chunk); - - /* gbk_char binary file */ - chunk = new MemoryChunk; - retval = chunk->load("gbk_char.bin"); - if (!retval) { - fprintf(stderr, "open gbk_char.bin failed!\n"); - exit(ENOENT); + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + gchar * filename = g_build_filename("..", "..", "data", + bin_file, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + g_free(filename); } - phrase_index.load(2, chunk); /* Note: please increase the value when corpus size becomes larger. * To avoid zero value when computing unigram frequency in float format. |