summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-05-17 15:01:11 +0800
committerPeng Wu <alexepico@gmail.com>2012-05-17 15:01:11 +0800
commit4797c0419273b71e5fa64ba7a1ee233de7c0ac48 (patch)
tree222c7be46f7e2bd64bd767273d8b62b98153e041
parent41286716a6b90e78eb3abe3aa5f1620bc5f0f605 (diff)
downloadlibpinyin-4797c0419273b71e5fa64ba7a1ee233de7c0ac48.tar.gz
libpinyin-4797c0419273b71e5fa64ba7a1ee233de7c0ac48.tar.xz
libpinyin-4797c0419273b71e5fa64ba7a1ee233de7c0ac48.zip
update utils/training
-rw-r--r--utils/training/estimate_interpolation.cpp28
-rw-r--r--utils/training/eval_correction_rate.cpp35
-rw-r--r--utils/training/export_k_mixture_model.cpp26
-rw-r--r--utils/training/gen_ngram.cpp37
-rw-r--r--utils/training/gen_unigram.cpp34
5 files changed, 95 insertions, 65 deletions
diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp
index e62e8c0..5f5abae 100644
--- a/utils/training/estimate_interpolation.cpp
+++ b/utils/training/estimate_interpolation.cpp
@@ -90,16 +90,24 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram,
int main(int argc, char * argv[]){
FacadePhraseIndex phrase_index;
-
- //gb_char binary file
- MemoryChunk * chunk = new MemoryChunk;
- chunk->load("gb_char.bin");
- phrase_index.load(1, chunk);
-
- //gbk_char binary file
- chunk = new MemoryChunk;
- chunk->load("gbk_char.bin");
- phrase_index.load(2, chunk);
+ MemoryChunk * chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const char * bin_file = pinyin_phrase_files[i];
+ if (NULL == bin_file)
+ continue;
+
+ gchar * filename = g_build_filename("..", "..", "data",
+ bin_file, NULL);
+ chunk = new MemoryChunk;
+ bool retval = chunk->load(filename);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", bin_file);
+ exit(ENOENT);
+ }
+
+ phrase_index.load(i, chunk);
+ g_free(filename);
+ }
Bigram bigram;
bigram.attach("bigram.db", ATTACH_READONLY);
diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp
index 5a0ec77..55fc3b0 100644
--- a/utils/training/eval_correction_rate.cpp
+++ b/utils/training/eval_correction_rate.cpp
@@ -118,22 +118,33 @@ int main(int argc, char * argv[]){
pinyin_option_t options = USE_TONE;
FacadeChewingTable largetable;
- MemoryChunk * new_chunk = new MemoryChunk;
- new_chunk->load("pinyin_index.bin");
- largetable.load(options, new_chunk, NULL);
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("pinyin_index.bin");
+ largetable.load(options, chunk, NULL);
FacadePhraseIndex phrase_index;
- new_chunk = new MemoryChunk;
- new_chunk->load("gb_char.bin");
- phrase_index.load(1, new_chunk);
- new_chunk = new MemoryChunk;
- new_chunk->load("gbk_char.bin");
- phrase_index.load(2, new_chunk);
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const char * bin_file = pinyin_phrase_files[i];
+ if (NULL == bin_file)
+ continue;
+
+ gchar * filename = g_build_filename("..", "..", "data",
+ bin_file, NULL);
+ chunk = new MemoryChunk;
+ bool retval = chunk->load(filename);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", bin_file);
+ exit(ENOENT);
+ }
+
+ phrase_index.load(i, chunk);
+ g_free(filename);
+ }
FacadePhraseTable phrases;
- new_chunk = new MemoryChunk;
- new_chunk->load("phrase_index.bin");
- phrases.load(new_chunk, NULL);
+ chunk = new MemoryChunk;
+ chunk->load("phrase_index.bin");
+ phrases.load(chunk, NULL);
Bigram system_bigram;
system_bigram.attach("bigram.db", ATTACH_READONLY);
diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp
index af4116d..fd20a84 100644
--- a/utils/training/export_k_mixture_model.cpp
+++ b/utils/training/export_k_mixture_model.cpp
@@ -125,16 +125,24 @@ int main(int argc, char * argv[]){
}
FacadePhraseIndex phrase_index;
+ MemoryChunk * chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const char * bin_file = pinyin_phrase_files[i];
+ if (NULL == bin_file)
+ continue;
+
+ gchar * filename = g_build_filename("..", "..", "data",
+ bin_file, NULL);
+ chunk = new MemoryChunk;
+ bool retval = chunk->load(filename);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", bin_file);
+ exit(ENOENT);
+ }
- //gb_char binary file
- MemoryChunk * chunk = new MemoryChunk;
- chunk->load("gb_char.bin");
- phrase_index.load(1, chunk);
-
- //gbk_char binary file
- chunk = new MemoryChunk;
- chunk->load("gbk_char.bin");
- phrase_index.load(2, chunk);
+ phrase_index.load(i, chunk);
+ g_free(filename);
+ }
KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
bigram.attach(k_mixture_model_filename, ATTACH_READONLY);
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
index be3364d..8c2a31c 100644
--- a/utils/training/gen_ngram.cpp
+++ b/utils/training/gen_ngram.cpp
@@ -26,8 +26,6 @@
#include <glib.h>
#include "pinyin_internal.h"
-static PhraseLargeTable * g_phrases = NULL;
-
void print_help(){
printf("Usage: gen_ngram [--skip-pi-gram-training]\n");
printf(" [--bigram-file <FILENAME>]\n");
@@ -58,23 +56,30 @@ int main(int argc, char * argv[]){
++i;
}
- g_phrases = new PhraseLargeTable;
- //init phrase lookup
+ PhraseLargeTable phrases;
+ /* init phrase table */
MemoryChunk * chunk = new MemoryChunk;
chunk->load("phrase_index.bin");
- g_phrases->load(chunk);
+ phrases.load(chunk);
FacadePhraseIndex phrase_index;
-
- //gb_char binary file
- chunk = new MemoryChunk;
- chunk->load("gb_char.bin");
- phrase_index.load(1, chunk);
-
- //gbk_char binary file
- chunk = new MemoryChunk;
- chunk->load("gbk_char.bin");
- phrase_index.load(2, chunk);
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const char * bin_file = pinyin_phrase_files[i];
+ if (NULL == bin_file)
+ continue;
+
+ gchar * filename = g_build_filename("..", "..", "data",
+ bin_file, NULL);
+ chunk = new MemoryChunk;
+ bool retval = chunk->load(filename);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", bin_file);
+ exit(ENOENT);
+ }
+
+ phrase_index.load(i, chunk);
+ g_free(filename);
+ }
Bigram bigram;
bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
@@ -93,7 +98,7 @@ int main(int argc, char * argv[]){
phrase_token_t token = 0;
if ( 0 != phrase_len ) {
- int result = g_phrases->search( phrase_len, phrase, token);
+ int result = phrases.search( phrase_len, phrase, token);
if ( ! (result & SEARCH_OK) )
token = 0;
g_free(phrase);
diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp
index cd938f6..2656647 100644
--- a/utils/training/gen_unigram.cpp
+++ b/utils/training/gen_unigram.cpp
@@ -26,26 +26,24 @@
/* increase all unigram frequency by a constant. */
int main(int argc, char * argv[]){
-
FacadePhraseIndex phrase_index;
-
- /* gb_char binary file */
- MemoryChunk * chunk = new MemoryChunk;
- bool retval = chunk->load("gb_char.bin");
- if (!retval) {
- fprintf(stderr, "open gb_char.bin failed!\n");
- exit(ENOENT);
- }
- phrase_index.load(1, chunk);
-
- /* gbk_char binary file */
- chunk = new MemoryChunk;
- retval = chunk->load("gbk_char.bin");
- if (!retval) {
- fprintf(stderr, "open gbk_char.bin failed!\n");
- exit(ENOENT);
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const char * bin_file = pinyin_phrase_files[i];
+ if (NULL == bin_file)
+ continue;
+
+ gchar * filename = g_build_filename("..", "..", "data",
+ bin_file, NULL);
+ chunk = new MemoryChunk;
+ bool retval = chunk->load(filename);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", bin_file);
+ exit(ENOENT);
+ }
+
+ phrase_index.load(i, chunk);
+ g_free(filename);
}
- phrase_index.load(2, chunk);
/* Note: please increase the value when corpus size becomes larger.
* To avoid zero value when computing unigram frequency in float format.