diff options
author | Peng Wu <alexepico@gmail.com> | 2011-05-10 15:05:16 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-05-10 15:12:36 +0800 |
commit | e101e93196ba381850c5e33f9f9151616bb7c87d (patch) | |
tree | 0e6cbbd7382a297cfafe94d02f719d12ea4dbc94 | |
parent | 8f0f9f33716e9c81a74f2ff8bb77eb32ec8cd41a (diff) | |
download | libpinyin-e101e93196ba381850c5e33f9f9151616bb7c87d.tar.gz libpinyin-e101e93196ba381850c5e33f9f9151616bb7c87d.tar.xz libpinyin-e101e93196ba381850c5e33f9f9151616bb7c87d.zip |
add cmd line options to gen k mixture model
-rw-r--r-- | utils/training/gen_deleted_ngram.cpp | 6 | ||||
-rw-r--r-- | utils/training/gen_k_mixture_model.cpp | 80 | ||||
-rw-r--r-- | utils/training/prune_k_mixture_model.cpp | 1 |
3 files changed, 70 insertions, 17 deletions
diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp index cb1c4a0..1f098c8 100644 --- a/utils/training/gen_deleted_ngram.cpp +++ b/utils/training/gen_deleted_ngram.cpp @@ -43,15 +43,15 @@ int main(int argc, char * argv[]){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); - }else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ + } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; - }else if ( strcmp("--deleted-bigram-file", argv[i]) == 0){ + } else if ( strcmp("--deleted-bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; - }else{ + } else { print_help(); exit(EINVAL); } diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp index 3d5d8c4..d0b327a 100644 --- a/utils/training/gen_k_mixture_model.cpp +++ b/utils/training/gen_k_mixture_model.cpp @@ -20,9 +20,9 @@ */ - -#include <glib.h> #include "pinyin.h" +#include <glib.h> +#include <locale.h> #include "k_mixture_model.h" typedef GHashTable * HashofWordPair; @@ -34,11 +34,11 @@ static PhraseLargeTable * g_phrases = NULL; static KMixtureModelBigram * g_k_mixture_model = NULL; static guint32 g_maximum_occurs = 20; static parameter_t g_maximum_increase_rates = 3.; +static bool g_train_pi_gram = true; + void print_help(){ printf("gen_k_mixture_model [--skip-pi-gram-training]\n"); - printf(" [--skip-bi-gram-training]\n"); - printf(" [--skip-k-mixture-model-training]\n"); printf(" [--maximum-ocurrs-allowed <INT>]\n"); printf(" [--maximum-increase-rates-allowed <FLOAT>]\n"); printf(" [--k-mixture-model-file <FILENAME>]\n"); @@ -178,6 +178,11 @@ static void train_single_gram_wrapper(gpointer key, gpointer value, phrase_token_t token = GPOINTER_TO_UINT(key); guint32 delta = 0; + if ( null_token == token ){ + if ( !g_train_pi_gram ) + return; + } + KMixtureModelSingleGram * single_gram = NULL; bool exists = g_k_mixture_model->load(token, single_gram); if ( exists ){ @@ -202,16 +207,40 @@ static void train_single_gram_wrapper(gpointer key, gpointer value, delete single_gram; } -bool train_document(){ - g_hash_table_foreach(g_hash_of_document, train_single_gram_wrapper, NULL); - return true; -} - int main(int argc, char * argv[]){ + int i = 1; const char * k_mixture_model_filename = NULL; - g_hash_of_document = g_hash_table_new_full - (g_int_hash, g_int_equal, NULL, (GDestroyNotify)g_hash_table_unref); + setlocale(LC_ALL, ""); + while ( i < argc ){ + if ( strcmp("--help", argv[i]) == 0 ){ + print_help(); + exit(0); + } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ + g_train_pi_gram = false; + } else if ( strcmp("--maximum-ocurrs-allowed", argv[i]) == 0 ){ + if ( ++i >= argc ){ + print_help(); + exit(EINVAL); + } + g_maximum_occurs = atoi(argv[i]); + } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){ + if ( ++i >= argc ){ + print_help(); + exit(EINVAL); + } + g_maximum_increase_rates = atof(argv[i]); + } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){ + if ( ++i >= argc ){ + print_help(); + exit(EINVAL); + } + k_mixture_model_filename = argv[i]; + } else { + break; + } + ++i; + } g_phrases = new PhraseLargeTable; MemoryChunk * chunk = new MemoryChunk; @@ -221,10 +250,33 @@ int main(int argc, char * argv[]){ g_k_mixture_model = new KMixtureModelBigram(K_MIXTURE_MODEL_MAGIC_NUMBER); g_k_mixture_model->attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); - assert(convert_document_to_hash(stdin)); - assert(train_document()); + while ( i < argc ){ + const char * filename = argv[i]; + FILE * document = fopen(filename, "r"); + if ( NULL == document ){ + int err_saved = errno; + fprintf(stderr, "can't open file: %s.\n", filename); + fprintf(stderr, "error:%s.\n", strerror(err_saved)); + exit(err_saved); + } + + g_hash_of_document = g_hash_table_new_full + (g_int_hash, g_int_equal, NULL, + (GDestroyNotify)g_hash_table_unref); + + assert(convert_document_to_hash(document)); + fclose(document); + + /* train the document, and convert it to k mixture model. */ + g_hash_table_foreach(g_hash_of_document, + train_single_gram_wrapper, NULL); + + g_hash_table_unref(g_hash_of_document); + g_hash_of_document = NULL; + + ++i; + } - g_hash_table_unref(g_hash_of_document); delete g_phrases; delete g_k_mixture_model; diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp index 8845648..64c7f65 100644 --- a/utils/training/prune_k_mixture_model.cpp +++ b/utils/training/prune_k_mixture_model.cpp @@ -101,6 +101,7 @@ int main(int argc, char * argv[]){ } else { bigram_filename = argv[i]; } + ++i; } /* TODO: magic header signature check here. */ |