summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-10 15:05:16 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-10 15:12:36 +0800
commite101e93196ba381850c5e33f9f9151616bb7c87d (patch)
tree0e6cbbd7382a297cfafe94d02f719d12ea4dbc94 /utils
parent8f0f9f33716e9c81a74f2ff8bb77eb32ec8cd41a (diff)
downloadlibpinyin-e101e93196ba381850c5e33f9f9151616bb7c87d.tar.gz
libpinyin-e101e93196ba381850c5e33f9f9151616bb7c87d.tar.xz
libpinyin-e101e93196ba381850c5e33f9f9151616bb7c87d.zip
add cmd line options to gen k mixture model
Diffstat (limited to 'utils')
-rw-r--r--utils/training/gen_deleted_ngram.cpp6
-rw-r--r--utils/training/gen_k_mixture_model.cpp80
-rw-r--r--utils/training/prune_k_mixture_model.cpp1
3 files changed, 70 insertions, 17 deletions
diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp
index cb1c4a0..1f098c8 100644
--- a/utils/training/gen_deleted_ngram.cpp
+++ b/utils/training/gen_deleted_ngram.cpp
@@ -43,15 +43,15 @@ int main(int argc, char * argv[]){
if ( strcmp("--help", argv[i]) == 0){
print_help();
exit(0);
- }else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
+ } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
train_pi_gram = false;
- }else if ( strcmp("--deleted-bigram-file", argv[i]) == 0){
+ } else if ( strcmp("--deleted-bigram-file", argv[i]) == 0){
if ( ++i >= argc ) {
print_help();
exit(EINVAL);
}
bigram_filename = argv[i];
- }else{
+ } else {
print_help();
exit(EINVAL);
}
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp
index 3d5d8c4..d0b327a 100644
--- a/utils/training/gen_k_mixture_model.cpp
+++ b/utils/training/gen_k_mixture_model.cpp
@@ -20,9 +20,9 @@
*/
-
-#include <glib.h>
#include "pinyin.h"
+#include <glib.h>
+#include <locale.h>
#include "k_mixture_model.h"
typedef GHashTable * HashofWordPair;
@@ -34,11 +34,11 @@ static PhraseLargeTable * g_phrases = NULL;
static KMixtureModelBigram * g_k_mixture_model = NULL;
static guint32 g_maximum_occurs = 20;
static parameter_t g_maximum_increase_rates = 3.;
+static bool g_train_pi_gram = true;
+
void print_help(){
printf("gen_k_mixture_model [--skip-pi-gram-training]\n");
- printf(" [--skip-bi-gram-training]\n");
- printf(" [--skip-k-mixture-model-training]\n");
printf(" [--maximum-ocurrs-allowed <INT>]\n");
printf(" [--maximum-increase-rates-allowed <FLOAT>]\n");
printf(" [--k-mixture-model-file <FILENAME>]\n");
@@ -178,6 +178,11 @@ static void train_single_gram_wrapper(gpointer key, gpointer value,
phrase_token_t token = GPOINTER_TO_UINT(key);
guint32 delta = 0;
+ if ( null_token == token ){
+ if ( !g_train_pi_gram )
+ return;
+ }
+
KMixtureModelSingleGram * single_gram = NULL;
bool exists = g_k_mixture_model->load(token, single_gram);
if ( exists ){
@@ -202,16 +207,40 @@ static void train_single_gram_wrapper(gpointer key, gpointer value,
delete single_gram;
}
-bool train_document(){
- g_hash_table_foreach(g_hash_of_document, train_single_gram_wrapper, NULL);
- return true;
-}
-
int main(int argc, char * argv[]){
+ int i = 1;
const char * k_mixture_model_filename = NULL;
- g_hash_of_document = g_hash_table_new_full
- (g_int_hash, g_int_equal, NULL, (GDestroyNotify)g_hash_table_unref);
+ setlocale(LC_ALL, "");
+ while ( i < argc ){
+ if ( strcmp("--help", argv[i]) == 0 ){
+ print_help();
+ exit(0);
+ } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
+ g_train_pi_gram = false;
+ } else if ( strcmp("--maximum-ocurrs-allowed", argv[i]) == 0 ){
+ if ( ++i >= argc ){
+ print_help();
+ exit(EINVAL);
+ }
+ g_maximum_occurs = atoi(argv[i]);
+ } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){
+ if ( ++i >= argc ){
+ print_help();
+ exit(EINVAL);
+ }
+ g_maximum_increase_rates = atof(argv[i]);
+ } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){
+ if ( ++i >= argc ){
+ print_help();
+ exit(EINVAL);
+ }
+ k_mixture_model_filename = argv[i];
+ } else {
+ break;
+ }
+ ++i;
+ }
g_phrases = new PhraseLargeTable;
MemoryChunk * chunk = new MemoryChunk;
@@ -221,10 +250,33 @@ int main(int argc, char * argv[]){
g_k_mixture_model = new KMixtureModelBigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
g_k_mixture_model->attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
- assert(convert_document_to_hash(stdin));
- assert(train_document());
+ while ( i < argc ){
+ const char * filename = argv[i];
+ FILE * document = fopen(filename, "r");
+ if ( NULL == document ){
+ int err_saved = errno;
+ fprintf(stderr, "can't open file: %s.\n", filename);
+ fprintf(stderr, "error:%s.\n", strerror(err_saved));
+ exit(err_saved);
+ }
+
+ g_hash_of_document = g_hash_table_new_full
+ (g_int_hash, g_int_equal, NULL,
+ (GDestroyNotify)g_hash_table_unref);
+
+ assert(convert_document_to_hash(document));
+ fclose(document);
+
+ /* train the document, and convert it to k mixture model. */
+ g_hash_table_foreach(g_hash_of_document,
+ train_single_gram_wrapper, NULL);
+
+ g_hash_table_unref(g_hash_of_document);
+ g_hash_of_document = NULL;
+
+ ++i;
+ }
- g_hash_table_unref(g_hash_of_document);
delete g_phrases;
delete g_k_mixture_model;
diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp
index 8845648..64c7f65 100644
--- a/utils/training/prune_k_mixture_model.cpp
+++ b/utils/training/prune_k_mixture_model.cpp
@@ -101,6 +101,7 @@ int main(int argc, char * argv[]){
} else {
bigram_filename = argv[i];
}
+ ++i;
}
/* TODO: magic header signature check here. */