summaryrefslogtreecommitdiffstats
path: root/utils/training/gen_k_mixture_model.cpp
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-10 14:03:31 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-10 14:05:09 +0800
commit8f0f9f33716e9c81a74f2ff8bb77eb32ec8cd41a (patch)
treed9d64591ea81368c5a3dc10c43a36f17ee1cd66c /utils/training/gen_k_mixture_model.cpp
parentc3ab43532529b45048a7a711e94d0a37f102d603 (diff)
downloadlibpinyin-8f0f9f33716e9c81a74f2ff8bb77eb32ec8cd41a.tar.gz
libpinyin-8f0f9f33716e9c81a74f2ff8bb77eb32ec8cd41a.tar.xz
libpinyin-8f0f9f33716e9c81a74f2ff8bb77eb32ec8cd41a.zip
write main for gen k mixture model
Diffstat (limited to 'utils/training/gen_k_mixture_model.cpp')
-rw-r--r--utils/training/gen_k_mixture_model.cpp26
1 files changed, 21 insertions, 5 deletions
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp
index ba0936f..3d5d8c4 100644
--- a/utils/training/gen_k_mixture_model.cpp
+++ b/utils/training/gen_k_mixture_model.cpp
@@ -31,7 +31,7 @@ typedef GHashTable * HashofSecondWord;
/* Hash token of Hash token of word count. */
static HashofWordPair g_hash_of_document = NULL;
static PhraseLargeTable * g_phrases = NULL;
-static KMixtureModelBigram * g_bigram = NULL;
+static KMixtureModelBigram * g_k_mixture_model = NULL;
static guint32 g_maximum_occurs = 20;
static parameter_t g_maximum_increase_rates = 3.;
@@ -179,7 +179,7 @@ static void train_single_gram_wrapper(gpointer key, gpointer value,
guint32 delta = 0;
KMixtureModelSingleGram * single_gram = NULL;
- bool exists = g_bigram->load(token, single_gram);
+ bool exists = g_k_mixture_model->load(token, single_gram);
if ( exists ){
train_single_gram(token, single_gram, delta);
} else { /* item doesn't exist. */
@@ -188,17 +188,17 @@ static void train_single_gram_wrapper(gpointer key, gpointer value,
}
KMixtureModelMagicHeader magic_header;
- assert(g_bigram->get_magic_header(magic_header));
+ assert(g_k_mixture_model->get_magic_header(magic_header));
if ( magic_header.m_WC + delta < magic_header.m_WC ){
fprintf(stderr, "the m_WC integer in magic header overflows.\n");
return;
}
magic_header.m_WC += delta;
magic_header.m_N ++;
- assert(g_bigram->set_magic_header(magic_header));
+ assert(g_k_mixture_model->set_magic_header(magic_header));
/* save the single gram. */
- assert(g_bigram->store(token, single_gram));
+ assert(g_k_mixture_model->store(token, single_gram));
delete single_gram;
}
@@ -208,9 +208,25 @@ bool train_document(){
}
int main(int argc, char * argv[]){
+ const char * k_mixture_model_filename = NULL;
+
g_hash_of_document = g_hash_table_new_full
(g_int_hash, g_int_equal, NULL, (GDestroyNotify)g_hash_table_unref);
+ g_phrases = new PhraseLargeTable;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/phrase_index.bin");
+ g_phrases->load(chunk);
+
+ g_k_mixture_model = new KMixtureModelBigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ g_k_mixture_model->attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+ assert(convert_document_to_hash(stdin));
+ assert(train_document());
+
+ g_hash_table_unref(g_hash_of_document);
+ delete g_phrases;
+ delete g_k_mixture_model;
return 0;
}