remove some global varibles in gen k mixture model

author: Peng Wu <alexepico@gmail.com> 2011-05-24 11:08:02 +0800
committer: Peng Wu <alexepico@gmail.com> 2011-05-24 11:08:02 +0800
commit: 47089bf0d8767dcca81fd6ca5f680bb651b356ec (patch)
tree: d40bd387756624e7f544ae7e4e784a1d9de7a7f5 /utils
parent: 674d1bf7611715babce989d150348e7085eaec6b (diff)
download: libpinyin-47089bf0d8767dcca81fd6ca5f680bb651b356ec.tar.gz
libpinyin-47089bf0d8767dcca81fd6ca5f680bb651b356ec.tar.xz
libpinyin-47089bf0d8767dcca81fd6ca5f680bb651b356ec.zip
1 files changed, 23 insertions, 26 deletions
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp
index 9821961..0d7204f 100644
--- a/utils/training/gen_k_mixture_model.cpp
+++ b/utils/training/gen_k_mixture_model.cpp
@@ -25,13 +25,10 @@
 #include <locale.h>
 #include "k_mixture_model.h"
 
-typedef GHashTable * HashofWordPair;
+typedef GHashTable * HashofDocument;
 typedef GHashTable * HashofSecondWord;
 
 /* Hash token of Hash token of word count. */
-static HashofWordPair g_hash_of_document = NULL;
-static PhraseLargeTable * g_phrases = NULL;
-static KMixtureModelBigram * g_k_mixture_model = NULL;
 static guint32 g_maximum_occurs = 20;
 static parameter_t g_maximum_increase_rates = 3.;
 static bool g_train_pi_gram = true;
@@ -46,7 +43,8 @@ void print_help(){
 }
 
 
-bool read_document(FILE * document){
+bool read_document(PhraseLargeTable * phrases, FILE * document,
+                   HashofDocument hash_of_document){
     char * linebuf = NULL;
     size_t size = 0;
     phrase_token_t last_token, cur_token = last_token = 0;
@@ -64,7 +62,7 @@ bool read_document(FILE * document){
             continue;
 
         phrase_token_t token = 0;
-        int search_result = g_phrases->search( phrase_len, phrase, token );
+        int search_result = phrases->search( phrase_len, phrase, token );
         if ( ! (search_result & SEARCH_OK) )
             token = 0;
 
@@ -89,7 +87,7 @@ bool read_document(FILE * document){
         gpointer value = NULL;
         HashofSecondWord hash_of_second_word = NULL;
         gboolean lookup_result = g_hash_table_lookup_extended
-            (g_hash_of_document, GUINT_TO_POINTER(last_token),
+            (hash_of_document, GUINT_TO_POINTER(last_token),
              NULL, &value);
         if ( !lookup_result ){
             hash_of_second_word = g_hash_table_new
@@ -110,7 +108,7 @@ bool read_document(FILE * document){
         g_hash_table_insert(hash_of_second_word,
                             GUINT_TO_POINTER(cur_token),
                             GUINT_TO_POINTER(count));
-        g_hash_table_insert(g_hash_of_document,
+        g_hash_table_insert(hash_of_document,
                             GUINT_TO_POINTER(last_token),
                             hash_of_second_word);
     }
@@ -164,8 +162,9 @@ static void train_word_pair(KMixtureModelSingleGram * single_gram,
     single_gram->set_array_header(array_header);
 }
 
-bool train_single_gram(phrase_token_t token,
+bool train_single_gram(HashofDocument hash_of_document,
                        KMixtureModelSingleGram * single_gram,
+                       phrase_token_t token,
                        guint32 & delta){
     assert(NULL != single_gram);
     delta = 0; /* delta in WC of single_gram. */
@@ -176,7 +175,7 @@ bool train_single_gram(phrase_token_t token,
     HashofSecondWord hash_of_second_word = NULL;
     gpointer key, value = NULL;
     assert(g_hash_table_lookup_extended
-           (g_hash_of_document, GUINT_TO_POINTER(token),
+           (hash_of_document, GUINT_TO_POINTER(token),
             NULL, &value));
     hash_of_second_word = (HashofSecondWord) value;
     assert(NULL != hash_of_second_word);
@@ -195,7 +194,8 @@ bool train_single_gram(phrase_token_t token,
     return true;
 }
 
-static bool train_second_word(KMixtureModelBigram * bigram, 
+static bool train_second_word(HashofDocument hash_of_document,
+                              KMixtureModelBigram * bigram,
                               phrase_token_t token){
     guint32 delta = 0;
 
@@ -203,7 +203,7 @@ static bool train_second_word(KMixtureModelBigram * bigram,
     bool exists = bigram->load(token, single_gram);
     if ( !exists )
         single_gram = new KMixtureModelSingleGram;
-    train_single_gram(token, single_gram, delta);
+    train_single_gram(hash_of_document, single_gram, token, delta);
 
     KMixtureModelMagicHeader magic_header;
     if (!bigram->get_magic_header(magic_header)){
@@ -260,13 +260,13 @@ int main(int argc, char * argv[]){
         ++i;
     }
 
-    g_phrases = new PhraseLargeTable;
+    PhraseLargeTable phrases;
     MemoryChunk * chunk = new MemoryChunk;
     chunk->load("../../data/phrase_index.bin");
-    g_phrases->load(chunk);
+    phrases.load(chunk);
 
-    g_k_mixture_model = new KMixtureModelBigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
-    g_k_mixture_model->attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
+    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
 
     while ( i < argc ){
         const char * filename = argv[i];
@@ -278,37 +278,34 @@ int main(int argc, char * argv[]){
             exit(err_saved);
         }
 
-        g_hash_of_document = g_hash_table_new
+        HashofDocument hash_of_document = g_hash_table_new
             (g_direct_hash, g_direct_equal);
 
-        assert(read_document(document));
+        assert(read_document(&phrases, document, hash_of_document));
         fclose(document);
 
         GHashTableIter iter;
         gpointer key, value;
 
         /* train the document, and convert it to k mixture model. */
-        g_hash_table_iter_init(&iter, g_hash_of_document);
+        g_hash_table_iter_init(&iter, hash_of_document);
         while (g_hash_table_iter_next(&iter, &key, &value)) {
             phrase_token_t token = GPOINTER_TO_UINT(key);
-            train_second_word(g_k_mixture_model, token);
+            train_second_word(hash_of_document, &bigram, token);
         }
 
         /* free resources of g_hash_of_document */
-        g_hash_table_iter_init(&iter, g_hash_of_document);
+        g_hash_table_iter_init(&iter, hash_of_document);
         while (g_hash_table_iter_next(&iter, &key, &value)) {
             HashofSecondWord second_word = (HashofSecondWord) value;
             g_hash_table_iter_steal(&iter);
             g_hash_table_unref(second_word);
         }
-        g_hash_table_unref(g_hash_of_document);
-        g_hash_of_document = NULL;
+        g_hash_table_unref(hash_of_document);
+        hash_of_document = NULL;
 
         ++i;
     }
 
-    delete g_phrases;
-    delete g_k_mixture_model;
-
     return 0;
 }
author	Peng Wu <alexepico@gmail.com>	2011-05-24 11:08:02 +0800
committer	Peng Wu <alexepico@gmail.com>	2011-05-24 11:08:02 +0800
commit	47089bf0d8767dcca81fd6ca5f680bb651b356ec (patch)
tree	d40bd387756624e7f544ae7e4e784a1d9de7a7f5 /utils
parent	674d1bf7611715babce989d150348e7085eaec6b (diff)
download	libpinyin-47089bf0d8767dcca81fd6ca5f680bb651b356ec.tar.gz libpinyin-47089bf0d8767dcca81fd6ca5f680bb651b356ec.tar.xz libpinyin-47089bf0d8767dcca81fd6ca5f680bb651b356ec.zip