summaryrefslogtreecommitdiffstats
path: root/utils/training/gen_k_mixture_model.cpp
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-12 10:19:36 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-12 11:48:40 +0800
commitf110c5501299adb2e809a678001257f455407e3b (patch)
tree248bdf1b1be650c503f7b71d2612dac7642e0859 /utils/training/gen_k_mixture_model.cpp
parent9ebfcbba3dc5d069dd8bf89fcb69b2d388aa3289 (diff)
downloadlibpinyin-f110c5501299adb2e809a678001257f455407e3b.tar.gz
libpinyin-f110c5501299adb2e809a678001257f455407e3b.tar.xz
libpinyin-f110c5501299adb2e809a678001257f455407e3b.zip
refine gen k mixture model
Diffstat (limited to 'utils/training/gen_k_mixture_model.cpp')
-rw-r--r--utils/training/gen_k_mixture_model.cpp29
1 files changed, 19 insertions, 10 deletions
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp
index 8fc6946..a2731e9 100644
--- a/utils/training/gen_k_mixture_model.cpp
+++ b/utils/training/gen_k_mixture_model.cpp
@@ -46,7 +46,7 @@ void print_help(){
}
-bool convert_document_to_hash(FILE * document){
+bool read_document(FILE * document){
char * linebuf = NULL;
size_t size = 0;
phrase_token_t last_token, cur_token = last_token = 0;
@@ -68,9 +68,22 @@ bool convert_document_to_hash(FILE * document){
if ( ! (search_result & SEARCH_OK) )
token = 0;
+ g_free(phrase);
+ phrase = NULL;
+
last_token = cur_token;
cur_token = token;
+ /* skip null_token in second word. */
+ if ( null_token == cur_token )
+ continue;
+
+ /* skip pi-gram training. */
+ if ( null_token == last_token ){
+ if ( !g_train_pi_gram )
+ continue;
+ }
+
/* remember the (last_token, cur_token) word pair. */
gpointer value = NULL;
HashofSecondWord hash_of_second_word = NULL;
@@ -100,6 +113,8 @@ bool convert_document_to_hash(FILE * document){
hash_of_second_word);
}
+ free(linebuf);
+
return true;
}
@@ -173,16 +188,10 @@ bool train_single_gram(phrase_token_t token,
return true;
}
-static void train_single_gram_wrapper(gpointer key, gpointer value,
- gpointer user_data){
+static void hash_of_document_train_wrapper(gpointer key, gpointer value, gpointer user_data){
phrase_token_t token = GPOINTER_TO_UINT(key);
guint32 delta = 0;
- if ( null_token == token ){
- if ( !g_train_pi_gram )
- return;
- }
-
KMixtureModelSingleGram * single_gram = NULL;
bool exists = g_k_mixture_model->load(token, single_gram);
if ( !exists )
@@ -261,12 +270,12 @@ int main(int argc, char * argv[]){
(g_int_hash, g_int_equal, NULL,
(GDestroyNotify)g_hash_table_unref);
- assert(convert_document_to_hash(document));
+ assert(read_document(document));
fclose(document);
/* train the document, and convert it to k mixture model. */
g_hash_table_foreach(g_hash_of_document,
- train_single_gram_wrapper, NULL);
+ hash_of_document_train_wrapper, NULL);
g_hash_table_unref(g_hash_of_document);
g_hash_of_document = NULL;