diff options
author | Peng Wu <alexepico@gmail.com> | 2011-05-12 10:19:36 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-05-12 11:48:40 +0800 |
commit | f110c5501299adb2e809a678001257f455407e3b (patch) | |
tree | 248bdf1b1be650c503f7b71d2612dac7642e0859 /utils/training | |
parent | 9ebfcbba3dc5d069dd8bf89fcb69b2d388aa3289 (diff) | |
download | libpinyin-f110c5501299adb2e809a678001257f455407e3b.tar.gz libpinyin-f110c5501299adb2e809a678001257f455407e3b.tar.xz libpinyin-f110c5501299adb2e809a678001257f455407e3b.zip |
refine gen k mixture model
Diffstat (limited to 'utils/training')
-rw-r--r-- | utils/training/gen_k_mixture_model.cpp | 29 |
1 files changed, 19 insertions, 10 deletions
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp index 8fc6946..a2731e9 100644 --- a/utils/training/gen_k_mixture_model.cpp +++ b/utils/training/gen_k_mixture_model.cpp @@ -46,7 +46,7 @@ void print_help(){ } -bool convert_document_to_hash(FILE * document){ +bool read_document(FILE * document){ char * linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; @@ -68,9 +68,22 @@ bool convert_document_to_hash(FILE * document){ if ( ! (search_result & SEARCH_OK) ) token = 0; + g_free(phrase); + phrase = NULL; + last_token = cur_token; cur_token = token; + /* skip null_token in second word. */ + if ( null_token == cur_token ) + continue; + + /* skip pi-gram training. */ + if ( null_token == last_token ){ + if ( !g_train_pi_gram ) + continue; + } + /* remember the (last_token, cur_token) word pair. */ gpointer value = NULL; HashofSecondWord hash_of_second_word = NULL; @@ -100,6 +113,8 @@ bool convert_document_to_hash(FILE * document){ hash_of_second_word); } + free(linebuf); + return true; } @@ -173,16 +188,10 @@ bool train_single_gram(phrase_token_t token, return true; } -static void train_single_gram_wrapper(gpointer key, gpointer value, - gpointer user_data){ +static void hash_of_document_train_wrapper(gpointer key, gpointer value, gpointer user_data){ phrase_token_t token = GPOINTER_TO_UINT(key); guint32 delta = 0; - if ( null_token == token ){ - if ( !g_train_pi_gram ) - return; - } - KMixtureModelSingleGram * single_gram = NULL; bool exists = g_k_mixture_model->load(token, single_gram); if ( !exists ) @@ -261,12 +270,12 @@ int main(int argc, char * argv[]){ (g_int_hash, g_int_equal, NULL, (GDestroyNotify)g_hash_table_unref); - assert(convert_document_to_hash(document)); + assert(read_document(document)); fclose(document); /* train the document, and convert it to k mixture model. */ g_hash_table_foreach(g_hash_of_document, - train_single_gram_wrapper, NULL); + hash_of_document_train_wrapper, NULL); g_hash_table_unref(g_hash_of_document); g_hash_of_document = NULL; |