summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-23 16:23:25 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-23 16:29:22 +0800
commit77e4f599b2e7123887660ca13ba7d7929bd04407 (patch)
tree16c8099eff583650aa0983924c5b02d9f1f730c1 /utils
parent2edbe14f3e66f9b2b25cd3e1afea23f602c16f41 (diff)
downloadlibpinyin-77e4f599b2e7123887660ca13ba7d7929bd04407.tar.gz
libpinyin-77e4f599b2e7123887660ca13ba7d7929bd04407.tar.xz
libpinyin-77e4f599b2e7123887660ca13ba7d7929bd04407.zip
fixes gen k mixture model
Diffstat (limited to 'utils')
-rw-r--r--utils/training/gen_k_mixture_model.cpp27
1 files changed, 22 insertions, 5 deletions
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp
index e9fb51e..608cde4 100644
--- a/utils/training/gen_k_mixture_model.cpp
+++ b/utils/training/gen_k_mixture_model.cpp
@@ -82,6 +82,7 @@ bool read_document(FILE * document){
if ( null_token == last_token ){
if ( !g_train_pi_gram )
continue;
+ last_token = sentence_start;
}
/* remember the (last_token, cur_token) word pair. */
@@ -91,7 +92,8 @@ bool read_document(FILE * document){
(g_hash_of_document, GUINT_TO_POINTER(last_token),
NULL, &value);
if ( !lookup_result ){
- hash_of_second_word = g_hash_table_new(g_int_hash, g_int_equal);
+ hash_of_second_word = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
} else {
hash_of_second_word = (HashofSecondWord) value;
}
@@ -144,6 +146,7 @@ static void train_word_pair(gpointer key, gpointer value,
array_item.m_n_1 ++;
array_item.m_Mr = std_lite::max(array_item.m_Mr, count);
delta = count;
+ assert(single_gram->set_array_item(token, array_item));
} else { /* item doesn't exist. */
/* the same as above. */
if ( count > g_maximum_occurs )
@@ -156,6 +159,7 @@ static void train_word_pair(gpointer key, gpointer value,
array_item.m_n_1 = 1;
array_item.m_Mr = count;
delta = count;
+ assert(single_gram->insert_array_item(token, array_item));
}
/* save delta in the array header. */
KMixtureModelArrayHeader array_header;
@@ -199,7 +203,11 @@ static void hash_of_document_train_wrapper(gpointer key, gpointer value, gpointe
train_single_gram(token, single_gram, delta);
KMixtureModelMagicHeader magic_header;
- assert(g_k_mixture_model->get_magic_header(magic_header));
+ if (!g_k_mixture_model->get_magic_header(magic_header)){
+ /* the first time to access the new k mixture model file. */
+ memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ }
+
if ( magic_header.m_WC + delta < magic_header.m_WC ){
fprintf(stderr, "the m_WC integer in magic header overflows.\n");
return;
@@ -213,6 +221,13 @@ static void hash_of_document_train_wrapper(gpointer key, gpointer value, gpointe
delete single_gram;
}
+static gboolean hash_of_document_free_wrapper(gpointer key, gpointer value, gpointer user_data){
+ phrase_token_t token = GPOINTER_TO_UINT(key);
+ HashofSecondWord second_word = (HashofSecondWord) value;
+ g_hash_table_unref(second_word);
+ return TRUE;
+}
+
int main(int argc, char * argv[]){
int i = 1;
const char * k_mixture_model_filename = NULL;
@@ -266,9 +281,8 @@ int main(int argc, char * argv[]){
exit(err_saved);
}
- g_hash_of_document = g_hash_table_new_full
- (g_int_hash, g_int_equal, NULL,
- (GDestroyNotify)g_hash_table_unref);
+ g_hash_of_document = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
assert(read_document(document));
fclose(document);
@@ -277,6 +291,9 @@ int main(int argc, char * argv[]){
g_hash_table_foreach(g_hash_of_document,
hash_of_document_train_wrapper, NULL);
+ /* free resources of g_hash_of_document */
+ g_hash_table_foreach_steal(g_hash_of_document,
+ hash_of_document_free_wrapper, NULL);
g_hash_table_unref(g_hash_of_document);
g_hash_of_document = NULL;