diff options
author | Peng Wu <alexepico@gmail.com> | 2011-07-27 23:23:57 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-07-27 23:25:51 +0800 |
commit | 6a7d4e5f6f8d629fbc36325fd816fde66e5d38ab (patch) | |
tree | 3defac378ae9be83a4c7e0783e1931c32a7eb3e0 /utils | |
parent | a4fddd81d5ba9638ded837fd3006b99184a9b6b4 (diff) | |
download | libpinyin-6a7d4e5f6f8d629fbc36325fd816fde66e5d38ab.tar.gz libpinyin-6a7d4e5f6f8d629fbc36325fd816fde66e5d38ab.tar.xz libpinyin-6a7d4e5f6f8d629fbc36325fd816fde66e5d38ab.zip |
improve prune k mixture model
Diffstat (limited to 'utils')
-rw-r--r-- | utils/training/k_mixture_model.h | 1 | ||||
-rw-r--r-- | utils/training/prune_k_mixture_model.cpp | 20 |
2 files changed, 20 insertions, 1 deletions
diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h index 50218c2..2d9816b 100644 --- a/utils/training/k_mixture_model.h +++ b/utils/training/k_mixture_model.h @@ -54,6 +54,7 @@ static inline parameter_t compute_B(corpus_count_t N, corpus_count_t T, corpus_count_t n_0, corpus_count_t n_1){ + /* Note: re-check this, to see if we can remove if statement. */ /* Please consider B_2 is no less than 2 in paper. */ if ( 0 == T - n_1 && 0 == N - n_0 - n_1 ) return 2; diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp index 8ae6992..f4bad65 100644 --- a/utils/training/prune_k_mixture_model.cpp +++ b/utils/training/prune_k_mixture_model.cpp @@ -52,7 +52,16 @@ bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header, item->m_item.m_n_1); } - assert(remained_poss >= 0); + /* wrong remained possibility. */ + if (remained_poss < 0) { + fprintf(stderr, "wrong remained possibility is found.\n"); + fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n", + g_prune_k, magic_header->m_N, item->m_item.m_WC, + magic_header->m_N - item->m_item.m_N_n_0, + item->m_item.m_n_1); + exit(EDOM); + } + if ( remained_poss < g_prune_poss ) { /* prune this word or phrase. */ KMixtureModelArrayItem removed_item; @@ -114,7 +123,14 @@ int main(int argc, char * argv[]){ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); bigram.get_all_items(items); + /* print prune progress */ + size_t progress = 0; size_t onestep = items->len / 20; for ( size_t i = 0; i < items->len; ++i ){ + if ( progress >= onestep ) { + progress = 0; printf("*"); + } + progress ++; + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); KMixtureModelSingleGram * single_gram = NULL; bigram.load(*token, single_gram); @@ -142,6 +158,8 @@ int main(int argc, char * argv[]){ removed_array = NULL; } + printf("\n"); + bigram.set_magic_header(magic_header); /* post processing clean up zero items */ |