From 6a7d4e5f6f8d629fbc36325fd816fde66e5d38ab Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Wed, 27 Jul 2011 23:23:57 +0800 Subject: improve prune k mixture model --- utils/training/k_mixture_model.h | 1 + utils/training/prune_k_mixture_model.cpp | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h index 50218c2..2d9816b 100644 --- a/utils/training/k_mixture_model.h +++ b/utils/training/k_mixture_model.h @@ -54,6 +54,7 @@ static inline parameter_t compute_B(corpus_count_t N, corpus_count_t T, corpus_count_t n_0, corpus_count_t n_1){ + /* Note: re-check this, to see if we can remove if statement. */ /* Please consider B_2 is no less than 2 in paper. */ if ( 0 == T - n_1 && 0 == N - n_0 - n_1 ) return 2; diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp index 8ae6992..f4bad65 100644 --- a/utils/training/prune_k_mixture_model.cpp +++ b/utils/training/prune_k_mixture_model.cpp @@ -52,7 +52,16 @@ bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header, item->m_item.m_n_1); } - assert(remained_poss >= 0); + /* wrong remained possibility. */ + if (remained_poss < 0) { + fprintf(stderr, "wrong remained possibility is found.\n"); + fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n", + g_prune_k, magic_header->m_N, item->m_item.m_WC, + magic_header->m_N - item->m_item.m_N_n_0, + item->m_item.m_n_1); + exit(EDOM); + } + if ( remained_poss < g_prune_poss ) { /* prune this word or phrase. */ KMixtureModelArrayItem removed_item; @@ -114,7 +123,14 @@ int main(int argc, char * argv[]){ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); bigram.get_all_items(items); + /* print prune progress */ + size_t progress = 0; size_t onestep = items->len / 20; for ( size_t i = 0; i < items->len; ++i ){ + if ( progress >= onestep ) { + progress = 0; printf("*"); + } + progress ++; + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); KMixtureModelSingleGram * single_gram = NULL; bigram.load(*token, single_gram); @@ -142,6 +158,8 @@ int main(int argc, char * argv[]){ removed_array = NULL; } + printf("\n"); + bigram.set_magic_header(magic_header); /* post processing clean up zero items */ -- cgit