summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-07-27 23:23:57 +0800
committerPeng Wu <alexepico@gmail.com>2011-07-27 23:25:51 +0800
commit6a7d4e5f6f8d629fbc36325fd816fde66e5d38ab (patch)
tree3defac378ae9be83a4c7e0783e1931c32a7eb3e0
parenta4fddd81d5ba9638ded837fd3006b99184a9b6b4 (diff)
downloadlibpinyin-6a7d4e5f6f8d629fbc36325fd816fde66e5d38ab.tar.gz
libpinyin-6a7d4e5f6f8d629fbc36325fd816fde66e5d38ab.tar.xz
libpinyin-6a7d4e5f6f8d629fbc36325fd816fde66e5d38ab.zip
improve prune k mixture model
-rw-r--r--utils/training/k_mixture_model.h1
-rw-r--r--utils/training/prune_k_mixture_model.cpp20
2 files changed, 20 insertions, 1 deletions
diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h
index 50218c2..2d9816b 100644
--- a/utils/training/k_mixture_model.h
+++ b/utils/training/k_mixture_model.h
@@ -54,6 +54,7 @@ static inline parameter_t compute_B(corpus_count_t N,
corpus_count_t T,
corpus_count_t n_0,
corpus_count_t n_1){
+ /* Note: re-check this, to see if we can remove if statement. */
/* Please consider B_2 is no less than 2 in paper. */
if ( 0 == T - n_1 && 0 == N - n_0 - n_1 )
return 2;
diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp
index 8ae6992..f4bad65 100644
--- a/utils/training/prune_k_mixture_model.cpp
+++ b/utils/training/prune_k_mixture_model.cpp
@@ -52,7 +52,16 @@ bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header,
item->m_item.m_n_1);
}
- assert(remained_poss >= 0);
+ /* wrong remained possibility. */
+ if (remained_poss < 0) {
+ fprintf(stderr, "wrong remained possibility is found.\n");
+ fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n",
+ g_prune_k, magic_header->m_N, item->m_item.m_WC,
+ magic_header->m_N - item->m_item.m_N_n_0,
+ item->m_item.m_n_1);
+ exit(EDOM);
+ }
+
if ( remained_poss < g_prune_poss ) {
/* prune this word or phrase. */
KMixtureModelArrayItem removed_item;
@@ -114,7 +123,14 @@ int main(int argc, char * argv[]){
GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
bigram.get_all_items(items);
+ /* print prune progress */
+ size_t progress = 0; size_t onestep = items->len / 20;
for ( size_t i = 0; i < items->len; ++i ){
+ if ( progress >= onestep ) {
+ progress = 0; printf("*");
+ }
+ progress ++;
+
phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
KMixtureModelSingleGram * single_gram = NULL;
bigram.load(*token, single_gram);
@@ -142,6 +158,8 @@ int main(int argc, char * argv[]){
removed_array = NULL;
}
+ printf("\n");
+
bigram.set_magic_header(magic_header);
/* post processing clean up zero items */