diff options
-rw-r--r-- | utils/training/prune_k_mixture_model.cpp | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp index f6f0bcc..7a724a9 100644 --- a/utils/training/prune_k_mixture_model.cpp +++ b/utils/training/prune_k_mixture_model.cpp @@ -1,9 +1,114 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + + +#include <errno.h> +#include <locale.h> #include "pinyin.h" #include "k_mixture_model.h" static guint32 g_prune_k = 3; static parameter_t g_prune_poss = 0.99; +void print_help(){ + printf("prune_k_mixture_model <FILENAME>\n"); +} + +bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header, + KMixtureModelSingleGram * & bigram){ + bool success; + + FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + bigram->retrieve_all(array); + for ( size_t i = 0; i < array->len; ++i) { + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i); + phrase_token_t token = item->m_token; + parameter_t remained_poss = 1; + for ( size_t k = 0; k < g_prune_k; ++k){ + remained_poss -= compute_Pr_G_3_with_count + (k, magic_header->m_N, item->m_item.m_WC, + magic_header->m_N - item->m_item.m_N_n_0, + item->m_item.m_n_1); + } + + assert(remained_poss >= 0); + if ( remained_poss < g_prune_poss ) { + /* prune this word or phrase. */ + KMixtureModelArrayItem removed_item; + bigram->remove_array_item(token, removed_item); + assert( memcmp(&removed_item, &(item->m_item), + sizeof(KMixtureModelArrayItem)) == 0 ); + KMixtureModelArrayHeader header; + bigram->get_array_header(header); + guint32 removed_count = removed_item.m_WC; + header.m_WC -= removed_count; + bigram->set_array_header(header); + magic_header->m_WC -= removed_count; + } + } + + KMixtureModelArrayHeader header; + bigram->get_array_header(header); + + if ( 0 == header.m_WC ){ + delete bigram; + bigram = NULL; + } + + return true; +} + int main(int argc, char * argv[]){ + const char * bigram_filename = NULL; + + setlocale(LC_ALL, ""); + if ( 2 != argc ){ + print_help(); + exit(EINVAL); + } else { + bigram_filename = argv[1]; + } + + /* TODO: magic header signature check here. */ + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(bigram_filename); + + KMixtureModelMagicHeader magic_header; + bigram.get_magic_header(magic_header); + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(items); + + for ( size_t i; i < items->len; ++i ){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + prune_k_mixture_model(&magic_header, single_gram); + + if ( NULL == single_gram ) + bigram.remove(*token); + else bigram.store(*token, single_gram); + } + + bigram.set_magic_header(magic_header); return 0; } |