diff options
Diffstat (limited to 'utils/training/prune_k_mixture_model.cpp')
-rw-r--r-- | utils/training/prune_k_mixture_model.cpp | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp new file mode 100644 index 0000000..40dfb87 --- /dev/null +++ b/utils/training/prune_k_mixture_model.cpp @@ -0,0 +1,192 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + + +#include <errno.h> +#include <locale.h> +#include <limits.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" + + +void print_help(){ + printf("Usage: prune_k_mixture_model -k <INT> --CDF <DOUBLE> <FILENAME>\n"); +} + +static gint g_prune_k = 3; +static parameter_t g_prune_poss = 0.99; + +static GOptionEntry entries[] = +{ + {"pruneK", 'k', 0, G_OPTION_ARG_INT, &g_prune_k, "k parameter", NULL}, + {"CDF", 0, 0, G_OPTION_ARG_DOUBLE, &g_prune_poss, "CDF parameter", NULL}, + {NULL} +}; + + +bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header, + KMixtureModelSingleGram * & bigram, + FlexibleBigramPhraseArray removed_array){ + bool success; + + FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + bigram->retrieve_all(array); + + for ( size_t i = 0; i < array->len; ++i) { + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i); + phrase_token_t token = item->m_token; + parameter_t remained_poss = 1; parameter_t one_poss = 0; + bool errors = false; + for ( size_t k = 0; k < g_prune_k; ++k){ + one_poss = compute_Pr_G_3_with_count + (k, magic_header->m_N, item->m_item.m_WC, + magic_header->m_N - item->m_item.m_N_n_0, + item->m_item.m_n_1); + if ( !(0 <= one_poss && one_poss <= 1) ) + errors = true; + remained_poss -= one_poss; + } + + if ( fabs(remained_poss) < DBL_EPSILON ) + remained_poss = 0.; + + /* some wrong possibility. */ + if ( errors || !(0 <= remained_poss && remained_poss <= 1) ) { + fprintf(stderr, "some wrong possibility is encountered:%f.\n", + remained_poss); + fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n", + g_prune_k, magic_header->m_N, item->m_item.m_WC, + magic_header->m_N - item->m_item.m_N_n_0, + item->m_item.m_n_1); + exit(EDOM); + } + + if ( remained_poss < g_prune_poss ) { + /* prune this word or phrase. */ + KMixtureModelArrayItem removed_item; + bigram->remove_array_item(token, removed_item); + assert( memcmp(&removed_item, &(item->m_item), + sizeof(KMixtureModelArrayItem)) == 0 ); + + KMixtureModelArrayItemWithToken removed_item_with_token; + removed_item_with_token.m_token = token; + removed_item_with_token.m_item = removed_item; + g_array_append_val(removed_array, removed_item_with_token); + + KMixtureModelArrayHeader array_header; + bigram->get_array_header(array_header); + guint32 removed_count = removed_item.m_WC; + array_header.m_WC -= removed_count; + bigram->set_array_header(array_header); + magic_header->m_WC -= removed_count; + magic_header->m_total_freq -= removed_count; + } + } + + return true; +} + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- prune k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (2 != argc) { + fprintf(stderr, "wrong arguments.\n"); + exit(EINVAL); + } + + const gchar * bigram_filename = argv[1]; + + /* TODO: magic header signature check here. */ + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(bigram_filename, ATTACH_READWRITE); + + KMixtureModelMagicHeader magic_header; + if (!bigram.get_magic_header(magic_header)) { + fprintf(stderr, "no magic header in k mixture model.\n"); + exit(ENODATA); + } + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(items); + + /* print prune progress */ + size_t progress = 0; size_t onestep = items->len / 20; + for ( size_t i = 0; i < items->len; ++i ){ + if ( progress >= onestep ) { + progress = 0; fprintf(stderr, "*"); + } + progress ++; + + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + FlexibleBigramPhraseArray removed_array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + + prune_k_mixture_model(&magic_header, single_gram, removed_array); + bigram.store(*token, single_gram); + + delete single_gram; + + /* post processing for unigram reduce */ + for (size_t m = 0; m < removed_array->len; ++m ){ + KMixtureModelArrayItemWithToken * item = + &g_array_index(removed_array, + KMixtureModelArrayItemWithToken, m); + KMixtureModelArrayHeader array_header; + assert(bigram.get_array_header(item->m_token, array_header)); + array_header.m_freq -= item->m_item.m_WC; + assert(array_header.m_freq >= 0); + assert(bigram.set_array_header(item->m_token, array_header)); + } + + g_array_free(removed_array, TRUE); + removed_array = NULL; + } + + fprintf(stderr, "\n"); + + bigram.set_magic_header(magic_header); + + /* post processing clean up zero items */ + KMixtureModelArrayHeader array_header; + for ( size_t i = 0; i < items->len; ++i ){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + assert(bigram.get_array_header(*token, array_header)); + if ( 0 == array_header.m_WC && 0 == array_header.m_freq ) + assert(bigram.remove(*token)); + } + + g_array_free(items, TRUE); + + return 0; +} |