diff options
Diffstat (limited to 'utils/training/merge_k_mixture_model.cpp')
-rw-r--r-- | utils/training/merge_k_mixture_model.cpp | 239 |
1 files changed, 239 insertions, 0 deletions
diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp new file mode 100644 index 0000000..ab08010 --- /dev/null +++ b/utils/training/merge_k_mixture_model.cpp @@ -0,0 +1,239 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <locale.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" + +void print_help(){ + printf("Usage: merge_k_mixture_model [--result-file <RESULT_FILENAME>]\n"); + printf(" {<SOURCE_FILENAME>}+\n"); +} + +static const gchar * result_filename = NULL; + +static GOptionEntry entries[] = +{ + {"result-file", 0, 0, G_OPTION_ARG_FILENAME, &result_filename, "merged result file", NULL}, + {NULL} +}; + +static bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first, + /* in */ FlexibleBigramPhraseArray second, + /* out */ FlexibleBigramPhraseArray & merged ){ + /* avoid to do empty merge. */ + assert( NULL != first && NULL != second && NULL != merged ); + + /* merge two arrays. */ + guint first_index, second_index = first_index = 0; + KMixtureModelArrayItemWithToken * first_item, + * second_item = first_item = NULL; + while ( first_index < first->len && second_index < second->len ){ + first_item = &g_array_index(first, KMixtureModelArrayItemWithToken, + first_index); + second_item = &g_array_index(second, KMixtureModelArrayItemWithToken, + second_index); + if ( first_item->m_token > second_item->m_token ) { + g_array_append_val(merged, *second_item); + second_index ++; + } else if ( first_item->m_token < second_item->m_token ) { + g_array_append_val(merged, *first_item); + first_index ++; + } else /* first_item->m_token == second_item->m_token */ { + KMixtureModelArrayItemWithToken merged_item; + memset(&merged_item, 0, sizeof(KMixtureModelArrayItemWithToken)); + merged_item.m_token = first_item->m_token;/* same as second_item */ + merged_item.m_item.m_WC = first_item->m_item.m_WC + + second_item->m_item.m_WC; + /* merged_item.m_item.m_T = first_item->m_item.m_T + + second_item->m_item.m_T; */ + merged_item.m_item.m_N_n_0 = first_item->m_item.m_N_n_0 + + second_item->m_item.m_N_n_0; + merged_item.m_item.m_n_1 = first_item->m_item.m_n_1 + + second_item->m_item.m_n_1; + merged_item.m_item.m_Mr = std_lite::max(first_item->m_item.m_Mr, + second_item->m_item.m_Mr); + g_array_append_val(merged, merged_item); + first_index ++; second_index ++; + } + } + + /* add remained items. */ + while ( first_index < first->len ){ + first_item = &g_array_index(first, KMixtureModelArrayItemWithToken, + first_index); + g_array_append_val(merged, *first_item); + first_index++; + } + + while ( second_index < second->len ){ + second_item = &g_array_index(second, KMixtureModelArrayItemWithToken, + second_index); + g_array_append_val(merged, *second_item); + second_index++; + } + + return true; +} + +static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + + KMixtureModelMagicHeader target_magic_header; + KMixtureModelMagicHeader new_magic_header; + KMixtureModelMagicHeader merged_magic_header; + + memset(&merged_magic_header, 0, sizeof(KMixtureModelMagicHeader)); + if (!target->get_magic_header(target_magic_header)) { + memset(&target_magic_header, 0, sizeof(KMixtureModelMagicHeader)); + } + assert(new_one->get_magic_header(new_magic_header)); + if ( target_magic_header.m_WC + new_magic_header.m_WC < + std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){ + fprintf(stderr, "the m_WC integer in magic header overflows.\n"); + return false; + } + if ( target_magic_header.m_total_freq + new_magic_header.m_total_freq < + std_lite::max( target_magic_header.m_total_freq, + new_magic_header.m_total_freq ) ){ + fprintf(stderr, "the m_total_freq in magic header overflows.\n"); + return false; + } + + merged_magic_header.m_WC = target_magic_header.m_WC + + new_magic_header.m_WC; + merged_magic_header.m_N = target_magic_header.m_N + + new_magic_header.m_N; + merged_magic_header.m_total_freq = target_magic_header.m_total_freq + + new_magic_header.m_total_freq; + + assert(target->set_magic_header(merged_magic_header)); + return true; +} + +static bool merge_array_items( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + + GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + new_one->get_all_items(new_items); + + for ( size_t i = 0; i < new_items->len; ++i ){ + phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i); + KMixtureModelSingleGram * target_single_gram = NULL; + KMixtureModelSingleGram * new_single_gram = NULL; + + assert(new_one->load(*token, new_single_gram)); + bool exists_in_target = target->load(*token, target_single_gram); + if ( !exists_in_target ){ + target->store(*token, new_single_gram); + delete new_single_gram; + continue; + } + + /* word count in array header in parallel with array items */ + KMixtureModelArrayHeader target_array_header; + KMixtureModelArrayHeader new_array_header; + KMixtureModelArrayHeader merged_array_header; + + assert(new_one->get_array_header(*token, new_array_header)); + assert(target->get_array_header(*token, target_array_header)); + memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader)); + + merged_array_header.m_WC = target_array_header.m_WC + + new_array_header.m_WC; + merged_array_header.m_freq = target_array_header.m_freq + + new_array_header.m_freq; + /* end of word count in array header computing. */ + + assert(NULL != target_single_gram); + KMixtureModelSingleGram * merged_single_gram = + new KMixtureModelSingleGram; + + FlexibleBigramPhraseArray target_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + target_single_gram->retrieve_all(target_array); + + FlexibleBigramPhraseArray new_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + new_single_gram->retrieve_all(new_array); + FlexibleBigramPhraseArray merged_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + + assert(merge_two_phrase_array(target_array, new_array, merged_array)); + + g_array_free(target_array, TRUE); + g_array_free(new_array, TRUE); + delete target_single_gram; delete new_single_gram; + + for ( size_t m = 0; m < merged_array->len; ++m ){ + KMixtureModelArrayItemWithToken * item = + &g_array_index(merged_array, + KMixtureModelArrayItemWithToken, m); + merged_single_gram->insert_array_item(item->m_token, item->m_item); + } + + assert(merged_single_gram->set_array_header(merged_array_header)); + assert(target->store(*token, merged_single_gram)); + delete merged_single_gram; + g_array_free(merged_array, TRUE); + } + + g_array_free(new_items, TRUE); + return true; +} + +bool merge_two_k_mixture_model( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + assert(NULL != target); + assert(NULL != new_one); + return merge_array_items(target, new_one) && + merge_magic_header(target, new_one); +} + +int main(int argc, char * argv[]){ + int i = 1; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- merge k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + KMixtureModelBigram target(K_MIXTURE_MODEL_MAGIC_NUMBER); + target.attach(result_filename, ATTACH_READWRITE|ATTACH_CREATE); + + while (i < argc){ + const char * new_filename = argv[i]; + KMixtureModelBigram new_one(K_MIXTURE_MODEL_MAGIC_NUMBER); + new_one.attach(new_filename, ATTACH_READONLY); + if ( !merge_two_k_mixture_model(&target, &new_one) ) + exit(EOVERFLOW); + ++i; + } + + return 0; +} |