diff options
-rw-r--r-- | utils/training/merge_k_mixture_model.cpp | 129 |
1 files changed, 116 insertions, 13 deletions
diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp index adce191..c4df992 100644 --- a/utils/training/merge_k_mixture_model.cpp +++ b/utils/training/merge_k_mixture_model.cpp @@ -27,22 +27,12 @@ void print_help(){ } -bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first, +static bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first, /* in */ FlexibleBigramPhraseArray second, /* out */ FlexibleBigramPhraseArray & merged ){ merged = NULL; - /* both arrays are empty. */ - if ( !(first || second) ) - return false; - - /* only one array contains items. */ - if ( !first || !second ){ - if ( first ) - merged = first; - if (second ) - merged = second; - return true; - } + /* avoid to do empty merge. */ + assert( NULL != first && NULL != second && NULL != merged ); merged = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); @@ -76,6 +66,7 @@ bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first, merged_item.m_item.m_Mr = std_lite::max(first_item->m_item.m_Mr, second_item->m_item.m_Mr); g_array_append_val(merged, merged_item); + first_index ++; second_index ++; } } @@ -97,6 +88,118 @@ bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first, return true; } +static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + + KMixtureModelMagicHeader target_magic_header; + KMixtureModelMagicHeader new_magic_header; + KMixtureModelMagicHeader merged_magic_header; + + memset(&merged_magic_header, 0, sizeof(KMixtureModelMagicHeader)); + assert(target->get_magic_header(target_magic_header)); + assert(new_one->get_magic_header(new_magic_header)); + if ( target_magic_header.m_WC + new_magic_header.m_WC < + std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){ + fprintf(stderr, "the m_WC integer in magic header overflows.\n"); + return false; + } + + merged_magic_header.m_WC = target_magic_header.m_WC + + new_magic_header.m_WC; + merged_magic_header.m_N = target_magic_header.m_N + + new_magic_header.m_N; + + assert(target->set_magic_header(merged_magic_header)); + return true; +} + +/* Note: must be called after the merge array items method. */ +static bool merge_array_headers( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one){ + + GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + new_one->get_all_items(new_items); + + for ( size_t i = 0; i < new_items->len; ++i ){ + phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i); + KMixtureModelArrayHeader target_array_header; + KMixtureModelArrayHeader new_array_header; + KMixtureModelArrayHeader merged_array_header; + + memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader)); + assert(new_one->get_array_header(*token, new_array_header)); + assert(target->get_array_header(*token, target_array_header)); + merged_array_header.m_WC = target_array_header.m_WC + + new_array_header.m_WC; + assert(target->set_array_header(*token, merged_array_header)); + } + + return true; +} + +static bool merge_array_items( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + + GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + new_one->get_all_items(new_items); + + for ( size_t i = 0; i < new_items->len; ++i ){ + phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i); + KMixtureModelSingleGram * target_single_gram = NULL; + KMixtureModelSingleGram * new_single_gram = NULL; + + assert(new_one->load(*token, new_single_gram)); + bool exists_in_target = target->load(*token, target_single_gram); + if ( !exists_in_target ){ + target->store(*token, new_single_gram); + delete new_single_gram; + continue; + } + + assert(NULL != target_single_gram); + KMixtureModelSingleGram * merged_single_gram = + new KMixtureModelSingleGram; + + FlexibleBigramPhraseArray target_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + target_single_gram->retrieve_all(target_array); + + FlexibleBigramPhraseArray new_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + new_single_gram->retrieve_all(new_array); + FlexibleBigramPhraseArray merged_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + + assert(merge_two_phrase_array(target_array, new_array, merged_array)); + + g_array_free(target_array, TRUE); + g_array_free(new_array, TRUE); + delete target_single_gram; delete new_single_gram; + + for ( size_t m = 0; m < merged_array->len; ++m ){ + KMixtureModelArrayItemWithToken * item = + &g_array_index(merged_array, + KMixtureModelArrayItemWithToken, m); + merged_single_gram->insert_array_item(item->m_token, item->m_item); + } + + assert(target->store(*token, merged_single_gram)); + delete merged_single_gram; + g_array_free(merged_array, TRUE); + } + + g_array_free(new_items, TRUE); + return true; +} + +bool merge_two_k_mixture_model( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + assert(NULL != target); + assert(NULL != new_one); + return merge_array_items(target, new_one) && + merge_array_headers(target, new_one) && + merge_magic_header(target, new_one); +} int main(int argc, char * argv[]){ const char * result_filename = NULL; |