summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-12 14:56:00 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-12 14:56:00 +0800
commit1b5ea010d98d272deb845965cef1511ccf3076de (patch)
tree74e730d6da4378d91c477eed266f1e7daed709a9 /utils
parent8156def6af7977a59ca44d1a26c53e588535d914 (diff)
downloadlibpinyin-1b5ea010d98d272deb845965cef1511ccf3076de.tar.gz
libpinyin-1b5ea010d98d272deb845965cef1511ccf3076de.tar.xz
libpinyin-1b5ea010d98d272deb845965cef1511ccf3076de.zip
write merge k mixture model in progress
Diffstat (limited to 'utils')
-rw-r--r--utils/training/merge_k_mixture_model.cpp129
1 files changed, 116 insertions, 13 deletions
diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp
index adce191..c4df992 100644
--- a/utils/training/merge_k_mixture_model.cpp
+++ b/utils/training/merge_k_mixture_model.cpp
@@ -27,22 +27,12 @@ void print_help(){
}
-bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first,
+static bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first,
/* in */ FlexibleBigramPhraseArray second,
/* out */ FlexibleBigramPhraseArray & merged ){
merged = NULL;
- /* both arrays are empty. */
- if ( !(first || second) )
- return false;
-
- /* only one array contains items. */
- if ( !first || !second ){
- if ( first )
- merged = first;
- if (second )
- merged = second;
- return true;
- }
+ /* avoid to do empty merge. */
+ assert( NULL != first && NULL != second && NULL != merged );
merged = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
@@ -76,6 +66,7 @@ bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first,
merged_item.m_item.m_Mr = std_lite::max(first_item->m_item.m_Mr,
second_item->m_item.m_Mr);
g_array_append_val(merged, merged_item);
+ first_index ++; second_index ++;
}
}
@@ -97,6 +88,118 @@ bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first,
return true;
}
+static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one ){
+
+ KMixtureModelMagicHeader target_magic_header;
+ KMixtureModelMagicHeader new_magic_header;
+ KMixtureModelMagicHeader merged_magic_header;
+
+ memset(&merged_magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ assert(target->get_magic_header(target_magic_header));
+ assert(new_one->get_magic_header(new_magic_header));
+ if ( target_magic_header.m_WC + new_magic_header.m_WC <
+ std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){
+ fprintf(stderr, "the m_WC integer in magic header overflows.\n");
+ return false;
+ }
+
+ merged_magic_header.m_WC = target_magic_header.m_WC +
+ new_magic_header.m_WC;
+ merged_magic_header.m_N = target_magic_header.m_N +
+ new_magic_header.m_N;
+
+ assert(target->set_magic_header(merged_magic_header));
+ return true;
+}
+
+/* Note: must be called after the merge array items method. */
+static bool merge_array_headers( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one){
+
+ GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ new_one->get_all_items(new_items);
+
+ for ( size_t i = 0; i < new_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i);
+ KMixtureModelArrayHeader target_array_header;
+ KMixtureModelArrayHeader new_array_header;
+ KMixtureModelArrayHeader merged_array_header;
+
+ memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader));
+ assert(new_one->get_array_header(*token, new_array_header));
+ assert(target->get_array_header(*token, target_array_header));
+ merged_array_header.m_WC = target_array_header.m_WC +
+ new_array_header.m_WC;
+ assert(target->set_array_header(*token, merged_array_header));
+ }
+
+ return true;
+}
+
+static bool merge_array_items( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one ){
+
+ GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ new_one->get_all_items(new_items);
+
+ for ( size_t i = 0; i < new_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i);
+ KMixtureModelSingleGram * target_single_gram = NULL;
+ KMixtureModelSingleGram * new_single_gram = NULL;
+
+ assert(new_one->load(*token, new_single_gram));
+ bool exists_in_target = target->load(*token, target_single_gram);
+ if ( !exists_in_target ){
+ target->store(*token, new_single_gram);
+ delete new_single_gram;
+ continue;
+ }
+
+ assert(NULL != target_single_gram);
+ KMixtureModelSingleGram * merged_single_gram =
+ new KMixtureModelSingleGram;
+
+ FlexibleBigramPhraseArray target_array =
+ g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ target_single_gram->retrieve_all(target_array);
+
+ FlexibleBigramPhraseArray new_array =
+ g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ new_single_gram->retrieve_all(new_array);
+ FlexibleBigramPhraseArray merged_array =
+ g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+
+ assert(merge_two_phrase_array(target_array, new_array, merged_array));
+
+ g_array_free(target_array, TRUE);
+ g_array_free(new_array, TRUE);
+ delete target_single_gram; delete new_single_gram;
+
+ for ( size_t m = 0; m < merged_array->len; ++m ){
+ KMixtureModelArrayItemWithToken * item =
+ &g_array_index(merged_array,
+ KMixtureModelArrayItemWithToken, m);
+ merged_single_gram->insert_array_item(item->m_token, item->m_item);
+ }
+
+ assert(target->store(*token, merged_single_gram));
+ delete merged_single_gram;
+ g_array_free(merged_array, TRUE);
+ }
+
+ g_array_free(new_items, TRUE);
+ return true;
+}
+
+bool merge_two_k_mixture_model( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one ){
+ assert(NULL != target);
+ assert(NULL != new_one);
+ return merge_array_items(target, new_one) &&
+ merge_array_headers(target, new_one) &&
+ merge_magic_header(target, new_one);
+}
int main(int argc, char * argv[]){
const char * result_filename = NULL;