diff options
author | Peng Wu <alexepico@gmail.com> | 2011-06-07 15:05:14 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-06-07 15:05:14 +0800 |
commit | 00b8f056a10af903b15b29c6c782b3f74a6147aa (patch) | |
tree | f99b19e2eeb478a691e545ddbd6ead39589fcb2c /utils/training | |
parent | f1b4b7024df6aa8a279e7ac8fbaf9a5701e446c4 (diff) | |
download | libpinyin-00b8f056a10af903b15b29c6c782b3f74a6147aa.tar.gz libpinyin-00b8f056a10af903b15b29c6c782b3f74a6147aa.tar.xz libpinyin-00b8f056a10af903b15b29c6c782b3f74a6147aa.zip |
add unigram to validate k mixture model
Diffstat (limited to 'utils/training')
-rw-r--r-- | utils/training/validate_k_mixture_model.cpp | 33 |
1 files changed, 29 insertions, 4 deletions
diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp index d2e3a7b..20c5636 100644 --- a/utils/training/validate_k_mixture_model.cpp +++ b/utils/training/validate_k_mixture_model.cpp @@ -38,16 +38,23 @@ bool validate_unigram(KMixtureModelBigram * bigram){ fprintf(stderr, "word count in magic header is unexpected zero.\n"); return false; } + guint32 expected_total_freq = magic_header.m_total_freq; + if ( 0 == expected_total_freq ){ + fprintf(stderr, "total freq in magic header is unexpected zero.\n"); + return false; + } + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); bigram->get_all_items(items); - guint32 sum = 0; + guint32 sum = 0; guint32 total_freq = 0; for (size_t i = 0; i < items->len; ++i) { phrase_token_t * token = &g_array_index(items, phrase_token_t, i); KMixtureModelArrayHeader array_header; assert(bigram->get_array_header(*token, array_header)); sum += array_header.m_WC; + total_freq += array_header.m_freq; } if ( sum != expected_sum ){ @@ -56,6 +63,13 @@ bool validate_unigram(KMixtureModelBigram * bigram){ fprintf(stderr, "the sum differs from word count.\n"); return false; } + if ( total_freq != expected_total_freq ){ + fprintf(stderr, "total freq in magic header:%d\n", + expected_total_freq); + fprintf(stderr, "sum of freqs in array headers:%d\n", total_freq); + fprintf(stderr, "the total freq differs from sum of freqs.\n"); + return false; + } g_array_free(items, TRUE); return true; @@ -79,10 +93,21 @@ bool validate_bigram(KMixtureModelBigram * bigram){ assert(single_gram->get_array_header(array_header)); guint32 expected_sum = array_header.m_WC; + guint32 freq = array_header.m_freq; if ( 0 == expected_sum ){ - fprintf(stderr, "in the array header of token %d:\n", *token); - fprintf(stderr, "word count is unexpected zero.\n"); - result = false; + if ( 0 != array->len ){ + fprintf(stderr, "in the array header of token %d:\n", *token); + fprintf(stderr, "word count is zero but has array items.\n"); + result = false; + } + if ( 0 != freq ){ + continue; + } else { + fprintf(stderr, "in the array header of token %d:\n", *token); + fprintf(stderr, "both word count and freq are " + "unexpected zero.\n"); + result = false; + } } guint32 sum = 0; |