summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-06-07 15:05:14 +0800
committerPeng Wu <alexepico@gmail.com>2011-06-07 15:05:14 +0800
commit00b8f056a10af903b15b29c6c782b3f74a6147aa (patch)
treef99b19e2eeb478a691e545ddbd6ead39589fcb2c /utils
parentf1b4b7024df6aa8a279e7ac8fbaf9a5701e446c4 (diff)
downloadlibpinyin-00b8f056a10af903b15b29c6c782b3f74a6147aa.tar.gz
libpinyin-00b8f056a10af903b15b29c6c782b3f74a6147aa.tar.xz
libpinyin-00b8f056a10af903b15b29c6c782b3f74a6147aa.zip
add unigram to validate k mixture model
Diffstat (limited to 'utils')
-rw-r--r--utils/training/validate_k_mixture_model.cpp33
1 files changed, 29 insertions, 4 deletions
diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp
index d2e3a7b..20c5636 100644
--- a/utils/training/validate_k_mixture_model.cpp
+++ b/utils/training/validate_k_mixture_model.cpp
@@ -38,16 +38,23 @@ bool validate_unigram(KMixtureModelBigram * bigram){
fprintf(stderr, "word count in magic header is unexpected zero.\n");
return false;
}
+ guint32 expected_total_freq = magic_header.m_total_freq;
+ if ( 0 == expected_total_freq ){
+ fprintf(stderr, "total freq in magic header is unexpected zero.\n");
+ return false;
+ }
+
GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
bigram->get_all_items(items);
- guint32 sum = 0;
+ guint32 sum = 0; guint32 total_freq = 0;
for (size_t i = 0; i < items->len; ++i) {
phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
KMixtureModelArrayHeader array_header;
assert(bigram->get_array_header(*token, array_header));
sum += array_header.m_WC;
+ total_freq += array_header.m_freq;
}
if ( sum != expected_sum ){
@@ -56,6 +63,13 @@ bool validate_unigram(KMixtureModelBigram * bigram){
fprintf(stderr, "the sum differs from word count.\n");
return false;
}
+ if ( total_freq != expected_total_freq ){
+ fprintf(stderr, "total freq in magic header:%d\n",
+ expected_total_freq);
+ fprintf(stderr, "sum of freqs in array headers:%d\n", total_freq);
+ fprintf(stderr, "the total freq differs from sum of freqs.\n");
+ return false;
+ }
g_array_free(items, TRUE);
return true;
@@ -79,10 +93,21 @@ bool validate_bigram(KMixtureModelBigram * bigram){
assert(single_gram->get_array_header(array_header));
guint32 expected_sum = array_header.m_WC;
+ guint32 freq = array_header.m_freq;
if ( 0 == expected_sum ){
- fprintf(stderr, "in the array header of token %d:\n", *token);
- fprintf(stderr, "word count is unexpected zero.\n");
- result = false;
+ if ( 0 != array->len ){
+ fprintf(stderr, "in the array header of token %d:\n", *token);
+ fprintf(stderr, "word count is zero but has array items.\n");
+ result = false;
+ }
+ if ( 0 != freq ){
+ continue;
+ } else {
+ fprintf(stderr, "in the array header of token %d:\n", *token);
+ fprintf(stderr, "both word count and freq are "
+ "unexpected zero.\n");
+ result = false;
+ }
}
guint32 sum = 0;