summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-23 11:38:14 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-23 11:38:14 +0800
commitb47878f90385acd42588658a6dcf2c0edfae419f (patch)
treeb4476926862caceb1e53fd3ababa9df5bbd96f9a
parent5a6ac44fa3d1acb7cd6ff62cb1e71d6c9d8a8fed (diff)
downloadlibpinyin-b47878f90385acd42588658a6dcf2c0edfae419f.tar.gz
libpinyin-b47878f90385acd42588658a6dcf2c0edfae419f.tar.xz
libpinyin-b47878f90385acd42588658a6dcf2c0edfae419f.zip
write validate k mixture model tool
-rw-r--r--utils/training/Makefile.am9
-rw-r--r--utils/training/export_k_mixture_model.cpp2
-rw-r--r--utils/training/validate_k_mixture_model.cpp134
3 files changed, 142 insertions, 3 deletions
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am
index 0915479..a8b5478 100644
--- a/utils/training/Makefile.am
+++ b/utils/training/Makefile.am
@@ -35,7 +35,8 @@ noinst_PROGRAMS = gen_ngram \
prune_k_mixture_model \
import_k_mixture_model \
export_k_mixture_model \
- k_mixture_model_to_interpolation
+ k_mixture_model_to_interpolation \
+ validate_k_mixture_model
gen_ngram_SOURCES = gen_ngram.cpp
@@ -79,4 +80,8 @@ export_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp
-k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file
+k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
+
+validate_k_mixture_model_SOURCES = validate_k_mixture_model.cpp
+
+validate_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp
index f42df96..b10ee43 100644
--- a/utils/training/export_k_mixture_model.cpp
+++ b/utils/training/export_k_mixture_model.cpp
@@ -48,7 +48,7 @@ bool print_k_mixture_model_array_headers(FILE * output,
for (size_t i = 0; i < items->len; ++i) {
phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
KMixtureModelArrayHeader array_header;
- bigram->get_array_header(*token, array_header);
+ assert(bigram->get_array_header(*token, array_header));
char * phrase = taglib_token_to_string(phrase_index, *token);
if ( phrase )
fprintf(output, "\\item %s count %d\n", phrase, array_header.m_WC);
diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp
new file mode 100644
index 0000000..3e5458d
--- /dev/null
+++ b/utils/training/validate_k_mixture_model.cpp
@@ -0,0 +1,134 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "pinyin.h"
+#include "k_mixture_model.h"
+
+void print_help(){
+ printf("Usage: validate_k_mixture_model <FILENAME>\n");
+}
+
+bool validate_unigram(KMixtureModelBigram * bigram){
+ KMixtureModelMagicHeader magic_header;
+ if( !bigram->get_magic_header(magic_header) ){
+ fprintf(stderr, "no magic header in k mixture model.\n");
+ return false;
+ }
+
+ guint32 expected_sum = magic_header.m_WC;
+ if ( 0 == expected_sum ){
+ fprintf(stderr, "word count in magic header is unexpected zero.\n");
+ return false;
+ }
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ guint32 sum = 0;
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ KMixtureModelArrayHeader array_header;
+ assert(bigram->get_array_header(*token, array_header));
+ sum += array_header.m_WC;
+ }
+
+ if ( sum != expected_sum ){
+ fprintf(stderr, "word count in magic header:%d\n", expected_sum);
+ fprintf(stderr, "sum of word count in array headers:%d\n", sum);
+ fprintf(stderr, "the sum differs from word count.\n");
+ return false;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
+
+bool validate_bigram(KMixtureModelBigram * bigram){
+ bool result = true;
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ assert(bigram->load(*token, single_gram));
+ FlexibleBigramPhraseArray array = g_array_new
+ (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ single_gram->retrieve_all(array);
+
+ KMixtureModelArrayHeader array_header;
+ assert(single_gram->get_array_header(array_header));
+
+ guint32 expected_sum = array_header.m_WC;
+ if ( 0 == expected_sum ){
+ fprintf(stderr, "in the array header of token %d:\n", *token);
+ fprintf(stderr, "word count is unexpected zero.\n");
+ result = false;
+ }
+
+ guint32 sum = 0;
+ for (size_t m = 0; m< array->len; ++m){
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
+
+ sum += item->m_item.m_WC;
+ }
+
+ if ( sum != expected_sum ){
+ fprintf(stderr, "word count in array header:%d\n", expected_sum);
+ fprintf(stderr, "sum of word count in array items:%d\n", sum);
+ fprintf(stderr, "the sum differs from word count.\n");
+ result = false;
+ }
+ }
+
+ g_array_free(items, TRUE);
+ return result;
+}
+
+int main(int argc, char * argv[]){
+ int i = 1;
+ const char * k_mixture_model_filename = NULL;
+
+ while ( i < argc ){
+ if ( strcmp ("--help", argv[i]) == 0 ){
+ print_help();
+ exit(0);
+ } else {
+ k_mixture_model_filename = argv[i];
+ }
+ }
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(k_mixture_model_filename, ATTACH_READONLY);
+
+ if (!validate_unigram(&bigram)) {
+ fprintf(stderr, "k mixture model validation failed.\n");
+ exit(ENODATA);
+ }
+
+ if (!validate_bigram(&bigram)) {
+ fprintf(stderr, "k mixture model validation failed.\n");
+ exit(ENODATA);
+ }
+
+ return 0;
+}