From 5bf8ae9ab5f227ec14d0d7cd671c491189733a94 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 16 May 2011 14:57:18 +0800 Subject: wrote export k mixture model --- utils/storage/export_interpolation.cpp | 8 +-- utils/training/Makefile.am | 12 +++- utils/training/export_k_mixture_model.cpp | 103 ++++++++++++++++++++++++++++-- 3 files changed, 111 insertions(+), 12 deletions(-) diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp index bcc90b8..23e4d4a 100644 --- a/utils/storage/export_interpolation.cpp +++ b/utils/storage/export_interpolation.cpp @@ -30,12 +30,12 @@ void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index); void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram); -void begin_data(FILE * file){ - fprintf(file, "\\data model interpolation\n"); +void begin_data(FILE * output){ + fprintf(output, "\\data model interpolation\n"); } -void end_data(FILE * file){ - fprintf(file, "\\end\n"); +void end_data(FILE * output){ + fprintf(output, "\\end\n"); } int main(int argc, char * argv[]){ diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am index 44e4dad..adc0c30 100644 --- a/utils/training/Makefile.am +++ b/utils/training/Makefile.am @@ -20,7 +20,8 @@ MAINTAINERCLEANFILES = Makefile.in INCLUDES = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ - -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils/storage \ @GLIB2_CPPFLAGS@ noinst_HEADERS = k_mixture_model.h @@ -32,7 +33,8 @@ noinst_PROGRAMS = gen_ngram \ estimate_interpolation \ estimate_k_mixture_model \ merge_k_mixture_model \ - prune_k_mixture_model + prune_k_mixture_model \ + export_k_mixture_model gen_ngram_SOURCES = gen_ngram.cpp @@ -64,4 +66,8 @@ merge_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ prune_k_mixture_model_SOURCES = prune_k_mixture_model.cpp -prune_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file +prune_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ + +export_k_mixture_model_SOURCES = export_k_mixture_model.cpp + +export_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp index b167394..7802e4d 100644 --- a/utils/training/export_k_mixture_model.cpp +++ b/utils/training/export_k_mixture_model.cpp @@ -20,20 +20,29 @@ */ #include "pinyin.h" +#include "tag_utility.h" +#include "k_mixture_model.h" -bool print_k_mixture_model_magic_header(KMixtureModelBigram * bigram){ +void print_help(){ + printf("export_k_mixture_model [--k-mixture-model-file ]\n"); +} + +bool print_k_mixture_model_magic_header(FILE * output, + KMixtureModelBigram * bigram){ KMixtureModelMagicHeader magic_header; if ( !bigram->get_magic_header(magic_header) ){ fprintf(stderr, "no magic header in k mixture model.\n"); exit(ENODATA); } - printf("\\data model \"k mixture model\" count %d N %d\n", + fprintf(output, "\\data model \"k mixture model\" count %d N %d\n", magic_header.m_WC, magic_header.m_N); return true; } -bool print_k_mixture_model_array_header(KMixtureModelBigram * bigram){ - printf("\1-gram\n"); +bool print_k_mixture_model_array_headers(FILE * output, + KMixtureModelBigram * bigram, + FacadePhraseIndex * phrase_index){ + fprintf(output, "\\1-gram\n"); GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); bigram->get_all_items(items); @@ -41,11 +50,95 @@ bool print_k_mixture_model_array_header(KMixtureModelBigram * bigram){ phrase_token_t * token = &g_array_index(items, phrase_token_t, i); KMixtureModelArrayHeader array_header; bigram->get_array_header(*token, array_header); - + char * phrase = taglib_token_to_string(phrase_index, *token); + if ( phrase ) + fprintf(output, "\\item %s count %d\n", phrase, array_header.m_WC); + + g_free(phrase); } + return true; +} + +bool print_k_mixture_model_array_items(FILE * output, + KMixtureModelBigram * bigram, + FacadePhraseIndex * phrase_index){ + fprintf(output, "\\2-gram\n"); + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + assert(bigram->load(*token, single_gram)); + FlexibleBigramPhraseArray array = g_array_new + (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + single_gram->retrieve_all(array); + + for (size_t m = 0; m < array->len; ++m){ + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m); + char * word1 = taglib_token_to_string(phrase_index, *token); + char * word2 = taglib_token_to_string(phrase_index, item->m_token); + + if (word1 && word2) + fprintf(output, "\\item %s %s count %d T %d N_n_0 %d Mr %d\n", + word1, word2, item->m_item.m_WC, item->m_item.m_WC, + item->m_item.m_N_n_0, item->m_item.m_Mr); + + g_free(word1); g_free(word2); + } + g_array_free(array, TRUE); + delete single_gram; + } + + g_array_free(items, TRUE); + return true; +} + +bool end_data(FILE * output){ + fprintf(output, "\\end\n"); return true; } int main(int argc, char * argv[]){ + int i = 1; + const char * k_mixture_model_filename = NULL; + FILE * output = stdout; + + while ( i < argc ){ + if ( strcmp ("--help", argv[i]) == 0 ){ + print_help(); + exit(0); + } else if ( strcmp ("--k-mixture-model-file", argv[i]) == 0 ){ + if ( ++i > argc ){ + print_help(); + exit(EINVAL); + } + k_mixture_model_filename = argv[i]; + } else { + print_help(); + exit(EINVAL); + } + } + + FacadePhraseIndex phrase_index; + + //gb_char binary file + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + //gbk_char binary file + chunk = new MemoryChunk; + chunk->load("../../data/gbk_char.bin"); + phrase_index.load(2, chunk); + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(k_mixture_model_filename, ATTACH_READONLY); + + print_k_mixture_model_magic_header(output, &bigram); + print_k_mixture_model_array_headers(output, &bigram, &phrase_index); + print_k_mixture_model_array_items(output, &bigram, &phrase_index); + + return 0; } -- cgit