summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-16 14:57:18 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-16 14:57:18 +0800
commit5bf8ae9ab5f227ec14d0d7cd671c491189733a94 (patch)
tree4b9e54be68bec38f6745ef6fde2e9b491c1d64fe /utils
parentec8c2e7e553ca4faf51c4372ee96f7b1ac34886c (diff)
downloadlibpinyin-5bf8ae9ab5f227ec14d0d7cd671c491189733a94.tar.gz
libpinyin-5bf8ae9ab5f227ec14d0d7cd671c491189733a94.tar.xz
libpinyin-5bf8ae9ab5f227ec14d0d7cd671c491189733a94.zip
wrote export k mixture model
Diffstat (limited to 'utils')
-rw-r--r--utils/storage/export_interpolation.cpp8
-rw-r--r--utils/training/Makefile.am12
-rw-r--r--utils/training/export_k_mixture_model.cpp103
3 files changed, 111 insertions, 12 deletions
diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp
index bcc90b8..23e4d4a 100644
--- a/utils/storage/export_interpolation.cpp
+++ b/utils/storage/export_interpolation.cpp
@@ -30,12 +30,12 @@
void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index);
void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram);
-void begin_data(FILE * file){
- fprintf(file, "\\data model interpolation\n");
+void begin_data(FILE * output){
+ fprintf(output, "\\data model interpolation\n");
}
-void end_data(FILE * file){
- fprintf(file, "\\end\n");
+void end_data(FILE * output){
+ fprintf(output, "\\end\n");
}
int main(int argc, char * argv[]){
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am
index 44e4dad..adc0c30 100644
--- a/utils/training/Makefile.am
+++ b/utils/training/Makefile.am
@@ -20,7 +20,8 @@ MAINTAINERCLEANFILES = Makefile.in
INCLUDES = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
- -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils/storage \
@GLIB2_CPPFLAGS@
noinst_HEADERS = k_mixture_model.h
@@ -32,7 +33,8 @@ noinst_PROGRAMS = gen_ngram \
estimate_interpolation \
estimate_k_mixture_model \
merge_k_mixture_model \
- prune_k_mixture_model
+ prune_k_mixture_model \
+ export_k_mixture_model
gen_ngram_SOURCES = gen_ngram.cpp
@@ -64,4 +66,8 @@ merge_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
prune_k_mixture_model_SOURCES = prune_k_mixture_model.cpp
-prune_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file
+prune_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
+
+export_k_mixture_model_SOURCES = export_k_mixture_model.cpp
+
+export_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file
diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp
index b167394..7802e4d 100644
--- a/utils/training/export_k_mixture_model.cpp
+++ b/utils/training/export_k_mixture_model.cpp
@@ -20,20 +20,29 @@
*/
#include "pinyin.h"
+#include "tag_utility.h"
+#include "k_mixture_model.h"
-bool print_k_mixture_model_magic_header(KMixtureModelBigram * bigram){
+void print_help(){
+ printf("export_k_mixture_model [--k-mixture-model-file <FILENAME>]\n");
+}
+
+bool print_k_mixture_model_magic_header(FILE * output,
+ KMixtureModelBigram * bigram){
KMixtureModelMagicHeader magic_header;
if ( !bigram->get_magic_header(magic_header) ){
fprintf(stderr, "no magic header in k mixture model.\n");
exit(ENODATA);
}
- printf("\\data model \"k mixture model\" count %d N %d\n",
+ fprintf(output, "\\data model \"k mixture model\" count %d N %d\n",
magic_header.m_WC, magic_header.m_N);
return true;
}
-bool print_k_mixture_model_array_header(KMixtureModelBigram * bigram){
- printf("\1-gram\n");
+bool print_k_mixture_model_array_headers(FILE * output,
+ KMixtureModelBigram * bigram,
+ FacadePhraseIndex * phrase_index){
+ fprintf(output, "\\1-gram\n");
GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
bigram->get_all_items(items);
@@ -41,11 +50,95 @@ bool print_k_mixture_model_array_header(KMixtureModelBigram * bigram){
phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
KMixtureModelArrayHeader array_header;
bigram->get_array_header(*token, array_header);
-
+ char * phrase = taglib_token_to_string(phrase_index, *token);
+ if ( phrase )
+ fprintf(output, "\\item %s count %d\n", phrase, array_header.m_WC);
+
+ g_free(phrase);
}
+ return true;
+}
+
+bool print_k_mixture_model_array_items(FILE * output,
+ KMixtureModelBigram * bigram,
+ FacadePhraseIndex * phrase_index){
+ fprintf(output, "\\2-gram\n");
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ assert(bigram->load(*token, single_gram));
+ FlexibleBigramPhraseArray array = g_array_new
+ (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ single_gram->retrieve_all(array);
+
+ for (size_t m = 0; m < array->len; ++m){
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
+ char * word1 = taglib_token_to_string(phrase_index, *token);
+ char * word2 = taglib_token_to_string(phrase_index, item->m_token);
+
+ if (word1 && word2)
+ fprintf(output, "\\item %s %s count %d T %d N_n_0 %d Mr %d\n",
+ word1, word2, item->m_item.m_WC, item->m_item.m_WC,
+ item->m_item.m_N_n_0, item->m_item.m_Mr);
+
+ g_free(word1); g_free(word2);
+ }
+ g_array_free(array, TRUE);
+ delete single_gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
+
+bool end_data(FILE * output){
+ fprintf(output, "\\end\n");
return true;
}
int main(int argc, char * argv[]){
+ int i = 1;
+ const char * k_mixture_model_filename = NULL;
+ FILE * output = stdout;
+
+ while ( i < argc ){
+ if ( strcmp ("--help", argv[i]) == 0 ){
+ print_help();
+ exit(0);
+ } else if ( strcmp ("--k-mixture-model-file", argv[i]) == 0 ){
+ if ( ++i > argc ){
+ print_help();
+ exit(EINVAL);
+ }
+ k_mixture_model_filename = argv[i];
+ } else {
+ print_help();
+ exit(EINVAL);
+ }
+ }
+
+ FacadePhraseIndex phrase_index;
+
+ //gb_char binary file
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+
+ //gbk_char binary file
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gbk_char.bin");
+ phrase_index.load(2, chunk);
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(k_mixture_model_filename, ATTACH_READONLY);
+
+ print_k_mixture_model_magic_header(output, &bigram);
+ print_k_mixture_model_array_headers(output, &bigram, &phrase_index);
+ print_k_mixture_model_array_items(output, &bigram, &phrase_index);
+
+ return 0;
}