From 5150341809f92fb2179decdfdd6ec1477d988461 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 19 May 2011 13:36:10 +0800 Subject: write k mixture model to interpolation conversion tool --- utils/training/Makefile.am | 9 +- utils/training/import_k_mixture_model.cpp | 2 +- .../training/k_mixture_model_to_interpolation.cpp | 197 +++++++++++++++++++++ 3 files changed, 205 insertions(+), 3 deletions(-) create mode 100644 utils/training/k_mixture_model_to_interpolation.cpp (limited to 'utils') diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am index 3a3d6b9..7411e78 100644 --- a/utils/training/Makefile.am +++ b/utils/training/Makefile.am @@ -35,7 +35,8 @@ noinst_PROGRAMS = gen_ngram \ merge_k_mixture_model \ prune_k_mixture_model \ import_k_mixture_model \ - export_k_mixture_model + export_k_mixture_model \ + k_mixture_model_to_interpolation gen_ngram_SOURCES = gen_ngram.cpp @@ -75,4 +76,8 @@ import_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la export_k_mixture_model_SOURCES = export_k_mixture_model.cpp -export_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file +export_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ + +k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp + +k_mixture_model_to_interpolation_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp index a19f1cf..aa6e9ca 100644 --- a/utils/training/import_k_mixture_model.cpp +++ b/utils/training/import_k_mixture_model.cpp @@ -241,7 +241,7 @@ int main(int argc, char * argv[]){ phrases.load(chunk); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); - bigram.attach(k_mixture_model_filename, ATTACH_READONLY); + bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); taglib_init(); diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp new file mode 100644 index 0000000..2daa0f2 --- /dev/null +++ b/utils/training/k_mixture_model_to_interpolation.cpp @@ -0,0 +1,197 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "pinyin.h" +#include "tag_utility.h" + +enum LINE_TYPE{ + BEGIN_LINE = 1, + END_LINE, + GRAM_1_LINE, + GRAM_2_LINE, + GRAM_1_ITEM_LINE, + GRAM_2_ITEM_LINE +}; + +static int line_type = 0; +static GPtrArray * values = NULL; +static GHashTable * required = NULL; +/* variables for line buffer. */ +static char * linebuf = NULL; +static size_t len = 0; + +bool parse_unigram(FILE * input, FILE * output); + +bool parse_bigram(FILE * input, FILE * output); + +static ssize_t my_getline(FILE * input){ + ssize_t result = getline(&linebuf, &len, input); + if ( result == -1 ) + return result; + + linebuf[strlen(linebuf) - 1] = '\0'; + return result; +} + +bool parse_body(FILE * input, FILE * output){ + taglib_push_state(); + + assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + + do { + retry: + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case END_LINE: + fprintf(output, "\\end\n"); + goto end; + case GRAM_1_LINE: + fprintf(output, "\\1-gram\n"); + my_getline(input); + parse_unigram(input, output); + goto retry; + case GRAM_2_LINE: + fprintf(output, "\\2-gram\n"); + my_getline(input); + parse_bigram(input, output); + goto retry; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_unigram(FILE * input, FILE * output){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 1, "count", "")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case GRAM_1_ITEM_LINE: { + /* handle \item in \1-gram */ + const char * string = (const char *) g_ptr_array_index(values, 0); + gpointer value = NULL; + assert(g_hash_table_lookup_extended(required, "count", + NULL, &value)); + const char * count = (const char *) value; + fprintf(output, "\\item %s count %s\n", string, count); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_bigram(FILE * input, FILE * output){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 2, + "count", "T:N_n_0:n_1:Mr")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_2_ITEM_LINE:{ + /* handle \item in \2-gram */ + /* two strings */ + const char * string1 = (const char *) g_ptr_array_index(values, 0); + const char * string2 = (const char *) g_ptr_array_index(values, 1); + + gpointer value = NULL; + /* tag: count */ + assert(g_hash_table_lookup_extended(required, "count", NULL, &value)); + const char * count = (const char *)value; + fprintf(output, "\\item %s %s count %s", string1, string2, count); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + taglib_init(); + + values = g_ptr_array_new(); + required = g_hash_table_new(g_str_hash, g_str_equal); + + //enter "\data" line + assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "count:N")); + ssize_t result = my_getline(input); + if ( result == -1 ) { + fprintf(stderr, "empty file input.\n"); + exit(ENODATA); + } + + //read "\data" line + if ( !taglib_read(linebuf, line_type, values, required) ) { + fprintf(stderr, "error: k mixture model expected.\n"); + exit(ENODATA); + } + + assert(line_type == BEGIN_LINE); + gpointer value = NULL; + assert(g_hash_table_lookup_extended(required, "model", NULL, &value)); + const char * model = (const char *) value; + if ( !( strcmp("k mixture model", model) == 0 ) ){ + fprintf(stderr, "error: k mixture model expected.\n"); + exit(ENODATA); + } + + fprintf(output, "\\data model interpolation\n"); + + result = my_getline(input); + if ( result != -1 ) + parse_body(input, output); + + taglib_fini(); + + return 0; +} -- cgit