summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-19 13:36:10 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-19 13:36:10 +0800
commit5150341809f92fb2179decdfdd6ec1477d988461 (patch)
tree8d06aeffb594349c8359929aa5dd807837adf647 /utils
parent2988e1a53748d31c6dae10f909465d52ab82e6bd (diff)
downloadlibpinyin-5150341809f92fb2179decdfdd6ec1477d988461.tar.gz
libpinyin-5150341809f92fb2179decdfdd6ec1477d988461.tar.xz
libpinyin-5150341809f92fb2179decdfdd6ec1477d988461.zip
write k mixture model to interpolation conversion tool
Diffstat (limited to 'utils')
-rw-r--r--utils/training/Makefile.am9
-rw-r--r--utils/training/import_k_mixture_model.cpp2
-rw-r--r--utils/training/k_mixture_model_to_interpolation.cpp197
3 files changed, 205 insertions, 3 deletions
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am
index 3a3d6b9..7411e78 100644
--- a/utils/training/Makefile.am
+++ b/utils/training/Makefile.am
@@ -35,7 +35,8 @@ noinst_PROGRAMS = gen_ngram \
merge_k_mixture_model \
prune_k_mixture_model \
import_k_mixture_model \
- export_k_mixture_model
+ export_k_mixture_model \
+ k_mixture_model_to_interpolation
gen_ngram_SOURCES = gen_ngram.cpp
@@ -75,4 +76,8 @@ import_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la
export_k_mixture_model_SOURCES = export_k_mixture_model.cpp
-export_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file
+export_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@
+
+k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp
+
+k_mixture_model_to_interpolation_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file
diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp
index a19f1cf..aa6e9ca 100644
--- a/utils/training/import_k_mixture_model.cpp
+++ b/utils/training/import_k_mixture_model.cpp
@@ -241,7 +241,7 @@ int main(int argc, char * argv[]){
phrases.load(chunk);
KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
- bigram.attach(k_mixture_model_filename, ATTACH_READONLY);
+ bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
taglib_init();
diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp
new file mode 100644
index 0000000..2daa0f2
--- /dev/null
+++ b/utils/training/k_mixture_model_to_interpolation.cpp
@@ -0,0 +1,197 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "pinyin.h"
+#include "tag_utility.h"
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_unigram(FILE * input, FILE * output);
+
+bool parse_bigram(FILE * input, FILE * output);
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ linebuf[strlen(linebuf) - 1] = '\0';
+ return result;
+}
+
+bool parse_body(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ fprintf(output, "\\end\n");
+ goto end;
+ case GRAM_1_LINE:
+ fprintf(output, "\\1-gram\n");
+ my_getline(input);
+ parse_unigram(input, output);
+ goto retry;
+ case GRAM_2_LINE:
+ fprintf(output, "\\2-gram\n");
+ my_getline(input);
+ parse_bigram(input, output);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 1, "count", ""));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case GRAM_1_ITEM_LINE: {
+ /* handle \item in \1-gram */
+ const char * string = (const char *) g_ptr_array_index(values, 0);
+ gpointer value = NULL;
+ assert(g_hash_table_lookup_extended(required, "count",
+ NULL, &value));
+ const char * count = (const char *) value;
+ fprintf(output, "\\item %s count %s\n", string, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 2,
+ "count", "T:N_n_0:n_1:Mr"));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two strings */
+ const char * string1 = (const char *) g_ptr_array_index(values, 0);
+ const char * string2 = (const char *) g_ptr_array_index(values, 1);
+
+ gpointer value = NULL;
+ /* tag: count */
+ assert(g_hash_table_lookup_extended(required, "count", NULL, &value));
+ const char * count = (const char *)value;
+ fprintf(output, "\\item %s %s count %s", string1, string2, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ taglib_init();
+
+ values = g_ptr_array_new();
+ required = g_hash_table_new(g_str_hash, g_str_equal);
+
+ //enter "\data" line
+ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "count:N"));
+ ssize_t result = my_getline(input);
+ if ( result == -1 ) {
+ fprintf(stderr, "empty file input.\n");
+ exit(ENODATA);
+ }
+
+ //read "\data" line
+ if ( !taglib_read(linebuf, line_type, values, required) ) {
+ fprintf(stderr, "error: k mixture model expected.\n");
+ exit(ENODATA);
+ }
+
+ assert(line_type == BEGIN_LINE);
+ gpointer value = NULL;
+ assert(g_hash_table_lookup_extended(required, "model", NULL, &value));
+ const char * model = (const char *) value;
+ if ( !( strcmp("k mixture model", model) == 0 ) ){
+ fprintf(stderr, "error: k mixture model expected.\n");
+ exit(ENODATA);
+ }
+
+ fprintf(output, "\\data model interpolation\n");
+
+ result = my_getline(input);
+ if ( result != -1 )
+ parse_body(input, output);
+
+ taglib_fini();
+
+ return 0;
+}