summaryrefslogtreecommitdiffstats
path: root/utils/training/k_mixture_model_to_interpolation.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'utils/training/k_mixture_model_to_interpolation.cpp')
-rw-r--r--utils/training/k_mixture_model_to_interpolation.cpp214
1 files changed, 214 insertions, 0 deletions
diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp
new file mode 100644
index 0000000..c5a66ec
--- /dev/null
+++ b/utils/training/k_mixture_model_to_interpolation.cpp
@@ -0,0 +1,214 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline(FILE * input, FILE * output);
+
+bool parse_unigram(FILE * input, FILE * output);
+
+bool parse_bigram(FILE * input, FILE * output);
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ linebuf[strlen(linebuf) - 1] = '\0';
+ return result;
+}
+
+bool parse_headline(FILE * input, FILE * output) {
+ /* enter "\data" line */
+ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model",
+ "count:N:total_freq"));
+
+ /* read "\data" line */
+ if ( !taglib_read(linebuf, line_type, values, required) ) {
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ assert(line_type == BEGIN_LINE);
+ TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+ if ( !( strcmp("k mixture model", model) == 0 ) ){
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ /* print header */
+ fprintf(output, "\\data model interpolation\n");
+
+ return true;
+}
+
+bool parse_body(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ fprintf(output, "\\end\n");
+ goto end;
+ case GRAM_1_LINE:
+ fprintf(output, "\\1-gram\n");
+ my_getline(input);
+ parse_unigram(input, output);
+ goto retry;
+ case GRAM_2_LINE:
+ fprintf(output, "\\2-gram\n");
+ my_getline(input);
+ parse_bigram(input, output);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count"));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case GRAM_1_ITEM_LINE: {
+ /* handle \item in \1-gram */
+ TAGLIB_GET_TOKEN(token, 0);
+ TAGLIB_GET_PHRASE_STRING(word, 1);
+
+ /* remove the "<start>" in the uni-gram of interpolation model */
+ if ( sentence_start == token )
+ break;
+
+ TAGLIB_GET_TAGVALUE(glong, freq, atol);
+
+ /* ignore zero unigram freq item */
+ if ( 0 != freq )
+ fprintf(output, "\\item %d %s count %ld\n", token, word, freq);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
+ "count", "T:N_n_0:n_1:Mr"));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two strings */
+ TAGLIB_GET_TOKEN(token1, 0);
+ TAGLIB_GET_PHRASE_STRING(word1, 1);
+
+ TAGLIB_GET_TOKEN(token2, 2);
+ TAGLIB_GET_PHRASE_STRING(word2, 3);
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ fprintf(output, "\\item %d %s %d %s count %ld\n",
+ token1, word1, token2, word2, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ taglib_init();
+
+ values = g_ptr_array_new();
+ required = g_hash_table_new(g_str_hash, g_str_equal);
+
+ ssize_t result = my_getline(input);
+ if ( result == -1 ) {
+ fprintf(stderr, "empty file input.\n");
+ exit(ENODATA);
+ }
+
+ if (!parse_headline(input, output))
+ exit(ENODATA);
+
+ result = my_getline(input);
+ if ( result != -1 )
+ parse_body(input, output);
+
+ taglib_fini();
+
+ return 0;
+}