summaryrefslogtreecommitdiffstats
path: root/utils/storage
diff options
context:
space:
mode:
Diffstat (limited to 'utils/storage')
-rw-r--r--utils/storage/CMakeLists.txt29
-rw-r--r--utils/storage/Makefile.am45
-rw-r--r--utils/storage/export_interpolation.cpp144
-rw-r--r--utils/storage/gen_binary_files.cpp115
-rw-r--r--utils/storage/gen_pinyin_table.cpp330
-rw-r--r--utils/storage/import_interpolation.cpp313
6 files changed, 976 insertions, 0 deletions
diff --git a/utils/storage/CMakeLists.txt b/utils/storage/CMakeLists.txt
new file mode 100644
index 0000000..63cabcd
--- /dev/null
+++ b/utils/storage/CMakeLists.txt
@@ -0,0 +1,29 @@
+add_executable(
+ gen_binary_files
+ gen_binary_files.cpp
+)
+
+target_link_libraries(
+ gen_binary_files
+ libpinyin
+)
+
+add_executable(
+ import_interpolation
+ import_interpolation.cpp
+)
+
+target_link_libraries(
+ import_interpolation
+ libpinyin
+)
+
+add_executable(
+ export_interpolation
+ export_interpolation.cpp
+)
+
+target_link_libraries(
+ export_interpolation
+ libpinyin
+)
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
new file mode 100644
index 0000000..db63488
--- /dev/null
+++ b/utils/storage/Makefile.am
@@ -0,0 +1,45 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
+ @GLIB2_CFLAGS@
+
+bin_PROGRAMS = gen_binary_files \
+ import_interpolation
+
+noinst_PROGRAMS = export_interpolation \
+ gen_pinyin_table
+
+gen_binary_files_SOURCES = gen_binary_files.cpp
+
+gen_binary_files_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+import_interpolation_SOURCES = import_interpolation.cpp
+
+import_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+export_interpolation_SOURCES = export_interpolation.cpp
+
+export_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_pinyin_table_SOURCES = gen_pinyin_table.cpp
+
+gen_pinyin_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp
new file mode 100644
index 0000000..c43eefb
--- /dev/null
+++ b/utils/storage/export_interpolation.cpp
@@ -0,0 +1,144 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+/* export interpolation model as textual format */
+
+bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index);
+bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram);
+
+bool begin_data(FILE * output){
+ fprintf(output, "\\data model interpolation\n");
+ return true;
+}
+
+bool end_data(FILE * output){
+ fprintf(output, "\\end\n");
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * output = stdout;
+ const char * bigram_filename = SYSTEM_BIGRAM;
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ bigram.attach(bigram_filename, ATTACH_READONLY);
+
+ begin_data(output);
+
+ gen_unigram(output, &phrase_index);
+ gen_bigram(output, &phrase_index, &bigram);
+
+ end_data(output);
+ return 0;
+}
+
+bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
+ fprintf(output, "\\1-gram\n");
+ for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) {
+
+ PhraseIndexRange range;
+ int result = phrase_index->get_range(i, range);
+ if (ERROR_OK != result )
+ continue;
+
+ PhraseItem item;
+ for (phrase_token_t token = range.m_range_begin;
+ token < range.m_range_end; token++) {
+ int result = phrase_index->get_phrase_item(token, item);
+
+ if ( result == ERROR_NO_ITEM )
+ continue;
+ assert( result == ERROR_OK);
+
+ size_t freq = item.get_unigram_frequency();
+ if ( 0 == freq )
+ continue;
+ char * phrase = taglib_token_to_string(phrase_index, token);
+ if ( phrase )
+ fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq);
+
+ g_free(phrase);
+ }
+ }
+ return true;
+}
+
+bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){
+ fprintf(output, "\\2-gram\n");
+
+ /* Retrieve all user items. */
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ bigram->get_all_items(items);
+
+ PhraseItem item;
+
+ for(size_t i = 0; i < items->len; i++){
+ phrase_token_t token = g_array_index(items, phrase_token_t, i);
+ SingleGram * single_gram = NULL;
+ bigram->load(token, single_gram);
+
+ BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+ single_gram->retrieve_all(array);
+ for(size_t j = 0; j < array->len; j++) {
+ BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);
+
+ char * word1 = taglib_token_to_string(phrase_index, token);
+ char * word2 = taglib_token_to_string(phrase_index, item->m_token);
+ guint32 freq = item->m_count;
+
+ if ( word1 && word2)
+ fprintf(output, "\\item %d %s %d %s count %d\n",
+ token, word1, item->m_token, word2, freq);
+
+ g_free(word1); g_free(word2);
+ }
+
+ g_array_free(array, TRUE);
+ delete single_gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp
new file mode 100644
index 0000000..4216b44
--- /dev/null
+++ b/utils/storage/gen_binary_files.cpp
@@ -0,0 +1,115 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+ {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+ {NULL}
+};
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate binary files");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ bool retval = system_table_info.load(filename);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+ g_free(filename);
+
+ /* generate pinyin index*/
+ pinyin_option_t options = USE_TONE;
+ ChewingLargeTable chewing_table(options);
+ PhraseLargeTable2 phrase_table;
+
+ /* generate phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+ assert(table_info->m_dict_index == i);
+
+ if (SYSTEM_FILE != table_info->m_file_type &&
+ DICTIONARY != table_info->m_file_type)
+ continue;
+
+ const char * tablename = table_info->m_table_filename;
+
+ filename = g_build_filename(table_dir, tablename, NULL);
+ FILE * tablefile = fopen(filename, "r");
+
+ if (NULL == tablefile) {
+ fprintf(stderr, "open %s failed!\n", tablename);
+ exit(ENOENT);
+ }
+
+ chewing_table.load_text(tablefile);
+ fseek(tablefile, 0L, SEEK_SET);
+ phrase_table.load_text(tablefile);
+ fseek(tablefile, 0L, SEEK_SET);
+ phrase_index.load_text(i, tablefile);
+ fclose(tablefile);
+ g_free(filename);
+ }
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ chewing_table.store(new_chunk);
+ new_chunk->save(SYSTEM_PINYIN_INDEX);
+ chewing_table.load(new_chunk);
+
+ new_chunk = new MemoryChunk;
+ phrase_table.store(new_chunk);
+ new_chunk->save(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(new_chunk);
+
+ phrase_index.compact();
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ if (!save_dictionary(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}
diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp
new file mode 100644
index 0000000..3b541d1
--- /dev/null
+++ b/utils/storage/gen_pinyin_table.cpp
@@ -0,0 +1,330 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+
+
+void print_help(){
+ printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n"
+ "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"
+ "<OUTPUTFILE> the result output file\n"
+ "<FILEi> input pinyin files\n"
+ "<PHRASE_INDEX> phrase index identifier\n");
+}
+
+
+static gint phrase_index = 0;
+static const gchar * outputfile = "temp.out";
+
+static GOptionEntry entries[] =
+{
+ {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL},
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL},
+ {NULL}
+};
+
+
+using namespace pinyin;
+
+/* map from phrase_item to GArray of chewing_and_freq_item */
+GTree * g_chewing_tree;
+/* Array of GArray of phrase_and_array_item */
+GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
+
+struct phrase_item{
+ size_t length;
+ gunichar * uniphrase;
+};
+
+struct chewing_and_freq_item{
+ ChewingKeyVector keys;
+ ChewingKeyRestVector key_rests;
+ guint32 freq;
+};
+
+struct phrase_and_array_item{
+ phrase_item phrase; /* the key of g_chewing_tree */
+ /* Array of chewing_and_freq_item */
+ GArray * chewing_and_freq_array; /* the value of g_chewing_tree */
+};
+
+
+void feed_file(const char * filename);
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data);
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata);
+
+void gen_phrase_file(const char * outputfile, int phrase_index);
+
+
+gint phrase_item_compare(gconstpointer a, gconstpointer b){
+ phrase_item * itema = (phrase_item *) a;
+ phrase_item * itemb = (phrase_item *) b;
+ if ( itema->length != itemb->length )
+ return itema->length - itemb->length;
+ else
+ return memcmp(itema->uniphrase, itemb->uniphrase,
+ sizeof(gunichar) * itema->length);
+}
+
+
+int main(int argc, char * argv[]){
+ int i;
+
+ g_chewing_tree = g_tree_new(phrase_item_compare);
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate pinyin table");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ for (i = 1; i < argc; ++i) {
+ feed_file(argv[i]);
+ }
+
+ printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree));
+
+ /* store in item array */
+ g_item_array[0] = NULL;
+ for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_item_array[i] = g_array_new
+ (FALSE, TRUE, sizeof(phrase_and_array_item));
+ }
+ g_tree_foreach(g_chewing_tree, store_one_item, NULL);
+
+ /* sort item array */
+ for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
+ }
+
+ gen_phrase_file(outputfile, phrase_index);
+
+ return 0;
+}
+
+void feed_file ( const char * filename){
+ char phrase[1024], pinyin[1024];
+ guint32 freq;
+
+ FILE * infile = fopen(filename, "r");
+ if ( NULL == infile ){
+ fprintf(stderr, "Can't open file %s.\n", filename);
+ exit(ENOENT);
+ }
+
+ while ( !feof(infile)){
+ int num = fscanf(infile, "%s %s %u",
+ phrase, pinyin, &freq);
+
+ if (3 != num)
+ continue;
+
+ if (feof(infile))
+ break;
+
+ feed_line(phrase, pinyin, freq);
+ }
+
+ fclose(infile);
+}
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
+ phrase_item * item = new phrase_item;
+ item->length = g_utf8_strlen(phrase, -1);
+
+ /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
+ * where is the code which I don't want to touch. :-)
+ */
+
+ if (item->length >= MAX_PHRASE_LENGTH) {
+ fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
+
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests = g_array_new
+ (FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE;
+ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
+ assert(keys->len == key_rests->len);
+
+ if (keys->len != item->length) {
+ fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);
+
+ chewing_and_freq_item value_item;
+ value_item.keys = keys; value_item.key_rests = key_rests;
+ value_item.freq = freq;
+
+ assert(item->length == value_item.keys->len);
+ if (NULL == array) {
+ array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ return;
+ }
+
+ bool found = false;
+ for (size_t i = 0; i < array->len; ++i) {
+ chewing_and_freq_item * cur_item =
+ &g_array_index(array, chewing_and_freq_item, i);
+ int result = pinyin_exact_compare2
+ ((ChewingKey *) value_item.keys->data,
+ (ChewingKey *) cur_item->keys->data,
+ value_item.keys->len);
+
+ if (0 == result) {
+ fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
+ phrase, pinyin, freq);
+ cur_item->freq += freq;
+ found = true;
+ }
+ }
+
+ if (!found) {
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ } else {
+ /* clean up */
+ g_array_free(keys, TRUE);
+ g_array_free(key_rests, TRUE);
+ }
+
+ delete item;
+}
+
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data) {
+ phrase_and_array_item item;
+ item.phrase = *((phrase_item *) key);
+ item.chewing_and_freq_array = (GArray *) value;
+ int len = item.phrase.length;
+ g_array_append_val(g_item_array[len], item);
+ return FALSE;
+}
+
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata) {
+ int phrase_length = *((int *) userdata);
+ phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs;
+ phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs;
+
+ ChewingKeyVector keys_lhs = g_array_index
+ (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ ChewingKeyVector keys_rhs = g_array_index
+ (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ return pinyin_exact_compare2((ChewingKey *)keys_lhs->data,
+ (ChewingKey *)keys_rhs->data, phrase_length);
+}
+
+
+void gen_phrase_file(const char * outputfile, int phrase_index){
+ FILE * outfile = fopen(outputfile, "w");
+ if (NULL == outfile ) {
+ fprintf(stderr, "Can't write file %s.\n", outputfile);
+ exit(ENOENT);
+ }
+
+ phrase_token_t token = 1;
+
+ /* phrase length index */
+ for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) {
+ GArray * item_array = g_item_array[i];
+
+ /* item array index */
+ for (size_t m = 0; m < item_array->len; ++m) {
+ phrase_and_array_item * item = &g_array_index
+ (item_array, phrase_and_array_item, m);
+ phrase_item phrase = item->phrase;
+ GArray * chewing_and_freqs = item->chewing_and_freq_array;
+
+ gchar * phrase_str = g_ucs4_to_utf8
+ (phrase.uniphrase, phrase.length, NULL, NULL, NULL);
+
+ /* iterate each pinyin */
+ for (size_t n = 0; n < chewing_and_freqs->len; ++n) {
+ chewing_and_freq_item * chewing_and_freq =
+ &g_array_index
+ (chewing_and_freqs, chewing_and_freq_item, n);
+
+ ChewingKeyVector keys = chewing_and_freq->keys;
+ ChewingKeyRestVector key_rests = chewing_and_freq->key_rests;
+
+ GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *));
+ gchar * pinyin = NULL;
+
+ size_t k;
+ for (k = 0; k < keys->len; ++k) {
+ ChewingKey key = g_array_index(keys, ChewingKey, k);
+ ChewingKeyRest key_rest = g_array_index
+ (key_rests, ChewingKeyRest, k);
+
+ //assert (CHEWING_ZERO_TONE != key.m_tone);
+ pinyin = key.get_pinyin_string();
+ g_array_append_val(pinyins, pinyin);
+ }
+ gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data);
+
+ for (k = 0; k < pinyins->len; ++k) {
+ g_free(g_array_index(pinyins, gchar *, k));
+ }
+ g_array_free(pinyins, TRUE);
+
+ guint32 freq = chewing_and_freq->freq;
+
+ /* avoid zero freq */
+ if (freq < 3) freq = 3;
+
+ fprintf(outfile, "%s\t%s\t%d\t%d\n",
+ pinyin_str, phrase_str,
+ PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq);
+
+ g_free(pinyin_str);
+ }
+ g_free(phrase_str);
+ token++;
+ }
+ }
+
+ fclose(outfile);
+}
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
new file mode 100644
index 0000000..205a27a
--- /dev/null
+++ b/utils/storage/import_interpolation.cpp
@@ -0,0 +1,313 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+ {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+ {NULL}
+};
+
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline();
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index);
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * bigram);
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+ return result;
+}
+
+bool parse_headline(){
+ /* enter "\data" line */
+ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
+
+ /* read "\data" line */
+ if ( !taglib_read(linebuf, line_type, values, required) ) {
+ fprintf(stderr, "error: interpolation model expected.\n");
+ return false;
+ }
+
+ assert(line_type == BEGIN_LINE);
+ /* check header */
+ TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+ if ( !( strcmp("interpolation", model) == 0 ) ) {
+ fprintf(stderr, "error: interpolation model expected.\n");
+ return false;
+ }
+ return true;
+}
+
+bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ goto end;
+ case GRAM_1_LINE:
+ my_getline(input);
+ parse_unigram(input, phrase_table, phrase_index);
+ goto retry;
+ case GRAM_2_LINE:
+ my_getline(input);
+ parse_bigram(input, phrase_table, phrase_index, bigram);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1) ;
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", ""));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_1_ITEM_LINE:{
+ /* handle \item in \1-gram */
+ TAGLIB_GET_TOKEN(token, 0);
+ TAGLIB_GET_PHRASE_STRING(word, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token, word));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ phrase_index->add_unigram_frequency(token, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", ""));
+
+ phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL;
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two tokens */
+ TAGLIB_GET_TOKEN(token1, 0);
+ TAGLIB_GET_PHRASE_STRING(word1, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token1, word1));
+
+ TAGLIB_GET_TOKEN(token2, 2);
+ TAGLIB_GET_PHRASE_STRING(word2, 3);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token2, word2));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+
+ if ( last_token != token1 ) {
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+
+ /* safe guard */
+ last_token = null_token;
+ last_single_gram = NULL;
+ }
+ SingleGram * single_gram = NULL;
+ bigram->load(token1, single_gram);
+
+ /* create the new single gram */
+ if ( single_gram == NULL )
+ single_gram = new SingleGram;
+ last_token = token1;
+ last_single_gram = single_gram;
+ }
+
+ /* save the freq */
+ assert(NULL != last_single_gram);
+ guint32 total_freq = 0;
+ assert(last_single_gram->get_total_freq(total_freq));
+ assert(last_single_gram->insert_freq(token2, count));
+ total_freq += count;
+ assert(last_single_gram->set_total_freq(total_freq));
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ //safe guard
+ last_token = 0;
+ last_single_gram = NULL;
+ }
+
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ const char * bigram_filename = SYSTEM_BIGRAM;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- import interpolation model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ bool retval = system_table_info.load(filename);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+ g_free(filename);
+
+ PhraseLargeTable2 phrase_table;
+
+ MemoryChunk * chunk = new MemoryChunk;
+ retval = chunk->load(SYSTEM_PHRASE_INDEX);
+ if (!retval) {
+ fprintf(stderr, "open phrase_index.bin failed!\n");
+ exit(ENOENT);
+ }
+ phrase_table.load(chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", bigram_filename);
+ exit(ENOENT);
+ }
+
+ taglib_init();
+
+ values = g_ptr_array_new();
+ required = g_hash_table_new(g_str_hash, g_str_equal);
+
+ /* read first line */
+ ssize_t result = my_getline(input);
+ if ( result == -1 ) {
+ fprintf(stderr, "empty file input.\n");
+ exit(ENODATA);
+ }
+
+ if (!parse_headline())
+ exit(ENODATA);
+
+ result = my_getline(input);
+ if ( result != -1 )
+ parse_body(input, &phrase_table, &phrase_index, &bigram);
+
+ taglib_fini();
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}