import libpinyin code

author: Peng Wu <alexepico@gmail.com> 2013-07-22 11:37:11 +0800
committer: Peng Wu <alexepico@gmail.com> 2013-07-22 11:37:11 +0800
commit: b78429d78df745dd327b6dada6b9bd71ea5df84e (patch)
tree: 82c4625db8674c66d69fd566fce8efc347e3cb3a /utils
download: libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.gz
libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.xz
libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.zip
30 files changed, 5088 insertions, 0 deletions
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
new file mode 100644
index 0000000..dbd7855
--- /dev/null
+++ b/utils/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(segment)
+add_subdirectory(storage)
+add_subdirectory(training)
+\ No newline at end of file
diff --git a/utils/Makefile.am b/utils/Makefile.am
new file mode 100644
index 0000000..bc0f3e5
--- /dev/null
+++ b/utils/Makefile.am
@@ -0,0 +1,27 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+AUTOMAKE_OPTIONS 	= gnu
+SUBDIRS 		= storage segment training
+
+MAINTAINERCLEANFILES 	= Makefile.in 
+
+CLEANFILES		= *.bak
+
+ACLOCAL			= aclocal -I $(ac_aux_dir)
+
+noinst_HEADERS          = utils_helper.h
diff --git a/utils/segment/CMakeLists.txt b/utils/segment/CMakeLists.txt
new file mode 100644
index 0000000..82e4deb
--- /dev/null
+++ b/utils/segment/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_executable(
+    spseg
+    spseg.cpp
+)
+
+target_link_libraries(
+    spseg
+    libpinyin
+)
+
+add_executable(
+    ngseg
+    ngseg.cpp
+)
+
+target_link_libraries(
+    ngseg
+    libpinyin
+)
+\ No newline at end of file
diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am
new file mode 100644
index 0000000..579d6e4
--- /dev/null
+++ b/utils/segment/Makefile.am
@@ -0,0 +1,39 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+MAINTAINERCLEANFILES    = Makefile.in
+
+INCLUDES		= -I$(top_srcdir)/src \
+			  -I$(top_srcdir)/src/include \
+			  -I$(top_srcdir)/src/storage \
+			  -I$(top_srcdir)/src/lookup \
+			  -I$(top_srcdir)/utils \
+			  @GLIB2_CFLAGS@
+
+noinst_PROGRAMS		= spseg ngseg mergeseq
+
+spseg_SOURCES		= spseg.cpp
+
+spseg_LDADD		= ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+ngseg_SOURCES		= ngseg.cpp
+
+ngseg_LDADD		= ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+mergeseq_SOURCES	= mergeseq.cpp
+
+mergeseq_LDADD		= ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp
new file mode 100644
index 0000000..1a26064
--- /dev/null
+++ b/utils/segment/mergeseq.cpp
@@ -0,0 +1,278 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2013 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <string.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+    printf("Usage: mergeseq [-o outputfile] [inputfile]\n");
+}
+
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+    {NULL}
+};
+
+
+/* data structure definition. */
+typedef struct{
+    phrase_token_t m_token;
+    gint m_token_len;
+} TokenInfo;
+
+
+/* GArray of ucs4 characters. */
+typedef GArray * UnicodeCharVector;
+/* GArray of TokenInfo. */
+typedef GArray * TokenInfoVector;
+
+gint calculate_sequence_length(TokenInfoVector tokeninfos) {
+    gint len = 0;
+
+    size_t i = 0;
+    for (i = 0; i < tokeninfos->len; ++i) {
+        TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i);
+        len += token_info->m_token_len;
+    }
+
+    return len;
+}
+
+/* if merge sequence found, merge and output it,
+ *   if not, just output the first token;
+ * pop the first token or sequence.
+ */
+bool merge_sequence(FacadePhraseTable2 * phrase_table,
+                    FacadePhraseIndex * phrase_index,
+                    UnicodeCharVector unichars,
+                    TokenInfoVector tokeninfos) {
+    assert(tokeninfos->len > 0);
+
+    bool found = false;
+    TokenInfo * token_info = NULL;
+    phrase_token_t token = null_token;
+
+    ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(PhraseTokens));
+    phrase_index->prepare_tokens(tokens);
+
+    /* search the merge sequence. */
+    size_t index = tokeninfos->len;
+    gint seq_len = calculate_sequence_length(tokeninfos);
+    while (seq_len > 0) {
+        /* do phrase table search. */
+        int retval = phrase_table->search(seq_len, ucs4_str, tokens);
+
+        if (retval & SEARCH_OK) {
+            int num = get_first_token(tokens, token);
+            found = true;
+            break;
+        }
+
+        --index;
+        token_info = &g_array_index(tokeninfos, TokenInfo, index);
+        seq_len -= token_info->m_token_len;
+    }
+
+    phrase_index->destroy_tokens(tokens);
+
+    /* push the merged sequence back. */
+    if (found) {
+        /* pop up the origin sequence. */
+        g_array_remove_range(tokeninfos, 0, index);
+
+        TokenInfo info;
+        info.m_token = token;
+        info.m_token_len = seq_len;
+        g_array_prepend_val(tokeninfos, info);
+    }
+
+    return found;
+}
+
+bool pop_first_token(UnicodeCharVector unichars,
+                     TokenInfoVector tokeninfos,
+                     FILE * output) {
+    ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
+
+    /* pop it. */
+    TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, 0);
+    phrase_token_t token = token_info->m_token;
+    gint token_len = token_info->m_token_len;
+
+    glong read = 0;
+    gchar * utf8_str = g_ucs4_to_utf8(ucs4_str, token_len, &read, NULL, NULL);
+    assert(read == token_len);
+    fprintf(output, "%d %s\n", token, utf8_str);
+    g_free(utf8_str);
+
+    g_array_remove_range(unichars, 0, token_len);
+    g_array_remove_index(tokeninfos, 0);
+
+    return true;
+}
+
+bool feed_line(FacadePhraseTable2 * phrase_table,
+               FacadePhraseIndex * phrase_index,
+               UnicodeCharVector unichars,
+               TokenInfoVector tokeninfos,
+               const char * linebuf,
+               FILE * output) {
+
+    TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf);
+
+    if (null_token == token) {
+        /* empty the queue. */
+        while (0 != tokeninfos->len) {
+            merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+            pop_first_token(unichars, tokeninfos, output);
+        }
+
+        assert(0 == unichars->len);
+        assert(0 == tokeninfos->len);
+
+        /* restore the null token line. */
+        fprintf(output, "%s\n", linebuf);
+
+        return false;
+    }
+
+    PhraseItem item;
+    phrase_index->get_phrase_item(token, item);
+    gint len = item.get_phrase_length();
+
+    TokenInfo info;
+    info.m_token = token;
+    info.m_token_len = len;
+    g_array_append_val(tokeninfos, info);
+
+    ucs4_t buffer[MAX_PHRASE_LENGTH];
+    item.get_phrase_string(buffer);
+    g_array_append_vals(unichars, buffer, len);
+
+    /* probe merge sequence. */
+    len = calculate_sequence_length(tokeninfos);
+    while (len >= MAX_PHRASE_LENGTH) {
+        merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+        pop_first_token(unichars, tokeninfos, output);
+        len = calculate_sequence_length(tokeninfos);
+    }
+
+    return true;
+}
+
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+    FILE * output = stdout;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- merge word sequence");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    if (outputfile) {
+        output = fopen(outputfile, "w");
+        if (NULL == output) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    if (argc > 2) {
+        fprintf(stderr, "too many arguments.\n");
+        exit(EINVAL);
+    }
+
+    if (2 == argc) {
+        input = fopen(argv[1], "r");
+        if (NULL == input) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    /* init phrase table */
+    FacadePhraseTable2 phrase_table;
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk, NULL);
+
+    /* init phrase index */
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    GArray * unichars = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+    GArray * tokeninfos = g_array_new(TRUE, TRUE, sizeof(TokenInfo));
+
+    char * linebuf = NULL; size_t size = 0; ssize_t read;
+    while( (read = getline(&linebuf, &size, input)) != -1 ){
+        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        if (0 == strlen(linebuf))
+            continue;
+
+        feed_line(&phrase_table, &phrase_index,
+                  unichars, tokeninfos,
+                  linebuf, output);
+    }
+
+    g_array_free(unichars, TRUE);
+    g_array_free(tokeninfos, TRUE);
+    free(linebuf);
+    fclose(input);
+    fclose(output);
+    return 0;
+}
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
new file mode 100644
index 0000000..03fe5b4
--- /dev/null
+++ b/utils/segment/ngseg.cpp
@@ -0,0 +1,261 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2010 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+    printf("Usage: ngseg [--generate-extra-enter]  [-o outputfile] [inputfile]\n");
+}
+
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+    {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+    {NULL}
+};
+
+
+/* n-gram based sentence segment. */
+
+/* Note:
+ * Currently libpinyin supports ucs4 characters.
+ * This is a pre-processor tool for raw corpus,
+ * and skips non-Chinese characters.
+ */
+
+/* TODO:
+ * Try to add punctuation mark and english support,
+ * such as ',', '.', '?', '!', <english>, and other punctuations.
+ */
+
+enum CONTEXT_STATE{
+    CONTEXT_INIT,
+    CONTEXT_SEGMENTABLE,
+    CONTEXT_UNKNOWN
+};
+
+bool deal_with_segmentable(PhraseLookup * phrase_lookup,
+                           GArray * current_ucs4,
+                           FILE * output){
+    char * result_string = NULL;
+    MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    phrase_lookup->get_best_match(current_ucs4->len,
+                                  (ucs4_t *) current_ucs4->data, results);
+
+    phrase_lookup->convert_to_utf8(results, result_string);
+
+    if (result_string) {
+        fprintf(output, "%s\n", result_string);
+    } else {
+        char * tmp_string = g_ucs4_to_utf8
+            ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+              NULL, NULL, NULL);
+        fprintf(stderr, "Un-segmentable sentence encountered:%s\n",
+                tmp_string);
+        g_array_free(results, TRUE);
+        return false;
+    }
+    g_array_free(results, TRUE);
+    g_free(result_string);
+    return true;
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+    char * result_string = g_ucs4_to_utf8
+        ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+          NULL, NULL, NULL);
+    fprintf(output, "%d %s\n", null_token, result_string);
+    g_free(result_string);
+    return true;
+}
+
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+    FILE * output = stdout;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- n-gram segment");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    if (outputfile) {
+        output = fopen(outputfile, "w");
+        if (NULL == output) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    if (argc > 2) {
+        fprintf(stderr, "too many arguments.\n");
+        exit(EINVAL);
+    }
+
+    if (2 == argc) {
+        input = fopen(argv[1], "r");
+        if (NULL == input) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    /* init phrase table */
+    FacadePhraseTable2 phrase_table;
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk, NULL);
+
+    /* init phrase index */
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    /* init bi-gram */
+    Bigram system_bigram;
+    system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+    Bigram user_bigram;
+
+    gfloat lambda = system_table_info.get_lambda();
+
+    /* init phrase lookup */
+    PhraseLookup phrase_lookup(lambda,
+                               &phrase_table, &phrase_index,
+                               &system_bigram, &user_bigram);
+
+
+    CONTEXT_STATE state, next_state;
+    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(PhraseTokens));
+    phrase_index.prepare_tokens(tokens);
+
+    /* split the sentence */
+    char * linebuf = NULL; size_t size = 0; ssize_t read;
+    while( (read = getline(&linebuf, &size, input)) != -1 ){
+        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        /* check non-ucs4 characters */
+        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+        glong len = 0;
+        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+        if ( len != num_of_chars ) {
+            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
+            fprintf(output, "%d \n", null_token);
+            continue;
+        }
+
+        /* only new-line persists. */
+        if ( 0  == num_of_chars ) {
+            fprintf(output, "%d \n", null_token);
+            continue;
+        }
+
+        state = CONTEXT_INIT;
+        int result = phrase_table.search( 1, sentence, tokens);
+        g_array_append_val( current_ucs4, sentence[0]);
+        if ( result & SEARCH_OK )
+            state = CONTEXT_SEGMENTABLE;
+        else
+            state = CONTEXT_UNKNOWN;
+
+        for ( int i = 1; i < num_of_chars; ++i) {
+            int result = phrase_table.search( 1, sentence + i, tokens);
+            if ( result & SEARCH_OK )
+                next_state = CONTEXT_SEGMENTABLE;
+            else
+                next_state = CONTEXT_UNKNOWN;
+
+            if ( state == next_state ){
+                g_array_append_val(current_ucs4, sentence[i]);
+                continue;
+            }
+
+            assert ( state != next_state );
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_lookup, current_ucs4, output);
+
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+
+            /* save the current character */
+            g_array_set_size(current_ucs4, 0);
+            g_array_append_val(current_ucs4, sentence[i]);
+            state = next_state;
+        }
+
+        if ( current_ucs4->len ) {
+            /* this seems always true. */
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_lookup, current_ucs4, output);
+
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+            g_array_set_size(current_ucs4, 0);
+        }
+
+        /* print extra enter */
+        if ( gen_extra_enter )
+            fprintf(output, "%d \n", null_token);
+
+        g_free(sentence);
+    }
+    phrase_index.destroy_tokens(tokens);
+
+    /* print enter at file tail */
+    fprintf(output, "%d \n", null_token);
+    g_array_free(current_ucs4, TRUE);
+    free(linebuf);
+    fclose(input);
+    fclose(output);
+    return 0;
+}
diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp
new file mode 100644
index 0000000..b543cc5
--- /dev/null
+++ b/utils/segment/spseg.cpp
@@ -0,0 +1,343 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2010,2013 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+    printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
+}
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+    {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+    {NULL}
+};
+
+
+/* graph shortest path sentence segment. */
+
+/* Note:
+ * Currently libpinyin only supports ucs4 characters, as this is a
+ * pre-processor tool for raw corpus, it will skip all sentences
+ * which contains non-ucs4 characters.
+ */
+
+enum CONTEXT_STATE{
+    CONTEXT_INIT,
+    CONTEXT_SEGMENTABLE,
+    CONTEXT_UNKNOWN
+};
+
+struct SegmentStep{
+    phrase_token_t m_handle;
+    ucs4_t * m_phrase;
+    size_t m_phrase_len;
+    //use formula W = number of words. Zero handle means one word.
+    guint m_nword;
+    //backtrace information, -1 one step backward.
+    gint m_backward_nstep;
+public:
+    SegmentStep(){
+        m_handle = null_token;
+        m_phrase = NULL;
+        m_phrase_len = 0;
+        m_nword = UINT_MAX;
+        m_backward_nstep = -0;
+    }
+};
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
+
+/* Note: do not free phrase, as it is used by strings (array of segment). */
+bool segment(FacadePhraseTable2 * phrase_table,
+             FacadePhraseIndex * phrase_index,
+             GArray * current_ucs4,
+             GArray * strings /* Array of SegmentStep. */){
+    ucs4_t * phrase = (ucs4_t *)current_ucs4->data;
+    guint phrase_len = current_ucs4->len;
+
+    /* Prepare for shortest path segment dynamic programming. */
+    GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+    SegmentStep step;
+    for ( glong i = 0; i < phrase_len + 1; ++i ){
+        g_array_append_val(steps, step);
+    }
+
+    SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0);
+    first_step->m_nword = 0;
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(PhraseTokens));
+    phrase_index->prepare_tokens(tokens);
+
+    for ( glong i = 0; i < phrase_len + 1; ++i ) {
+        SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
+        size_t nword = step_begin->m_nword;
+        for ( glong k = i + 1; k < phrase_len + 1; ++k ) {
+            size_t len = k - i;
+            ucs4_t * cur_phrase = phrase + i;
+
+            phrase_token_t token = null_token;
+            int result = phrase_table->search(len, cur_phrase, tokens);
+            int num = get_first_token(tokens, token);
+
+            if ( !(result & SEARCH_OK) ){
+                token = null_token;
+                if ( 1 != len )
+                    continue;
+            }
+            ++nword;
+
+            SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
+            if ( nword < step_end->m_nword ) {
+                step_end->m_handle = token;
+                step_end->m_phrase = cur_phrase;
+                step_end->m_phrase_len = len;
+                step_end->m_nword = nword;
+                step_end->m_backward_nstep = i - k;
+            }
+            if ( !(result & SEARCH_CONTINUED) )
+                break;
+        }
+    }
+    phrase_index->destroy_tokens(tokens);
+
+    return backtrace(steps, phrase_len, strings);
+}
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings){
+    /* backtracing to get the result. */
+    size_t cur_step = phrase_len;
+    g_array_set_size(strings, 0);
+    while ( cur_step ){
+        SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step);
+        g_array_append_val(strings, *step);
+        cur_step = cur_step + step->m_backward_nstep;
+        /* intended to avoid leaking internal informations. */
+        step->m_nword = 0; step->m_backward_nstep = 0;
+    }
+
+    /* reverse the strings. */
+    for ( size_t i = 0; i < strings->len / 2; ++i ) {
+        SegmentStep * head, * tail;
+        head = &g_array_index(strings, SegmentStep, i);
+        tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i );
+        SegmentStep tmp;
+        tmp = *head;
+        *head = *tail;
+        *tail = tmp;
+    }
+
+    g_array_free(steps, TRUE);
+    return true;
+}
+
+bool deal_with_segmentable(FacadePhraseTable2 * phrase_table,
+                           FacadePhraseIndex * phrase_index,
+                           GArray * current_ucs4,
+                           FILE * output){
+
+    /* do segment stuff. */
+    GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+    segment(phrase_table, phrase_index, current_ucs4, strings);
+
+    /* print out the split phrase. */
+    for ( glong i = 0; i < strings->len; ++i ) {
+        SegmentStep * step = &g_array_index(strings, SegmentStep, i);
+        char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
+        fprintf(output, "%d %s\n", step->m_handle, string);
+        g_free(string);
+    }
+
+    g_array_free(strings, TRUE);
+    return true;
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+    char * result_string = g_ucs4_to_utf8
+        ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+          NULL, NULL, NULL);
+    fprintf(output, "%d %s\n", null_token, result_string);
+    g_free(result_string);
+    return true;
+}
+
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+    FILE * output = stdout;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- shortest path segment");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    if (outputfile) {
+        output = fopen(outputfile, "w");
+        if (NULL == output) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    if (argc > 2) {
+        fprintf(stderr, "too many arguments.\n");
+        exit(EINVAL);
+    }
+
+    if (2 == argc) {
+        input = fopen(argv[1], "r");
+        if (NULL == input) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    /* init phrase table */
+    FacadePhraseTable2 phrase_table;
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk, NULL);
+
+    /* init phrase index */
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    CONTEXT_STATE state, next_state;
+    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(PhraseTokens));
+    phrase_index.prepare_tokens(tokens);
+
+    char * linebuf = NULL; size_t size = 0; ssize_t read;
+    while( (read = getline(&linebuf, &size, input)) != -1 ){
+        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        /* check non-ucs4 characters. */
+        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+        glong len = 0;
+        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+        if ( len != num_of_chars ) {
+            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
+            fprintf(output, "%d \n", null_token);
+            continue;
+        }
+
+        /* only new-line persists. */
+        if ( 0  == num_of_chars ) {
+            fprintf(output, "%d \n", null_token);
+            continue;
+        }
+
+        state = CONTEXT_INIT;
+        int result = phrase_table.search( 1, sentence, tokens);
+        g_array_append_val( current_ucs4, sentence[0]);
+        if ( result & SEARCH_OK )
+            state = CONTEXT_SEGMENTABLE;
+        else
+            state = CONTEXT_UNKNOWN;
+
+        for ( int i = 1; i < num_of_chars; ++i) {
+            int result = phrase_table.search( 1, sentence + i, tokens);
+            if ( result & SEARCH_OK )
+                next_state = CONTEXT_SEGMENTABLE;
+            else
+                next_state = CONTEXT_UNKNOWN;
+
+            if ( state == next_state ){
+                g_array_append_val(current_ucs4, sentence[i]);
+                continue;
+            }
+
+            assert ( state != next_state );
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_table, &phrase_index,
+                                      current_ucs4, output);
+
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+
+            /* save the current character */
+            g_array_set_size(current_ucs4, 0);
+            g_array_append_val(current_ucs4, sentence[i]);
+            state = next_state;
+        }
+
+        if ( current_ucs4->len ) {
+            /* this seems always true. */
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_table, &phrase_index,
+                                      current_ucs4, output);
+
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+            g_array_set_size(current_ucs4, 0);
+        }
+
+        /* print extra enter */
+        if ( gen_extra_enter )
+            fprintf(output, "%d \n", null_token);
+
+        g_free(sentence);
+    }
+    phrase_index.destroy_tokens(tokens);
+
+    /* print enter at file tail */
+    fprintf(output, "%d \n", null_token);
+    g_array_free(current_ucs4, TRUE);
+    free(linebuf);
+    fclose(input);
+    fclose(output);
+    return 0;
+}
diff --git a/utils/storage/CMakeLists.txt b/utils/storage/CMakeLists.txt
new file mode 100644
index 0000000..63cabcd
--- /dev/null
+++ b/utils/storage/CMakeLists.txt
@@ -0,0 +1,29 @@
+add_executable(
+    gen_binary_files
+    gen_binary_files.cpp
+)
+
+target_link_libraries(
+    gen_binary_files
+    libpinyin
+)
+
+add_executable(
+    import_interpolation
+    import_interpolation.cpp
+)
+
+target_link_libraries(
+    import_interpolation
+    libpinyin
+)
+
+add_executable(
+    export_interpolation
+    export_interpolation.cpp
+)
+
+target_link_libraries(
+    export_interpolation
+    libpinyin
+)
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
new file mode 100644
index 0000000..db63488
--- /dev/null
+++ b/utils/storage/Makefile.am
@@ -0,0 +1,45 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+INCLUDES                = -I$(top_srcdir)/src \
+                          -I$(top_srcdir)/src/include \
+                          -I$(top_srcdir)/src/storage \
+                          -I$(top_srcdir)/src/lookup \
+			  -I$(top_srcdir)/utils \
+                          @GLIB2_CFLAGS@
+
+bin_PROGRAMS		 = gen_binary_files \
+			   import_interpolation
+
+noinst_PROGRAMS          = export_interpolation \
+			   gen_pinyin_table
+
+gen_binary_files_SOURCES    = gen_binary_files.cpp
+
+gen_binary_files_LDADD      = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+import_interpolation_SOURCES = import_interpolation.cpp
+
+import_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+export_interpolation_SOURCES = export_interpolation.cpp
+
+export_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_pinyin_table_SOURCES    = gen_pinyin_table.cpp
+
+gen_pinyin_table_LDADD      = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp
new file mode 100644
index 0000000..c43eefb
--- /dev/null
+++ b/utils/storage/export_interpolation.cpp
@@ -0,0 +1,144 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2010 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+/* export interpolation model as textual format */
+
+bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index);
+bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram);
+
+bool begin_data(FILE * output){
+    fprintf(output, "\\data model interpolation\n");
+    return true;
+}
+
+bool end_data(FILE * output){
+    fprintf(output, "\\end\n");
+    return true;
+}
+
+int main(int argc, char * argv[]){
+    FILE * output = stdout;
+    const char * bigram_filename = SYSTEM_BIGRAM;
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    Bigram bigram;
+    bigram.attach(bigram_filename, ATTACH_READONLY);
+
+    begin_data(output);
+
+    gen_unigram(output, &phrase_index);
+    gen_bigram(output, &phrase_index, &bigram);
+
+    end_data(output);
+    return 0;
+}
+
+bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
+    fprintf(output, "\\1-gram\n");
+    for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) {
+
+        PhraseIndexRange range;
+        int result = phrase_index->get_range(i, range);
+        if (ERROR_OK != result )
+            continue;
+
+        PhraseItem item;
+        for (phrase_token_t token = range.m_range_begin;
+              token < range.m_range_end; token++) {
+            int result = phrase_index->get_phrase_item(token, item);
+
+            if ( result == ERROR_NO_ITEM )
+                continue;
+            assert( result == ERROR_OK);
+
+            size_t freq = item.get_unigram_frequency();
+            if ( 0 == freq )
+                continue;
+            char * phrase = taglib_token_to_string(phrase_index, token);
+            if ( phrase )
+                fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq);
+
+            g_free(phrase);
+        }
+    }
+    return true;
+}
+
+bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){
+    fprintf(output, "\\2-gram\n");
+
+    /* Retrieve all user items. */
+    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+    bigram->get_all_items(items);
+
+    PhraseItem item;
+
+    for(size_t i = 0; i < items->len; i++){
+        phrase_token_t token = g_array_index(items, phrase_token_t, i);
+        SingleGram * single_gram = NULL;
+        bigram->load(token, single_gram);
+
+        BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+        single_gram->retrieve_all(array);
+        for(size_t j = 0; j < array->len; j++) {
+            BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);
+
+            char * word1 = taglib_token_to_string(phrase_index, token);
+            char * word2 = taglib_token_to_string(phrase_index, item->m_token);
+            guint32 freq = item->m_count;
+
+            if ( word1 && word2)
+                fprintf(output, "\\item %d %s %d %s count %d\n",
+                        token, word1, item->m_token, word2, freq);
+
+            g_free(word1); g_free(word2);
+        }
+
+        g_array_free(array, TRUE);
+        delete single_gram;
+    }
+
+    g_array_free(items, TRUE);
+    return true;
+}
diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp
new file mode 100644
index 0000000..4216b44
--- /dev/null
+++ b/utils/storage/gen_binary_files.cpp
@@ -0,0 +1,115 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2010 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+    {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+    {NULL}
+};
+
+int main(int argc, char * argv[]){
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- generate binary files");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    SystemTableInfo system_table_info;
+
+    gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+    bool retval = system_table_info.load(filename);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+    g_free(filename);
+
+    /* generate pinyin index*/
+    pinyin_option_t options = USE_TONE;
+    ChewingLargeTable chewing_table(options);
+    PhraseLargeTable2 phrase_table;
+
+    /* generate phrase index */
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+        const pinyin_table_info_t * table_info = phrase_files + i;
+        assert(table_info->m_dict_index == i);
+
+        if (SYSTEM_FILE != table_info->m_file_type &&
+            DICTIONARY != table_info->m_file_type)
+            continue;
+
+        const char * tablename = table_info->m_table_filename;
+
+        filename = g_build_filename(table_dir, tablename, NULL);
+        FILE * tablefile = fopen(filename, "r");
+
+        if (NULL == tablefile) {
+            fprintf(stderr, "open %s failed!\n", tablename);
+            exit(ENOENT);
+        }
+
+        chewing_table.load_text(tablefile);
+        fseek(tablefile, 0L, SEEK_SET);
+        phrase_table.load_text(tablefile);
+        fseek(tablefile, 0L, SEEK_SET);
+        phrase_index.load_text(i, tablefile);
+        fclose(tablefile);
+        g_free(filename);
+    }
+
+    MemoryChunk * new_chunk = new MemoryChunk;
+    chewing_table.store(new_chunk);
+    new_chunk->save(SYSTEM_PINYIN_INDEX);
+    chewing_table.load(new_chunk);
+    
+    new_chunk = new MemoryChunk;
+    phrase_table.store(new_chunk);
+    new_chunk->save(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(new_chunk);
+
+    phrase_index.compact();
+
+    if (!save_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    if (!save_dictionary(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    return 0;
+}
diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp
new file mode 100644
index 0000000..3b541d1
--- /dev/null
+++ b/utils/storage/gen_pinyin_table.cpp
@@ -0,0 +1,330 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+
+
+void print_help(){
+    printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n"
+           "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"
+           "<OUTPUTFILE> the result output file\n"
+           "<FILEi> input pinyin files\n"
+           "<PHRASE_INDEX> phrase index identifier\n");
+}
+
+
+static gint phrase_index = 0;
+static const gchar * outputfile = "temp.out";
+
+static GOptionEntry entries[] =
+{
+    {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL},
+    {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL},
+    {NULL}
+};
+
+
+using namespace pinyin;
+
+/* map from phrase_item to GArray of chewing_and_freq_item */
+GTree  * g_chewing_tree;
+/* Array of GArray of phrase_and_array_item */
+GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
+
+struct phrase_item{
+    size_t length;
+    gunichar * uniphrase;
+};
+
+struct chewing_and_freq_item{
+    ChewingKeyVector keys;
+    ChewingKeyRestVector key_rests;
+    guint32 freq;
+};
+
+struct phrase_and_array_item{
+    phrase_item phrase;                    /* the key of g_chewing_tree */
+    /* Array of chewing_and_freq_item */
+    GArray * chewing_and_freq_array;       /* the value of g_chewing_tree */
+};
+
+
+void feed_file(const char * filename);
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data);
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+                         gpointer userdata);
+
+void gen_phrase_file(const char * outputfile, int phrase_index);
+
+
+gint phrase_item_compare(gconstpointer a, gconstpointer b){
+    phrase_item * itema = (phrase_item *) a;
+    phrase_item * itemb = (phrase_item *) b;
+    if ( itema->length != itemb->length )
+	return itema->length - itemb->length;
+    else
+	return memcmp(itema->uniphrase, itemb->uniphrase,
+		      sizeof(gunichar) * itema->length);
+}
+
+
+int main(int argc, char * argv[]){
+    int i;
+
+    g_chewing_tree = g_tree_new(phrase_item_compare);
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- generate pinyin table");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    for (i = 1; i < argc; ++i) {
+        feed_file(argv[i]);
+    }
+
+    printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree));
+
+    /* store in item array */
+    g_item_array[0] = NULL;
+    for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+	g_item_array[i] = g_array_new
+            (FALSE, TRUE, sizeof(phrase_and_array_item));
+    }
+    g_tree_foreach(g_chewing_tree, store_one_item, NULL);
+
+    /* sort item array */
+    for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+	g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
+    }
+
+    gen_phrase_file(outputfile, phrase_index);
+
+    return 0;
+}
+
+void feed_file ( const char * filename){
+    char phrase[1024], pinyin[1024];
+    guint32 freq;
+
+    FILE * infile = fopen(filename, "r");
+    if ( NULL == infile ){
+        fprintf(stderr, "Can't open file %s.\n", filename);
+        exit(ENOENT);
+    }
+
+    while ( !feof(infile)){
+	int num = fscanf(infile, "%s %s %u",
+                         phrase, pinyin, &freq);
+
+        if (3 != num)
+            continue;
+
+	if (feof(infile))
+            break;
+
+	feed_line(phrase, pinyin, freq);
+    }
+
+    fclose(infile);
+}
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
+    phrase_item * item = new phrase_item;
+    item->length = g_utf8_strlen(phrase, -1);
+
+    /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
+     *	where is the code which I don't want to touch. :-)
+     */
+
+    if (item->length >= MAX_PHRASE_LENGTH) {
+        fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
+        delete item;
+        return;
+    }
+
+    item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
+
+    FullPinyinParser2 parser;
+    ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+    ChewingKeyRestVector key_rests = g_array_new
+        (FALSE, FALSE, sizeof(ChewingKeyRest));
+
+    pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE;
+    parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
+    assert(keys->len == key_rests->len);
+
+    if (keys->len != item->length) {
+        fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
+        delete item;
+        return;
+    }
+
+    GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);
+
+    chewing_and_freq_item value_item;
+    value_item.keys = keys; value_item.key_rests = key_rests;
+    value_item.freq = freq;
+
+    assert(item->length == value_item.keys->len);
+    if (NULL == array) {
+        array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
+        g_array_append_val(array, value_item);
+        g_tree_insert(g_chewing_tree, item, array);
+        return;
+    }
+
+    bool found = false;
+    for (size_t i = 0; i < array->len; ++i) {
+        chewing_and_freq_item * cur_item =
+            &g_array_index(array, chewing_and_freq_item, i);
+        int result = pinyin_exact_compare2
+            ((ChewingKey *) value_item.keys->data,
+             (ChewingKey *) cur_item->keys->data,
+             value_item.keys->len);
+
+        if (0 == result) {
+            fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
+                    phrase, pinyin, freq);
+            cur_item->freq += freq;
+            found = true;
+        }
+    }
+
+    if (!found) {
+        g_array_append_val(array, value_item);
+        g_tree_insert(g_chewing_tree, item, array);
+    } else {
+        /* clean up */
+        g_array_free(keys, TRUE);
+        g_array_free(key_rests, TRUE);
+    }
+
+    delete item;
+}
+
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data) {
+    phrase_and_array_item item;
+    item.phrase = *((phrase_item *) key);
+    item.chewing_and_freq_array = (GArray *) value;
+    int len = item.phrase.length;
+    g_array_append_val(g_item_array[len], item);
+    return FALSE;
+}
+
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+                         gpointer userdata) {
+    int phrase_length = *((int *) userdata);
+    phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs;
+    phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs;
+
+    ChewingKeyVector keys_lhs = g_array_index
+        (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+    ChewingKeyVector keys_rhs = g_array_index
+        (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+    return pinyin_exact_compare2((ChewingKey *)keys_lhs->data,
+                                 (ChewingKey *)keys_rhs->data, phrase_length);
+}
+
+
+void gen_phrase_file(const char * outputfile, int phrase_index){
+    FILE * outfile = fopen(outputfile, "w");
+    if (NULL == outfile ) {
+        fprintf(stderr, "Can't write file %s.\n", outputfile);
+        exit(ENOENT);
+    }
+
+    phrase_token_t token = 1;
+
+    /* phrase length index */
+    for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) {
+        GArray * item_array = g_item_array[i];
+
+        /* item array index */
+        for (size_t m = 0; m < item_array->len; ++m) {
+            phrase_and_array_item * item = &g_array_index
+                (item_array, phrase_and_array_item, m);
+            phrase_item phrase = item->phrase;
+            GArray * chewing_and_freqs = item->chewing_and_freq_array;
+
+            gchar * phrase_str = g_ucs4_to_utf8
+                (phrase.uniphrase, phrase.length, NULL, NULL, NULL);
+
+            /* iterate each pinyin */
+            for (size_t n = 0; n < chewing_and_freqs->len; ++n) {
+                chewing_and_freq_item * chewing_and_freq =
+                    &g_array_index
+                    (chewing_and_freqs, chewing_and_freq_item, n);
+
+                ChewingKeyVector keys = chewing_and_freq->keys;
+                ChewingKeyRestVector key_rests = chewing_and_freq->key_rests;
+
+                GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *));
+                gchar * pinyin = NULL;
+
+                size_t k;
+                for (k = 0; k < keys->len; ++k) {
+                    ChewingKey key = g_array_index(keys, ChewingKey, k);
+                    ChewingKeyRest key_rest = g_array_index
+                        (key_rests, ChewingKeyRest, k);
+
+                    //assert (CHEWING_ZERO_TONE != key.m_tone);
+                    pinyin = key.get_pinyin_string();
+                    g_array_append_val(pinyins, pinyin);
+                }
+                gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data);
+
+                for (k = 0; k < pinyins->len; ++k) {
+                    g_free(g_array_index(pinyins, gchar *, k));
+                }
+                g_array_free(pinyins, TRUE);
+
+                guint32 freq = chewing_and_freq->freq;
+
+                /* avoid zero freq */
+                if (freq < 3) freq = 3;
+
+		fprintf(outfile, "%s\t%s\t%d\t%d\n",
+                        pinyin_str, phrase_str,
+                        PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq);
+
+                g_free(pinyin_str);
+            }
+            g_free(phrase_str);
+            token++;
+        }
+    }
+
+    fclose(outfile);
+}
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
new file mode 100644
index 0000000..205a27a
--- /dev/null
+++ b/utils/storage/import_interpolation.cpp
@@ -0,0 +1,313 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2010 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+    {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+    {NULL}
+};
+
+
+enum LINE_TYPE{
+    BEGIN_LINE = 1,
+    END_LINE,
+    GRAM_1_LINE,
+    GRAM_2_LINE,
+    GRAM_1_ITEM_LINE,
+    GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline();
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+                   FacadePhraseIndex * phrase_index);
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+                  FacadePhraseIndex * phrase_index,
+                  Bigram * bigram);
+
+static ssize_t my_getline(FILE * input){
+    ssize_t result = getline(&linebuf, &len, input);
+    if ( result == -1 )
+        return result;
+
+    if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+        linebuf[strlen(linebuf) - 1] = '\0';
+    }
+    return result;
+}
+
+bool parse_headline(){
+    /* enter "\data" line */
+    assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
+
+    /* read "\data" line */
+    if ( !taglib_read(linebuf, line_type, values, required) ) {
+        fprintf(stderr, "error: interpolation model expected.\n");
+        return false;
+    }
+
+    assert(line_type == BEGIN_LINE);
+    /* check header */
+    TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+    if ( !( strcmp("interpolation", model) == 0 ) ) {
+        fprintf(stderr, "error: interpolation model expected.\n");
+        return false;
+    }
+    return true;
+}
+
+bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table,
+                FacadePhraseIndex * phrase_index,
+                Bigram * bigram){
+    taglib_push_state();
+
+    assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+    assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+    assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+    do {
+    retry:
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch(line_type) {
+        case END_LINE:
+            goto end;
+        case GRAM_1_LINE:
+            my_getline(input);
+            parse_unigram(input, phrase_table, phrase_index);
+            goto retry;
+        case GRAM_2_LINE:
+            my_getline(input);
+            parse_bigram(input, phrase_table, phrase_index, bigram);
+            goto retry;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1) ;
+
+ end:
+    taglib_pop_state();
+    return true;
+}
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+                   FacadePhraseIndex * phrase_index){
+    taglib_push_state();
+
+    assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", ""));
+
+    do {
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch (line_type) {
+        case GRAM_1_ITEM_LINE:{
+            /* handle \item in \1-gram */
+            TAGLIB_GET_TOKEN(token, 0);
+            TAGLIB_GET_PHRASE_STRING(word, 1);
+            assert(taglib_validate_token_with_string
+                   (phrase_index, token, word));
+
+            TAGLIB_GET_TAGVALUE(glong, count, atol);
+            phrase_index->add_unigram_frequency(token, count);
+            break;
+        }
+        case END_LINE:
+        case GRAM_1_LINE:
+        case GRAM_2_LINE:
+            goto end;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1);
+
+ end:
+    taglib_pop_state();
+    return true;
+}
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+                  FacadePhraseIndex * phrase_index,
+                  Bigram * bigram){
+    taglib_push_state();
+
+    assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", ""));
+
+    phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL;
+    do {
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch (line_type) {
+        case GRAM_2_ITEM_LINE:{
+            /* handle \item in \2-gram */
+            /* two tokens */
+            TAGLIB_GET_TOKEN(token1, 0);
+            TAGLIB_GET_PHRASE_STRING(word1, 1);
+            assert(taglib_validate_token_with_string
+                   (phrase_index, token1, word1));
+
+            TAGLIB_GET_TOKEN(token2, 2);
+            TAGLIB_GET_PHRASE_STRING(word2, 3);
+            assert(taglib_validate_token_with_string
+                   (phrase_index, token2, word2));
+
+            TAGLIB_GET_TAGVALUE(glong, count, atol);
+
+            if ( last_token != token1 ) {
+                if ( last_token && last_single_gram ) {
+                    bigram->store(last_token, last_single_gram);
+                    delete last_single_gram;
+
+                    /* safe guard */
+                    last_token = null_token;
+                    last_single_gram = NULL;
+                }
+                SingleGram * single_gram = NULL;
+                bigram->load(token1, single_gram);
+
+                /* create the new single gram */
+                if ( single_gram == NULL )
+                    single_gram = new SingleGram;
+                last_token = token1;
+                last_single_gram = single_gram;
+            }
+
+            /* save the freq */
+            assert(NULL != last_single_gram);
+            guint32 total_freq = 0;
+            assert(last_single_gram->get_total_freq(total_freq));
+            assert(last_single_gram->insert_freq(token2, count));
+            total_freq += count;
+            assert(last_single_gram->set_total_freq(total_freq));
+            break;
+        }
+        case END_LINE:
+        case GRAM_1_LINE:
+        case GRAM_2_LINE:
+            goto end;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1);
+
+ end:
+    if ( last_token && last_single_gram ) {
+        bigram->store(last_token, last_single_gram);
+        delete last_single_gram;
+        //safe guard
+        last_token = 0;
+        last_single_gram = NULL;
+    }
+
+    taglib_pop_state();
+    return true;
+}
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+    const char * bigram_filename = SYSTEM_BIGRAM;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- import interpolation model");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    SystemTableInfo system_table_info;
+
+    gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+    bool retval = system_table_info.load(filename);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+    g_free(filename);
+
+    PhraseLargeTable2 phrase_table;
+
+    MemoryChunk * chunk = new MemoryChunk;
+    retval = chunk->load(SYSTEM_PHRASE_INDEX);
+    if (!retval) {
+        fprintf(stderr, "open phrase_index.bin failed!\n");
+        exit(ENOENT);
+    }
+    phrase_table.load(chunk);
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    Bigram bigram;
+    retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+    if (!retval) {
+        fprintf(stderr, "open %s failed!\n", bigram_filename);
+        exit(ENOENT);
+    }
+
+    taglib_init();
+
+    values = g_ptr_array_new();
+    required = g_hash_table_new(g_str_hash, g_str_equal);
+
+    /* read first line */
+    ssize_t result = my_getline(input);
+    if ( result == -1 ) {
+        fprintf(stderr, "empty file input.\n");
+        exit(ENODATA);
+    }
+
+    if (!parse_headline())
+        exit(ENODATA);
+
+    result = my_getline(input);
+    if ( result != -1 )
+        parse_body(input, &phrase_table, &phrase_index, &bigram);
+
+    taglib_fini();
+
+    if (!save_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    return 0;
+}
diff --git a/utils/training/CMakeLists.txt b/utils/training/CMakeLists.txt
new file mode 100644
index 0000000..ee59bcd
--- /dev/null
+++ b/utils/training/CMakeLists.txt
@@ -0,0 +1,129 @@
+add_executable(
+    gen_ngram
+    gen_ngram.cpp
+)
+
+target_link_libraries(
+    gen_ngram
+    libpinyin
+)
+
+add_executable(
+    gen_deleted_ngram
+    gen_deleted_ngram.cpp
+)
+
+target_link_libraries(
+    gen_deleted_ngram
+    libpinyin
+)
+
+add_executable(
+    gen_unigram
+    gen_unigram.cpp
+)
+
+target_link_libraries(
+    gen_unigram
+    libpinyin
+)
+
+add_executable(
+    gen_k_mixture_model
+    gen_k_mixture_model.cpp
+)
+
+target_link_libraries(
+    gen_k_mixture_model
+    libpinyin
+)
+
+add_executable(
+    estimate_interpolation
+    estimate_interpolation.cpp
+)
+
+target_link_libraries(
+    estimate_interpolation
+    libpinyin
+)
+
+add_executable(
+    estimate_k_mixture_model
+    estimate_k_mixture_model.cpp
+)
+
+target_link_libraries(
+    estimate_k_mixture_model
+    libpinyin
+)
+
+add_executable(
+    merge_k_mixture_model
+    merge_k_mixture_model.cpp
+)
+
+target_link_libraries(
+    merge_k_mixture_model
+    libpinyin
+)
+
+add_executable(
+    prune_k_mixture_model
+    prune_k_mixture_model.cpp
+)
+
+target_link_libraries(
+    prune_k_mixture_model
+    libpinyin
+)
+
+add_executable(
+    import_k_mixture_model
+    import_k_mixture_model.cpp
+)
+
+target_link_libraries(
+    import_k_mixture_model
+    libpinyin
+)
+
+add_executable(
+    export_k_mixture_model
+    export_k_mixture_model.cpp
+)
+
+target_link_libraries(
+    export_k_mixture_model
+    libpinyin
+)
+
+add_executable(
+    k_mixture_model_to_interpolation
+    k_mixture_model_to_interpolation.cpp
+)
+
+target_link_libraries(
+    k_mixture_model_to_interpolation
+    libpinyin
+)
+
+add_executable(
+    validate_k_mixture_model
+    validate_k_mixture_model.cpp
+)
+
+target_link_libraries(
+    validate_k_mixture_model
+    libpinyin
+)
+
+add_executable(
+    eval_correction_rate
+    eval_correction_rate.cpp
+)
+
+target_link_libraries(
+    eval_correction_rate
+    libpinyin
+)
+\ No newline at end of file
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am
new file mode 100644
index 0000000..dc834ec
--- /dev/null
+++ b/utils/training/Makefile.am
@@ -0,0 +1,97 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+MAINTAINERCLEANFILES    = Makefile.in
+
+INCLUDES		= -I$(top_srcdir)/src \
+			  -I$(top_srcdir)/src/include \
+			  -I$(top_srcdir)/src/storage \
+			  -I$(top_srcdir)/src/lookup \
+			  -I$(top_srcdir)/utils \
+			  @GLIB2_CFLAGS@
+
+noinst_HEADERS		= k_mixture_model.h
+
+bin_PROGRAMS		= gen_unigram
+
+noinst_PROGRAMS		= gen_ngram \
+			  gen_deleted_ngram \
+			  gen_k_mixture_model \
+			  estimate_interpolation \
+			  estimate_k_mixture_model \
+			  merge_k_mixture_model \
+			  prune_k_mixture_model \
+			  import_k_mixture_model \
+			  export_k_mixture_model \
+			  k_mixture_model_to_interpolation \
+			  validate_k_mixture_model \
+			  eval_correction_rate
+
+gen_ngram_SOURCES	= gen_ngram.cpp
+
+gen_ngram_LDADD		= ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_deleted_ngram_SOURCES = gen_deleted_ngram.cpp
+
+gen_deleted_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_unigram_SOURCES     = gen_unigram.cpp
+
+gen_unigram_LDADD       = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_k_mixture_model_SOURCES = gen_k_mixture_model.cpp
+
+gen_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+estimate_interpolation_SOURCES = estimate_interpolation.cpp
+
+estimate_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+estimate_k_mixture_model_SOURCES = estimate_k_mixture_model.cpp
+
+estimate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \
+					@GLIB2_LIBS@
+
+merge_k_mixture_model_SOURCES = merge_k_mixture_model.cpp
+
+merge_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+prune_k_mixture_model_SOURCES = prune_k_mixture_model.cpp
+
+prune_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+import_k_mixture_model_SOURCES = import_k_mixture_model.cpp
+
+import_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+export_k_mixture_model_SOURCES = export_k_mixture_model.cpp
+
+export_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp
+
+k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin_internal.la \
+						@GLIB2_LIBS@
+
+validate_k_mixture_model_SOURCES = validate_k_mixture_model.cpp
+
+validate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \
+					@GLIB2_LIBS@
+
+eval_correction_rate_SOURCES = eval_correction_rate.cpp
+
+eval_correction_rate_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp
new file mode 100644
index 0000000..5cdc680
--- /dev/null
+++ b/utils/training/estimate_interpolation.cpp
@@ -0,0 +1,144 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2006-2008 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <math.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+parameter_t compute_interpolation(SingleGram * deleted_bigram,
+				  FacadePhraseIndex * unigram,
+				  SingleGram * bigram){
+    bool success;
+    parameter_t lambda = 0, next_lambda = 0.6;
+    parameter_t epsilon = 0.001;
+    
+    while ( fabs(lambda - next_lambda) > epsilon){
+	lambda = next_lambda;
+	next_lambda = 0;
+	guint32 table_num = 0;
+	parameter_t numerator = 0;
+	parameter_t part_of_denominator = 0;
+	
+	BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+	deleted_bigram->retrieve_all(array);
+
+	for ( int i = 0; i < array->len; ++i){
+	    BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, i);
+	    //get the phrase token
+	    phrase_token_t token = item->m_token;
+	    guint32 deleted_count = item->m_count;
+
+	    {
+		guint32 freq = 0;
+		parameter_t elem_poss = 0;
+		if (bigram && bigram->get_freq(token, freq)){
+		    guint32 total_freq;
+		    assert(bigram->get_total_freq(total_freq));
+		    assert(0 != total_freq);
+		    elem_poss = freq / (parameter_t) total_freq;
+		}
+		numerator = lambda * elem_poss;
+	    }
+
+	    {
+		parameter_t elem_poss = 0;
+		PhraseItem item;
+		if (!unigram->get_phrase_item(token, item)){
+		    guint32 freq = item.get_unigram_frequency();
+		    guint32 total_freq = unigram->get_phrase_index_total_freq();
+		    elem_poss = freq / (parameter_t)total_freq;
+		}
+		part_of_denominator = (1 - lambda) * elem_poss;
+	    }
+	    
+	    if (0 == (numerator + part_of_denominator))
+		continue;
+	    
+	    next_lambda += deleted_count * (numerator / (numerator + part_of_denominator));
+	}
+	assert(deleted_bigram->get_total_freq(table_num));
+	next_lambda /= table_num;
+
+	g_array_free(array, TRUE);
+    }
+    lambda = next_lambda;
+    return lambda;
+}
+    
+int main(int argc, char * argv[]){
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    Bigram bigram;
+    bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+
+    Bigram deleted_bigram;
+    deleted_bigram.attach(DELETED_BIGRAM, ATTACH_READONLY);
+
+    GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    deleted_bigram.get_all_items(deleted_items);
+
+    parameter_t lambda_sum = 0;
+    int lambda_count = 0;
+
+    for ( int i = 0; i < deleted_items->len; ++i ){
+	phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
+	SingleGram * single_gram = NULL;
+	bigram.load(*token, single_gram);
+
+	SingleGram * deleted_single_gram = NULL;
+	deleted_bigram.load(*token, deleted_single_gram);
+	
+	parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram);
+	
+	printf("token:%d lambda:%f\n", *token, lambda);
+
+	lambda_sum += lambda;
+	lambda_count ++;
+
+	if (single_gram)
+            delete single_gram;
+	delete deleted_single_gram;
+    }
+
+    printf("average lambda:%f\n", (lambda_sum/lambda_count));
+    g_array_free(deleted_items, TRUE);
+    return 0;
+}
+
diff --git a/utils/training/estimate_k_mixture_model.cpp b/utils/training/estimate_k_mixture_model.cpp
new file mode 100644
index 0000000..c0fa03f
--- /dev/null
+++ b/utils/training/estimate_k_mixture_model.cpp
@@ -0,0 +1,159 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+static const gchar * bigram_filename = "k_mixture_model_ngram.db";
+static const gchar * deleted_bigram_filename = "k_mixture_model_deleted_ngram.db";
+
+static GOptionEntry entries[] =
+{
+    {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "the bigram file", NULL},
+    {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &deleted_bigram_filename, "the deleted bigram file", NULL},
+    {NULL}
+};
+
+
+parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram,
+                                  KMixtureModelBigram * unigram,
+                                  KMixtureModelSingleGram * bigram){
+    bool success;
+    parameter_t lambda = 0, next_lambda = 0.6;
+    parameter_t epsilon = 0.001;
+
+    KMixtureModelMagicHeader magic_header;
+    assert(unigram->get_magic_header(magic_header));
+    assert(0 != magic_header.m_total_freq);
+
+    while (fabs(lambda - next_lambda) > epsilon){
+        lambda = next_lambda;
+        next_lambda = 0;
+        parameter_t numerator = 0;
+        parameter_t part_of_denominator = 0;
+
+        FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+        deleted_bigram->retrieve_all(array);
+
+        for ( size_t i = 0; i < array->len; ++i){
+            KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i);
+            //get the phrase token
+            phrase_token_t token = item->m_token;
+            guint32 deleted_count = item->m_item.m_WC;
+
+            {
+                parameter_t elem_poss = 0;
+                KMixtureModelArrayHeader array_header;
+                KMixtureModelArrayItem array_item;
+                if ( bigram && bigram->get_array_item(token, array_item) ){
+                    assert(bigram->get_array_header(array_header));
+                    assert(0 != array_header.m_WC);
+                    elem_poss = array_item.m_WC / (parameter_t) array_header.m_WC;
+                }
+                numerator = lambda * elem_poss;
+            }
+
+            {
+                parameter_t elem_poss = 0;
+                KMixtureModelArrayHeader array_header;
+                if (unigram->get_array_header(token, array_header)){
+                    elem_poss = array_header.m_freq / (parameter_t) magic_header.m_total_freq;
+                }
+                part_of_denominator = (1 - lambda) * elem_poss;
+            }
+            if (0 == (numerator + part_of_denominator))
+                continue;
+
+            next_lambda += deleted_count * (numerator / (numerator + part_of_denominator));
+        }
+        KMixtureModelArrayHeader header;
+        assert(deleted_bigram->get_array_header(header));
+        assert(0 != header.m_WC);
+        next_lambda /= header.m_WC;
+
+        g_array_free(array, TRUE);
+    }
+    lambda = next_lambda;
+    return lambda;
+}
+
+int main(int argc, char * argv[]){
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- estimate k mixture model");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    /* TODO: magic header signature check here. */
+    KMixtureModelBigram unigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    unigram.attach(bigram_filename, ATTACH_READONLY);
+
+    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    bigram.attach(bigram_filename, ATTACH_READONLY);
+
+    KMixtureModelBigram deleted_bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    deleted_bigram.attach(deleted_bigram_filename, ATTACH_READONLY);
+
+    GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    deleted_bigram.get_all_items(deleted_items);
+
+    parameter_t lambda_sum = 0;
+    int lambda_count = 0;
+
+    for( size_t i = 0; i < deleted_items->len; ++i ){
+        phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
+        KMixtureModelSingleGram * single_gram = NULL;
+        bigram.load(*token, single_gram);
+
+        KMixtureModelSingleGram * deleted_single_gram = NULL;
+        deleted_bigram.load(*token, deleted_single_gram);
+
+        KMixtureModelArrayHeader array_header;
+        if (single_gram)
+            assert(single_gram->get_array_header(array_header));
+        KMixtureModelArrayHeader deleted_array_header;
+        assert(deleted_single_gram->get_array_header(deleted_array_header));
+
+        if ( 0 != deleted_array_header.m_WC ) {
+            parameter_t lambda = compute_interpolation(deleted_single_gram, &unigram, single_gram);
+
+            printf("token:%d lambda:%f\n", *token, lambda);
+
+            lambda_sum += lambda;
+            lambda_count ++;
+        }
+
+        if (single_gram)
+            delete single_gram;
+        delete deleted_single_gram;
+    }
+
+    printf("average lambda:%f\n", (lambda_sum/lambda_count));
+    g_array_free(deleted_items, TRUE);
+    return 0;
+}
diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp
new file mode 100644
index 0000000..b45781d
--- /dev/null
+++ b/utils/training/eval_correction_rate.cpp
@@ -0,0 +1,211 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+    printf("Usage: eval_correction_rate\n");
+}
+
+bool get_possible_pinyin(FacadePhraseIndex * phrase_index,
+                         TokenVector tokens, ChewingKeyVector keys){
+    ChewingKey buffer[MAX_PHRASE_LENGTH];
+    size_t key_index; guint32 max_freq;
+    guint32 freq;
+    g_array_set_size(keys, 0);
+
+    for (size_t i = 0; i < tokens->len; ++i){
+        phrase_token_t * token = &g_array_index(tokens, phrase_token_t, i);
+        PhraseItem item;
+        phrase_index->get_phrase_item(*token, item);
+        key_index = 0; max_freq = 0;
+        for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) {
+            freq = 0;
+            assert(item.get_nth_pronunciation(m, buffer, freq));
+            if ( freq > max_freq ) {
+                key_index = m;
+                max_freq = freq;
+            }
+        }
+
+        assert(item.get_nth_pronunciation(key_index, buffer, freq));
+        assert(max_freq == freq);
+        guint8 len = item.get_phrase_length();
+        g_array_append_vals(keys, buffer, len);
+    }
+    return true;
+}
+
+bool get_best_match(PinyinLookup2 * pinyin_lookup,
+                    ChewingKeyVector keys, TokenVector tokens){
+    /* prepare the prefixes for get_best_match. */
+    TokenVector prefixes = g_array_new
+        (FALSE, FALSE, sizeof(phrase_token_t));
+    g_array_append_val(prefixes, sentence_start);
+
+    /* initialize constraints. */
+    CandidateConstraints constraints = g_array_new
+        (TRUE, FALSE, sizeof(lookup_constraint_t));
+    g_array_set_size(constraints, keys->len);
+    for ( size_t i = 0; i < constraints->len; ++i ) {
+        lookup_constraint_t * constraint = &g_array_index
+            (constraints, lookup_constraint_t, i);
+        constraint->m_type = NO_CONSTRAINT;
+    }
+
+    bool retval = pinyin_lookup->get_best_match(prefixes, keys, constraints, tokens);
+
+    g_array_free(prefixes, TRUE);
+    g_array_free(constraints, TRUE);
+    return retval;
+}
+
+bool do_one_test(PinyinLookup2 * pinyin_lookup,
+                 FacadePhraseIndex * phrase_index,
+                 TokenVector tokens){
+    bool retval = false;
+
+    ChewingKeyVector keys = g_array_new(FALSE, TRUE, sizeof(ChewingKey));
+    TokenVector guessed_tokens = g_array_new
+        (FALSE, TRUE, sizeof(phrase_token_t));
+
+    get_possible_pinyin(phrase_index, tokens, keys);
+    get_best_match(pinyin_lookup, keys, guessed_tokens);
+    /* compare the results */
+    char * sentence = NULL; char * guessed_sentence = NULL;
+    pinyin_lookup->convert_to_utf8(tokens, sentence);
+    pinyin_lookup->convert_to_utf8
+        (guessed_tokens, guessed_sentence);
+
+    if ( strcmp(sentence, guessed_sentence) != 0 ) {
+        fprintf(stderr, "test sentence:%s\n", sentence);
+        fprintf(stderr, "guessed sentence:%s\n", guessed_sentence);
+        fprintf(stderr, "the result mis-matches.\n");
+        retval = false;
+    } else {
+        retval = true;
+    }
+
+    g_free(sentence); g_free(guessed_sentence);
+    g_array_free(keys, TRUE);
+    g_array_free(guessed_tokens, TRUE);
+    return retval;
+}
+
+int main(int argc, char * argv[]){
+    const char * evals_text = "evals2.text";
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    pinyin_option_t options = USE_TONE;
+    FacadeChewingTable largetable;
+
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PINYIN_INDEX);
+    largetable.load(options, chunk, NULL);
+
+    FacadePhraseTable2 phrase_table;
+    chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk, NULL);
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    Bigram system_bigram;
+    system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+    Bigram user_bigram;
+    user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE);
+
+    gfloat lambda = system_table_info.get_lambda();
+
+    PinyinLookup2 pinyin_lookup(lambda, options,
+                                &largetable, &phrase_index,
+                                &system_bigram, &user_bigram);
+
+    /* open evals text. */
+    FILE * evals_file = fopen(evals_text, "r");
+    if ( NULL == evals_file ) {
+        fprintf(stderr, "Can't open file:%s\n", evals_text);
+        exit(ENOENT);
+    }
+
+    /* Evaluates the correction rate of test text documents. */
+    size_t tested_count = 0; size_t passed_count = 0;
+    char* linebuf = NULL; size_t size = 0;
+    TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t));
+
+    phrase_token_t token = null_token;
+    while( getline(&linebuf, &size, evals_file) ) {
+        if ( feof(evals_file) )
+            break;
+
+        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
+        if ( null_token == token ) {
+            if ( tokens->len ) { /* one test. */
+                if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
+                    tested_count ++; passed_count ++;
+                } else {
+                    tested_count ++;
+                }
+                g_array_set_size(tokens, 0);
+            }
+        } else {
+            g_array_append_val(tokens, token);
+        }
+    }
+
+    if ( tokens->len ) { /* one test. */
+        if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
+            tested_count ++; passed_count ++;
+        } else {
+            tested_count ++;
+        }
+    }
+
+    parameter_t rate = passed_count / (parameter_t) tested_count;
+    printf("correction rate:%f\n", rate);
+
+    g_array_free(tokens, TRUE);
+    fclose(evals_file);
+    free(linebuf);
+
+    return 0;
+}
diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp
new file mode 100644
index 0000000..e446e79
--- /dev/null
+++ b/utils/training/export_k_mixture_model.cpp
@@ -0,0 +1,156 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+#include "utils_helper.h"
+
+static const gchar * k_mixture_model_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL},
+    {NULL}
+};
+
+
+bool print_k_mixture_model_magic_header(FILE * output,
+                                        KMixtureModelBigram * bigram){
+    KMixtureModelMagicHeader magic_header;
+    if ( !bigram->get_magic_header(magic_header) ){
+        fprintf(stderr, "no magic header in k mixture model.\n");
+        exit(ENODATA);
+    }
+    fprintf(output, "\\data model \"k mixture model\" count %d N %d "
+            "total_freq %d\n", magic_header.m_WC, magic_header.m_N,
+            magic_header.m_total_freq);
+    return true;
+}
+
+bool print_k_mixture_model_array_headers(FILE * output,
+                                         KMixtureModelBigram * bigram,
+                                         FacadePhraseIndex * phrase_index){
+    fprintf(output, "\\1-gram\n");
+    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    bigram->get_all_items(items);
+
+    for (size_t i = 0; i < items->len; ++i) {
+        phrase_token_t token = g_array_index(items, phrase_token_t, i);
+        KMixtureModelArrayHeader array_header;
+        assert(bigram->get_array_header(token, array_header));
+        char * phrase = taglib_token_to_string(phrase_index, token);
+        if ( phrase )
+            fprintf(output, "\\item %d %s count %d freq %d\n",
+                    token, phrase, array_header.m_WC, array_header.m_freq);
+
+        g_free(phrase);
+    }
+    return true;
+}
+
+bool print_k_mixture_model_array_items(FILE * output,
+                                       KMixtureModelBigram * bigram,
+                                       FacadePhraseIndex * phrase_index){
+    fprintf(output, "\\2-gram\n");
+    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    bigram->get_all_items(items);
+
+    for (size_t i = 0; i < items->len; ++i) {
+        phrase_token_t token = g_array_index(items, phrase_token_t, i);
+        KMixtureModelSingleGram * single_gram = NULL;
+        assert(bigram->load(token, single_gram));
+        FlexibleBigramPhraseArray array = g_array_new
+            (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+        single_gram->retrieve_all(array);
+
+        for (size_t m = 0; m < array->len; ++m){
+            KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
+            char * word1 = taglib_token_to_string(phrase_index, token);
+            char * word2 = taglib_token_to_string(phrase_index, item->m_token);
+
+            if (word1 && word2)
+                fprintf(output, "\\item %d %s %d %s count %d T %d N_n_0 %d n_1 %d Mr %d\n",
+                        token, word1, item->m_token, word2,
+                        item->m_item.m_WC, item->m_item.m_WC,
+                        item->m_item.m_N_n_0, item->m_item.m_n_1,
+                        item->m_item.m_Mr);
+
+            g_free(word1); g_free(word2);
+        }
+
+        g_array_free(array, TRUE);
+        delete single_gram;
+    }
+
+    g_array_free(items, TRUE);
+    return true;
+}
+
+bool end_data(FILE * output){
+    fprintf(output, "\\end\n");
+    return true;
+}
+
+int main(int argc, char * argv[]){
+    FILE * output = stdout;
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- export k mixture model");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    if (!bigram.attach(k_mixture_model_filename, ATTACH_READONLY)) {
+        fprintf(stderr, "open %s failed.\n", k_mixture_model_filename);
+        exit(ENOENT);
+    }
+
+    print_k_mixture_model_magic_header(output, &bigram);
+    print_k_mixture_model_array_headers(output, &bigram, &phrase_index);
+    print_k_mixture_model_array_items(output, &bigram, &phrase_index);
+
+    end_data(output);
+
+    return 0;
+}
diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp
new file mode 100644
index 0000000..b6f96fa
--- /dev/null
+++ b/utils/training/gen_deleted_ngram.cpp
@@ -0,0 +1,128 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2006-2007, 2011 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static gboolean train_pi_gram = TRUE;
+static const gchar * bigram_filename = DELETED_BIGRAM;
+
+static GOptionEntry entries[] =
+{
+    {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL},
+    {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "deleted bi-gram file", NULL},
+    {NULL}
+};
+
+
+int main(int argc, char * argv[]){
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- generate deleted n-gram");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    /* load phrase table. */
+    PhraseLargeTable2 phrase_table;
+    MemoryChunk * new_chunk = new MemoryChunk;
+    new_chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(new_chunk);
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENODATA);
+
+    Bigram bigram;
+    bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+
+    char* linebuf = NULL; size_t size = 0;
+    phrase_token_t last_token, cur_token = last_token = 0;
+    while( getline(&linebuf, &size, stdin) ){
+	if ( feof(stdin) )
+	    break;
+
+        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
+	last_token = cur_token;
+	cur_token = token;
+
+        /* skip null_token in second word. */
+        if ( null_token == cur_token )
+            continue;
+
+        /* skip pi-gram training. */
+        if ( null_token == last_token ){
+            if ( !train_pi_gram )
+                continue;
+            last_token = sentence_start;
+        }
+
+        /* train bi-gram */
+        SingleGram * single_gram = NULL;
+        bigram.load(last_token, single_gram);
+
+        if ( NULL == single_gram ){
+            single_gram = new SingleGram;
+        }
+        guint32 freq, total_freq;
+        //increase freq
+        if (single_gram->get_freq(cur_token, freq))
+            assert(single_gram->set_freq(cur_token, freq + 1));
+        else
+            assert(single_gram->insert_freq(cur_token, 1));
+        //increase total freq
+        single_gram->get_total_freq(total_freq);
+        single_gram->set_total_freq(total_freq + 1);
+        
+        bigram.store(last_token, single_gram);
+        delete single_gram;
+    }
+
+    free(linebuf);
+    return 0;
+}
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp
new file mode 100644
index 0000000..2dfb3d1
--- /dev/null
+++ b/utils/training/gen_k_mixture_model.cpp
@@ -0,0 +1,411 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+
+#include <glib.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+#include "k_mixture_model.h"
+
+/* Hash token of Hash token of word count. */
+typedef GHashTable * HashofDocument;
+typedef GHashTable * HashofSecondWord;
+
+typedef GHashTable * HashofUnigram;
+
+
+void print_help(){
+    printf("Usage: gen_k_mixture_model [--skip-pi-gram-training]\n"
+           "                           [--maximum-occurs-allowed <INT>]\n"
+           "                           [--maximum-increase-rates-allowed <FLOAT>]\n"
+           "                           [--k-mixture-model-file <FILENAME>]\n"
+           "                           {<FILENAME>}+\n");
+}
+
+
+static gint g_maximum_occurs = 20;
+static parameter_t g_maximum_increase_rates = 3.;
+static gboolean g_train_pi_gram = TRUE;
+static const gchar * g_k_mixture_model_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &g_train_pi_gram, "skip pi-gram training", NULL},
+    {"maximum-occurs-allowed", 0, 0, G_OPTION_ARG_INT, &g_maximum_occurs, "maximum occurs allowed", NULL},
+    {"maximum-increase-rates-allowed", 0, 0, G_OPTION_ARG_DOUBLE, &g_maximum_increase_rates, "maximum increase rates allowed", NULL},
+    {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &g_k_mixture_model_filename, "k mixture model file", NULL},
+    {NULL}
+};
+
+
+bool read_document(PhraseLargeTable2 * phrase_table,
+                   FacadePhraseIndex * phrase_index,
+                   FILE * document,
+                   HashofDocument hash_of_document,
+                   HashofUnigram hash_of_unigram){
+
+    char * linebuf = NULL;size_t size = 0;
+    phrase_token_t last_token, cur_token = last_token = 0;
+
+    while ( getline(&linebuf, &size, document) ){
+        if ( feof(document) )
+            break;
+
+        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf);
+
+        last_token = cur_token;
+        cur_token = token;
+
+        /* skip null_token in second word. */
+        if ( null_token == cur_token )
+            continue;
+
+        gpointer value = NULL;
+        gboolean lookup_result = g_hash_table_lookup_extended
+            (hash_of_unigram, GUINT_TO_POINTER(cur_token),
+             NULL, &value);
+        if ( !lookup_result ){
+            g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token),
+                                GUINT_TO_POINTER(1));
+        } else {
+            guint32 freq = GPOINTER_TO_UINT(value);
+            freq ++;
+            g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token),
+                                GUINT_TO_POINTER(freq));
+        }
+
+        /* skip pi-gram training. */
+        if ( null_token == last_token ){
+            if ( !g_train_pi_gram )
+                continue;
+            last_token = sentence_start;
+        }
+
+        /* remember the (last_token, cur_token) word pair. */
+        HashofSecondWord hash_of_second_word = NULL;
+        lookup_result = g_hash_table_lookup_extended
+            (hash_of_document, GUINT_TO_POINTER(last_token),
+             NULL, &value);
+        if ( !lookup_result ){
+            hash_of_second_word = g_hash_table_new
+                (g_direct_hash, g_direct_equal);
+        } else {
+            hash_of_second_word = (HashofSecondWord) value;
+        }
+
+        value = NULL;
+        lookup_result = g_hash_table_lookup_extended
+            (hash_of_second_word, GUINT_TO_POINTER(cur_token),
+             NULL, &value);
+        guint32 count = 0;
+        if ( lookup_result ) {
+            count = GPOINTER_TO_UINT(value);
+        }
+        count ++;
+        g_hash_table_insert(hash_of_second_word,
+                            GUINT_TO_POINTER(cur_token),
+                            GUINT_TO_POINTER(count));
+        g_hash_table_insert(hash_of_document,
+                            GUINT_TO_POINTER(last_token),
+                            hash_of_second_word);
+    }
+
+    free(linebuf);
+
+    return true;
+}
+
+static void train_word_pair(HashofUnigram hash_of_unigram,
+                            KMixtureModelSingleGram * single_gram,
+                            phrase_token_t token2, guint32 count){
+    KMixtureModelArrayItem array_item;
+
+    bool exists = single_gram->get_array_item(token2, array_item);
+    if ( exists ) {
+        guint32 maximum_occurs_allowed = std_lite::max
+            ((guint32)g_maximum_occurs,
+             (guint32)ceil(array_item.m_Mr * g_maximum_increase_rates));
+        /* Exceeds the maximum occurs allowed of the word or phrase,
+         * in a single document.
+         */
+        if ( count > maximum_occurs_allowed ){
+            gpointer value = NULL;
+            assert( g_hash_table_lookup_extended
+                    (hash_of_unigram, GUINT_TO_POINTER(token2),
+                     NULL, &value) );
+            guint32 freq = GPOINTER_TO_UINT(value);
+            freq -= count;
+            if ( freq > 0 ) {
+                g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2),
+                                    GUINT_TO_POINTER(freq));
+            } else if ( freq == 0 ) {
+                assert(g_hash_table_steal(hash_of_unigram,
+                                          GUINT_TO_POINTER(token2)));
+            } else {
+                assert(false);
+            }
+            return;
+        }
+        array_item.m_WC += count;
+        /* array_item.m_T += count; the same as m_WC. */
+        array_item.m_N_n_0 ++;
+        if ( 1 == count )
+            array_item.m_n_1 ++;
+        array_item.m_Mr = std_lite::max(array_item.m_Mr, count);
+        assert(single_gram->set_array_item(token2, array_item));
+    } else { /* item doesn't exist. */
+        /* the same as above. */
+        if ( count > g_maximum_occurs ){
+            gpointer value = NULL;
+            assert( g_hash_table_lookup_extended
+                    (hash_of_unigram, GUINT_TO_POINTER(token2),
+                     NULL, &value) );
+            guint32 freq = GPOINTER_TO_UINT(value);
+            freq -= count;
+            if ( freq > 0 ) {
+                g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2),
+                                    GUINT_TO_POINTER(freq));
+            } else if ( freq == 0 ) {
+                assert(g_hash_table_steal(hash_of_unigram,
+                                          GUINT_TO_POINTER(token2)));
+            } else {
+                assert(false);
+            }
+            return;
+        }
+        memset(&array_item, 0, sizeof(KMixtureModelArrayItem));
+        array_item.m_WC = count;
+        /* array_item.m_T = count; the same as m_WC. */
+        array_item.m_N_n_0 = 1;
+        if ( 1 == count )
+            array_item.m_n_1 = 1;
+        array_item.m_Mr = count;
+        assert(single_gram->insert_array_item(token2, array_item));
+    }
+
+    /* save delta in the array header. */
+    KMixtureModelArrayHeader array_header;
+    single_gram->get_array_header(array_header);
+    array_header.m_WC += count;
+    single_gram->set_array_header(array_header);
+}
+
+bool train_single_gram(HashofUnigram hash_of_unigram,
+                       HashofDocument hash_of_document,
+                       KMixtureModelSingleGram * single_gram,
+                       phrase_token_t token1,
+                       guint32 & delta){
+    assert(NULL != single_gram);
+    delta = 0; /* delta in WC of single_gram. */
+    KMixtureModelArrayHeader array_header;
+    assert(single_gram->get_array_header(array_header));
+    guint32 saved_array_header_WC = array_header.m_WC;
+
+    HashofSecondWord hash_of_second_word = NULL;
+    gpointer key, value = NULL;
+    assert(g_hash_table_lookup_extended
+           (hash_of_document, GUINT_TO_POINTER(token1),
+            NULL, &value));
+    hash_of_second_word = (HashofSecondWord) value;
+    assert(NULL != hash_of_second_word);
+
+    /* train word pair */
+    GHashTableIter iter;
+    g_hash_table_iter_init(&iter, hash_of_second_word);
+    while (g_hash_table_iter_next(&iter, &key, &value)) {
+        phrase_token_t token2 = GPOINTER_TO_UINT(key);
+        guint32 count = GPOINTER_TO_UINT(value);
+        train_word_pair(hash_of_unigram, single_gram, token2, count);
+    }
+
+    assert(single_gram->get_array_header(array_header));
+    delta = array_header.m_WC - saved_array_header_WC;
+    return true;
+}
+
+static bool train_second_word(HashofUnigram hash_of_unigram,
+                              KMixtureModelBigram * bigram,
+                              HashofDocument hash_of_document,
+                              phrase_token_t token1){
+    guint32 delta = 0;
+
+    KMixtureModelSingleGram * single_gram = NULL;
+    bool exists = bigram->load(token1, single_gram);
+    if ( !exists )
+        single_gram = new KMixtureModelSingleGram;
+    train_single_gram(hash_of_unigram, hash_of_document,
+                      single_gram, token1, delta);
+
+    if ( 0 == delta ){ /* Please consider maximum occurs allowed. */
+        delete single_gram;
+        return false;
+    }
+
+    /* save the single gram. */
+    assert(bigram->store(token1, single_gram));
+    delete single_gram;
+
+    KMixtureModelMagicHeader magic_header;
+    if (!bigram->get_magic_header(magic_header)){
+        /* the first time to access the new k mixture model file. */
+        memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader));
+    }
+
+    if ( magic_header.m_WC + delta < magic_header.m_WC ){
+        fprintf(stderr, "the m_WC integer in magic header overflows.\n");
+        return false;
+    }
+    magic_header.m_WC += delta;
+    assert(bigram->set_magic_header(magic_header));
+
+    return true;
+}
+
+/* Note: this method is a post-processing method, run this last. */
+static bool post_processing_unigram(KMixtureModelBigram * bigram,
+                                    HashofUnigram hash_of_unigram){
+    GHashTableIter iter;
+    gpointer key, value;
+    guint32 total_freq = 0;
+
+    g_hash_table_iter_init(&iter, hash_of_unigram);
+    while (g_hash_table_iter_next(&iter, &key, &value)){
+        guint32 token = GPOINTER_TO_UINT(key);
+        guint32 freq = GPOINTER_TO_UINT(value);
+        KMixtureModelArrayHeader array_header;
+        bool result = bigram->get_array_header(token, array_header);
+        array_header.m_freq += freq;
+        total_freq += freq;
+        bigram->set_array_header(token, array_header);
+    }
+
+    KMixtureModelMagicHeader magic_header;
+    assert(bigram->get_magic_header(magic_header));
+    if ( magic_header.m_total_freq + total_freq < magic_header.m_total_freq ){
+        fprintf(stderr, "the m_total_freq in magic header overflows.\n");
+        return false;
+    }
+    magic_header.m_total_freq += total_freq;
+    assert(bigram->set_magic_header(magic_header));
+
+    return true;
+}
+
+int main(int argc, char * argv[]){
+    int i = 1;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- generate k mixture model");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    PhraseLargeTable2 phrase_table;
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk);
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+    while ( i < argc ){
+        const char * filename = argv[i];
+        FILE * document = fopen(filename, "r");
+        if ( NULL == document ){
+            int err_saved = errno;
+            fprintf(stderr, "can't open file: %s.\n", filename);
+            fprintf(stderr, "error:%s.\n", strerror(err_saved));
+            exit(err_saved);
+        }
+
+        HashofDocument hash_of_document = g_hash_table_new
+            (g_direct_hash, g_direct_equal);
+        HashofUnigram hash_of_unigram = g_hash_table_new
+            (g_direct_hash, g_direct_equal);
+
+        assert(read_document(&phrase_table, &phrase_index, document,
+                             hash_of_document, hash_of_unigram));
+        fclose(document);
+        document = NULL;
+
+        GHashTableIter iter;
+        gpointer key, value;
+
+        /* train the document, and convert it to k mixture model. */
+        g_hash_table_iter_init(&iter, hash_of_document);
+        while (g_hash_table_iter_next(&iter, &key, &value)) {
+            phrase_token_t token1 = GPOINTER_TO_UINT(key);
+            train_second_word(hash_of_unigram, &bigram,
+                              hash_of_document, token1);
+        }
+
+        KMixtureModelMagicHeader magic_header;
+        assert(bigram.get_magic_header(magic_header));
+        magic_header.m_N ++;
+        assert(bigram.set_magic_header(magic_header));
+
+        post_processing_unigram(&bigram, hash_of_unigram);
+
+        /* free resources of g_hash_of_document */
+        g_hash_table_iter_init(&iter, hash_of_document);
+        while (g_hash_table_iter_next(&iter, &key, &value)) {
+            HashofSecondWord second_word = (HashofSecondWord) value;
+            g_hash_table_iter_steal(&iter);
+            g_hash_table_unref(second_word);
+        }
+        g_hash_table_unref(hash_of_document);
+        hash_of_document = NULL;
+
+        g_hash_table_unref(hash_of_unigram);
+        hash_of_unigram = NULL;
+
+        ++i;
+    }
+
+    return 0;
+}
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
new file mode 100644
index 0000000..1947959
--- /dev/null
+++ b/utils/training/gen_ngram.cpp
@@ -0,0 +1,136 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2006-2007, 2011 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static gboolean train_pi_gram = TRUE;
+static const gchar * bigram_filename = SYSTEM_BIGRAM;
+
+static GOptionEntry entries[] =
+{
+    {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL},
+    {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "bi-gram file", NULL},
+    {NULL}
+};
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- generate n-gram");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    PhraseLargeTable2 phrase_table;
+    /* init phrase table */
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk);
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+    
+    Bigram bigram;
+    bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+
+    char* linebuf = NULL; size_t size = 0;
+    phrase_token_t last_token, cur_token = last_token = 0;
+    while( getline(&linebuf, &size, input) ){
+	if ( feof(input) )
+	    break;
+
+        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
+	last_token = cur_token;
+	cur_token = token;
+
+        /* skip null_token in second word. */
+        if ( null_token == cur_token )
+            continue;
+
+        /* training uni-gram */
+        phrase_index.add_unigram_frequency(cur_token, 1);
+
+        /* skip pi-gram training. */
+        if ( null_token == last_token ){
+            if ( !train_pi_gram )
+                continue;
+            last_token = sentence_start;
+        }
+
+        /* train bi-gram */
+        SingleGram * single_gram = NULL;
+        bigram.load(last_token, single_gram);
+
+        if ( NULL == single_gram ){
+            single_gram = new SingleGram;
+        }
+        guint32 freq, total_freq;
+        /* increase freq */
+        if (single_gram->get_freq(cur_token, freq))
+            assert(single_gram->set_freq(cur_token, freq + 1));
+        else
+            assert(single_gram->insert_freq(cur_token, 1));
+        /* increase total freq */
+        single_gram->get_total_freq(total_freq);
+        single_gram->set_total_freq(total_freq + 1);
+
+        bigram.store(last_token, single_gram);
+        delete single_gram;
+    }
+
+    free(linebuf);
+    
+    if (!save_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    return 0;
+}
diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp
new file mode 100644
index 0000000..f4c51af
--- /dev/null
+++ b/utils/training/gen_unigram.cpp
@@ -0,0 +1,111 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+    {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+    {NULL}
+};
+
+/* increase all unigram frequency by a constant. */
+
+int main(int argc, char * argv[]){
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- increase uni-gram");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    SystemTableInfo system_table_info;
+
+    gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+    bool retval = system_table_info.load(filename);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+    g_free(filename);
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    /* Note: please increase the value when corpus size becomes larger.
+     *  To avoid zero value when computing unigram frequency in float format.
+     */
+    for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+        const pinyin_table_info_t * table_info = phrase_files + i;
+        assert(table_info->m_dict_index == i);
+
+        if (SYSTEM_FILE != table_info->m_file_type &&
+            DICTIONARY != table_info->m_file_type)
+            continue;
+
+        guint32 freq = 1;
+#if 0
+        /* skip GBK_DICTIONARY. */
+        if (GBK_DICTIONARY == table_info->m_dict_index)
+            freq = 1;
+#endif
+
+        const char * binfile = table_info->m_system_filename;
+
+        MemoryChunk * chunk = new MemoryChunk;
+        bool retval = chunk->load(binfile);
+        if (!retval) {
+            fprintf(stderr, "load %s failed!\n", binfile);
+            exit(ENOENT);
+        }
+
+        phrase_index.load(i, chunk);
+
+        PhraseIndexRange range;
+        int result = phrase_index.get_range(i, range);
+        if ( result == ERROR_OK ) {
+            for (size_t token = range.m_range_begin;
+                  token <= range.m_range_end; ++token) {
+                phrase_index.add_unigram_frequency(token, freq);
+            }
+        }
+    }
+
+    if (!save_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    if (!save_dictionary(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    return 0;
+}
diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp
new file mode 100644
index 0000000..40870cf
--- /dev/null
+++ b/utils/training/import_k_mixture_model.cpp
@@ -0,0 +1,322 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+#include "k_mixture_model.h"
+
+static const gchar * k_mixture_model_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL},
+    {NULL}
+};
+
+
+enum LINE_TYPE{
+    BEGIN_LINE = 1,
+    END_LINE,
+    GRAM_1_LINE,
+    GRAM_2_LINE,
+    GRAM_1_ITEM_LINE,
+    GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline(KMixtureModelBigram * bigram);
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+                   FacadePhraseIndex * phrase_index,
+                   KMixtureModelBigram * bigram);
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+                  FacadePhraseIndex * phrase_index,
+                  KMixtureModelBigram * bigram);
+
+
+static ssize_t my_getline(FILE * input){
+    ssize_t result = getline(&linebuf, &len, input);
+    if ( result == -1 )
+        return result;
+
+    if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+        linebuf[strlen(linebuf) - 1] = '\0';
+    }
+    return result;
+}
+
+bool parse_headline(KMixtureModelBigram * bigram){
+    /* enter "\data" line */
+    assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", ""));
+
+    /* read "\data" line */
+    if ( !taglib_read(linebuf, line_type, values, required) ) {
+        fprintf(stderr, "error: k mixture model expected.\n");
+        return false;
+    }
+
+    assert(line_type == BEGIN_LINE);
+    /* check header */
+    TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+    if ( !( strcmp("k mixture model", model) == 0 ) ) {
+        fprintf(stderr, "error: k mixture model expected.\n");
+        return false;
+    }
+
+    TAGLIB_GET_TAGVALUE(glong, count, atol);
+    TAGLIB_GET_TAGVALUE(glong, N, atol);
+    TAGLIB_GET_TAGVALUE(glong, total_freq, atol);
+
+    KMixtureModelMagicHeader magic_header;
+    memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader));
+    magic_header.m_WC =count; magic_header.m_N = N;
+    magic_header.m_total_freq = total_freq;
+    bigram->set_magic_header(magic_header);
+
+    return true;
+}
+
+bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table,
+                FacadePhraseIndex * phrase_index,
+                KMixtureModelBigram * bigram){
+    taglib_push_state();
+
+    assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+    assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+    assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+    do {
+    retry:
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch(line_type) {
+        case END_LINE:
+            goto end;
+        case GRAM_1_LINE:
+            my_getline(input);
+            parse_unigram(input, phrase_table, phrase_index, bigram);
+            goto retry;
+        case GRAM_2_LINE:
+            my_getline(input);
+            parse_bigram(input, phrase_table, phrase_index, bigram);
+            goto retry;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1) ;
+
+ end:
+    taglib_pop_state();
+    return true;
+}
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+                   FacadePhraseIndex * phrase_index,
+                   KMixtureModelBigram * bigram){
+    taglib_push_state();
+
+    assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", ""));
+
+    do {
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch (line_type) {
+        case GRAM_1_ITEM_LINE:{
+            /* handle \item in \1-gram */
+            TAGLIB_GET_TOKEN(token, 0);
+            TAGLIB_GET_PHRASE_STRING(word, 1);
+            assert(taglib_validate_token_with_string
+                   (phrase_index, token, word));
+
+            TAGLIB_GET_TAGVALUE(glong, count, atol);
+            TAGLIB_GET_TAGVALUE(glong, freq, atol);
+
+            KMixtureModelArrayHeader array_header;
+            memset(&array_header, 0, sizeof(KMixtureModelArrayHeader));
+            array_header.m_WC = count; array_header.m_freq = freq;
+            bigram->set_array_header(token, array_header);
+            break;
+        }
+        case END_LINE:
+        case GRAM_1_LINE:
+        case GRAM_2_LINE:
+            goto end;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1);
+
+ end:
+    taglib_pop_state();
+    return true;
+}
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+                  FacadePhraseIndex * phrase_index,
+                  KMixtureModelBigram * bigram){
+    taglib_push_state();
+
+    assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
+                          "count:T:N_n_0:n_1:Mr", ""));
+
+    phrase_token_t last_token = null_token;
+    KMixtureModelSingleGram * last_single_gram = NULL;
+    do {
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch (line_type) {
+        case GRAM_2_ITEM_LINE:{
+            /* handle \item in \2-gram */
+            /* two tokens */
+            TAGLIB_GET_TOKEN(token1, 0);
+            TAGLIB_GET_PHRASE_STRING(word1, 1);
+            assert(taglib_validate_token_with_string
+                   (phrase_index, token1, word1));
+
+            TAGLIB_GET_TOKEN(token2, 2);
+            TAGLIB_GET_PHRASE_STRING(word2, 3);
+            assert(taglib_validate_token_with_string
+                   (phrase_index, token2, word2));
+
+            TAGLIB_GET_TAGVALUE(glong, count, atol);
+            TAGLIB_GET_TAGVALUE(glong, T, atol);
+            assert(count == T);
+            TAGLIB_GET_TAGVALUE(glong, N_n_0, atol);
+            TAGLIB_GET_TAGVALUE(glong, n_1, atol);
+            TAGLIB_GET_TAGVALUE(glong, Mr, atol);
+
+            KMixtureModelArrayItem array_item;
+            memset(&array_item, 0, sizeof(KMixtureModelArrayItem));
+            array_item.m_WC = count; array_item.m_N_n_0 = N_n_0;
+            array_item.m_n_1 = n_1; array_item.m_Mr = Mr;
+
+            if ( last_token != token1 ) {
+                if ( last_token && last_single_gram ) {
+                    bigram->store(last_token, last_single_gram);
+                    delete last_single_gram;
+                    /* safe guard */
+                    last_token = null_token;
+                    last_single_gram = NULL;
+                }
+                KMixtureModelSingleGram * single_gram = NULL;
+                bigram->load(token1, single_gram);
+
+                /* create the new single gram */
+                if ( single_gram == NULL )
+                    single_gram = new KMixtureModelSingleGram;
+                last_token = token1;
+                last_single_gram = single_gram;
+            }
+
+            assert(NULL != last_single_gram);
+            assert(last_single_gram->insert_array_item(token2, array_item));
+            break;
+        }
+        case END_LINE:
+        case GRAM_1_LINE:
+        case GRAM_2_LINE:
+            goto end;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1);
+
+ end:
+    if ( last_token && last_single_gram ) {
+        bigram->store(last_token, last_single_gram);
+        delete last_single_gram;
+        /* safe guard */
+        last_token = null_token;
+        last_single_gram = NULL;
+    }
+
+    taglib_pop_state();
+    return true;
+}
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- import k mixture model");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    PhraseLargeTable2 phrase_table;
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk);
+
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+    taglib_init();
+
+    /* prepare to read n-gram model */
+    values = g_ptr_array_new();
+    required = g_hash_table_new(g_str_hash, g_str_equal);
+
+    ssize_t result = my_getline(input);
+    if ( result == -1 ) {
+        fprintf(stderr, "empty file input.\n");
+        exit(ENODATA);
+    }
+
+    if (!parse_headline(&bigram))
+        exit(ENODATA);
+
+    result = my_getline(input);
+    if ( result != -1 )
+        parse_body(input, &phrase_table, &phrase_index, &bigram);
+
+    taglib_fini();
+
+    return 0;
+}
diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h
new file mode 100644
index 0000000..ad8d3d8
--- /dev/null
+++ b/utils/training/k_mixture_model.h
@@ -0,0 +1,172 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+
+#ifndef K_MIXTURE_MODEL
+#define K_MIXTURE_MODEL
+
+#include <math.h>
+#include "novel_types.h"
+#include "flexible_ngram.h"
+
+namespace pinyin{
+
+typedef guint32 corpus_count_t;
+
+/* Note: storage parameters: N, T, n_r.
+ * N: the total number of documents.
+ * T: the total number of instances of the word or phrase.
+ * n_r: the number of documents having exactly <b>r</b> occurrences.
+ *      only n_0, n_1 are used here.
+ */
+
+static inline parameter_t compute_alpha(corpus_count_t N, corpus_count_t n_0){
+    parameter_t alpha = 1 - n_0 / (parameter_t) N;
+    return alpha;
+}
+
+static inline parameter_t compute_gamma(corpus_count_t N,
+                                        corpus_count_t n_0,
+                                        corpus_count_t n_1){
+    parameter_t gamma = 1 - n_1 / (parameter_t) (N - n_0);
+    return gamma;
+}
+
+static inline parameter_t compute_B(corpus_count_t N,
+                                    corpus_count_t T,
+                                    corpus_count_t n_0,
+                                    corpus_count_t n_1){
+    /* Note: re-check this, to see if we can remove if statement. */
+    /* Please consider B_2 is no less than 2 in paper. */
+#if 1
+    if ( 0 == T - n_1 && 0 == N - n_0 - n_1 )
+        return 2;
+#endif
+
+    parameter_t B = (T - n_1 ) / (parameter_t) (N - n_0 - n_1);
+    return B;
+}
+
+/* three parameters model */
+static inline parameter_t compute_Pr_G_3(corpus_count_t k,
+                                         parameter_t alpha,
+                                         parameter_t gamma,
+                                         parameter_t B){
+    if ( k == 0 )
+        return 1 - alpha;
+
+    if ( k == 1 )
+        return alpha * (1 - gamma);
+
+    if ( k > 1 ) {
+        return (alpha * gamma / (B - 1)) * pow((1 - 1 / (B - 1)) , k - 2);
+    }
+
+    assert(false);
+}
+
+static inline parameter_t compute_Pr_G_3_with_count(corpus_count_t k,
+                                                    corpus_count_t N,
+                                                    corpus_count_t T,
+                                                    corpus_count_t n_0,
+                                                    corpus_count_t n_1){
+    parameter_t alpha = compute_alpha(N, n_0);
+    parameter_t gamma = compute_gamma(N, n_0, n_1);
+    parameter_t B = compute_B(N, T, n_0, n_1);
+
+    return compute_Pr_G_3(k, alpha, gamma, B);
+}
+
+/* two parameters model */
+static inline parameter_t compute_Pr_G_2(corpus_count_t k,
+                                         parameter_t alpha,
+                                         parameter_t B){
+    parameter_t gamma = 1 - 1 / (B - 1);
+    return compute_Pr_G_3(k, alpha, gamma, B);
+}
+
+static inline parameter_t compute_Pr_G_2_with_count(corpus_count_t k,
+                                                    corpus_count_t N,
+                                                    corpus_count_t T,
+                                                    corpus_count_t n_0,
+                                                    corpus_count_t n_1){
+    parameter_t alpha = compute_alpha(N, n_0);
+    parameter_t B = compute_B(N, T, n_0, n_1);
+    return compute_Pr_G_2(k, alpha, B);
+}
+
+#define K_MIXTURE_MODEL_MAGIC_NUMBER "KMMP"
+
+typedef struct{
+    /* the total number of instances of all words. */
+    guint32 m_WC;
+    /* the total number of documents. */
+    guint32 m_N;
+    /* the total freq of uni-gram. */
+    guint32 m_total_freq;
+} KMixtureModelMagicHeader;
+
+typedef struct{
+    /* the total number of instances of word W1. */
+    guint32 m_WC;
+    /* the freq of uni-gram. see m_total_freq in magic header also. */
+    guint32 m_freq;
+} KMixtureModelArrayHeader;
+
+typedef struct{
+    /* the total number of all W1,W2 word pair. */
+    guint32 m_WC;
+
+    /* the total number of instances of the word or phrase.
+       (two word phrase) */
+    /* guint32 m_T; Please use m_WC instead.
+       alias of m_WC, always the same. */
+
+    /* n_r: the number of documents having exactly r occurrences. */
+    /* guint32 m_n_0;
+       Note: compute this value using the following equation.
+       m_n_0 = KMixtureModelMagicHeader.m_N - m_N_n_0;
+       m_N_n_0, the number of documents which contains the word or phrase.
+       (two word phrase) */
+    guint32 m_N_n_0;
+    guint32 m_n_1;
+
+    /* maximum instances of the word or phrase (two word phrase)
+       in previous documents last seen. */
+    guint32 m_Mr;
+} KMixtureModelArrayItem;
+
+typedef FlexibleBigram<KMixtureModelMagicHeader,
+                       KMixtureModelArrayHeader,
+                       KMixtureModelArrayItem>
+KMixtureModelBigram;
+
+typedef FlexibleSingleGram<KMixtureModelArrayHeader,
+                           KMixtureModelArrayItem>
+KMixtureModelSingleGram;
+
+typedef KMixtureModelSingleGram::ArrayItemWithToken
+KMixtureModelArrayItemWithToken;
+
+};
+
+
+#endif
diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp
new file mode 100644
index 0000000..c5a66ec
--- /dev/null
+++ b/utils/training/k_mixture_model_to_interpolation.cpp
@@ -0,0 +1,214 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+enum LINE_TYPE{
+    BEGIN_LINE = 1,
+    END_LINE,
+    GRAM_1_LINE,
+    GRAM_2_LINE,
+    GRAM_1_ITEM_LINE,
+    GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline(FILE * input, FILE * output);
+
+bool parse_unigram(FILE * input, FILE * output);
+
+bool parse_bigram(FILE * input, FILE * output);
+
+static ssize_t my_getline(FILE * input){
+    ssize_t result = getline(&linebuf, &len, input);
+    if ( result == -1 )
+        return result;
+
+    linebuf[strlen(linebuf) - 1] = '\0';
+    return result;
+}
+
+bool parse_headline(FILE * input, FILE * output) {
+    /* enter "\data" line */
+    assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model",
+                          "count:N:total_freq"));
+
+    /* read "\data" line */
+    if ( !taglib_read(linebuf, line_type, values, required) ) {
+        fprintf(stderr, "error: k mixture model expected.\n");
+        return false;
+    }
+
+    assert(line_type == BEGIN_LINE);
+    TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+    if ( !( strcmp("k mixture model", model) == 0 ) ){
+        fprintf(stderr, "error: k mixture model expected.\n");
+        return false;
+    }
+
+    /* print header */
+    fprintf(output, "\\data model interpolation\n");
+
+    return true;
+}
+
+bool parse_body(FILE * input, FILE * output){
+    taglib_push_state();
+
+    assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+    assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+    assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+    do {
+    retry:
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch(line_type) {
+        case END_LINE:
+            fprintf(output, "\\end\n");
+            goto end;
+        case GRAM_1_LINE:
+            fprintf(output, "\\1-gram\n");
+            my_getline(input);
+            parse_unigram(input, output);
+            goto retry;
+        case GRAM_2_LINE:
+            fprintf(output, "\\2-gram\n");
+            my_getline(input);
+            parse_bigram(input, output);
+            goto retry;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1);
+
+ end:
+    taglib_pop_state();
+    return true;
+}
+
+bool parse_unigram(FILE * input, FILE * output){
+    taglib_push_state();
+
+    assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count"));
+
+    do {
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch(line_type) {
+        case GRAM_1_ITEM_LINE: {
+            /* handle \item in \1-gram */
+            TAGLIB_GET_TOKEN(token, 0);
+            TAGLIB_GET_PHRASE_STRING(word, 1);
+
+            /* remove the "<start>" in the uni-gram of interpolation model */
+            if ( sentence_start == token )
+                break;
+
+            TAGLIB_GET_TAGVALUE(glong, freq, atol);
+
+            /* ignore zero unigram freq item */
+            if ( 0 != freq )
+                fprintf(output, "\\item %d %s count %ld\n", token, word, freq);
+            break;
+        }
+        case END_LINE:
+        case GRAM_1_LINE:
+        case GRAM_2_LINE:
+            goto end;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1);
+
+ end:
+    taglib_pop_state();
+    return true;
+}
+
+bool parse_bigram(FILE * input, FILE * output){
+    taglib_push_state();
+
+    assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
+                          "count", "T:N_n_0:n_1:Mr"));
+
+    do {
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch (line_type) {
+        case GRAM_2_ITEM_LINE:{
+            /* handle \item in \2-gram */
+            /* two strings */
+            TAGLIB_GET_TOKEN(token1, 0);
+            TAGLIB_GET_PHRASE_STRING(word1, 1);
+
+            TAGLIB_GET_TOKEN(token2, 2);
+            TAGLIB_GET_PHRASE_STRING(word2, 3);
+
+            TAGLIB_GET_TAGVALUE(glong, count, atol);
+            fprintf(output, "\\item %d %s %d %s count %ld\n",
+                    token1, word1, token2, word2, count);
+            break;
+        }
+        case END_LINE:
+        case GRAM_1_LINE:
+        case GRAM_2_LINE:
+            goto end;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1);
+
+ end:
+    taglib_pop_state();
+    return true;
+}
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+    FILE * output = stdout;
+
+    taglib_init();
+
+    values = g_ptr_array_new();
+    required = g_hash_table_new(g_str_hash, g_str_equal);
+
+    ssize_t result = my_getline(input);
+    if ( result == -1 ) {
+        fprintf(stderr, "empty file input.\n");
+        exit(ENODATA);
+    }
+
+    if (!parse_headline(input, output))
+        exit(ENODATA);
+
+    result = my_getline(input);
+    if ( result != -1 )
+        parse_body(input, output);
+
+    taglib_fini();
+
+    return 0;
+}
diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp
new file mode 100644
index 0000000..ab08010
--- /dev/null
+++ b/utils/training/merge_k_mixture_model.cpp
@@ -0,0 +1,239 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+void print_help(){
+    printf("Usage: merge_k_mixture_model [--result-file <RESULT_FILENAME>]\n");
+    printf("                             {<SOURCE_FILENAME>}+\n");
+}
+
+static const gchar * result_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"result-file", 0, 0, G_OPTION_ARG_FILENAME, &result_filename, "merged result file", NULL},
+    {NULL}
+};
+
+static bool merge_two_phrase_array( /* in */  FlexibleBigramPhraseArray first,
+                             /* in */  FlexibleBigramPhraseArray second,
+                             /* out */ FlexibleBigramPhraseArray & merged ){
+    /* avoid to do empty merge. */
+    assert( NULL != first && NULL != second && NULL != merged );
+
+    /* merge two arrays. */
+    guint first_index, second_index = first_index = 0;
+    KMixtureModelArrayItemWithToken * first_item,
+        * second_item = first_item = NULL;
+    while ( first_index < first->len && second_index < second->len ){
+        first_item = &g_array_index(first, KMixtureModelArrayItemWithToken,
+                                    first_index);
+        second_item = &g_array_index(second, KMixtureModelArrayItemWithToken,
+                                     second_index);
+        if ( first_item->m_token > second_item->m_token ) {
+            g_array_append_val(merged, *second_item);
+            second_index ++;
+        } else if ( first_item->m_token < second_item->m_token ) {
+            g_array_append_val(merged, *first_item);
+            first_index ++;
+        } else /* first_item->m_token == second_item->m_token */ {
+            KMixtureModelArrayItemWithToken merged_item;
+            memset(&merged_item, 0, sizeof(KMixtureModelArrayItemWithToken));
+            merged_item.m_token = first_item->m_token;/* same as second_item */
+            merged_item.m_item.m_WC = first_item->m_item.m_WC +
+                second_item->m_item.m_WC;
+            /* merged_item.m_item.m_T = first_item->m_item.m_T +
+                   second_item->m_item.m_T; */
+            merged_item.m_item.m_N_n_0 = first_item->m_item.m_N_n_0 +
+                second_item->m_item.m_N_n_0;
+            merged_item.m_item.m_n_1 = first_item->m_item.m_n_1 +
+                second_item->m_item.m_n_1;
+            merged_item.m_item.m_Mr = std_lite::max(first_item->m_item.m_Mr,
+                                                    second_item->m_item.m_Mr);
+            g_array_append_val(merged, merged_item);
+            first_index ++; second_index ++;
+        }
+    }
+
+    /* add remained items. */
+    while ( first_index < first->len ){
+        first_item = &g_array_index(first, KMixtureModelArrayItemWithToken,
+                                    first_index);
+        g_array_append_val(merged, *first_item);
+        first_index++;
+    }
+
+    while ( second_index < second->len ){
+        second_item = &g_array_index(second, KMixtureModelArrayItemWithToken,
+                                     second_index);
+        g_array_append_val(merged, *second_item);
+        second_index++;
+    }
+
+    return true;
+}
+
+static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target,
+                                /* in */ KMixtureModelBigram * new_one ){
+
+    KMixtureModelMagicHeader target_magic_header;
+    KMixtureModelMagicHeader new_magic_header;
+    KMixtureModelMagicHeader merged_magic_header;
+
+    memset(&merged_magic_header, 0, sizeof(KMixtureModelMagicHeader));
+    if (!target->get_magic_header(target_magic_header)) {
+        memset(&target_magic_header, 0, sizeof(KMixtureModelMagicHeader));
+    }
+    assert(new_one->get_magic_header(new_magic_header));
+    if ( target_magic_header.m_WC + new_magic_header.m_WC <
+         std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){
+        fprintf(stderr, "the m_WC integer in magic header overflows.\n");
+        return false;
+    }
+    if ( target_magic_header.m_total_freq + new_magic_header.m_total_freq <
+         std_lite::max( target_magic_header.m_total_freq,
+                        new_magic_header.m_total_freq ) ){
+        fprintf(stderr, "the m_total_freq in magic header overflows.\n");
+        return false;
+    }
+
+    merged_magic_header.m_WC = target_magic_header.m_WC +
+        new_magic_header.m_WC;
+    merged_magic_header.m_N = target_magic_header.m_N +
+        new_magic_header.m_N;
+    merged_magic_header.m_total_freq = target_magic_header.m_total_freq +
+        new_magic_header.m_total_freq;
+
+    assert(target->set_magic_header(merged_magic_header));
+    return true;
+}
+
+static bool merge_array_items( /* in & out */ KMixtureModelBigram * target,
+                               /* in */ KMixtureModelBigram * new_one ){
+
+    GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    new_one->get_all_items(new_items);
+
+    for ( size_t i = 0; i < new_items->len; ++i ){
+        phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i);
+        KMixtureModelSingleGram * target_single_gram = NULL;
+        KMixtureModelSingleGram * new_single_gram = NULL;
+
+        assert(new_one->load(*token, new_single_gram));
+        bool exists_in_target = target->load(*token, target_single_gram);
+        if ( !exists_in_target ){
+            target->store(*token, new_single_gram);
+            delete new_single_gram;
+            continue;
+        }
+
+        /* word count in array header in parallel with array items */
+        KMixtureModelArrayHeader target_array_header;
+        KMixtureModelArrayHeader new_array_header;
+        KMixtureModelArrayHeader merged_array_header;
+
+        assert(new_one->get_array_header(*token, new_array_header));
+        assert(target->get_array_header(*token, target_array_header));
+        memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader));
+
+        merged_array_header.m_WC = target_array_header.m_WC +
+            new_array_header.m_WC;
+        merged_array_header.m_freq = target_array_header.m_freq +
+            new_array_header.m_freq;
+        /* end of word count in array header computing. */
+
+        assert(NULL != target_single_gram);
+        KMixtureModelSingleGram * merged_single_gram =
+            new KMixtureModelSingleGram;
+
+        FlexibleBigramPhraseArray target_array =
+            g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+        target_single_gram->retrieve_all(target_array);
+
+        FlexibleBigramPhraseArray new_array =
+            g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+        new_single_gram->retrieve_all(new_array);
+        FlexibleBigramPhraseArray merged_array =
+            g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+
+        assert(merge_two_phrase_array(target_array, new_array, merged_array));
+
+        g_array_free(target_array, TRUE);
+        g_array_free(new_array, TRUE);
+        delete target_single_gram; delete new_single_gram;
+
+        for ( size_t m = 0; m < merged_array->len; ++m ){
+            KMixtureModelArrayItemWithToken * item =
+                &g_array_index(merged_array,
+                               KMixtureModelArrayItemWithToken, m);
+            merged_single_gram->insert_array_item(item->m_token, item->m_item);
+        }
+
+        assert(merged_single_gram->set_array_header(merged_array_header));
+        assert(target->store(*token, merged_single_gram));
+        delete merged_single_gram;
+        g_array_free(merged_array, TRUE);
+    }
+
+    g_array_free(new_items, TRUE);
+    return true;
+}
+
+bool merge_two_k_mixture_model( /* in & out */ KMixtureModelBigram * target,
+                                /* in */ KMixtureModelBigram * new_one ){
+    assert(NULL != target);
+    assert(NULL != new_one);
+    return merge_array_items(target, new_one) &&
+        merge_magic_header(target, new_one);
+}
+
+int main(int argc, char * argv[]){
+    int i = 1;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- merge k mixture model");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    KMixtureModelBigram target(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    target.attach(result_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+    while (i < argc){
+        const char * new_filename = argv[i];
+        KMixtureModelBigram new_one(K_MIXTURE_MODEL_MAGIC_NUMBER);
+        new_one.attach(new_filename, ATTACH_READONLY);
+        if ( !merge_two_k_mixture_model(&target, &new_one) )
+            exit(EOVERFLOW);
+        ++i;
+    }
+
+    return 0;
+}
diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp
new file mode 100644
index 0000000..40dfb87
--- /dev/null
+++ b/utils/training/prune_k_mixture_model.cpp
@@ -0,0 +1,192 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+
+
+#include <errno.h>
+#include <locale.h>
+#include <limits.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+
+void print_help(){
+    printf("Usage: prune_k_mixture_model -k <INT> --CDF <DOUBLE>  <FILENAME>\n");
+}
+
+static gint g_prune_k = 3;
+static parameter_t g_prune_poss = 0.99;
+
+static GOptionEntry entries[] =
+{
+    {"pruneK", 'k', 0, G_OPTION_ARG_INT, &g_prune_k, "k parameter", NULL},
+    {"CDF", 0, 0, G_OPTION_ARG_DOUBLE, &g_prune_poss, "CDF parameter", NULL},
+    {NULL}
+};
+
+
+bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header,
+                           KMixtureModelSingleGram * & bigram,
+                           FlexibleBigramPhraseArray removed_array){
+    bool success;
+
+    FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+    bigram->retrieve_all(array);
+
+    for ( size_t i = 0; i < array->len; ++i) {
+        KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i);
+        phrase_token_t token = item->m_token;
+        parameter_t remained_poss = 1; parameter_t one_poss = 0;
+        bool errors = false;
+        for ( size_t k = 0; k < g_prune_k; ++k){
+            one_poss = compute_Pr_G_3_with_count
+                (k, magic_header->m_N, item->m_item.m_WC,
+                 magic_header->m_N - item->m_item.m_N_n_0,
+                 item->m_item.m_n_1);
+            if ( !(0 <= one_poss && one_poss <= 1) )
+                errors = true;
+            remained_poss -= one_poss;
+        }
+
+        if ( fabs(remained_poss) < DBL_EPSILON )
+            remained_poss = 0.;
+
+        /* some wrong possibility. */
+        if ( errors || !(0 <= remained_poss && remained_poss <= 1) ) {
+            fprintf(stderr, "some wrong possibility is encountered:%f.\n",
+                    remained_poss);
+            fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n",
+                    g_prune_k, magic_header->m_N, item->m_item.m_WC,
+                    magic_header->m_N - item->m_item.m_N_n_0,
+                    item->m_item.m_n_1);
+            exit(EDOM);
+        }
+
+        if ( remained_poss < g_prune_poss ) {
+            /* prune this word or phrase. */
+            KMixtureModelArrayItem removed_item;
+            bigram->remove_array_item(token, removed_item);
+            assert( memcmp(&removed_item, &(item->m_item),
+                           sizeof(KMixtureModelArrayItem)) == 0 );
+
+            KMixtureModelArrayItemWithToken removed_item_with_token;
+            removed_item_with_token.m_token = token;
+            removed_item_with_token.m_item = removed_item;
+            g_array_append_val(removed_array, removed_item_with_token);
+
+            KMixtureModelArrayHeader array_header;
+            bigram->get_array_header(array_header);
+            guint32 removed_count = removed_item.m_WC;
+            array_header.m_WC -= removed_count;
+            bigram->set_array_header(array_header);
+            magic_header->m_WC -= removed_count;
+            magic_header->m_total_freq -= removed_count;
+        }
+    }
+
+    return true;
+}
+
+int main(int argc, char * argv[]){
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- prune k mixture model");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    if (2 != argc) {
+        fprintf(stderr, "wrong arguments.\n");
+        exit(EINVAL);
+    }
+
+    const gchar * bigram_filename = argv[1];
+
+    /* TODO: magic header signature check here. */
+    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    bigram.attach(bigram_filename, ATTACH_READWRITE);
+
+    KMixtureModelMagicHeader magic_header;
+    if (!bigram.get_magic_header(magic_header)) {
+        fprintf(stderr, "no magic header in k mixture model.\n");
+        exit(ENODATA);
+    }
+
+    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    bigram.get_all_items(items);
+
+    /* print prune progress */
+    size_t progress = 0; size_t onestep = items->len / 20;
+    for ( size_t i = 0; i < items->len; ++i ){
+        if ( progress >= onestep ) {
+            progress = 0; fprintf(stderr, "*");
+        }
+        progress ++;
+
+        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+        KMixtureModelSingleGram * single_gram = NULL;
+        bigram.load(*token, single_gram);
+
+        FlexibleBigramPhraseArray removed_array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+
+        prune_k_mixture_model(&magic_header, single_gram, removed_array);
+        bigram.store(*token, single_gram);
+
+        delete single_gram;
+
+        /* post processing for unigram reduce */
+        for (size_t m = 0; m < removed_array->len; ++m ){
+            KMixtureModelArrayItemWithToken * item =
+                &g_array_index(removed_array,
+                              KMixtureModelArrayItemWithToken, m);
+            KMixtureModelArrayHeader array_header;
+            assert(bigram.get_array_header(item->m_token, array_header));
+            array_header.m_freq -= item->m_item.m_WC;
+            assert(array_header.m_freq >= 0);
+            assert(bigram.set_array_header(item->m_token, array_header));
+        }
+
+        g_array_free(removed_array, TRUE);
+        removed_array = NULL;
+    }
+
+    fprintf(stderr, "\n");
+
+    bigram.set_magic_header(magic_header);
+
+    /* post processing clean up zero items */
+    KMixtureModelArrayHeader array_header;
+    for ( size_t i = 0; i < items->len; ++i ){
+        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+        assert(bigram.get_array_header(*token, array_header));
+        if ( 0 == array_header.m_WC && 0 == array_header.m_freq )
+            assert(bigram.remove(*token));
+    }
+
+    g_array_free(items, TRUE);
+
+    return 0;
+}
diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp
new file mode 100644
index 0000000..7c057b9
--- /dev/null
+++ b/utils/training/validate_k_mixture_model.cpp
@@ -0,0 +1,174 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+void print_help(){
+    printf("Usage: validate_k_mixture_model <FILENAME>\n");
+}
+
+bool validate_unigram(KMixtureModelBigram * bigram){
+    KMixtureModelMagicHeader magic_header;
+    if( !bigram->get_magic_header(magic_header) ){
+        fprintf(stderr, "no magic header in k mixture model.\n");
+        return false;
+    }
+
+    guint32 expected_word_count = magic_header.m_WC;
+    if ( 0 == expected_word_count ){
+        fprintf(stderr, "word count in magic header is unexpected zero.\n");
+        return false;
+    }
+    guint32 expected_total_freq = magic_header.m_total_freq;
+    if ( 0 == expected_total_freq ){
+        fprintf(stderr, "total freq in magic header is unexpected zero.\n");
+        return false;
+    }
+
+    if ( expected_word_count != expected_total_freq ){
+        fprintf(stderr, "the word count doesn't match the total freq.\n");
+        return false;
+    }
+    
+    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    bigram->get_all_items(items);
+
+    guint32 word_count = 0; guint32 total_freq = 0;
+    for (size_t i = 0; i < items->len; ++i) {
+        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+        KMixtureModelArrayHeader array_header;
+        assert(bigram->get_array_header(*token, array_header));
+        word_count += array_header.m_WC;
+        total_freq += array_header.m_freq;
+    }
+
+    if ( word_count != expected_word_count ){
+        fprintf(stderr, "word count in magic header:%d\n",
+                expected_word_count);
+        fprintf(stderr, "sum of word count in array headers:%d\n", word_count);
+        fprintf(stderr, "the sum differs from word count.\n");
+        return false;
+    }
+    if ( total_freq != expected_total_freq ){
+        fprintf(stderr, "total freq in magic header:%d\n",
+                expected_total_freq);
+        fprintf(stderr, "sum of freqs in array headers:%d\n", total_freq);
+        fprintf(stderr, "the total freq differs from sum of freqs.\n");
+        return false;
+    }
+
+    g_array_free(items, TRUE);
+    return true;
+}
+
+bool validate_bigram(KMixtureModelBigram * bigram){
+    bool result = true;
+
+    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    bigram->get_all_items(items);
+
+    for (size_t i = 0; i < items->len; ++i) {
+        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+        KMixtureModelSingleGram * single_gram = NULL;
+        assert(bigram->load(*token, single_gram));
+
+        FlexibleBigramPhraseArray array = g_array_new
+            (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+        single_gram->retrieve_all(array);
+
+        KMixtureModelArrayHeader array_header;
+        assert(single_gram->get_array_header(array_header));
+
+        guint32 expected_sum = array_header.m_WC;
+        guint32 freq = array_header.m_freq;
+        if ( 0 == expected_sum ){
+            if ( 0 != array->len ){
+                fprintf(stderr, "in the array header of token %d:\n", *token);
+                fprintf(stderr, "word count is zero but has array items.\n");
+                result = false;
+            }
+            if ( 0 != freq ){
+                delete single_gram;
+                continue;
+            } else {
+                fprintf(stderr, "in the array header of token %d:\n", *token);
+                fprintf(stderr, "both word count and freq are "
+                        "unexpected zero.\n");
+                result = false;
+            }
+        }
+
+        guint32 sum = 0;
+        for (size_t m = 0; m< array->len; ++m){
+            KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
+
+            sum += item->m_item.m_WC;
+        }
+
+        if ( sum != expected_sum ){
+            fprintf(stderr, "word count in array header:%d\n", expected_sum);
+            fprintf(stderr, "sum of word count in array items:%d\n", sum);
+            fprintf(stderr, "the sum differs from word count.\n");
+            result = false;
+        }
+
+        g_array_free(array, TRUE);
+        delete single_gram;
+    }
+
+    g_array_free(items, TRUE);
+    return result;
+}
+
+int main(int argc, char * argv[]){
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- validate k mixture model");
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    if (2 != argc) {
+        fprintf(stderr, "wrong arguments.\n");
+        exit(EINVAL);
+    }
+
+    const char * k_mixture_model_filename = argv[1];
+
+    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+    bigram.attach(k_mixture_model_filename, ATTACH_READONLY);
+
+    if (!validate_unigram(&bigram)) {
+        fprintf(stderr, "k mixture model validation failed.\n");
+        exit(ENODATA);
+    }
+
+    if (!validate_bigram(&bigram)) {
+        fprintf(stderr, "k mixture model validation failed.\n");
+        exit(ENODATA);
+    }
+
+    return 0;
+}
diff --git a/utils/utils_helper.h b/utils/utils_helper.h
new file mode 100644
index 0000000..b91067b
--- /dev/null
+++ b/utils/utils_helper.h
@@ -0,0 +1,147 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+
+#ifndef UTILS_HELPER_H
+#define UTILS_HELPER_H
+
+
+#define TAGLIB_GET_TOKEN(var, index)                                    \
+    phrase_token_t var = null_token;                                    \
+    {                                                                   \
+        const char * string = (const char *) g_ptr_array_index          \
+            (values, index);                                            \
+        var = atoi(string);                                             \
+    }
+
+#define TAGLIB_GET_PHRASE_STRING(var, index)                            \
+    const char * var = NULL;                                            \
+    {                                                                   \
+        var = (const char *) g_ptr_array_index                          \
+            (values, index);                                            \
+    }
+
+#define TAGLIB_GET_TAGVALUE(type, var, conv)                            \
+    type var;                                                           \
+    {                                                                   \
+        gpointer value = NULL;                                          \
+        assert(g_hash_table_lookup_extended                             \
+               (required, #var, NULL, &value));                         \
+        var = conv((const char *)value);                                \
+    }
+
+#define TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, var, line)            \
+    phrase_token_t var = null_token;                                    \
+    do {                                                                \
+        if (0 == strlen(line))                                          \
+            break;                                                      \
+                                                                        \
+        gchar ** strs = g_strsplit_set(line, " \t", 2);                 \
+        if (2 != g_strv_length(strs))                                   \
+            assert(false);                                              \
+                                                                        \
+        phrase_token_t _token = atoi(strs[0]);                          \
+        const char * phrase = strs[1];                                  \
+        if (null_token != _token)                                       \
+            assert(taglib_validate_token_with_string                    \
+                   (phrase_index, _token, phrase));                     \
+                                                                        \
+        var = _token;                                                   \
+                                                                        \
+        g_strfreev(strs);                                               \
+    } while(false);
+
+
+static bool load_phrase_index(const pinyin_table_info_t * phrase_files,
+                              FacadePhraseIndex * phrase_index) {
+    MemoryChunk * chunk = NULL;
+    for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+        const pinyin_table_info_t * table_info = phrase_files + i;
+
+        if (SYSTEM_FILE != table_info->m_file_type)
+            continue;
+
+        const char * binfile = table_info->m_system_filename;
+
+        chunk = new MemoryChunk;
+        bool retval = chunk->load(binfile);
+        if (!retval) {
+            fprintf(stderr, "load %s failed!\n", binfile);
+            delete chunk;
+            return false;
+        }
+
+        phrase_index->load(i, chunk);
+    }
+    return true;
+}
+
+static bool save_phrase_index(const pinyin_table_info_t * phrase_files,
+                              FacadePhraseIndex * phrase_index) {
+    MemoryChunk * new_chunk = NULL;
+    for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+        const pinyin_table_info_t * table_info = phrase_files + i;
+
+        if (SYSTEM_FILE != table_info->m_file_type)
+            continue;
+
+        const char * binfile = table_info->m_system_filename;
+
+        new_chunk = new MemoryChunk;
+        phrase_index->store(i, new_chunk);
+        bool retval = new_chunk->save(binfile);
+        if (!retval) {
+            fprintf(stderr, "save %s failed.", binfile);
+            delete new_chunk;
+            return false;
+        }
+
+        phrase_index->load(i, new_chunk);
+    }
+    return true;
+}
+
+static bool save_dictionary(const pinyin_table_info_t * phrase_files,
+                            FacadePhraseIndex * phrase_index) {
+    MemoryChunk * new_chunk = NULL;
+    for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+        const pinyin_table_info_t * table_info = phrase_files + i;
+
+        if (DICTIONARY != table_info->m_file_type)
+            continue;
+
+        const char * binfile = table_info->m_system_filename;
+
+        new_chunk = new MemoryChunk;
+        phrase_index->store(i, new_chunk);
+        bool retval = new_chunk->save(binfile);
+        if (!retval) {
+            fprintf(stderr, "save %s failed.", binfile);
+            delete new_chunk;
+            return false;
+        }
+
+        phrase_index->load(i, new_chunk);
+    }
+    return true;
+}
+
+#endif
author	Peng Wu <alexepico@gmail.com>	2013-07-22 11:37:11 +0800
committer	Peng Wu <alexepico@gmail.com>	2013-07-22 11:37:11 +0800
commit	b78429d78df745dd327b6dada6b9bd71ea5df84e (patch)
tree	82c4625db8674c66d69fd566fce8efc347e3cb3a /utils
download	libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.gz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.xz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.zip