summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-07-22 11:37:11 +0800
committerPeng Wu <alexepico@gmail.com>2013-07-22 11:37:11 +0800
commitb78429d78df745dd327b6dada6b9bd71ea5df84e (patch)
tree82c4625db8674c66d69fd566fce8efc347e3cb3a /utils
downloadlibzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.gz
libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.xz
libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.zip
import libpinyin code
Diffstat (limited to 'utils')
-rw-r--r--utils/CMakeLists.txt3
-rw-r--r--utils/Makefile.am27
-rw-r--r--utils/segment/CMakeLists.txt19
-rw-r--r--utils/segment/Makefile.am39
-rw-r--r--utils/segment/mergeseq.cpp278
-rw-r--r--utils/segment/ngseg.cpp261
-rw-r--r--utils/segment/spseg.cpp343
-rw-r--r--utils/storage/CMakeLists.txt29
-rw-r--r--utils/storage/Makefile.am45
-rw-r--r--utils/storage/export_interpolation.cpp144
-rw-r--r--utils/storage/gen_binary_files.cpp115
-rw-r--r--utils/storage/gen_pinyin_table.cpp330
-rw-r--r--utils/storage/import_interpolation.cpp313
-rw-r--r--utils/training/CMakeLists.txt129
-rw-r--r--utils/training/Makefile.am97
-rw-r--r--utils/training/estimate_interpolation.cpp144
-rw-r--r--utils/training/estimate_k_mixture_model.cpp159
-rw-r--r--utils/training/eval_correction_rate.cpp211
-rw-r--r--utils/training/export_k_mixture_model.cpp156
-rw-r--r--utils/training/gen_deleted_ngram.cpp128
-rw-r--r--utils/training/gen_k_mixture_model.cpp411
-rw-r--r--utils/training/gen_ngram.cpp136
-rw-r--r--utils/training/gen_unigram.cpp111
-rw-r--r--utils/training/import_k_mixture_model.cpp322
-rw-r--r--utils/training/k_mixture_model.h172
-rw-r--r--utils/training/k_mixture_model_to_interpolation.cpp214
-rw-r--r--utils/training/merge_k_mixture_model.cpp239
-rw-r--r--utils/training/prune_k_mixture_model.cpp192
-rw-r--r--utils/training/validate_k_mixture_model.cpp174
-rw-r--r--utils/utils_helper.h147
30 files changed, 5088 insertions, 0 deletions
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
new file mode 100644
index 0000000..dbd7855
--- /dev/null
+++ b/utils/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(segment)
+add_subdirectory(storage)
+add_subdirectory(training) \ No newline at end of file
diff --git a/utils/Makefile.am b/utils/Makefile.am
new file mode 100644
index 0000000..bc0f3e5
--- /dev/null
+++ b/utils/Makefile.am
@@ -0,0 +1,27 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+AUTOMAKE_OPTIONS = gnu
+SUBDIRS = storage segment training
+
+MAINTAINERCLEANFILES = Makefile.in
+
+CLEANFILES = *.bak
+
+ACLOCAL = aclocal -I $(ac_aux_dir)
+
+noinst_HEADERS = utils_helper.h
diff --git a/utils/segment/CMakeLists.txt b/utils/segment/CMakeLists.txt
new file mode 100644
index 0000000..82e4deb
--- /dev/null
+++ b/utils/segment/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_executable(
+ spseg
+ spseg.cpp
+)
+
+target_link_libraries(
+ spseg
+ libpinyin
+)
+
+add_executable(
+ ngseg
+ ngseg.cpp
+)
+
+target_link_libraries(
+ ngseg
+ libpinyin
+) \ No newline at end of file
diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am
new file mode 100644
index 0000000..579d6e4
--- /dev/null
+++ b/utils/segment/Makefile.am
@@ -0,0 +1,39 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
+ @GLIB2_CFLAGS@
+
+noinst_PROGRAMS = spseg ngseg mergeseq
+
+spseg_SOURCES = spseg.cpp
+
+spseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+ngseg_SOURCES = ngseg.cpp
+
+ngseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+mergeseq_SOURCES = mergeseq.cpp
+
+mergeseq_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp
new file mode 100644
index 0000000..1a26064
--- /dev/null
+++ b/utils/segment/mergeseq.cpp
@@ -0,0 +1,278 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2013 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <string.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+ printf("Usage: mergeseq [-o outputfile] [inputfile]\n");
+}
+
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+ {NULL}
+};
+
+
+/* data structure definition. */
+typedef struct{
+ phrase_token_t m_token;
+ gint m_token_len;
+} TokenInfo;
+
+
+/* GArray of ucs4 characters. */
+typedef GArray * UnicodeCharVector;
+/* GArray of TokenInfo. */
+typedef GArray * TokenInfoVector;
+
+gint calculate_sequence_length(TokenInfoVector tokeninfos) {
+ gint len = 0;
+
+ size_t i = 0;
+ for (i = 0; i < tokeninfos->len; ++i) {
+ TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i);
+ len += token_info->m_token_len;
+ }
+
+ return len;
+}
+
+/* if merge sequence found, merge and output it,
+ * if not, just output the first token;
+ * pop the first token or sequence.
+ */
+bool merge_sequence(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ UnicodeCharVector unichars,
+ TokenInfoVector tokeninfos) {
+ assert(tokeninfos->len > 0);
+
+ bool found = false;
+ TokenInfo * token_info = NULL;
+ phrase_token_t token = null_token;
+
+ ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index->prepare_tokens(tokens);
+
+ /* search the merge sequence. */
+ size_t index = tokeninfos->len;
+ gint seq_len = calculate_sequence_length(tokeninfos);
+ while (seq_len > 0) {
+ /* do phrase table search. */
+ int retval = phrase_table->search(seq_len, ucs4_str, tokens);
+
+ if (retval & SEARCH_OK) {
+ int num = get_first_token(tokens, token);
+ found = true;
+ break;
+ }
+
+ --index;
+ token_info = &g_array_index(tokeninfos, TokenInfo, index);
+ seq_len -= token_info->m_token_len;
+ }
+
+ phrase_index->destroy_tokens(tokens);
+
+ /* push the merged sequence back. */
+ if (found) {
+ /* pop up the origin sequence. */
+ g_array_remove_range(tokeninfos, 0, index);
+
+ TokenInfo info;
+ info.m_token = token;
+ info.m_token_len = seq_len;
+ g_array_prepend_val(tokeninfos, info);
+ }
+
+ return found;
+}
+
+bool pop_first_token(UnicodeCharVector unichars,
+ TokenInfoVector tokeninfos,
+ FILE * output) {
+ ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
+
+ /* pop it. */
+ TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, 0);
+ phrase_token_t token = token_info->m_token;
+ gint token_len = token_info->m_token_len;
+
+ glong read = 0;
+ gchar * utf8_str = g_ucs4_to_utf8(ucs4_str, token_len, &read, NULL, NULL);
+ assert(read == token_len);
+ fprintf(output, "%d %s\n", token, utf8_str);
+ g_free(utf8_str);
+
+ g_array_remove_range(unichars, 0, token_len);
+ g_array_remove_index(tokeninfos, 0);
+
+ return true;
+}
+
+bool feed_line(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ UnicodeCharVector unichars,
+ TokenInfoVector tokeninfos,
+ const char * linebuf,
+ FILE * output) {
+
+ TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf);
+
+ if (null_token == token) {
+ /* empty the queue. */
+ while (0 != tokeninfos->len) {
+ merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+ pop_first_token(unichars, tokeninfos, output);
+ }
+
+ assert(0 == unichars->len);
+ assert(0 == tokeninfos->len);
+
+ /* restore the null token line. */
+ fprintf(output, "%s\n", linebuf);
+
+ return false;
+ }
+
+ PhraseItem item;
+ phrase_index->get_phrase_item(token, item);
+ gint len = item.get_phrase_length();
+
+ TokenInfo info;
+ info.m_token = token;
+ info.m_token_len = len;
+ g_array_append_val(tokeninfos, info);
+
+ ucs4_t buffer[MAX_PHRASE_LENGTH];
+ item.get_phrase_string(buffer);
+ g_array_append_vals(unichars, buffer, len);
+
+ /* probe merge sequence. */
+ len = calculate_sequence_length(tokeninfos);
+ while (len >= MAX_PHRASE_LENGTH) {
+ merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+ pop_first_token(unichars, tokeninfos, output);
+ len = calculate_sequence_length(tokeninfos);
+ }
+
+ return true;
+}
+
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- merge word sequence");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (outputfile) {
+ output = fopen(outputfile, "w");
+ if (NULL == output) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ if (argc > 2) {
+ fprintf(stderr, "too many arguments.\n");
+ exit(EINVAL);
+ }
+
+ if (2 == argc) {
+ input = fopen(argv[1], "r");
+ if (NULL == input) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* init phrase table */
+ FacadePhraseTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ /* init phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ GArray * unichars = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+ GArray * tokeninfos = g_array_new(TRUE, TRUE, sizeof(TokenInfo));
+
+ char * linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, input)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if (0 == strlen(linebuf))
+ continue;
+
+ feed_line(&phrase_table, &phrase_index,
+ unichars, tokeninfos,
+ linebuf, output);
+ }
+
+ g_array_free(unichars, TRUE);
+ g_array_free(tokeninfos, TRUE);
+ free(linebuf);
+ fclose(input);
+ fclose(output);
+ return 0;
+}
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
new file mode 100644
index 0000000..03fe5b4
--- /dev/null
+++ b/utils/segment/ngseg.cpp
@@ -0,0 +1,261 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+ printf("Usage: ngseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
+}
+
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+ {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+ {NULL}
+};
+
+
+/* n-gram based sentence segment. */
+
+/* Note:
+ * Currently libpinyin supports ucs4 characters.
+ * This is a pre-processor tool for raw corpus,
+ * and skips non-Chinese characters.
+ */
+
+/* TODO:
+ * Try to add punctuation mark and english support,
+ * such as ',', '.', '?', '!', <english>, and other punctuations.
+ */
+
+enum CONTEXT_STATE{
+ CONTEXT_INIT,
+ CONTEXT_SEGMENTABLE,
+ CONTEXT_UNKNOWN
+};
+
+bool deal_with_segmentable(PhraseLookup * phrase_lookup,
+ GArray * current_ucs4,
+ FILE * output){
+ char * result_string = NULL;
+ MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ phrase_lookup->get_best_match(current_ucs4->len,
+ (ucs4_t *) current_ucs4->data, results);
+
+ phrase_lookup->convert_to_utf8(results, result_string);
+
+ if (result_string) {
+ fprintf(output, "%s\n", result_string);
+ } else {
+ char * tmp_string = g_ucs4_to_utf8
+ ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+ NULL, NULL, NULL);
+ fprintf(stderr, "Un-segmentable sentence encountered:%s\n",
+ tmp_string);
+ g_array_free(results, TRUE);
+ return false;
+ }
+ g_array_free(results, TRUE);
+ g_free(result_string);
+ return true;
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+ char * result_string = g_ucs4_to_utf8
+ ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+ NULL, NULL, NULL);
+ fprintf(output, "%d %s\n", null_token, result_string);
+ g_free(result_string);
+ return true;
+}
+
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- n-gram segment");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (outputfile) {
+ output = fopen(outputfile, "w");
+ if (NULL == output) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ if (argc > 2) {
+ fprintf(stderr, "too many arguments.\n");
+ exit(EINVAL);
+ }
+
+ if (2 == argc) {
+ input = fopen(argv[1], "r");
+ if (NULL == input) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* init phrase table */
+ FacadePhraseTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ /* init phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ /* init bi-gram */
+ Bigram system_bigram;
+ system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+ Bigram user_bigram;
+
+ gfloat lambda = system_table_info.get_lambda();
+
+ /* init phrase lookup */
+ PhraseLookup phrase_lookup(lambda,
+ &phrase_table, &phrase_index,
+ &system_bigram, &user_bigram);
+
+
+ CONTEXT_STATE state, next_state;
+ GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index.prepare_tokens(tokens);
+
+ /* split the sentence */
+ char * linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, input)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ /* check non-ucs4 characters */
+ const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+ glong len = 0;
+ ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+ if ( len != num_of_chars ) {
+ fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ /* only new-line persists. */
+ if ( 0 == num_of_chars ) {
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ state = CONTEXT_INIT;
+ int result = phrase_table.search( 1, sentence, tokens);
+ g_array_append_val( current_ucs4, sentence[0]);
+ if ( result & SEARCH_OK )
+ state = CONTEXT_SEGMENTABLE;
+ else
+ state = CONTEXT_UNKNOWN;
+
+ for ( int i = 1; i < num_of_chars; ++i) {
+ int result = phrase_table.search( 1, sentence + i, tokens);
+ if ( result & SEARCH_OK )
+ next_state = CONTEXT_SEGMENTABLE;
+ else
+ next_state = CONTEXT_UNKNOWN;
+
+ if ( state == next_state ){
+ g_array_append_val(current_ucs4, sentence[i]);
+ continue;
+ }
+
+ assert ( state != next_state );
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_lookup, current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+
+ /* save the current character */
+ g_array_set_size(current_ucs4, 0);
+ g_array_append_val(current_ucs4, sentence[i]);
+ state = next_state;
+ }
+
+ if ( current_ucs4->len ) {
+ /* this seems always true. */
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_lookup, current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+ g_array_set_size(current_ucs4, 0);
+ }
+
+ /* print extra enter */
+ if ( gen_extra_enter )
+ fprintf(output, "%d \n", null_token);
+
+ g_free(sentence);
+ }
+ phrase_index.destroy_tokens(tokens);
+
+ /* print enter at file tail */
+ fprintf(output, "%d \n", null_token);
+ g_array_free(current_ucs4, TRUE);
+ free(linebuf);
+ fclose(input);
+ fclose(output);
+ return 0;
+}
diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp
new file mode 100644
index 0000000..b543cc5
--- /dev/null
+++ b/utils/segment/spseg.cpp
@@ -0,0 +1,343 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010,2013 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+ printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
+}
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+ {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+ {NULL}
+};
+
+
+/* graph shortest path sentence segment. */
+
+/* Note:
+ * Currently libpinyin only supports ucs4 characters, as this is a
+ * pre-processor tool for raw corpus, it will skip all sentences
+ * which contains non-ucs4 characters.
+ */
+
+enum CONTEXT_STATE{
+ CONTEXT_INIT,
+ CONTEXT_SEGMENTABLE,
+ CONTEXT_UNKNOWN
+};
+
+struct SegmentStep{
+ phrase_token_t m_handle;
+ ucs4_t * m_phrase;
+ size_t m_phrase_len;
+ //use formula W = number of words. Zero handle means one word.
+ guint m_nword;
+ //backtrace information, -1 one step backward.
+ gint m_backward_nstep;
+public:
+ SegmentStep(){
+ m_handle = null_token;
+ m_phrase = NULL;
+ m_phrase_len = 0;
+ m_nword = UINT_MAX;
+ m_backward_nstep = -0;
+ }
+};
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
+
+/* Note: do not free phrase, as it is used by strings (array of segment). */
+bool segment(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ GArray * current_ucs4,
+ GArray * strings /* Array of SegmentStep. */){
+ ucs4_t * phrase = (ucs4_t *)current_ucs4->data;
+ guint phrase_len = current_ucs4->len;
+
+ /* Prepare for shortest path segment dynamic programming. */
+ GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+ SegmentStep step;
+ for ( glong i = 0; i < phrase_len + 1; ++i ){
+ g_array_append_val(steps, step);
+ }
+
+ SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0);
+ first_step->m_nword = 0;
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index->prepare_tokens(tokens);
+
+ for ( glong i = 0; i < phrase_len + 1; ++i ) {
+ SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
+ size_t nword = step_begin->m_nword;
+ for ( glong k = i + 1; k < phrase_len + 1; ++k ) {
+ size_t len = k - i;
+ ucs4_t * cur_phrase = phrase + i;
+
+ phrase_token_t token = null_token;
+ int result = phrase_table->search(len, cur_phrase, tokens);
+ int num = get_first_token(tokens, token);
+
+ if ( !(result & SEARCH_OK) ){
+ token = null_token;
+ if ( 1 != len )
+ continue;
+ }
+ ++nword;
+
+ SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
+ if ( nword < step_end->m_nword ) {
+ step_end->m_handle = token;
+ step_end->m_phrase = cur_phrase;
+ step_end->m_phrase_len = len;
+ step_end->m_nword = nword;
+ step_end->m_backward_nstep = i - k;
+ }
+ if ( !(result & SEARCH_CONTINUED) )
+ break;
+ }
+ }
+ phrase_index->destroy_tokens(tokens);
+
+ return backtrace(steps, phrase_len, strings);
+}
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings){
+ /* backtracing to get the result. */
+ size_t cur_step = phrase_len;
+ g_array_set_size(strings, 0);
+ while ( cur_step ){
+ SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step);
+ g_array_append_val(strings, *step);
+ cur_step = cur_step + step->m_backward_nstep;
+ /* intended to avoid leaking internal informations. */
+ step->m_nword = 0; step->m_backward_nstep = 0;
+ }
+
+ /* reverse the strings. */
+ for ( size_t i = 0; i < strings->len / 2; ++i ) {
+ SegmentStep * head, * tail;
+ head = &g_array_index(strings, SegmentStep, i);
+ tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i );
+ SegmentStep tmp;
+ tmp = *head;
+ *head = *tail;
+ *tail = tmp;
+ }
+
+ g_array_free(steps, TRUE);
+ return true;
+}
+
+bool deal_with_segmentable(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ GArray * current_ucs4,
+ FILE * output){
+
+ /* do segment stuff. */
+ GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+ segment(phrase_table, phrase_index, current_ucs4, strings);
+
+ /* print out the split phrase. */
+ for ( glong i = 0; i < strings->len; ++i ) {
+ SegmentStep * step = &g_array_index(strings, SegmentStep, i);
+ char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
+ fprintf(output, "%d %s\n", step->m_handle, string);
+ g_free(string);
+ }
+
+ g_array_free(strings, TRUE);
+ return true;
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+ char * result_string = g_ucs4_to_utf8
+ ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+ NULL, NULL, NULL);
+ fprintf(output, "%d %s\n", null_token, result_string);
+ g_free(result_string);
+ return true;
+}
+
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- shortest path segment");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (outputfile) {
+ output = fopen(outputfile, "w");
+ if (NULL == output) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ if (argc > 2) {
+ fprintf(stderr, "too many arguments.\n");
+ exit(EINVAL);
+ }
+
+ if (2 == argc) {
+ input = fopen(argv[1], "r");
+ if (NULL == input) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* init phrase table */
+ FacadePhraseTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ /* init phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ CONTEXT_STATE state, next_state;
+ GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index.prepare_tokens(tokens);
+
+ char * linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, input)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ /* check non-ucs4 characters. */
+ const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+ glong len = 0;
+ ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+ if ( len != num_of_chars ) {
+ fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ /* only new-line persists. */
+ if ( 0 == num_of_chars ) {
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ state = CONTEXT_INIT;
+ int result = phrase_table.search( 1, sentence, tokens);
+ g_array_append_val( current_ucs4, sentence[0]);
+ if ( result & SEARCH_OK )
+ state = CONTEXT_SEGMENTABLE;
+ else
+ state = CONTEXT_UNKNOWN;
+
+ for ( int i = 1; i < num_of_chars; ++i) {
+ int result = phrase_table.search( 1, sentence + i, tokens);
+ if ( result & SEARCH_OK )
+ next_state = CONTEXT_SEGMENTABLE;
+ else
+ next_state = CONTEXT_UNKNOWN;
+
+ if ( state == next_state ){
+ g_array_append_val(current_ucs4, sentence[i]);
+ continue;
+ }
+
+ assert ( state != next_state );
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_table, &phrase_index,
+ current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+
+ /* save the current character */
+ g_array_set_size(current_ucs4, 0);
+ g_array_append_val(current_ucs4, sentence[i]);
+ state = next_state;
+ }
+
+ if ( current_ucs4->len ) {
+ /* this seems always true. */
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_table, &phrase_index,
+ current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+ g_array_set_size(current_ucs4, 0);
+ }
+
+ /* print extra enter */
+ if ( gen_extra_enter )
+ fprintf(output, "%d \n", null_token);
+
+ g_free(sentence);
+ }
+ phrase_index.destroy_tokens(tokens);
+
+ /* print enter at file tail */
+ fprintf(output, "%d \n", null_token);
+ g_array_free(current_ucs4, TRUE);
+ free(linebuf);
+ fclose(input);
+ fclose(output);
+ return 0;
+}
diff --git a/utils/storage/CMakeLists.txt b/utils/storage/CMakeLists.txt
new file mode 100644
index 0000000..63cabcd
--- /dev/null
+++ b/utils/storage/CMakeLists.txt
@@ -0,0 +1,29 @@
+add_executable(
+ gen_binary_files
+ gen_binary_files.cpp
+)
+
+target_link_libraries(
+ gen_binary_files
+ libpinyin
+)
+
+add_executable(
+ import_interpolation
+ import_interpolation.cpp
+)
+
+target_link_libraries(
+ import_interpolation
+ libpinyin
+)
+
+add_executable(
+ export_interpolation
+ export_interpolation.cpp
+)
+
+target_link_libraries(
+ export_interpolation
+ libpinyin
+)
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
new file mode 100644
index 0000000..db63488
--- /dev/null
+++ b/utils/storage/Makefile.am
@@ -0,0 +1,45 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
+ @GLIB2_CFLAGS@
+
+bin_PROGRAMS = gen_binary_files \
+ import_interpolation
+
+noinst_PROGRAMS = export_interpolation \
+ gen_pinyin_table
+
+gen_binary_files_SOURCES = gen_binary_files.cpp
+
+gen_binary_files_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+import_interpolation_SOURCES = import_interpolation.cpp
+
+import_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+export_interpolation_SOURCES = export_interpolation.cpp
+
+export_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_pinyin_table_SOURCES = gen_pinyin_table.cpp
+
+gen_pinyin_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp
new file mode 100644
index 0000000..c43eefb
--- /dev/null
+++ b/utils/storage/export_interpolation.cpp
@@ -0,0 +1,144 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+/* export interpolation model as textual format */
+
+bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index);
+bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram);
+
+bool begin_data(FILE * output){
+ fprintf(output, "\\data model interpolation\n");
+ return true;
+}
+
+bool end_data(FILE * output){
+ fprintf(output, "\\end\n");
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * output = stdout;
+ const char * bigram_filename = SYSTEM_BIGRAM;
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ bigram.attach(bigram_filename, ATTACH_READONLY);
+
+ begin_data(output);
+
+ gen_unigram(output, &phrase_index);
+ gen_bigram(output, &phrase_index, &bigram);
+
+ end_data(output);
+ return 0;
+}
+
+bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
+ fprintf(output, "\\1-gram\n");
+ for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) {
+
+ PhraseIndexRange range;
+ int result = phrase_index->get_range(i, range);
+ if (ERROR_OK != result )
+ continue;
+
+ PhraseItem item;
+ for (phrase_token_t token = range.m_range_begin;
+ token < range.m_range_end; token++) {
+ int result = phrase_index->get_phrase_item(token, item);
+
+ if ( result == ERROR_NO_ITEM )
+ continue;
+ assert( result == ERROR_OK);
+
+ size_t freq = item.get_unigram_frequency();
+ if ( 0 == freq )
+ continue;
+ char * phrase = taglib_token_to_string(phrase_index, token);
+ if ( phrase )
+ fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq);
+
+ g_free(phrase);
+ }
+ }
+ return true;
+}
+
+bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){
+ fprintf(output, "\\2-gram\n");
+
+ /* Retrieve all user items. */
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ bigram->get_all_items(items);
+
+ PhraseItem item;
+
+ for(size_t i = 0; i < items->len; i++){
+ phrase_token_t token = g_array_index(items, phrase_token_t, i);
+ SingleGram * single_gram = NULL;
+ bigram->load(token, single_gram);
+
+ BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+ single_gram->retrieve_all(array);
+ for(size_t j = 0; j < array->len; j++) {
+ BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);
+
+ char * word1 = taglib_token_to_string(phrase_index, token);
+ char * word2 = taglib_token_to_string(phrase_index, item->m_token);
+ guint32 freq = item->m_count;
+
+ if ( word1 && word2)
+ fprintf(output, "\\item %d %s %d %s count %d\n",
+ token, word1, item->m_token, word2, freq);
+
+ g_free(word1); g_free(word2);
+ }
+
+ g_array_free(array, TRUE);
+ delete single_gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp
new file mode 100644
index 0000000..4216b44
--- /dev/null
+++ b/utils/storage/gen_binary_files.cpp
@@ -0,0 +1,115 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+ {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+ {NULL}
+};
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate binary files");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ bool retval = system_table_info.load(filename);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+ g_free(filename);
+
+ /* generate pinyin index*/
+ pinyin_option_t options = USE_TONE;
+ ChewingLargeTable chewing_table(options);
+ PhraseLargeTable2 phrase_table;
+
+ /* generate phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+ assert(table_info->m_dict_index == i);
+
+ if (SYSTEM_FILE != table_info->m_file_type &&
+ DICTIONARY != table_info->m_file_type)
+ continue;
+
+ const char * tablename = table_info->m_table_filename;
+
+ filename = g_build_filename(table_dir, tablename, NULL);
+ FILE * tablefile = fopen(filename, "r");
+
+ if (NULL == tablefile) {
+ fprintf(stderr, "open %s failed!\n", tablename);
+ exit(ENOENT);
+ }
+
+ chewing_table.load_text(tablefile);
+ fseek(tablefile, 0L, SEEK_SET);
+ phrase_table.load_text(tablefile);
+ fseek(tablefile, 0L, SEEK_SET);
+ phrase_index.load_text(i, tablefile);
+ fclose(tablefile);
+ g_free(filename);
+ }
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ chewing_table.store(new_chunk);
+ new_chunk->save(SYSTEM_PINYIN_INDEX);
+ chewing_table.load(new_chunk);
+
+ new_chunk = new MemoryChunk;
+ phrase_table.store(new_chunk);
+ new_chunk->save(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(new_chunk);
+
+ phrase_index.compact();
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ if (!save_dictionary(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}
diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp
new file mode 100644
index 0000000..3b541d1
--- /dev/null
+++ b/utils/storage/gen_pinyin_table.cpp
@@ -0,0 +1,330 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+
+
+void print_help(){
+ printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n"
+ "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"
+ "<OUTPUTFILE> the result output file\n"
+ "<FILEi> input pinyin files\n"
+ "<PHRASE_INDEX> phrase index identifier\n");
+}
+
+
+static gint phrase_index = 0;
+static const gchar * outputfile = "temp.out";
+
+static GOptionEntry entries[] =
+{
+ {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL},
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL},
+ {NULL}
+};
+
+
+using namespace pinyin;
+
+/* map from phrase_item to GArray of chewing_and_freq_item */
+GTree * g_chewing_tree;
+/* Array of GArray of phrase_and_array_item */
+GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
+
+struct phrase_item{
+ size_t length;
+ gunichar * uniphrase;
+};
+
+struct chewing_and_freq_item{
+ ChewingKeyVector keys;
+ ChewingKeyRestVector key_rests;
+ guint32 freq;
+};
+
+struct phrase_and_array_item{
+ phrase_item phrase; /* the key of g_chewing_tree */
+ /* Array of chewing_and_freq_item */
+ GArray * chewing_and_freq_array; /* the value of g_chewing_tree */
+};
+
+
+void feed_file(const char * filename);
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data);
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata);
+
+void gen_phrase_file(const char * outputfile, int phrase_index);
+
+
+gint phrase_item_compare(gconstpointer a, gconstpointer b){
+ phrase_item * itema = (phrase_item *) a;
+ phrase_item * itemb = (phrase_item *) b;
+ if ( itema->length != itemb->length )
+ return itema->length - itemb->length;
+ else
+ return memcmp(itema->uniphrase, itemb->uniphrase,
+ sizeof(gunichar) * itema->length);
+}
+
+
+int main(int argc, char * argv[]){
+ int i;
+
+ g_chewing_tree = g_tree_new(phrase_item_compare);
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate pinyin table");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ for (i = 1; i < argc; ++i) {
+ feed_file(argv[i]);
+ }
+
+ printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree));
+
+ /* store in item array */
+ g_item_array[0] = NULL;
+ for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_item_array[i] = g_array_new
+ (FALSE, TRUE, sizeof(phrase_and_array_item));
+ }
+ g_tree_foreach(g_chewing_tree, store_one_item, NULL);
+
+ /* sort item array */
+ for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
+ }
+
+ gen_phrase_file(outputfile, phrase_index);
+
+ return 0;
+}
+
+void feed_file ( const char * filename){
+ char phrase[1024], pinyin[1024];
+ guint32 freq;
+
+ FILE * infile = fopen(filename, "r");
+ if ( NULL == infile ){
+ fprintf(stderr, "Can't open file %s.\n", filename);
+ exit(ENOENT);
+ }
+
+ while ( !feof(infile)){
+ int num = fscanf(infile, "%s %s %u",
+ phrase, pinyin, &freq);
+
+ if (3 != num)
+ continue;
+
+ if (feof(infile))
+ break;
+
+ feed_line(phrase, pinyin, freq);
+ }
+
+ fclose(infile);
+}
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
+ phrase_item * item = new phrase_item;
+ item->length = g_utf8_strlen(phrase, -1);
+
+ /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
+ * where is the code which I don't want to touch. :-)
+ */
+
+ if (item->length >= MAX_PHRASE_LENGTH) {
+ fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
+
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests = g_array_new
+ (FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE;
+ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
+ assert(keys->len == key_rests->len);
+
+ if (keys->len != item->length) {
+ fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);
+
+ chewing_and_freq_item value_item;
+ value_item.keys = keys; value_item.key_rests = key_rests;
+ value_item.freq = freq;
+
+ assert(item->length == value_item.keys->len);
+ if (NULL == array) {
+ array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ return;
+ }
+
+ bool found = false;
+ for (size_t i = 0; i < array->len; ++i) {
+ chewing_and_freq_item * cur_item =
+ &g_array_index(array, chewing_and_freq_item, i);
+ int result = pinyin_exact_compare2
+ ((ChewingKey *) value_item.keys->data,
+ (ChewingKey *) cur_item->keys->data,
+ value_item.keys->len);
+
+ if (0 == result) {
+ fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
+ phrase, pinyin, freq);
+ cur_item->freq += freq;
+ found = true;
+ }
+ }
+
+ if (!found) {
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ } else {
+ /* clean up */
+ g_array_free(keys, TRUE);
+ g_array_free(key_rests, TRUE);
+ }
+
+ delete item;
+}
+
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data) {
+ phrase_and_array_item item;
+ item.phrase = *((phrase_item *) key);
+ item.chewing_and_freq_array = (GArray *) value;
+ int len = item.phrase.length;
+ g_array_append_val(g_item_array[len], item);
+ return FALSE;
+}
+
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata) {
+ int phrase_length = *((int *) userdata);
+ phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs;
+ phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs;
+
+ ChewingKeyVector keys_lhs = g_array_index
+ (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ ChewingKeyVector keys_rhs = g_array_index
+ (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ return pinyin_exact_compare2((ChewingKey *)keys_lhs->data,
+ (ChewingKey *)keys_rhs->data, phrase_length);
+}
+
+
+void gen_phrase_file(const char * outputfile, int phrase_index){
+ FILE * outfile = fopen(outputfile, "w");
+ if (NULL == outfile ) {
+ fprintf(stderr, "Can't write file %s.\n", outputfile);
+ exit(ENOENT);
+ }
+
+ phrase_token_t token = 1;
+
+ /* phrase length index */
+ for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) {
+ GArray * item_array = g_item_array[i];
+
+ /* item array index */
+ for (size_t m = 0; m < item_array->len; ++m) {
+ phrase_and_array_item * item = &g_array_index
+ (item_array, phrase_and_array_item, m);
+ phrase_item phrase = item->phrase;
+ GArray * chewing_and_freqs = item->chewing_and_freq_array;
+
+ gchar * phrase_str = g_ucs4_to_utf8
+ (phrase.uniphrase, phrase.length, NULL, NULL, NULL);
+
+ /* iterate each pinyin */
+ for (size_t n = 0; n < chewing_and_freqs->len; ++n) {
+ chewing_and_freq_item * chewing_and_freq =
+ &g_array_index
+ (chewing_and_freqs, chewing_and_freq_item, n);
+
+ ChewingKeyVector keys = chewing_and_freq->keys;
+ ChewingKeyRestVector key_rests = chewing_and_freq->key_rests;
+
+ GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *));
+ gchar * pinyin = NULL;
+
+ size_t k;
+ for (k = 0; k < keys->len; ++k) {
+ ChewingKey key = g_array_index(keys, ChewingKey, k);
+ ChewingKeyRest key_rest = g_array_index
+ (key_rests, ChewingKeyRest, k);
+
+ //assert (CHEWING_ZERO_TONE != key.m_tone);
+ pinyin = key.get_pinyin_string();
+ g_array_append_val(pinyins, pinyin);
+ }
+ gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data);
+
+ for (k = 0; k < pinyins->len; ++k) {
+ g_free(g_array_index(pinyins, gchar *, k));
+ }
+ g_array_free(pinyins, TRUE);
+
+ guint32 freq = chewing_and_freq->freq;
+
+ /* avoid zero freq */
+ if (freq < 3) freq = 3;
+
+ fprintf(outfile, "%s\t%s\t%d\t%d\n",
+ pinyin_str, phrase_str,
+ PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq);
+
+ g_free(pinyin_str);
+ }
+ g_free(phrase_str);
+ token++;
+ }
+ }
+
+ fclose(outfile);
+}
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
new file mode 100644
index 0000000..205a27a
--- /dev/null
+++ b/utils/storage/import_interpolation.cpp
@@ -0,0 +1,313 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+ {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+ {NULL}
+};
+
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline();
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index);
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * bigram);
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+ return result;
+}
+
+bool parse_headline(){
+ /* enter "\data" line */
+ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
+
+ /* read "\data" line */
+ if ( !taglib_read(linebuf, line_type, values, required) ) {
+ fprintf(stderr, "error: interpolation model expected.\n");
+ return false;
+ }
+
+ assert(line_type == BEGIN_LINE);
+ /* check header */
+ TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+ if ( !( strcmp("interpolation", model) == 0 ) ) {
+ fprintf(stderr, "error: interpolation model expected.\n");
+ return false;
+ }
+ return true;
+}
+
+bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ goto end;
+ case GRAM_1_LINE:
+ my_getline(input);
+ parse_unigram(input, phrase_table, phrase_index);
+ goto retry;
+ case GRAM_2_LINE:
+ my_getline(input);
+ parse_bigram(input, phrase_table, phrase_index, bigram);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1) ;
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", ""));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_1_ITEM_LINE:{
+ /* handle \item in \1-gram */
+ TAGLIB_GET_TOKEN(token, 0);
+ TAGLIB_GET_PHRASE_STRING(word, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token, word));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ phrase_index->add_unigram_frequency(token, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", ""));
+
+ phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL;
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two tokens */
+ TAGLIB_GET_TOKEN(token1, 0);
+ TAGLIB_GET_PHRASE_STRING(word1, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token1, word1));
+
+ TAGLIB_GET_TOKEN(token2, 2);
+ TAGLIB_GET_PHRASE_STRING(word2, 3);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token2, word2));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+
+ if ( last_token != token1 ) {
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+
+ /* safe guard */
+ last_token = null_token;
+ last_single_gram = NULL;
+ }
+ SingleGram * single_gram = NULL;
+ bigram->load(token1, single_gram);
+
+ /* create the new single gram */
+ if ( single_gram == NULL )
+ single_gram = new SingleGram;
+ last_token = token1;
+ last_single_gram = single_gram;
+ }
+
+ /* save the freq */
+ assert(NULL != last_single_gram);
+ guint32 total_freq = 0;
+ assert(last_single_gram->get_total_freq(total_freq));
+ assert(last_single_gram->insert_freq(token2, count));
+ total_freq += count;
+ assert(last_single_gram->set_total_freq(total_freq));
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ //safe guard
+ last_token = 0;
+ last_single_gram = NULL;
+ }
+
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ const char * bigram_filename = SYSTEM_BIGRAM;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- import interpolation model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ bool retval = system_table_info.load(filename);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+ g_free(filename);
+
+ PhraseLargeTable2 phrase_table;
+
+ MemoryChunk * chunk = new MemoryChunk;
+ retval = chunk->load(SYSTEM_PHRASE_INDEX);
+ if (!retval) {
+ fprintf(stderr, "open phrase_index.bin failed!\n");
+ exit(ENOENT);
+ }
+ phrase_table.load(chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", bigram_filename);
+ exit(ENOENT);
+ }
+
+ taglib_init();
+
+ values = g_ptr_array_new();
+ required = g_hash_table_new(g_str_hash, g_str_equal);
+
+ /* read first line */
+ ssize_t result = my_getline(input);
+ if ( result == -1 ) {
+ fprintf(stderr, "empty file input.\n");
+ exit(ENODATA);
+ }
+
+ if (!parse_headline())
+ exit(ENODATA);
+
+ result = my_getline(input);
+ if ( result != -1 )
+ parse_body(input, &phrase_table, &phrase_index, &bigram);
+
+ taglib_fini();
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}
diff --git a/utils/training/CMakeLists.txt b/utils/training/CMakeLists.txt
new file mode 100644
index 0000000..ee59bcd
--- /dev/null
+++ b/utils/training/CMakeLists.txt
@@ -0,0 +1,129 @@
+add_executable(
+ gen_ngram
+ gen_ngram.cpp
+)
+
+target_link_libraries(
+ gen_ngram
+ libpinyin
+)
+
+add_executable(
+ gen_deleted_ngram
+ gen_deleted_ngram.cpp
+)
+
+target_link_libraries(
+ gen_deleted_ngram
+ libpinyin
+)
+
+add_executable(
+ gen_unigram
+ gen_unigram.cpp
+)
+
+target_link_libraries(
+ gen_unigram
+ libpinyin
+)
+
+add_executable(
+ gen_k_mixture_model
+ gen_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ gen_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ estimate_interpolation
+ estimate_interpolation.cpp
+)
+
+target_link_libraries(
+ estimate_interpolation
+ libpinyin
+)
+
+add_executable(
+ estimate_k_mixture_model
+ estimate_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ estimate_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ merge_k_mixture_model
+ merge_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ merge_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ prune_k_mixture_model
+ prune_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ prune_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ import_k_mixture_model
+ import_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ import_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ export_k_mixture_model
+ export_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ export_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ k_mixture_model_to_interpolation
+ k_mixture_model_to_interpolation.cpp
+)
+
+target_link_libraries(
+ k_mixture_model_to_interpolation
+ libpinyin
+)
+
+add_executable(
+ validate_k_mixture_model
+ validate_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ validate_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ eval_correction_rate
+ eval_correction_rate.cpp
+)
+
+target_link_libraries(
+ eval_correction_rate
+ libpinyin
+) \ No newline at end of file
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am
new file mode 100644
index 0000000..dc834ec
--- /dev/null
+++ b/utils/training/Makefile.am
@@ -0,0 +1,97 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
+ @GLIB2_CFLAGS@
+
+noinst_HEADERS = k_mixture_model.h
+
+bin_PROGRAMS = gen_unigram
+
+noinst_PROGRAMS = gen_ngram \
+ gen_deleted_ngram \
+ gen_k_mixture_model \
+ estimate_interpolation \
+ estimate_k_mixture_model \
+ merge_k_mixture_model \
+ prune_k_mixture_model \
+ import_k_mixture_model \
+ export_k_mixture_model \
+ k_mixture_model_to_interpolation \
+ validate_k_mixture_model \
+ eval_correction_rate
+
+gen_ngram_SOURCES = gen_ngram.cpp
+
+gen_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_deleted_ngram_SOURCES = gen_deleted_ngram.cpp
+
+gen_deleted_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_unigram_SOURCES = gen_unigram.cpp
+
+gen_unigram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_k_mixture_model_SOURCES = gen_k_mixture_model.cpp
+
+gen_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+estimate_interpolation_SOURCES = estimate_interpolation.cpp
+
+estimate_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+estimate_k_mixture_model_SOURCES = estimate_k_mixture_model.cpp
+
+estimate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \
+ @GLIB2_LIBS@
+
+merge_k_mixture_model_SOURCES = merge_k_mixture_model.cpp
+
+merge_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+prune_k_mixture_model_SOURCES = prune_k_mixture_model.cpp
+
+prune_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+import_k_mixture_model_SOURCES = import_k_mixture_model.cpp
+
+import_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+export_k_mixture_model_SOURCES = export_k_mixture_model.cpp
+
+export_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp
+
+k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin_internal.la \
+ @GLIB2_LIBS@
+
+validate_k_mixture_model_SOURCES = validate_k_mixture_model.cpp
+
+validate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \
+ @GLIB2_LIBS@
+
+eval_correction_rate_SOURCES = eval_correction_rate.cpp
+
+eval_correction_rate_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp
new file mode 100644
index 0000000..5cdc680
--- /dev/null
+++ b/utils/training/estimate_interpolation.cpp
@@ -0,0 +1,144 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2008 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <math.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+parameter_t compute_interpolation(SingleGram * deleted_bigram,
+ FacadePhraseIndex * unigram,
+ SingleGram * bigram){
+ bool success;
+ parameter_t lambda = 0, next_lambda = 0.6;
+ parameter_t epsilon = 0.001;
+
+ while ( fabs(lambda - next_lambda) > epsilon){
+ lambda = next_lambda;
+ next_lambda = 0;
+ guint32 table_num = 0;
+ parameter_t numerator = 0;
+ parameter_t part_of_denominator = 0;
+
+ BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+ deleted_bigram->retrieve_all(array);
+
+ for ( int i = 0; i < array->len; ++i){
+ BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, i);
+ //get the phrase token
+ phrase_token_t token = item->m_token;
+ guint32 deleted_count = item->m_count;
+
+ {
+ guint32 freq = 0;
+ parameter_t elem_poss = 0;
+ if (bigram && bigram->get_freq(token, freq)){
+ guint32 total_freq;
+ assert(bigram->get_total_freq(total_freq));
+ assert(0 != total_freq);
+ elem_poss = freq / (parameter_t) total_freq;
+ }
+ numerator = lambda * elem_poss;
+ }
+
+ {
+ parameter_t elem_poss = 0;
+ PhraseItem item;
+ if (!unigram->get_phrase_item(token, item)){
+ guint32 freq = item.get_unigram_frequency();
+ guint32 total_freq = unigram->get_phrase_index_total_freq();
+ elem_poss = freq / (parameter_t)total_freq;
+ }
+ part_of_denominator = (1 - lambda) * elem_poss;
+ }
+
+ if (0 == (numerator + part_of_denominator))
+ continue;
+
+ next_lambda += deleted_count * (numerator / (numerator + part_of_denominator));
+ }
+ assert(deleted_bigram->get_total_freq(table_num));
+ next_lambda /= table_num;
+
+ g_array_free(array, TRUE);
+ }
+ lambda = next_lambda;
+ return lambda;
+}
+
+int main(int argc, char * argv[]){
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+
+ Bigram deleted_bigram;
+ deleted_bigram.attach(DELETED_BIGRAM, ATTACH_READONLY);
+
+ GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ deleted_bigram.get_all_items(deleted_items);
+
+ parameter_t lambda_sum = 0;
+ int lambda_count = 0;
+
+ for ( int i = 0; i < deleted_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
+ SingleGram * single_gram = NULL;
+ bigram.load(*token, single_gram);
+
+ SingleGram * deleted_single_gram = NULL;
+ deleted_bigram.load(*token, deleted_single_gram);
+
+ parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram);
+
+ printf("token:%d lambda:%f\n", *token, lambda);
+
+ lambda_sum += lambda;
+ lambda_count ++;
+
+ if (single_gram)
+ delete single_gram;
+ delete deleted_single_gram;
+ }
+
+ printf("average lambda:%f\n", (lambda_sum/lambda_count));
+ g_array_free(deleted_items, TRUE);
+ return 0;
+}
+
diff --git a/utils/training/estimate_k_mixture_model.cpp b/utils/training/estimate_k_mixture_model.cpp
new file mode 100644
index 0000000..c0fa03f
--- /dev/null
+++ b/utils/training/estimate_k_mixture_model.cpp
@@ -0,0 +1,159 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+static const gchar * bigram_filename = "k_mixture_model_ngram.db";
+static const gchar * deleted_bigram_filename = "k_mixture_model_deleted_ngram.db";
+
+static GOptionEntry entries[] =
+{
+ {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "the bigram file", NULL},
+ {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &deleted_bigram_filename, "the deleted bigram file", NULL},
+ {NULL}
+};
+
+
+parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram,
+ KMixtureModelBigram * unigram,
+ KMixtureModelSingleGram * bigram){
+ bool success;
+ parameter_t lambda = 0, next_lambda = 0.6;
+ parameter_t epsilon = 0.001;
+
+ KMixtureModelMagicHeader magic_header;
+ assert(unigram->get_magic_header(magic_header));
+ assert(0 != magic_header.m_total_freq);
+
+ while (fabs(lambda - next_lambda) > epsilon){
+ lambda = next_lambda;
+ next_lambda = 0;
+ parameter_t numerator = 0;
+ parameter_t part_of_denominator = 0;
+
+ FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ deleted_bigram->retrieve_all(array);
+
+ for ( size_t i = 0; i < array->len; ++i){
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i);
+ //get the phrase token
+ phrase_token_t token = item->m_token;
+ guint32 deleted_count = item->m_item.m_WC;
+
+ {
+ parameter_t elem_poss = 0;
+ KMixtureModelArrayHeader array_header;
+ KMixtureModelArrayItem array_item;
+ if ( bigram && bigram->get_array_item(token, array_item) ){
+ assert(bigram->get_array_header(array_header));
+ assert(0 != array_header.m_WC);
+ elem_poss = array_item.m_WC / (parameter_t) array_header.m_WC;
+ }
+ numerator = lambda * elem_poss;
+ }
+
+ {
+ parameter_t elem_poss = 0;
+ KMixtureModelArrayHeader array_header;
+ if (unigram->get_array_header(token, array_header)){
+ elem_poss = array_header.m_freq / (parameter_t) magic_header.m_total_freq;
+ }
+ part_of_denominator = (1 - lambda) * elem_poss;
+ }
+ if (0 == (numerator + part_of_denominator))
+ continue;
+
+ next_lambda += deleted_count * (numerator / (numerator + part_of_denominator));
+ }
+ KMixtureModelArrayHeader header;
+ assert(deleted_bigram->get_array_header(header));
+ assert(0 != header.m_WC);
+ next_lambda /= header.m_WC;
+
+ g_array_free(array, TRUE);
+ }
+ lambda = next_lambda;
+ return lambda;
+}
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- estimate k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ /* TODO: magic header signature check here. */
+ KMixtureModelBigram unigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ unigram.attach(bigram_filename, ATTACH_READONLY);
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(bigram_filename, ATTACH_READONLY);
+
+ KMixtureModelBigram deleted_bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ deleted_bigram.attach(deleted_bigram_filename, ATTACH_READONLY);
+
+ GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ deleted_bigram.get_all_items(deleted_items);
+
+ parameter_t lambda_sum = 0;
+ int lambda_count = 0;
+
+ for( size_t i = 0; i < deleted_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ bigram.load(*token, single_gram);
+
+ KMixtureModelSingleGram * deleted_single_gram = NULL;
+ deleted_bigram.load(*token, deleted_single_gram);
+
+ KMixtureModelArrayHeader array_header;
+ if (single_gram)
+ assert(single_gram->get_array_header(array_header));
+ KMixtureModelArrayHeader deleted_array_header;
+ assert(deleted_single_gram->get_array_header(deleted_array_header));
+
+ if ( 0 != deleted_array_header.m_WC ) {
+ parameter_t lambda = compute_interpolation(deleted_single_gram, &unigram, single_gram);
+
+ printf("token:%d lambda:%f\n", *token, lambda);
+
+ lambda_sum += lambda;
+ lambda_count ++;
+ }
+
+ if (single_gram)
+ delete single_gram;
+ delete deleted_single_gram;
+ }
+
+ printf("average lambda:%f\n", (lambda_sum/lambda_count));
+ g_array_free(deleted_items, TRUE);
+ return 0;
+}
diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp
new file mode 100644
index 0000000..b45781d
--- /dev/null
+++ b/utils/training/eval_correction_rate.cpp
@@ -0,0 +1,211 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+ printf("Usage: eval_correction_rate\n");
+}
+
+bool get_possible_pinyin(FacadePhraseIndex * phrase_index,
+ TokenVector tokens, ChewingKeyVector keys){
+ ChewingKey buffer[MAX_PHRASE_LENGTH];
+ size_t key_index; guint32 max_freq;
+ guint32 freq;
+ g_array_set_size(keys, 0);
+
+ for (size_t i = 0; i < tokens->len; ++i){
+ phrase_token_t * token = &g_array_index(tokens, phrase_token_t, i);
+ PhraseItem item;
+ phrase_index->get_phrase_item(*token, item);
+ key_index = 0; max_freq = 0;
+ for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) {
+ freq = 0;
+ assert(item.get_nth_pronunciation(m, buffer, freq));
+ if ( freq > max_freq ) {
+ key_index = m;
+ max_freq = freq;
+ }
+ }
+
+ assert(item.get_nth_pronunciation(key_index, buffer, freq));
+ assert(max_freq == freq);
+ guint8 len = item.get_phrase_length();
+ g_array_append_vals(keys, buffer, len);
+ }
+ return true;
+}
+
+bool get_best_match(PinyinLookup2 * pinyin_lookup,
+ ChewingKeyVector keys, TokenVector tokens){
+ /* prepare the prefixes for get_best_match. */
+ TokenVector prefixes = g_array_new
+ (FALSE, FALSE, sizeof(phrase_token_t));
+ g_array_append_val(prefixes, sentence_start);
+
+ /* initialize constraints. */
+ CandidateConstraints constraints = g_array_new
+ (TRUE, FALSE, sizeof(lookup_constraint_t));
+ g_array_set_size(constraints, keys->len);
+ for ( size_t i = 0; i < constraints->len; ++i ) {
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+
+ bool retval = pinyin_lookup->get_best_match(prefixes, keys, constraints, tokens);
+
+ g_array_free(prefixes, TRUE);
+ g_array_free(constraints, TRUE);
+ return retval;
+}
+
+bool do_one_test(PinyinLookup2 * pinyin_lookup,
+ FacadePhraseIndex * phrase_index,
+ TokenVector tokens){
+ bool retval = false;
+
+ ChewingKeyVector keys = g_array_new(FALSE, TRUE, sizeof(ChewingKey));
+ TokenVector guessed_tokens = g_array_new
+ (FALSE, TRUE, sizeof(phrase_token_t));
+
+ get_possible_pinyin(phrase_index, tokens, keys);
+ get_best_match(pinyin_lookup, keys, guessed_tokens);
+ /* compare the results */
+ char * sentence = NULL; char * guessed_sentence = NULL;
+ pinyin_lookup->convert_to_utf8(tokens, sentence);
+ pinyin_lookup->convert_to_utf8
+ (guessed_tokens, guessed_sentence);
+
+ if ( strcmp(sentence, guessed_sentence) != 0 ) {
+ fprintf(stderr, "test sentence:%s\n", sentence);
+ fprintf(stderr, "guessed sentence:%s\n", guessed_sentence);
+ fprintf(stderr, "the result mis-matches.\n");
+ retval = false;
+ } else {
+ retval = true;
+ }
+
+ g_free(sentence); g_free(guessed_sentence);
+ g_array_free(keys, TRUE);
+ g_array_free(guessed_tokens, TRUE);
+ return retval;
+}
+
+int main(int argc, char * argv[]){
+ const char * evals_text = "evals2.text";
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ pinyin_option_t options = USE_TONE;
+ FacadeChewingTable largetable;
+
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PINYIN_INDEX);
+ largetable.load(options, chunk, NULL);
+
+ FacadePhraseTable2 phrase_table;
+ chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram system_bigram;
+ system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+ Bigram user_bigram;
+ user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE);
+
+ gfloat lambda = system_table_info.get_lambda();
+
+ PinyinLookup2 pinyin_lookup(lambda, options,
+ &largetable, &phrase_index,
+ &system_bigram, &user_bigram);
+
+ /* open evals text. */
+ FILE * evals_file = fopen(evals_text, "r");
+ if ( NULL == evals_file ) {
+ fprintf(stderr, "Can't open file:%s\n", evals_text);
+ exit(ENOENT);
+ }
+
+ /* Evaluates the correction rate of test text documents. */
+ size_t tested_count = 0; size_t passed_count = 0;
+ char* linebuf = NULL; size_t size = 0;
+ TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t));
+
+ phrase_token_t token = null_token;
+ while( getline(&linebuf, &size, evals_file) ) {
+ if ( feof(evals_file) )
+ break;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
+ if ( null_token == token ) {
+ if ( tokens->len ) { /* one test. */
+ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
+ tested_count ++; passed_count ++;
+ } else {
+ tested_count ++;
+ }
+ g_array_set_size(tokens, 0);
+ }
+ } else {
+ g_array_append_val(tokens, token);
+ }
+ }
+
+ if ( tokens->len ) { /* one test. */
+ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
+ tested_count ++; passed_count ++;
+ } else {
+ tested_count ++;
+ }
+ }
+
+ parameter_t rate = passed_count / (parameter_t) tested_count;
+ printf("correction rate:%f\n", rate);
+
+ g_array_free(tokens, TRUE);
+ fclose(evals_file);
+ free(linebuf);
+
+ return 0;
+}
diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp
new file mode 100644
index 0000000..e446e79
--- /dev/null
+++ b/utils/training/export_k_mixture_model.cpp
@@ -0,0 +1,156 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+#include "utils_helper.h"
+
+static const gchar * k_mixture_model_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL},
+ {NULL}
+};
+
+
+bool print_k_mixture_model_magic_header(FILE * output,
+ KMixtureModelBigram * bigram){
+ KMixtureModelMagicHeader magic_header;
+ if ( !bigram->get_magic_header(magic_header) ){
+ fprintf(stderr, "no magic header in k mixture model.\n");
+ exit(ENODATA);
+ }
+ fprintf(output, "\\data model \"k mixture model\" count %d N %d "
+ "total_freq %d\n", magic_header.m_WC, magic_header.m_N,
+ magic_header.m_total_freq);
+ return true;
+}
+
+bool print_k_mixture_model_array_headers(FILE * output,
+ KMixtureModelBigram * bigram,
+ FacadePhraseIndex * phrase_index){
+ fprintf(output, "\\1-gram\n");
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t token = g_array_index(items, phrase_token_t, i);
+ KMixtureModelArrayHeader array_header;
+ assert(bigram->get_array_header(token, array_header));
+ char * phrase = taglib_token_to_string(phrase_index, token);
+ if ( phrase )
+ fprintf(output, "\\item %d %s count %d freq %d\n",
+ token, phrase, array_header.m_WC, array_header.m_freq);
+
+ g_free(phrase);
+ }
+ return true;
+}
+
+bool print_k_mixture_model_array_items(FILE * output,
+ KMixtureModelBigram * bigram,
+ FacadePhraseIndex * phrase_index){
+ fprintf(output, "\\2-gram\n");
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t token = g_array_index(items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ assert(bigram->load(token, single_gram));
+ FlexibleBigramPhraseArray array = g_array_new
+ (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ single_gram->retrieve_all(array);
+
+ for (size_t m = 0; m < array->len; ++m){
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
+ char * word1 = taglib_token_to_string(phrase_index, token);
+ char * word2 = taglib_token_to_string(phrase_index, item->m_token);
+
+ if (word1 && word2)
+ fprintf(output, "\\item %d %s %d %s count %d T %d N_n_0 %d n_1 %d Mr %d\n",
+ token, word1, item->m_token, word2,
+ item->m_item.m_WC, item->m_item.m_WC,
+ item->m_item.m_N_n_0, item->m_item.m_n_1,
+ item->m_item.m_Mr);
+
+ g_free(word1); g_free(word2);
+ }
+
+ g_array_free(array, TRUE);
+ delete single_gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
+
+bool end_data(FILE * output){
+ fprintf(output, "\\end\n");
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * output = stdout;
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- export k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ if (!bigram.attach(k_mixture_model_filename, ATTACH_READONLY)) {
+ fprintf(stderr, "open %s failed.\n", k_mixture_model_filename);
+ exit(ENOENT);
+ }
+
+ print_k_mixture_model_magic_header(output, &bigram);
+ print_k_mixture_model_array_headers(output, &bigram, &phrase_index);
+ print_k_mixture_model_array_items(output, &bigram, &phrase_index);
+
+ end_data(output);
+
+ return 0;
+}
diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp
new file mode 100644
index 0000000..b6f96fa
--- /dev/null
+++ b/utils/training/gen_deleted_ngram.cpp
@@ -0,0 +1,128 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007, 2011 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static gboolean train_pi_gram = TRUE;
+static const gchar * bigram_filename = DELETED_BIGRAM;
+
+static GOptionEntry entries[] =
+{
+ {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL},
+ {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "deleted bi-gram file", NULL},
+ {NULL}
+};
+
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate deleted n-gram");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* load phrase table. */
+ PhraseLargeTable2 phrase_table;
+ MemoryChunk * new_chunk = new MemoryChunk;
+ new_chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(new_chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENODATA);
+
+ Bigram bigram;
+ bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+
+ char* linebuf = NULL; size_t size = 0;
+ phrase_token_t last_token, cur_token = last_token = 0;
+ while( getline(&linebuf, &size, stdin) ){
+ if ( feof(stdin) )
+ break;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
+ last_token = cur_token;
+ cur_token = token;
+
+ /* skip null_token in second word. */
+ if ( null_token == cur_token )
+ continue;
+
+ /* skip pi-gram training. */
+ if ( null_token == last_token ){
+ if ( !train_pi_gram )
+ continue;
+ last_token = sentence_start;
+ }
+
+ /* train bi-gram */
+ SingleGram * single_gram = NULL;
+ bigram.load(last_token, single_gram);
+
+ if ( NULL == single_gram ){
+ single_gram = new SingleGram;
+ }
+ guint32 freq, total_freq;
+ //increase freq
+ if (single_gram->get_freq(cur_token, freq))
+ assert(single_gram->set_freq(cur_token, freq + 1));
+ else
+ assert(single_gram->insert_freq(cur_token, 1));
+ //increase total freq
+ single_gram->get_total_freq(total_freq);
+ single_gram->set_total_freq(total_freq + 1);
+
+ bigram.store(last_token, single_gram);
+ delete single_gram;
+ }
+
+ free(linebuf);
+ return 0;
+}
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp
new file mode 100644
index 0000000..2dfb3d1
--- /dev/null
+++ b/utils/training/gen_k_mixture_model.cpp
@@ -0,0 +1,411 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <glib.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+#include "k_mixture_model.h"
+
+/* Hash token of Hash token of word count. */
+typedef GHashTable * HashofDocument;
+typedef GHashTable * HashofSecondWord;
+
+typedef GHashTable * HashofUnigram;
+
+
+void print_help(){
+ printf("Usage: gen_k_mixture_model [--skip-pi-gram-training]\n"
+ " [--maximum-occurs-allowed <INT>]\n"
+ " [--maximum-increase-rates-allowed <FLOAT>]\n"
+ " [--k-mixture-model-file <FILENAME>]\n"
+ " {<FILENAME>}+\n");
+}
+
+
+static gint g_maximum_occurs = 20;
+static parameter_t g_maximum_increase_rates = 3.;
+static gboolean g_train_pi_gram = TRUE;
+static const gchar * g_k_mixture_model_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &g_train_pi_gram, "skip pi-gram training", NULL},
+ {"maximum-occurs-allowed", 0, 0, G_OPTION_ARG_INT, &g_maximum_occurs, "maximum occurs allowed", NULL},
+ {"maximum-increase-rates-allowed", 0, 0, G_OPTION_ARG_DOUBLE, &g_maximum_increase_rates, "maximum increase rates allowed", NULL},
+ {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &g_k_mixture_model_filename, "k mixture model file", NULL},
+ {NULL}
+};
+
+
+bool read_document(PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ FILE * document,
+ HashofDocument hash_of_document,
+ HashofUnigram hash_of_unigram){
+
+ char * linebuf = NULL;size_t size = 0;
+ phrase_token_t last_token, cur_token = last_token = 0;
+
+ while ( getline(&linebuf, &size, document) ){
+ if ( feof(document) )
+ break;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf);
+
+ last_token = cur_token;
+ cur_token = token;
+
+ /* skip null_token in second word. */
+ if ( null_token == cur_token )
+ continue;
+
+ gpointer value = NULL;
+ gboolean lookup_result = g_hash_table_lookup_extended
+ (hash_of_unigram, GUINT_TO_POINTER(cur_token),
+ NULL, &value);
+ if ( !lookup_result ){
+ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token),
+ GUINT_TO_POINTER(1));
+ } else {
+ guint32 freq = GPOINTER_TO_UINT(value);
+ freq ++;
+ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token),
+ GUINT_TO_POINTER(freq));
+ }
+
+ /* skip pi-gram training. */
+ if ( null_token == last_token ){
+ if ( !g_train_pi_gram )
+ continue;
+ last_token = sentence_start;
+ }
+
+ /* remember the (last_token, cur_token) word pair. */
+ HashofSecondWord hash_of_second_word = NULL;
+ lookup_result = g_hash_table_lookup_extended
+ (hash_of_document, GUINT_TO_POINTER(last_token),
+ NULL, &value);
+ if ( !lookup_result ){
+ hash_of_second_word = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
+ } else {
+ hash_of_second_word = (HashofSecondWord) value;
+ }
+
+ value = NULL;
+ lookup_result = g_hash_table_lookup_extended
+ (hash_of_second_word, GUINT_TO_POINTER(cur_token),
+ NULL, &value);
+ guint32 count = 0;
+ if ( lookup_result ) {
+ count = GPOINTER_TO_UINT(value);
+ }
+ count ++;
+ g_hash_table_insert(hash_of_second_word,
+ GUINT_TO_POINTER(cur_token),
+ GUINT_TO_POINTER(count));
+ g_hash_table_insert(hash_of_document,
+ GUINT_TO_POINTER(last_token),
+ hash_of_second_word);
+ }
+
+ free(linebuf);
+
+ return true;
+}
+
+static void train_word_pair(HashofUnigram hash_of_unigram,
+ KMixtureModelSingleGram * single_gram,
+ phrase_token_t token2, guint32 count){
+ KMixtureModelArrayItem array_item;
+
+ bool exists = single_gram->get_array_item(token2, array_item);
+ if ( exists ) {
+ guint32 maximum_occurs_allowed = std_lite::max
+ ((guint32)g_maximum_occurs,
+ (guint32)ceil(array_item.m_Mr * g_maximum_increase_rates));
+ /* Exceeds the maximum occurs allowed of the word or phrase,
+ * in a single document.
+ */
+ if ( count > maximum_occurs_allowed ){
+ gpointer value = NULL;
+ assert( g_hash_table_lookup_extended
+ (hash_of_unigram, GUINT_TO_POINTER(token2),
+ NULL, &value) );
+ guint32 freq = GPOINTER_TO_UINT(value);
+ freq -= count;
+ if ( freq > 0 ) {
+ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2),
+ GUINT_TO_POINTER(freq));
+ } else if ( freq == 0 ) {
+ assert(g_hash_table_steal(hash_of_unigram,
+ GUINT_TO_POINTER(token2)));
+ } else {
+ assert(false);
+ }
+ return;
+ }
+ array_item.m_WC += count;
+ /* array_item.m_T += count; the same as m_WC. */
+ array_item.m_N_n_0 ++;
+ if ( 1 == count )
+ array_item.m_n_1 ++;
+ array_item.m_Mr = std_lite::max(array_item.m_Mr, count);
+ assert(single_gram->set_array_item(token2, array_item));
+ } else { /* item doesn't exist. */
+ /* the same as above. */
+ if ( count > g_maximum_occurs ){
+ gpointer value = NULL;
+ assert( g_hash_table_lookup_extended
+ (hash_of_unigram, GUINT_TO_POINTER(token2),
+ NULL, &value) );
+ guint32 freq = GPOINTER_TO_UINT(value);
+ freq -= count;
+ if ( freq > 0 ) {
+ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2),
+ GUINT_TO_POINTER(freq));
+ } else if ( freq == 0 ) {
+ assert(g_hash_table_steal(hash_of_unigram,
+ GUINT_TO_POINTER(token2)));
+ } else {
+ assert(false);
+ }
+ return;
+ }
+ memset(&array_item, 0, sizeof(KMixtureModelArrayItem));
+ array_item.m_WC = count;
+ /* array_item.m_T = count; the same as m_WC. */
+ array_item.m_N_n_0 = 1;
+ if ( 1 == count )
+ array_item.m_n_1 = 1;
+ array_item.m_Mr = count;
+ assert(single_gram->insert_array_item(token2, array_item));
+ }
+
+ /* save delta in the array header. */
+ KMixtureModelArrayHeader array_header;
+ single_gram->get_array_header(array_header);
+ array_header.m_WC += count;
+ single_gram->set_array_header(array_header);
+}
+
+bool train_single_gram(HashofUnigram hash_of_unigram,
+ HashofDocument hash_of_document,
+ KMixtureModelSingleGram * single_gram,
+ phrase_token_t token1,
+ guint32 & delta){
+ assert(NULL != single_gram);
+ delta = 0; /* delta in WC of single_gram. */
+ KMixtureModelArrayHeader array_header;
+ assert(single_gram->get_array_header(array_header));
+ guint32 saved_array_header_WC = array_header.m_WC;
+
+ HashofSecondWord hash_of_second_word = NULL;
+ gpointer key, value = NULL;
+ assert(g_hash_table_lookup_extended
+ (hash_of_document, GUINT_TO_POINTER(token1),
+ NULL, &value));
+ hash_of_second_word = (HashofSecondWord) value;
+ assert(NULL != hash_of_second_word);
+
+ /* train word pair */
+ GHashTableIter iter;
+ g_hash_table_iter_init(&iter, hash_of_second_word);
+ while (g_hash_table_iter_next(&iter, &key, &value)) {
+ phrase_token_t token2 = GPOINTER_TO_UINT(key);
+ guint32 count = GPOINTER_TO_UINT(value);
+ train_word_pair(hash_of_unigram, single_gram, token2, count);
+ }
+
+ assert(single_gram->get_array_header(array_header));
+ delta = array_header.m_WC - saved_array_header_WC;
+ return true;
+}
+
+static bool train_second_word(HashofUnigram hash_of_unigram,
+ KMixtureModelBigram * bigram,
+ HashofDocument hash_of_document,
+ phrase_token_t token1){
+ guint32 delta = 0;
+
+ KMixtureModelSingleGram * single_gram = NULL;
+ bool exists = bigram->load(token1, single_gram);
+ if ( !exists )
+ single_gram = new KMixtureModelSingleGram;
+ train_single_gram(hash_of_unigram, hash_of_document,
+ single_gram, token1, delta);
+
+ if ( 0 == delta ){ /* Please consider maximum occurs allowed. */
+ delete single_gram;
+ return false;
+ }
+
+ /* save the single gram. */
+ assert(bigram->store(token1, single_gram));
+ delete single_gram;
+
+ KMixtureModelMagicHeader magic_header;
+ if (!bigram->get_magic_header(magic_header)){
+ /* the first time to access the new k mixture model file. */
+ memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ }
+
+ if ( magic_header.m_WC + delta < magic_header.m_WC ){
+ fprintf(stderr, "the m_WC integer in magic header overflows.\n");
+ return false;
+ }
+ magic_header.m_WC += delta;
+ assert(bigram->set_magic_header(magic_header));
+
+ return true;
+}
+
+/* Note: this method is a post-processing method, run this last. */
+static bool post_processing_unigram(KMixtureModelBigram * bigram,
+ HashofUnigram hash_of_unigram){
+ GHashTableIter iter;
+ gpointer key, value;
+ guint32 total_freq = 0;
+
+ g_hash_table_iter_init(&iter, hash_of_unigram);
+ while (g_hash_table_iter_next(&iter, &key, &value)){
+ guint32 token = GPOINTER_TO_UINT(key);
+ guint32 freq = GPOINTER_TO_UINT(value);
+ KMixtureModelArrayHeader array_header;
+ bool result = bigram->get_array_header(token, array_header);
+ array_header.m_freq += freq;
+ total_freq += freq;
+ bigram->set_array_header(token, array_header);
+ }
+
+ KMixtureModelMagicHeader magic_header;
+ assert(bigram->get_magic_header(magic_header));
+ if ( magic_header.m_total_freq + total_freq < magic_header.m_total_freq ){
+ fprintf(stderr, "the m_total_freq in magic header overflows.\n");
+ return false;
+ }
+ magic_header.m_total_freq += total_freq;
+ assert(bigram->set_magic_header(magic_header));
+
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ int i = 1;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ PhraseLargeTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+ while ( i < argc ){
+ const char * filename = argv[i];
+ FILE * document = fopen(filename, "r");
+ if ( NULL == document ){
+ int err_saved = errno;
+ fprintf(stderr, "can't open file: %s.\n", filename);
+ fprintf(stderr, "error:%s.\n", strerror(err_saved));
+ exit(err_saved);
+ }
+
+ HashofDocument hash_of_document = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
+ HashofUnigram hash_of_unigram = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
+
+ assert(read_document(&phrase_table, &phrase_index, document,
+ hash_of_document, hash_of_unigram));
+ fclose(document);
+ document = NULL;
+
+ GHashTableIter iter;
+ gpointer key, value;
+
+ /* train the document, and convert it to k mixture model. */
+ g_hash_table_iter_init(&iter, hash_of_document);
+ while (g_hash_table_iter_next(&iter, &key, &value)) {
+ phrase_token_t token1 = GPOINTER_TO_UINT(key);
+ train_second_word(hash_of_unigram, &bigram,
+ hash_of_document, token1);
+ }
+
+ KMixtureModelMagicHeader magic_header;
+ assert(bigram.get_magic_header(magic_header));
+ magic_header.m_N ++;
+ assert(bigram.set_magic_header(magic_header));
+
+ post_processing_unigram(&bigram, hash_of_unigram);
+
+ /* free resources of g_hash_of_document */
+ g_hash_table_iter_init(&iter, hash_of_document);
+ while (g_hash_table_iter_next(&iter, &key, &value)) {
+ HashofSecondWord second_word = (HashofSecondWord) value;
+ g_hash_table_iter_steal(&iter);
+ g_hash_table_unref(second_word);
+ }
+ g_hash_table_unref(hash_of_document);
+ hash_of_document = NULL;
+
+ g_hash_table_unref(hash_of_unigram);
+ hash_of_unigram = NULL;
+
+ ++i;
+ }
+
+ return 0;
+}
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
new file mode 100644
index 0000000..1947959
--- /dev/null
+++ b/utils/training/gen_ngram.cpp
@@ -0,0 +1,136 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007, 2011 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static gboolean train_pi_gram = TRUE;
+static const gchar * bigram_filename = SYSTEM_BIGRAM;
+
+static GOptionEntry entries[] =
+{
+ {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL},
+ {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "bi-gram file", NULL},
+ {NULL}
+};
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate n-gram");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ PhraseLargeTable2 phrase_table;
+ /* init phrase table */
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+
+ char* linebuf = NULL; size_t size = 0;
+ phrase_token_t last_token, cur_token = last_token = 0;
+ while( getline(&linebuf, &size, input) ){
+ if ( feof(input) )
+ break;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
+ last_token = cur_token;
+ cur_token = token;
+
+ /* skip null_token in second word. */
+ if ( null_token == cur_token )
+ continue;
+
+ /* training uni-gram */
+ phrase_index.add_unigram_frequency(cur_token, 1);
+
+ /* skip pi-gram training. */
+ if ( null_token == last_token ){
+ if ( !train_pi_gram )
+ continue;
+ last_token = sentence_start;
+ }
+
+ /* train bi-gram */
+ SingleGram * single_gram = NULL;
+ bigram.load(last_token, single_gram);
+
+ if ( NULL == single_gram ){
+ single_gram = new SingleGram;
+ }
+ guint32 freq, total_freq;
+ /* increase freq */
+ if (single_gram->get_freq(cur_token, freq))
+ assert(single_gram->set_freq(cur_token, freq + 1));
+ else
+ assert(single_gram->insert_freq(cur_token, 1));
+ /* increase total freq */
+ single_gram->get_total_freq(total_freq);
+ single_gram->set_total_freq(total_freq + 1);
+
+ bigram.store(last_token, single_gram);
+ delete single_gram;
+ }
+
+ free(linebuf);
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}
diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp
new file mode 100644
index 0000000..f4c51af
--- /dev/null
+++ b/utils/training/gen_unigram.cpp
@@ -0,0 +1,111 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+ {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+ {NULL}
+};
+
+/* increase all unigram frequency by a constant. */
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- increase uni-gram");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ bool retval = system_table_info.load(filename);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+ g_free(filename);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ /* Note: please increase the value when corpus size becomes larger.
+ * To avoid zero value when computing unigram frequency in float format.
+ */
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+ assert(table_info->m_dict_index == i);
+
+ if (SYSTEM_FILE != table_info->m_file_type &&
+ DICTIONARY != table_info->m_file_type)
+ continue;
+
+ guint32 freq = 1;
+#if 0
+ /* skip GBK_DICTIONARY. */
+ if (GBK_DICTIONARY == table_info->m_dict_index)
+ freq = 1;
+#endif
+
+ const char * binfile = table_info->m_system_filename;
+
+ MemoryChunk * chunk = new MemoryChunk;
+ bool retval = chunk->load(binfile);
+ if (!retval) {
+ fprintf(stderr, "load %s failed!\n", binfile);
+ exit(ENOENT);
+ }
+
+ phrase_index.load(i, chunk);
+
+ PhraseIndexRange range;
+ int result = phrase_index.get_range(i, range);
+ if ( result == ERROR_OK ) {
+ for (size_t token = range.m_range_begin;
+ token <= range.m_range_end; ++token) {
+ phrase_index.add_unigram_frequency(token, freq);
+ }
+ }
+ }
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ if (!save_dictionary(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}
diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp
new file mode 100644
index 0000000..40870cf
--- /dev/null
+++ b/utils/training/import_k_mixture_model.cpp
@@ -0,0 +1,322 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+#include "k_mixture_model.h"
+
+static const gchar * k_mixture_model_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL},
+ {NULL}
+};
+
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline(KMixtureModelBigram * bigram);
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram);
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram);
+
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+ return result;
+}
+
+bool parse_headline(KMixtureModelBigram * bigram){
+ /* enter "\data" line */
+ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", ""));
+
+ /* read "\data" line */
+ if ( !taglib_read(linebuf, line_type, values, required) ) {
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ assert(line_type == BEGIN_LINE);
+ /* check header */
+ TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+ if ( !( strcmp("k mixture model", model) == 0 ) ) {
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ TAGLIB_GET_TAGVALUE(glong, N, atol);
+ TAGLIB_GET_TAGVALUE(glong, total_freq, atol);
+
+ KMixtureModelMagicHeader magic_header;
+ memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ magic_header.m_WC =count; magic_header.m_N = N;
+ magic_header.m_total_freq = total_freq;
+ bigram->set_magic_header(magic_header);
+
+ return true;
+}
+
+bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ goto end;
+ case GRAM_1_LINE:
+ my_getline(input);
+ parse_unigram(input, phrase_table, phrase_index, bigram);
+ goto retry;
+ case GRAM_2_LINE:
+ my_getline(input);
+ parse_bigram(input, phrase_table, phrase_index, bigram);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1) ;
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", ""));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_1_ITEM_LINE:{
+ /* handle \item in \1-gram */
+ TAGLIB_GET_TOKEN(token, 0);
+ TAGLIB_GET_PHRASE_STRING(word, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token, word));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ TAGLIB_GET_TAGVALUE(glong, freq, atol);
+
+ KMixtureModelArrayHeader array_header;
+ memset(&array_header, 0, sizeof(KMixtureModelArrayHeader));
+ array_header.m_WC = count; array_header.m_freq = freq;
+ bigram->set_array_header(token, array_header);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
+ "count:T:N_n_0:n_1:Mr", ""));
+
+ phrase_token_t last_token = null_token;
+ KMixtureModelSingleGram * last_single_gram = NULL;
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two tokens */
+ TAGLIB_GET_TOKEN(token1, 0);
+ TAGLIB_GET_PHRASE_STRING(word1, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token1, word1));
+
+ TAGLIB_GET_TOKEN(token2, 2);
+ TAGLIB_GET_PHRASE_STRING(word2, 3);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token2, word2));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ TAGLIB_GET_TAGVALUE(glong, T, atol);
+ assert(count == T);
+ TAGLIB_GET_TAGVALUE(glong, N_n_0, atol);
+ TAGLIB_GET_TAGVALUE(glong, n_1, atol);
+ TAGLIB_GET_TAGVALUE(glong, Mr, atol);
+
+ KMixtureModelArrayItem array_item;
+ memset(&array_item, 0, sizeof(KMixtureModelArrayItem));
+ array_item.m_WC = count; array_item.m_N_n_0 = N_n_0;
+ array_item.m_n_1 = n_1; array_item.m_Mr = Mr;
+
+ if ( last_token != token1 ) {
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ /* safe guard */
+ last_token = null_token;
+ last_single_gram = NULL;
+ }
+ KMixtureModelSingleGram * single_gram = NULL;
+ bigram->load(token1, single_gram);
+
+ /* create the new single gram */
+ if ( single_gram == NULL )
+ single_gram = new KMixtureModelSingleGram;
+ last_token = token1;
+ last_single_gram = single_gram;
+ }
+
+ assert(NULL != last_single_gram);
+ assert(last_single_gram->insert_array_item(token2, array_item));
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ /* safe guard */
+ last_token = null_token;
+ last_single_gram = NULL;
+ }
+
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- import k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ PhraseLargeTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+ taglib_init();
+
+ /* prepare to read n-gram model */
+ values = g_ptr_array_new();
+ required = g_hash_table_new(g_str_hash, g_str_equal);
+
+ ssize_t result = my_getline(input);
+ if ( result == -1 ) {
+ fprintf(stderr, "empty file input.\n");
+ exit(ENODATA);
+ }
+
+ if (!parse_headline(&bigram))
+ exit(ENODATA);
+
+ result = my_getline(input);
+ if ( result != -1 )
+ parse_body(input, &phrase_table, &phrase_index, &bigram);
+
+ taglib_fini();
+
+ return 0;
+}
diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h
new file mode 100644
index 0000000..ad8d3d8
--- /dev/null
+++ b/utils/training/k_mixture_model.h
@@ -0,0 +1,172 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef K_MIXTURE_MODEL
+#define K_MIXTURE_MODEL
+
+#include <math.h>
+#include "novel_types.h"
+#include "flexible_ngram.h"
+
+namespace pinyin{
+
+typedef guint32 corpus_count_t;
+
+/* Note: storage parameters: N, T, n_r.
+ * N: the total number of documents.
+ * T: the total number of instances of the word or phrase.
+ * n_r: the number of documents having exactly <b>r</b> occurrences.
+ * only n_0, n_1 are used here.
+ */
+
+static inline parameter_t compute_alpha(corpus_count_t N, corpus_count_t n_0){
+ parameter_t alpha = 1 - n_0 / (parameter_t) N;
+ return alpha;
+}
+
+static inline parameter_t compute_gamma(corpus_count_t N,
+ corpus_count_t n_0,
+ corpus_count_t n_1){
+ parameter_t gamma = 1 - n_1 / (parameter_t) (N - n_0);
+ return gamma;
+}
+
+static inline parameter_t compute_B(corpus_count_t N,
+ corpus_count_t T,
+ corpus_count_t n_0,
+ corpus_count_t n_1){
+ /* Note: re-check this, to see if we can remove if statement. */
+ /* Please consider B_2 is no less than 2 in paper. */
+#if 1
+ if ( 0 == T - n_1 && 0 == N - n_0 - n_1 )
+ return 2;
+#endif
+
+ parameter_t B = (T - n_1 ) / (parameter_t) (N - n_0 - n_1);
+ return B;
+}
+
+/* three parameters model */
+static inline parameter_t compute_Pr_G_3(corpus_count_t k,
+ parameter_t alpha,
+ parameter_t gamma,
+ parameter_t B){
+ if ( k == 0 )
+ return 1 - alpha;
+
+ if ( k == 1 )
+ return alpha * (1 - gamma);
+
+ if ( k > 1 ) {
+ return (alpha * gamma / (B - 1)) * pow((1 - 1 / (B - 1)) , k - 2);
+ }
+
+ assert(false);
+}
+
+static inline parameter_t compute_Pr_G_3_with_count(corpus_count_t k,
+ corpus_count_t N,
+ corpus_count_t T,
+ corpus_count_t n_0,
+ corpus_count_t n_1){
+ parameter_t alpha = compute_alpha(N, n_0);
+ parameter_t gamma = compute_gamma(N, n_0, n_1);
+ parameter_t B = compute_B(N, T, n_0, n_1);
+
+ return compute_Pr_G_3(k, alpha, gamma, B);
+}
+
+/* two parameters model */
+static inline parameter_t compute_Pr_G_2(corpus_count_t k,
+ parameter_t alpha,
+ parameter_t B){
+ parameter_t gamma = 1 - 1 / (B - 1);
+ return compute_Pr_G_3(k, alpha, gamma, B);
+}
+
+static inline parameter_t compute_Pr_G_2_with_count(corpus_count_t k,
+ corpus_count_t N,
+ corpus_count_t T,
+ corpus_count_t n_0,
+ corpus_count_t n_1){
+ parameter_t alpha = compute_alpha(N, n_0);
+ parameter_t B = compute_B(N, T, n_0, n_1);
+ return compute_Pr_G_2(k, alpha, B);
+}
+
+#define K_MIXTURE_MODEL_MAGIC_NUMBER "KMMP"
+
+typedef struct{
+ /* the total number of instances of all words. */
+ guint32 m_WC;
+ /* the total number of documents. */
+ guint32 m_N;
+ /* the total freq of uni-gram. */
+ guint32 m_total_freq;
+} KMixtureModelMagicHeader;
+
+typedef struct{
+ /* the total number of instances of word W1. */
+ guint32 m_WC;
+ /* the freq of uni-gram. see m_total_freq in magic header also. */
+ guint32 m_freq;
+} KMixtureModelArrayHeader;
+
+typedef struct{
+ /* the total number of all W1,W2 word pair. */
+ guint32 m_WC;
+
+ /* the total number of instances of the word or phrase.
+ (two word phrase) */
+ /* guint32 m_T; Please use m_WC instead.
+ alias of m_WC, always the same. */
+
+ /* n_r: the number of documents having exactly r occurrences. */
+ /* guint32 m_n_0;
+ Note: compute this value using the following equation.
+ m_n_0 = KMixtureModelMagicHeader.m_N - m_N_n_0;
+ m_N_n_0, the number of documents which contains the word or phrase.
+ (two word phrase) */
+ guint32 m_N_n_0;
+ guint32 m_n_1;
+
+ /* maximum instances of the word or phrase (two word phrase)
+ in previous documents last seen. */
+ guint32 m_Mr;
+} KMixtureModelArrayItem;
+
+typedef FlexibleBigram<KMixtureModelMagicHeader,
+ KMixtureModelArrayHeader,
+ KMixtureModelArrayItem>
+KMixtureModelBigram;
+
+typedef FlexibleSingleGram<KMixtureModelArrayHeader,
+ KMixtureModelArrayItem>
+KMixtureModelSingleGram;
+
+typedef KMixtureModelSingleGram::ArrayItemWithToken
+KMixtureModelArrayItemWithToken;
+
+};
+
+
+#endif
diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp
new file mode 100644
index 0000000..c5a66ec
--- /dev/null
+++ b/utils/training/k_mixture_model_to_interpolation.cpp
@@ -0,0 +1,214 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline(FILE * input, FILE * output);
+
+bool parse_unigram(FILE * input, FILE * output);
+
+bool parse_bigram(FILE * input, FILE * output);
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ linebuf[strlen(linebuf) - 1] = '\0';
+ return result;
+}
+
+bool parse_headline(FILE * input, FILE * output) {
+ /* enter "\data" line */
+ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model",
+ "count:N:total_freq"));
+
+ /* read "\data" line */
+ if ( !taglib_read(linebuf, line_type, values, required) ) {
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ assert(line_type == BEGIN_LINE);
+ TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+ if ( !( strcmp("k mixture model", model) == 0 ) ){
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ /* print header */
+ fprintf(output, "\\data model interpolation\n");
+
+ return true;
+}
+
+bool parse_body(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ fprintf(output, "\\end\n");
+ goto end;
+ case GRAM_1_LINE:
+ fprintf(output, "\\1-gram\n");
+ my_getline(input);
+ parse_unigram(input, output);
+ goto retry;
+ case GRAM_2_LINE:
+ fprintf(output, "\\2-gram\n");
+ my_getline(input);
+ parse_bigram(input, output);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count"));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case GRAM_1_ITEM_LINE: {
+ /* handle \item in \1-gram */
+ TAGLIB_GET_TOKEN(token, 0);
+ TAGLIB_GET_PHRASE_STRING(word, 1);
+
+ /* remove the "<start>" in the uni-gram of interpolation model */
+ if ( sentence_start == token )
+ break;
+
+ TAGLIB_GET_TAGVALUE(glong, freq, atol);
+
+ /* ignore zero unigram freq item */
+ if ( 0 != freq )
+ fprintf(output, "\\item %d %s count %ld\n", token, word, freq);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
+ "count", "T:N_n_0:n_1:Mr"));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two strings */
+ TAGLIB_GET_TOKEN(token1, 0);
+ TAGLIB_GET_PHRASE_STRING(word1, 1);
+
+ TAGLIB_GET_TOKEN(token2, 2);
+ TAGLIB_GET_PHRASE_STRING(word2, 3);
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ fprintf(output, "\\item %d %s %d %s count %ld\n",
+ token1, word1, token2, word2, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ taglib_init();
+
+ values = g_ptr_array_new();
+ required = g_hash_table_new(g_str_hash, g_str_equal);
+
+ ssize_t result = my_getline(input);
+ if ( result == -1 ) {
+ fprintf(stderr, "empty file input.\n");
+ exit(ENODATA);
+ }
+
+ if (!parse_headline(input, output))
+ exit(ENODATA);
+
+ result = my_getline(input);
+ if ( result != -1 )
+ parse_body(input, output);
+
+ taglib_fini();
+
+ return 0;
+}
diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp
new file mode 100644
index 0000000..ab08010
--- /dev/null
+++ b/utils/training/merge_k_mixture_model.cpp
@@ -0,0 +1,239 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+void print_help(){
+ printf("Usage: merge_k_mixture_model [--result-file <RESULT_FILENAME>]\n");
+ printf(" {<SOURCE_FILENAME>}+\n");
+}
+
+static const gchar * result_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"result-file", 0, 0, G_OPTION_ARG_FILENAME, &result_filename, "merged result file", NULL},
+ {NULL}
+};
+
+static bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first,
+ /* in */ FlexibleBigramPhraseArray second,
+ /* out */ FlexibleBigramPhraseArray & merged ){
+ /* avoid to do empty merge. */
+ assert( NULL != first && NULL != second && NULL != merged );
+
+ /* merge two arrays. */
+ guint first_index, second_index = first_index = 0;
+ KMixtureModelArrayItemWithToken * first_item,
+ * second_item = first_item = NULL;
+ while ( first_index < first->len && second_index < second->len ){
+ first_item = &g_array_index(first, KMixtureModelArrayItemWithToken,
+ first_index);
+ second_item = &g_array_index(second, KMixtureModelArrayItemWithToken,
+ second_index);
+ if ( first_item->m_token > second_item->m_token ) {
+ g_array_append_val(merged, *second_item);
+ second_index ++;
+ } else if ( first_item->m_token < second_item->m_token ) {
+ g_array_append_val(merged, *first_item);
+ first_index ++;
+ } else /* first_item->m_token == second_item->m_token */ {
+ KMixtureModelArrayItemWithToken merged_item;
+ memset(&merged_item, 0, sizeof(KMixtureModelArrayItemWithToken));
+ merged_item.m_token = first_item->m_token;/* same as second_item */
+ merged_item.m_item.m_WC = first_item->m_item.m_WC +
+ second_item->m_item.m_WC;
+ /* merged_item.m_item.m_T = first_item->m_item.m_T +
+ second_item->m_item.m_T; */
+ merged_item.m_item.m_N_n_0 = first_item->m_item.m_N_n_0 +
+ second_item->m_item.m_N_n_0;
+ merged_item.m_item.m_n_1 = first_item->m_item.m_n_1 +
+ second_item->m_item.m_n_1;
+ merged_item.m_item.m_Mr = std_lite::max(first_item->m_item.m_Mr,
+ second_item->m_item.m_Mr);
+ g_array_append_val(merged, merged_item);
+ first_index ++; second_index ++;
+ }
+ }
+
+ /* add remained items. */
+ while ( first_index < first->len ){
+ first_item = &g_array_index(first, KMixtureModelArrayItemWithToken,
+ first_index);
+ g_array_append_val(merged, *first_item);
+ first_index++;
+ }
+
+ while ( second_index < second->len ){
+ second_item = &g_array_index(second, KMixtureModelArrayItemWithToken,
+ second_index);
+ g_array_append_val(merged, *second_item);
+ second_index++;
+ }
+
+ return true;
+}
+
+static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one ){
+
+ KMixtureModelMagicHeader target_magic_header;
+ KMixtureModelMagicHeader new_magic_header;
+ KMixtureModelMagicHeader merged_magic_header;
+
+ memset(&merged_magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ if (!target->get_magic_header(target_magic_header)) {
+ memset(&target_magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ }
+ assert(new_one->get_magic_header(new_magic_header));
+ if ( target_magic_header.m_WC + new_magic_header.m_WC <
+ std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){
+ fprintf(stderr, "the m_WC integer in magic header overflows.\n");
+ return false;
+ }
+ if ( target_magic_header.m_total_freq + new_magic_header.m_total_freq <
+ std_lite::max( target_magic_header.m_total_freq,
+ new_magic_header.m_total_freq ) ){
+ fprintf(stderr, "the m_total_freq in magic header overflows.\n");
+ return false;
+ }
+
+ merged_magic_header.m_WC = target_magic_header.m_WC +
+ new_magic_header.m_WC;
+ merged_magic_header.m_N = target_magic_header.m_N +
+ new_magic_header.m_N;
+ merged_magic_header.m_total_freq = target_magic_header.m_total_freq +
+ new_magic_header.m_total_freq;
+
+ assert(target->set_magic_header(merged_magic_header));
+ return true;
+}
+
+static bool merge_array_items( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one ){
+
+ GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ new_one->get_all_items(new_items);
+
+ for ( size_t i = 0; i < new_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i);
+ KMixtureModelSingleGram * target_single_gram = NULL;
+ KMixtureModelSingleGram * new_single_gram = NULL;
+
+ assert(new_one->load(*token, new_single_gram));
+ bool exists_in_target = target->load(*token, target_single_gram);
+ if ( !exists_in_target ){
+ target->store(*token, new_single_gram);
+ delete new_single_gram;
+ continue;
+ }
+
+ /* word count in array header in parallel with array items */
+ KMixtureModelArrayHeader target_array_header;
+ KMixtureModelArrayHeader new_array_header;
+ KMixtureModelArrayHeader merged_array_header;
+
+ assert(new_one->get_array_header(*token, new_array_header));
+ assert(target->get_array_header(*token, target_array_header));
+ memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader));
+
+ merged_array_header.m_WC = target_array_header.m_WC +
+ new_array_header.m_WC;
+ merged_array_header.m_freq = target_array_header.m_freq +
+ new_array_header.m_freq;
+ /* end of word count in array header computing. */
+
+ assert(NULL != target_single_gram);
+ KMixtureModelSingleGram * merged_single_gram =
+ new KMixtureModelSingleGram;
+
+ FlexibleBigramPhraseArray target_array =
+ g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ target_single_gram->retrieve_all(target_array);
+
+ FlexibleBigramPhraseArray new_array =
+ g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ new_single_gram->retrieve_all(new_array);
+ FlexibleBigramPhraseArray merged_array =
+ g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+
+ assert(merge_two_phrase_array(target_array, new_array, merged_array));
+
+ g_array_free(target_array, TRUE);
+ g_array_free(new_array, TRUE);
+ delete target_single_gram; delete new_single_gram;
+
+ for ( size_t m = 0; m < merged_array->len; ++m ){
+ KMixtureModelArrayItemWithToken * item =
+ &g_array_index(merged_array,
+ KMixtureModelArrayItemWithToken, m);
+ merged_single_gram->insert_array_item(item->m_token, item->m_item);
+ }
+
+ assert(merged_single_gram->set_array_header(merged_array_header));
+ assert(target->store(*token, merged_single_gram));
+ delete merged_single_gram;
+ g_array_free(merged_array, TRUE);
+ }
+
+ g_array_free(new_items, TRUE);
+ return true;
+}
+
+bool merge_two_k_mixture_model( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one ){
+ assert(NULL != target);
+ assert(NULL != new_one);
+ return merge_array_items(target, new_one) &&
+ merge_magic_header(target, new_one);
+}
+
+int main(int argc, char * argv[]){
+ int i = 1;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- merge k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ KMixtureModelBigram target(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ target.attach(result_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+ while (i < argc){
+ const char * new_filename = argv[i];
+ KMixtureModelBigram new_one(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ new_one.attach(new_filename, ATTACH_READONLY);
+ if ( !merge_two_k_mixture_model(&target, &new_one) )
+ exit(EOVERFLOW);
+ ++i;
+ }
+
+ return 0;
+}
diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp
new file mode 100644
index 0000000..40dfb87
--- /dev/null
+++ b/utils/training/prune_k_mixture_model.cpp
@@ -0,0 +1,192 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+
+#include <errno.h>
+#include <locale.h>
+#include <limits.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+
+void print_help(){
+ printf("Usage: prune_k_mixture_model -k <INT> --CDF <DOUBLE> <FILENAME>\n");
+}
+
+static gint g_prune_k = 3;
+static parameter_t g_prune_poss = 0.99;
+
+static GOptionEntry entries[] =
+{
+ {"pruneK", 'k', 0, G_OPTION_ARG_INT, &g_prune_k, "k parameter", NULL},
+ {"CDF", 0, 0, G_OPTION_ARG_DOUBLE, &g_prune_poss, "CDF parameter", NULL},
+ {NULL}
+};
+
+
+bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header,
+ KMixtureModelSingleGram * & bigram,
+ FlexibleBigramPhraseArray removed_array){
+ bool success;
+
+ FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ bigram->retrieve_all(array);
+
+ for ( size_t i = 0; i < array->len; ++i) {
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i);
+ phrase_token_t token = item->m_token;
+ parameter_t remained_poss = 1; parameter_t one_poss = 0;
+ bool errors = false;
+ for ( size_t k = 0; k < g_prune_k; ++k){
+ one_poss = compute_Pr_G_3_with_count
+ (k, magic_header->m_N, item->m_item.m_WC,
+ magic_header->m_N - item->m_item.m_N_n_0,
+ item->m_item.m_n_1);
+ if ( !(0 <= one_poss && one_poss <= 1) )
+ errors = true;
+ remained_poss -= one_poss;
+ }
+
+ if ( fabs(remained_poss) < DBL_EPSILON )
+ remained_poss = 0.;
+
+ /* some wrong possibility. */
+ if ( errors || !(0 <= remained_poss && remained_poss <= 1) ) {
+ fprintf(stderr, "some wrong possibility is encountered:%f.\n",
+ remained_poss);
+ fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n",
+ g_prune_k, magic_header->m_N, item->m_item.m_WC,
+ magic_header->m_N - item->m_item.m_N_n_0,
+ item->m_item.m_n_1);
+ exit(EDOM);
+ }
+
+ if ( remained_poss < g_prune_poss ) {
+ /* prune this word or phrase. */
+ KMixtureModelArrayItem removed_item;
+ bigram->remove_array_item(token, removed_item);
+ assert( memcmp(&removed_item, &(item->m_item),
+ sizeof(KMixtureModelArrayItem)) == 0 );
+
+ KMixtureModelArrayItemWithToken removed_item_with_token;
+ removed_item_with_token.m_token = token;
+ removed_item_with_token.m_item = removed_item;
+ g_array_append_val(removed_array, removed_item_with_token);
+
+ KMixtureModelArrayHeader array_header;
+ bigram->get_array_header(array_header);
+ guint32 removed_count = removed_item.m_WC;
+ array_header.m_WC -= removed_count;
+ bigram->set_array_header(array_header);
+ magic_header->m_WC -= removed_count;
+ magic_header->m_total_freq -= removed_count;
+ }
+ }
+
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- prune k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (2 != argc) {
+ fprintf(stderr, "wrong arguments.\n");
+ exit(EINVAL);
+ }
+
+ const gchar * bigram_filename = argv[1];
+
+ /* TODO: magic header signature check here. */
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(bigram_filename, ATTACH_READWRITE);
+
+ KMixtureModelMagicHeader magic_header;
+ if (!bigram.get_magic_header(magic_header)) {
+ fprintf(stderr, "no magic header in k mixture model.\n");
+ exit(ENODATA);
+ }
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram.get_all_items(items);
+
+ /* print prune progress */
+ size_t progress = 0; size_t onestep = items->len / 20;
+ for ( size_t i = 0; i < items->len; ++i ){
+ if ( progress >= onestep ) {
+ progress = 0; fprintf(stderr, "*");
+ }
+ progress ++;
+
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ bigram.load(*token, single_gram);
+
+ FlexibleBigramPhraseArray removed_array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+
+ prune_k_mixture_model(&magic_header, single_gram, removed_array);
+ bigram.store(*token, single_gram);
+
+ delete single_gram;
+
+ /* post processing for unigram reduce */
+ for (size_t m = 0; m < removed_array->len; ++m ){
+ KMixtureModelArrayItemWithToken * item =
+ &g_array_index(removed_array,
+ KMixtureModelArrayItemWithToken, m);
+ KMixtureModelArrayHeader array_header;
+ assert(bigram.get_array_header(item->m_token, array_header));
+ array_header.m_freq -= item->m_item.m_WC;
+ assert(array_header.m_freq >= 0);
+ assert(bigram.set_array_header(item->m_token, array_header));
+ }
+
+ g_array_free(removed_array, TRUE);
+ removed_array = NULL;
+ }
+
+ fprintf(stderr, "\n");
+
+ bigram.set_magic_header(magic_header);
+
+ /* post processing clean up zero items */
+ KMixtureModelArrayHeader array_header;
+ for ( size_t i = 0; i < items->len; ++i ){
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ assert(bigram.get_array_header(*token, array_header));
+ if ( 0 == array_header.m_WC && 0 == array_header.m_freq )
+ assert(bigram.remove(*token));
+ }
+
+ g_array_free(items, TRUE);
+
+ return 0;
+}
diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp
new file mode 100644
index 0000000..7c057b9
--- /dev/null
+++ b/utils/training/validate_k_mixture_model.cpp
@@ -0,0 +1,174 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+void print_help(){
+ printf("Usage: validate_k_mixture_model <FILENAME>\n");
+}
+
+bool validate_unigram(KMixtureModelBigram * bigram){
+ KMixtureModelMagicHeader magic_header;
+ if( !bigram->get_magic_header(magic_header) ){
+ fprintf(stderr, "no magic header in k mixture model.\n");
+ return false;
+ }
+
+ guint32 expected_word_count = magic_header.m_WC;
+ if ( 0 == expected_word_count ){
+ fprintf(stderr, "word count in magic header is unexpected zero.\n");
+ return false;
+ }
+ guint32 expected_total_freq = magic_header.m_total_freq;
+ if ( 0 == expected_total_freq ){
+ fprintf(stderr, "total freq in magic header is unexpected zero.\n");
+ return false;
+ }
+
+ if ( expected_word_count != expected_total_freq ){
+ fprintf(stderr, "the word count doesn't match the total freq.\n");
+ return false;
+ }
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ guint32 word_count = 0; guint32 total_freq = 0;
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ KMixtureModelArrayHeader array_header;
+ assert(bigram->get_array_header(*token, array_header));
+ word_count += array_header.m_WC;
+ total_freq += array_header.m_freq;
+ }
+
+ if ( word_count != expected_word_count ){
+ fprintf(stderr, "word count in magic header:%d\n",
+ expected_word_count);
+ fprintf(stderr, "sum of word count in array headers:%d\n", word_count);
+ fprintf(stderr, "the sum differs from word count.\n");
+ return false;
+ }
+ if ( total_freq != expected_total_freq ){
+ fprintf(stderr, "total freq in magic header:%d\n",
+ expected_total_freq);
+ fprintf(stderr, "sum of freqs in array headers:%d\n", total_freq);
+ fprintf(stderr, "the total freq differs from sum of freqs.\n");
+ return false;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
+
+bool validate_bigram(KMixtureModelBigram * bigram){
+ bool result = true;
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ assert(bigram->load(*token, single_gram));
+
+ FlexibleBigramPhraseArray array = g_array_new
+ (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ single_gram->retrieve_all(array);
+
+ KMixtureModelArrayHeader array_header;
+ assert(single_gram->get_array_header(array_header));
+
+ guint32 expected_sum = array_header.m_WC;
+ guint32 freq = array_header.m_freq;
+ if ( 0 == expected_sum ){
+ if ( 0 != array->len ){
+ fprintf(stderr, "in the array header of token %d:\n", *token);
+ fprintf(stderr, "word count is zero but has array items.\n");
+ result = false;
+ }
+ if ( 0 != freq ){
+ delete single_gram;
+ continue;
+ } else {
+ fprintf(stderr, "in the array header of token %d:\n", *token);
+ fprintf(stderr, "both word count and freq are "
+ "unexpected zero.\n");
+ result = false;
+ }
+ }
+
+ guint32 sum = 0;
+ for (size_t m = 0; m< array->len; ++m){
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
+
+ sum += item->m_item.m_WC;
+ }
+
+ if ( sum != expected_sum ){
+ fprintf(stderr, "word count in array header:%d\n", expected_sum);
+ fprintf(stderr, "sum of word count in array items:%d\n", sum);
+ fprintf(stderr, "the sum differs from word count.\n");
+ result = false;
+ }
+
+ g_array_free(array, TRUE);
+ delete single_gram;
+ }
+
+ g_array_free(items, TRUE);
+ return result;
+}
+
+int main(int argc, char * argv[]){
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- validate k mixture model");
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (2 != argc) {
+ fprintf(stderr, "wrong arguments.\n");
+ exit(EINVAL);
+ }
+
+ const char * k_mixture_model_filename = argv[1];
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(k_mixture_model_filename, ATTACH_READONLY);
+
+ if (!validate_unigram(&bigram)) {
+ fprintf(stderr, "k mixture model validation failed.\n");
+ exit(ENODATA);
+ }
+
+ if (!validate_bigram(&bigram)) {
+ fprintf(stderr, "k mixture model validation failed.\n");
+ exit(ENODATA);
+ }
+
+ return 0;
+}
diff --git a/utils/utils_helper.h b/utils/utils_helper.h
new file mode 100644
index 0000000..b91067b
--- /dev/null
+++ b/utils/utils_helper.h
@@ -0,0 +1,147 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef UTILS_HELPER_H
+#define UTILS_HELPER_H
+
+
+#define TAGLIB_GET_TOKEN(var, index) \
+ phrase_token_t var = null_token; \
+ { \
+ const char * string = (const char *) g_ptr_array_index \
+ (values, index); \
+ var = atoi(string); \
+ }
+
+#define TAGLIB_GET_PHRASE_STRING(var, index) \
+ const char * var = NULL; \
+ { \
+ var = (const char *) g_ptr_array_index \
+ (values, index); \
+ }
+
+#define TAGLIB_GET_TAGVALUE(type, var, conv) \
+ type var; \
+ { \
+ gpointer value = NULL; \
+ assert(g_hash_table_lookup_extended \
+ (required, #var, NULL, &value)); \
+ var = conv((const char *)value); \
+ }
+
+#define TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, var, line) \
+ phrase_token_t var = null_token; \
+ do { \
+ if (0 == strlen(line)) \
+ break; \
+ \
+ gchar ** strs = g_strsplit_set(line, " \t", 2); \
+ if (2 != g_strv_length(strs)) \
+ assert(false); \
+ \
+ phrase_token_t _token = atoi(strs[0]); \
+ const char * phrase = strs[1]; \
+ if (null_token != _token) \
+ assert(taglib_validate_token_with_string \
+ (phrase_index, _token, phrase)); \
+ \
+ var = _token; \
+ \
+ g_strfreev(strs); \
+ } while(false);
+
+
+static bool load_phrase_index(const pinyin_table_info_t * phrase_files,
+ FacadePhraseIndex * phrase_index) {
+ MemoryChunk * chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (SYSTEM_FILE != table_info->m_file_type)
+ continue;
+
+ const char * binfile = table_info->m_system_filename;
+
+ chunk = new MemoryChunk;
+ bool retval = chunk->load(binfile);
+ if (!retval) {
+ fprintf(stderr, "load %s failed!\n", binfile);
+ delete chunk;
+ return false;
+ }
+
+ phrase_index->load(i, chunk);
+ }
+ return true;
+}
+
+static bool save_phrase_index(const pinyin_table_info_t * phrase_files,
+ FacadePhraseIndex * phrase_index) {
+ MemoryChunk * new_chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (SYSTEM_FILE != table_info->m_file_type)
+ continue;
+
+ const char * binfile = table_info->m_system_filename;
+
+ new_chunk = new MemoryChunk;
+ phrase_index->store(i, new_chunk);
+ bool retval = new_chunk->save(binfile);
+ if (!retval) {
+ fprintf(stderr, "save %s failed.", binfile);
+ delete new_chunk;
+ return false;
+ }
+
+ phrase_index->load(i, new_chunk);
+ }
+ return true;
+}
+
+static bool save_dictionary(const pinyin_table_info_t * phrase_files,
+ FacadePhraseIndex * phrase_index) {
+ MemoryChunk * new_chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (DICTIONARY != table_info->m_file_type)
+ continue;
+
+ const char * binfile = table_info->m_system_filename;
+
+ new_chunk = new MemoryChunk;
+ phrase_index->store(i, new_chunk);
+ bool retval = new_chunk->save(binfile);
+ if (!retval) {
+ fprintf(stderr, "save %s failed.", binfile);
+ delete new_chunk;
+ return false;
+ }
+
+ phrase_index->load(i, new_chunk);
+ }
+ return true;
+}
+
+#endif