diff options
author | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
commit | b78429d78df745dd327b6dada6b9bd71ea5df84e (patch) | |
tree | 82c4625db8674c66d69fd566fce8efc347e3cb3a /utils | |
download | libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.gz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.xz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.zip |
import libpinyin code
Diffstat (limited to 'utils')
30 files changed, 5088 insertions, 0 deletions
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt new file mode 100644 index 0000000..dbd7855 --- /dev/null +++ b/utils/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(segment) +add_subdirectory(storage) +add_subdirectory(training)
\ No newline at end of file diff --git a/utils/Makefile.am b/utils/Makefile.am new file mode 100644 index 0000000..bc0f3e5 --- /dev/null +++ b/utils/Makefile.am @@ -0,0 +1,27 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = storage segment training + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I $(ac_aux_dir) + +noinst_HEADERS = utils_helper.h diff --git a/utils/segment/CMakeLists.txt b/utils/segment/CMakeLists.txt new file mode 100644 index 0000000..82e4deb --- /dev/null +++ b/utils/segment/CMakeLists.txt @@ -0,0 +1,19 @@ +add_executable( + spseg + spseg.cpp +) + +target_link_libraries( + spseg + libpinyin +) + +add_executable( + ngseg + ngseg.cpp +) + +target_link_libraries( + ngseg + libpinyin +)
\ No newline at end of file diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am new file mode 100644 index 0000000..579d6e4 --- /dev/null +++ b/utils/segment/Makefile.am @@ -0,0 +1,39 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ + @GLIB2_CFLAGS@ + +noinst_PROGRAMS = spseg ngseg mergeseq + +spseg_SOURCES = spseg.cpp + +spseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +ngseg_SOURCES = ngseg.cpp + +ngseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +mergeseq_SOURCES = mergeseq.cpp + +mergeseq_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp new file mode 100644 index 0000000..1a26064 --- /dev/null +++ b/utils/segment/mergeseq.cpp @@ -0,0 +1,278 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2013 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include <string.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +void print_help(){ + printf("Usage: mergeseq [-o outputfile] [inputfile]\n"); +} + + +static gboolean gen_extra_enter = FALSE; +static gchar * outputfile = NULL; + +static GOptionEntry entries[] = +{ + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"}, + {NULL} +}; + + +/* data structure definition. */ +typedef struct{ + phrase_token_t m_token; + gint m_token_len; +} TokenInfo; + + +/* GArray of ucs4 characters. */ +typedef GArray * UnicodeCharVector; +/* GArray of TokenInfo. */ +typedef GArray * TokenInfoVector; + +gint calculate_sequence_length(TokenInfoVector tokeninfos) { + gint len = 0; + + size_t i = 0; + for (i = 0; i < tokeninfos->len; ++i) { + TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i); + len += token_info->m_token_len; + } + + return len; +} + +/* if merge sequence found, merge and output it, + * if not, just output the first token; + * pop the first token or sequence. + */ +bool merge_sequence(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + UnicodeCharVector unichars, + TokenInfoVector tokeninfos) { + assert(tokeninfos->len > 0); + + bool found = false; + TokenInfo * token_info = NULL; + phrase_token_t token = null_token; + + ucs4_t * ucs4_str = (ucs4_t *) unichars->data; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + + /* search the merge sequence. */ + size_t index = tokeninfos->len; + gint seq_len = calculate_sequence_length(tokeninfos); + while (seq_len > 0) { + /* do phrase table search. */ + int retval = phrase_table->search(seq_len, ucs4_str, tokens); + + if (retval & SEARCH_OK) { + int num = get_first_token(tokens, token); + found = true; + break; + } + + --index; + token_info = &g_array_index(tokeninfos, TokenInfo, index); + seq_len -= token_info->m_token_len; + } + + phrase_index->destroy_tokens(tokens); + + /* push the merged sequence back. */ + if (found) { + /* pop up the origin sequence. */ + g_array_remove_range(tokeninfos, 0, index); + + TokenInfo info; + info.m_token = token; + info.m_token_len = seq_len; + g_array_prepend_val(tokeninfos, info); + } + + return found; +} + +bool pop_first_token(UnicodeCharVector unichars, + TokenInfoVector tokeninfos, + FILE * output) { + ucs4_t * ucs4_str = (ucs4_t *) unichars->data; + + /* pop it. */ + TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, 0); + phrase_token_t token = token_info->m_token; + gint token_len = token_info->m_token_len; + + glong read = 0; + gchar * utf8_str = g_ucs4_to_utf8(ucs4_str, token_len, &read, NULL, NULL); + assert(read == token_len); + fprintf(output, "%d %s\n", token, utf8_str); + g_free(utf8_str); + + g_array_remove_range(unichars, 0, token_len); + g_array_remove_index(tokeninfos, 0); + + return true; +} + +bool feed_line(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + UnicodeCharVector unichars, + TokenInfoVector tokeninfos, + const char * linebuf, + FILE * output) { + + TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf); + + if (null_token == token) { + /* empty the queue. */ + while (0 != tokeninfos->len) { + merge_sequence(phrase_table, phrase_index, unichars, tokeninfos); + pop_first_token(unichars, tokeninfos, output); + } + + assert(0 == unichars->len); + assert(0 == tokeninfos->len); + + /* restore the null token line. */ + fprintf(output, "%s\n", linebuf); + + return false; + } + + PhraseItem item; + phrase_index->get_phrase_item(token, item); + gint len = item.get_phrase_length(); + + TokenInfo info; + info.m_token = token; + info.m_token_len = len; + g_array_append_val(tokeninfos, info); + + ucs4_t buffer[MAX_PHRASE_LENGTH]; + item.get_phrase_string(buffer); + g_array_append_vals(unichars, buffer, len); + + /* probe merge sequence. */ + len = calculate_sequence_length(tokeninfos); + while (len >= MAX_PHRASE_LENGTH) { + merge_sequence(phrase_table, phrase_index, unichars, tokeninfos); + pop_first_token(unichars, tokeninfos, output); + len = calculate_sequence_length(tokeninfos); + } + + return true; +} + + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- merge word sequence"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (outputfile) { + output = fopen(outputfile, "w"); + if (NULL == output) { + perror("open file failed"); + exit(EINVAL); + } + } + + if (argc > 2) { + fprintf(stderr, "too many arguments.\n"); + exit(EINVAL); + } + + if (2 == argc) { + input = fopen(argv[1], "r"); + if (NULL == input) { + perror("open file failed"); + exit(EINVAL); + } + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* init phrase table */ + FacadePhraseTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + GArray * unichars = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); + GArray * tokeninfos = g_array_new(TRUE, TRUE, sizeof(TokenInfo)); + + char * linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, input)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if (0 == strlen(linebuf)) + continue; + + feed_line(&phrase_table, &phrase_index, + unichars, tokeninfos, + linebuf, output); + } + + g_array_free(unichars, TRUE); + g_array_free(tokeninfos, TRUE); + free(linebuf); + fclose(input); + fclose(output); + return 0; +} diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp new file mode 100644 index 0000000..03fe5b4 --- /dev/null +++ b/utils/segment/ngseg.cpp @@ -0,0 +1,261 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +void print_help(){ + printf("Usage: ngseg [--generate-extra-enter] [-o outputfile] [inputfile]\n"); +} + + +static gboolean gen_extra_enter = FALSE; +static gchar * outputfile = NULL; + +static GOptionEntry entries[] = +{ + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"}, + {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL}, + {NULL} +}; + + +/* n-gram based sentence segment. */ + +/* Note: + * Currently libpinyin supports ucs4 characters. + * This is a pre-processor tool for raw corpus, + * and skips non-Chinese characters. + */ + +/* TODO: + * Try to add punctuation mark and english support, + * such as ',', '.', '?', '!', <english>, and other punctuations. + */ + +enum CONTEXT_STATE{ + CONTEXT_INIT, + CONTEXT_SEGMENTABLE, + CONTEXT_UNKNOWN +}; + +bool deal_with_segmentable(PhraseLookup * phrase_lookup, + GArray * current_ucs4, + FILE * output){ + char * result_string = NULL; + MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + phrase_lookup->get_best_match(current_ucs4->len, + (ucs4_t *) current_ucs4->data, results); + + phrase_lookup->convert_to_utf8(results, result_string); + + if (result_string) { + fprintf(output, "%s\n", result_string); + } else { + char * tmp_string = g_ucs4_to_utf8 + ( (ucs4_t *) current_ucs4->data, current_ucs4->len, + NULL, NULL, NULL); + fprintf(stderr, "Un-segmentable sentence encountered:%s\n", + tmp_string); + g_array_free(results, TRUE); + return false; + } + g_array_free(results, TRUE); + g_free(result_string); + return true; +} + +bool deal_with_unknown(GArray * current_ucs4, FILE * output){ + char * result_string = g_ucs4_to_utf8 + ( (ucs4_t *) current_ucs4->data, current_ucs4->len, + NULL, NULL, NULL); + fprintf(output, "%d %s\n", null_token, result_string); + g_free(result_string); + return true; +} + + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- n-gram segment"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (outputfile) { + output = fopen(outputfile, "w"); + if (NULL == output) { + perror("open file failed"); + exit(EINVAL); + } + } + + if (argc > 2) { + fprintf(stderr, "too many arguments.\n"); + exit(EINVAL); + } + + if (2 == argc) { + input = fopen(argv[1], "r"); + if (NULL == input) { + perror("open file failed"); + exit(EINVAL); + } + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* init phrase table */ + FacadePhraseTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + /* init bi-gram */ + Bigram system_bigram; + system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); + Bigram user_bigram; + + gfloat lambda = system_table_info.get_lambda(); + + /* init phrase lookup */ + PhraseLookup phrase_lookup(lambda, + &phrase_table, &phrase_index, + &system_bigram, &user_bigram); + + + CONTEXT_STATE state, next_state; + GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index.prepare_tokens(tokens); + + /* split the sentence */ + char * linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, input)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + /* check non-ucs4 characters */ + const glong num_of_chars = g_utf8_strlen(linebuf, -1); + glong len = 0; + ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); + if ( len != num_of_chars ) { + fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); + fprintf(output, "%d \n", null_token); + continue; + } + + /* only new-line persists. */ + if ( 0 == num_of_chars ) { + fprintf(output, "%d \n", null_token); + continue; + } + + state = CONTEXT_INIT; + int result = phrase_table.search( 1, sentence, tokens); + g_array_append_val( current_ucs4, sentence[0]); + if ( result & SEARCH_OK ) + state = CONTEXT_SEGMENTABLE; + else + state = CONTEXT_UNKNOWN; + + for ( int i = 1; i < num_of_chars; ++i) { + int result = phrase_table.search( 1, sentence + i, tokens); + if ( result & SEARCH_OK ) + next_state = CONTEXT_SEGMENTABLE; + else + next_state = CONTEXT_UNKNOWN; + + if ( state == next_state ){ + g_array_append_val(current_ucs4, sentence[i]); + continue; + } + + assert ( state != next_state ); + if ( state == CONTEXT_SEGMENTABLE ) + deal_with_segmentable(&phrase_lookup, current_ucs4, output); + + if ( state == CONTEXT_UNKNOWN ) + deal_with_unknown(current_ucs4, output); + + /* save the current character */ + g_array_set_size(current_ucs4, 0); + g_array_append_val(current_ucs4, sentence[i]); + state = next_state; + } + + if ( current_ucs4->len ) { + /* this seems always true. */ + if ( state == CONTEXT_SEGMENTABLE ) + deal_with_segmentable(&phrase_lookup, current_ucs4, output); + + if ( state == CONTEXT_UNKNOWN ) + deal_with_unknown(current_ucs4, output); + g_array_set_size(current_ucs4, 0); + } + + /* print extra enter */ + if ( gen_extra_enter ) + fprintf(output, "%d \n", null_token); + + g_free(sentence); + } + phrase_index.destroy_tokens(tokens); + + /* print enter at file tail */ + fprintf(output, "%d \n", null_token); + g_array_free(current_ucs4, TRUE); + free(linebuf); + fclose(input); + fclose(output); + return 0; +} diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp new file mode 100644 index 0000000..b543cc5 --- /dev/null +++ b/utils/segment/spseg.cpp @@ -0,0 +1,343 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010,2013 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <string.h> +#include <locale.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +void print_help(){ + printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n"); +} + +static gboolean gen_extra_enter = FALSE; +static gchar * outputfile = NULL; + +static GOptionEntry entries[] = +{ + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"}, + {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL}, + {NULL} +}; + + +/* graph shortest path sentence segment. */ + +/* Note: + * Currently libpinyin only supports ucs4 characters, as this is a + * pre-processor tool for raw corpus, it will skip all sentences + * which contains non-ucs4 characters. + */ + +enum CONTEXT_STATE{ + CONTEXT_INIT, + CONTEXT_SEGMENTABLE, + CONTEXT_UNKNOWN +}; + +struct SegmentStep{ + phrase_token_t m_handle; + ucs4_t * m_phrase; + size_t m_phrase_len; + //use formula W = number of words. Zero handle means one word. + guint m_nword; + //backtrace information, -1 one step backward. + gint m_backward_nstep; +public: + SegmentStep(){ + m_handle = null_token; + m_phrase = NULL; + m_phrase_len = 0; + m_nword = UINT_MAX; + m_backward_nstep = -0; + } +}; + +bool backtrace(GArray * steps, glong phrase_len, GArray * strings); + +/* Note: do not free phrase, as it is used by strings (array of segment). */ +bool segment(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + GArray * current_ucs4, + GArray * strings /* Array of SegmentStep. */){ + ucs4_t * phrase = (ucs4_t *)current_ucs4->data; + guint phrase_len = current_ucs4->len; + + /* Prepare for shortest path segment dynamic programming. */ + GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); + SegmentStep step; + for ( glong i = 0; i < phrase_len + 1; ++i ){ + g_array_append_val(steps, step); + } + + SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0); + first_step->m_nword = 0; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + + for ( glong i = 0; i < phrase_len + 1; ++i ) { + SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i); + size_t nword = step_begin->m_nword; + for ( glong k = i + 1; k < phrase_len + 1; ++k ) { + size_t len = k - i; + ucs4_t * cur_phrase = phrase + i; + + phrase_token_t token = null_token; + int result = phrase_table->search(len, cur_phrase, tokens); + int num = get_first_token(tokens, token); + + if ( !(result & SEARCH_OK) ){ + token = null_token; + if ( 1 != len ) + continue; + } + ++nword; + + SegmentStep * step_end = &g_array_index(steps, SegmentStep, k); + if ( nword < step_end->m_nword ) { + step_end->m_handle = token; + step_end->m_phrase = cur_phrase; + step_end->m_phrase_len = len; + step_end->m_nword = nword; + step_end->m_backward_nstep = i - k; + } + if ( !(result & SEARCH_CONTINUED) ) + break; + } + } + phrase_index->destroy_tokens(tokens); + + return backtrace(steps, phrase_len, strings); +} + +bool backtrace(GArray * steps, glong phrase_len, GArray * strings){ + /* backtracing to get the result. */ + size_t cur_step = phrase_len; + g_array_set_size(strings, 0); + while ( cur_step ){ + SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step); + g_array_append_val(strings, *step); + cur_step = cur_step + step->m_backward_nstep; + /* intended to avoid leaking internal informations. */ + step->m_nword = 0; step->m_backward_nstep = 0; + } + + /* reverse the strings. */ + for ( size_t i = 0; i < strings->len / 2; ++i ) { + SegmentStep * head, * tail; + head = &g_array_index(strings, SegmentStep, i); + tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i ); + SegmentStep tmp; + tmp = *head; + *head = *tail; + *tail = tmp; + } + + g_array_free(steps, TRUE); + return true; +} + +bool deal_with_segmentable(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + GArray * current_ucs4, + FILE * output){ + + /* do segment stuff. */ + GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); + segment(phrase_table, phrase_index, current_ucs4, strings); + + /* print out the split phrase. */ + for ( glong i = 0; i < strings->len; ++i ) { + SegmentStep * step = &g_array_index(strings, SegmentStep, i); + char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL); + fprintf(output, "%d %s\n", step->m_handle, string); + g_free(string); + } + + g_array_free(strings, TRUE); + return true; +} + +bool deal_with_unknown(GArray * current_ucs4, FILE * output){ + char * result_string = g_ucs4_to_utf8 + ( (ucs4_t *) current_ucs4->data, current_ucs4->len, + NULL, NULL, NULL); + fprintf(output, "%d %s\n", null_token, result_string); + g_free(result_string); + return true; +} + + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- shortest path segment"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (outputfile) { + output = fopen(outputfile, "w"); + if (NULL == output) { + perror("open file failed"); + exit(EINVAL); + } + } + + if (argc > 2) { + fprintf(stderr, "too many arguments.\n"); + exit(EINVAL); + } + + if (2 == argc) { + input = fopen(argv[1], "r"); + if (NULL == input) { + perror("open file failed"); + exit(EINVAL); + } + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* init phrase table */ + FacadePhraseTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + CONTEXT_STATE state, next_state; + GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index.prepare_tokens(tokens); + + char * linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, input)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + /* check non-ucs4 characters. */ + const glong num_of_chars = g_utf8_strlen(linebuf, -1); + glong len = 0; + ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); + if ( len != num_of_chars ) { + fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); + fprintf(output, "%d \n", null_token); + continue; + } + + /* only new-line persists. */ + if ( 0 == num_of_chars ) { + fprintf(output, "%d \n", null_token); + continue; + } + + state = CONTEXT_INIT; + int result = phrase_table.search( 1, sentence, tokens); + g_array_append_val( current_ucs4, sentence[0]); + if ( result & SEARCH_OK ) + state = CONTEXT_SEGMENTABLE; + else + state = CONTEXT_UNKNOWN; + + for ( int i = 1; i < num_of_chars; ++i) { + int result = phrase_table.search( 1, sentence + i, tokens); + if ( result & SEARCH_OK ) + next_state = CONTEXT_SEGMENTABLE; + else + next_state = CONTEXT_UNKNOWN; + + if ( state == next_state ){ + g_array_append_val(current_ucs4, sentence[i]); + continue; + } + + assert ( state != next_state ); + if ( state == CONTEXT_SEGMENTABLE ) + deal_with_segmentable(&phrase_table, &phrase_index, + current_ucs4, output); + + if ( state == CONTEXT_UNKNOWN ) + deal_with_unknown(current_ucs4, output); + + /* save the current character */ + g_array_set_size(current_ucs4, 0); + g_array_append_val(current_ucs4, sentence[i]); + state = next_state; + } + + if ( current_ucs4->len ) { + /* this seems always true. */ + if ( state == CONTEXT_SEGMENTABLE ) + deal_with_segmentable(&phrase_table, &phrase_index, + current_ucs4, output); + + if ( state == CONTEXT_UNKNOWN ) + deal_with_unknown(current_ucs4, output); + g_array_set_size(current_ucs4, 0); + } + + /* print extra enter */ + if ( gen_extra_enter ) + fprintf(output, "%d \n", null_token); + + g_free(sentence); + } + phrase_index.destroy_tokens(tokens); + + /* print enter at file tail */ + fprintf(output, "%d \n", null_token); + g_array_free(current_ucs4, TRUE); + free(linebuf); + fclose(input); + fclose(output); + return 0; +} diff --git a/utils/storage/CMakeLists.txt b/utils/storage/CMakeLists.txt new file mode 100644 index 0000000..63cabcd --- /dev/null +++ b/utils/storage/CMakeLists.txt @@ -0,0 +1,29 @@ +add_executable( + gen_binary_files + gen_binary_files.cpp +) + +target_link_libraries( + gen_binary_files + libpinyin +) + +add_executable( + import_interpolation + import_interpolation.cpp +) + +target_link_libraries( + import_interpolation + libpinyin +) + +add_executable( + export_interpolation + export_interpolation.cpp +) + +target_link_libraries( + export_interpolation + libpinyin +) diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am new file mode 100644 index 0000000..db63488 --- /dev/null +++ b/utils/storage/Makefile.am @@ -0,0 +1,45 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ + @GLIB2_CFLAGS@ + +bin_PROGRAMS = gen_binary_files \ + import_interpolation + +noinst_PROGRAMS = export_interpolation \ + gen_pinyin_table + +gen_binary_files_SOURCES = gen_binary_files.cpp + +gen_binary_files_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +import_interpolation_SOURCES = import_interpolation.cpp + +import_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +export_interpolation_SOURCES = export_interpolation.cpp + +export_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +gen_pinyin_table_SOURCES = gen_pinyin_table.cpp + +gen_pinyin_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp new file mode 100644 index 0000000..c43eefb --- /dev/null +++ b/utils/storage/export_interpolation.cpp @@ -0,0 +1,144 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <stdio.h> +#include <assert.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +/* export interpolation model as textual format */ + +bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index); +bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram); + +bool begin_data(FILE * output){ + fprintf(output, "\\data model interpolation\n"); + return true; +} + +bool end_data(FILE * output){ + fprintf(output, "\\end\n"); + return true; +} + +int main(int argc, char * argv[]){ + FILE * output = stdout; + const char * bigram_filename = SYSTEM_BIGRAM; + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + bigram.attach(bigram_filename, ATTACH_READONLY); + + begin_data(output); + + gen_unigram(output, &phrase_index); + gen_bigram(output, &phrase_index, &bigram); + + end_data(output); + return 0; +} + +bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) { + fprintf(output, "\\1-gram\n"); + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) { + + PhraseIndexRange range; + int result = phrase_index->get_range(i, range); + if (ERROR_OK != result ) + continue; + + PhraseItem item; + for (phrase_token_t token = range.m_range_begin; + token < range.m_range_end; token++) { + int result = phrase_index->get_phrase_item(token, item); + + if ( result == ERROR_NO_ITEM ) + continue; + assert( result == ERROR_OK); + + size_t freq = item.get_unigram_frequency(); + if ( 0 == freq ) + continue; + char * phrase = taglib_token_to_string(phrase_index, token); + if ( phrase ) + fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq); + + g_free(phrase); + } + } + return true; +} + +bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){ + fprintf(output, "\\2-gram\n"); + + /* Retrieve all user items. */ + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + bigram->get_all_items(items); + + PhraseItem item; + + for(size_t i = 0; i < items->len; i++){ + phrase_token_t token = g_array_index(items, phrase_token_t, i); + SingleGram * single_gram = NULL; + bigram->load(token, single_gram); + + BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); + single_gram->retrieve_all(array); + for(size_t j = 0; j < array->len; j++) { + BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j); + + char * word1 = taglib_token_to_string(phrase_index, token); + char * word2 = taglib_token_to_string(phrase_index, item->m_token); + guint32 freq = item->m_count; + + if ( word1 && word2) + fprintf(output, "\\item %d %s %d %s count %d\n", + token, word1, item->m_token, word2, freq); + + g_free(word1); g_free(word2); + } + + g_array_free(array, TRUE); + delete single_gram; + } + + g_array_free(items, TRUE); + return true; +} diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp new file mode 100644 index 0000000..4216b44 --- /dev/null +++ b/utils/storage/gen_binary_files.cpp @@ -0,0 +1,115 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +static const gchar * table_dir = "."; + +static GOptionEntry entries[] = +{ + {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL}, + {NULL} +}; + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate binary files"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + bool retval = system_table_info.load(filename); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + g_free(filename); + + /* generate pinyin index*/ + pinyin_option_t options = USE_TONE; + ChewingLargeTable chewing_table(options); + PhraseLargeTable2 phrase_table; + + /* generate phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + assert(table_info->m_dict_index == i); + + if (SYSTEM_FILE != table_info->m_file_type && + DICTIONARY != table_info->m_file_type) + continue; + + const char * tablename = table_info->m_table_filename; + + filename = g_build_filename(table_dir, tablename, NULL); + FILE * tablefile = fopen(filename, "r"); + + if (NULL == tablefile) { + fprintf(stderr, "open %s failed!\n", tablename); + exit(ENOENT); + } + + chewing_table.load_text(tablefile); + fseek(tablefile, 0L, SEEK_SET); + phrase_table.load_text(tablefile); + fseek(tablefile, 0L, SEEK_SET); + phrase_index.load_text(i, tablefile); + fclose(tablefile); + g_free(filename); + } + + MemoryChunk * new_chunk = new MemoryChunk; + chewing_table.store(new_chunk); + new_chunk->save(SYSTEM_PINYIN_INDEX); + chewing_table.load(new_chunk); + + new_chunk = new MemoryChunk; + phrase_table.store(new_chunk); + new_chunk->save(SYSTEM_PHRASE_INDEX); + phrase_table.load(new_chunk); + + phrase_index.compact(); + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + if (!save_dictionary(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp new file mode 100644 index 0000000..3b541d1 --- /dev/null +++ b/utils/storage/gen_pinyin_table.cpp @@ -0,0 +1,330 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <stdio.h> +#include <glib.h> +#include "pinyin_internal.h" + + +void print_help(){ + printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n" + "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n" + "<OUTPUTFILE> the result output file\n" + "<FILEi> input pinyin files\n" + "<PHRASE_INDEX> phrase index identifier\n"); +} + + +static gint phrase_index = 0; +static const gchar * outputfile = "temp.out"; + +static GOptionEntry entries[] = +{ + {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL}, + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL}, + {NULL} +}; + + +using namespace pinyin; + +/* map from phrase_item to GArray of chewing_and_freq_item */ +GTree * g_chewing_tree; +/* Array of GArray of phrase_and_array_item */ +GArray * g_item_array[MAX_PHRASE_LENGTH + 1]; + +struct phrase_item{ + size_t length; + gunichar * uniphrase; +}; + +struct chewing_and_freq_item{ + ChewingKeyVector keys; + ChewingKeyRestVector key_rests; + guint32 freq; +}; + +struct phrase_and_array_item{ + phrase_item phrase; /* the key of g_chewing_tree */ + /* Array of chewing_and_freq_item */ + GArray * chewing_and_freq_array; /* the value of g_chewing_tree */ +}; + + +void feed_file(const char * filename); + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq); + +gboolean store_one_item(gpointer key, gpointer value, gpointer data); + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata); + +void gen_phrase_file(const char * outputfile, int phrase_index); + + +gint phrase_item_compare(gconstpointer a, gconstpointer b){ + phrase_item * itema = (phrase_item *) a; + phrase_item * itemb = (phrase_item *) b; + if ( itema->length != itemb->length ) + return itema->length - itemb->length; + else + return memcmp(itema->uniphrase, itemb->uniphrase, + sizeof(gunichar) * itema->length); +} + + +int main(int argc, char * argv[]){ + int i; + + g_chewing_tree = g_tree_new(phrase_item_compare); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate pinyin table"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + for (i = 1; i < argc; ++i) { + feed_file(argv[i]); + } + + printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree)); + + /* store in item array */ + g_item_array[0] = NULL; + for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_item_array[i] = g_array_new + (FALSE, TRUE, sizeof(phrase_and_array_item)); + } + g_tree_foreach(g_chewing_tree, store_one_item, NULL); + + /* sort item array */ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i); + } + + gen_phrase_file(outputfile, phrase_index); + + return 0; +} + +void feed_file ( const char * filename){ + char phrase[1024], pinyin[1024]; + guint32 freq; + + FILE * infile = fopen(filename, "r"); + if ( NULL == infile ){ + fprintf(stderr, "Can't open file %s.\n", filename); + exit(ENOENT); + } + + while ( !feof(infile)){ + int num = fscanf(infile, "%s %s %u", + phrase, pinyin, &freq); + + if (3 != num) + continue; + + if (feof(infile)) + break; + + feed_line(phrase, pinyin, freq); + } + + fclose(infile); +} + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq) { + phrase_item * item = new phrase_item; + item->length = g_utf8_strlen(phrase, -1); + + /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp + * where is the code which I don't want to touch. :-) + */ + + if (item->length >= MAX_PHRASE_LENGTH) { + fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); + + FullPinyinParser2 parser; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = g_array_new + (FALSE, FALSE, sizeof(ChewingKeyRest)); + + pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + assert(keys->len == key_rests->len); + + if (keys->len != item->length) { + fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item); + + chewing_and_freq_item value_item; + value_item.keys = keys; value_item.key_rests = key_rests; + value_item.freq = freq; + + assert(item->length == value_item.keys->len); + if (NULL == array) { + array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item)); + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + return; + } + + bool found = false; + for (size_t i = 0; i < array->len; ++i) { + chewing_and_freq_item * cur_item = + &g_array_index(array, chewing_and_freq_item, i); + int result = pinyin_exact_compare2 + ((ChewingKey *) value_item.keys->data, + (ChewingKey *) cur_item->keys->data, + value_item.keys->len); + + if (0 == result) { + fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", + phrase, pinyin, freq); + cur_item->freq += freq; + found = true; + } + } + + if (!found) { + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + } else { + /* clean up */ + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + } + + delete item; +} + + +gboolean store_one_item(gpointer key, gpointer value, gpointer data) { + phrase_and_array_item item; + item.phrase = *((phrase_item *) key); + item.chewing_and_freq_array = (GArray *) value; + int len = item.phrase.length; + g_array_append_val(g_item_array[len], item); + return FALSE; +} + + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata) { + int phrase_length = *((int *) userdata); + phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs; + phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs; + + ChewingKeyVector keys_lhs = g_array_index + (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + ChewingKeyVector keys_rhs = g_array_index + (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + return pinyin_exact_compare2((ChewingKey *)keys_lhs->data, + (ChewingKey *)keys_rhs->data, phrase_length); +} + + +void gen_phrase_file(const char * outputfile, int phrase_index){ + FILE * outfile = fopen(outputfile, "w"); + if (NULL == outfile ) { + fprintf(stderr, "Can't write file %s.\n", outputfile); + exit(ENOENT); + } + + phrase_token_t token = 1; + + /* phrase length index */ + for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) { + GArray * item_array = g_item_array[i]; + + /* item array index */ + for (size_t m = 0; m < item_array->len; ++m) { + phrase_and_array_item * item = &g_array_index + (item_array, phrase_and_array_item, m); + phrase_item phrase = item->phrase; + GArray * chewing_and_freqs = item->chewing_and_freq_array; + + gchar * phrase_str = g_ucs4_to_utf8 + (phrase.uniphrase, phrase.length, NULL, NULL, NULL); + + /* iterate each pinyin */ + for (size_t n = 0; n < chewing_and_freqs->len; ++n) { + chewing_and_freq_item * chewing_and_freq = + &g_array_index + (chewing_and_freqs, chewing_and_freq_item, n); + + ChewingKeyVector keys = chewing_and_freq->keys; + ChewingKeyRestVector key_rests = chewing_and_freq->key_rests; + + GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *)); + gchar * pinyin = NULL; + + size_t k; + for (k = 0; k < keys->len; ++k) { + ChewingKey key = g_array_index(keys, ChewingKey, k); + ChewingKeyRest key_rest = g_array_index + (key_rests, ChewingKeyRest, k); + + //assert (CHEWING_ZERO_TONE != key.m_tone); + pinyin = key.get_pinyin_string(); + g_array_append_val(pinyins, pinyin); + } + gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data); + + for (k = 0; k < pinyins->len; ++k) { + g_free(g_array_index(pinyins, gchar *, k)); + } + g_array_free(pinyins, TRUE); + + guint32 freq = chewing_and_freq->freq; + + /* avoid zero freq */ + if (freq < 3) freq = 3; + + fprintf(outfile, "%s\t%s\t%d\t%d\n", + pinyin_str, phrase_str, + PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq); + + g_free(pinyin_str); + } + g_free(phrase_str); + token++; + } + } + + fclose(outfile); +} diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp new file mode 100644 index 0000000..205a27a --- /dev/null +++ b/utils/storage/import_interpolation.cpp @@ -0,0 +1,313 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +static const gchar * table_dir = "."; + +static GOptionEntry entries[] = +{ + {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL}, + {NULL} +}; + + +enum LINE_TYPE{ + BEGIN_LINE = 1, + END_LINE, + GRAM_1_LINE, + GRAM_2_LINE, + GRAM_1_ITEM_LINE, + GRAM_2_ITEM_LINE +}; + +static int line_type = 0; +static GPtrArray * values = NULL; +static GHashTable * required = NULL; +/* variables for line buffer. */ +static char * linebuf = NULL; +static size_t len = 0; + +bool parse_headline(); + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index); + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * bigram); + +static ssize_t my_getline(FILE * input){ + ssize_t result = getline(&linebuf, &len, input); + if ( result == -1 ) + return result; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + return result; +} + +bool parse_headline(){ + /* enter "\data" line */ + assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); + + /* read "\data" line */ + if ( !taglib_read(linebuf, line_type, values, required) ) { + fprintf(stderr, "error: interpolation model expected.\n"); + return false; + } + + assert(line_type == BEGIN_LINE); + /* check header */ + TAGLIB_GET_TAGVALUE(const char *, model, (const char *)); + if ( !( strcmp("interpolation", model) == 0 ) ) { + fprintf(stderr, "error: interpolation model expected.\n"); + return false; + } + return true; +} + +bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + + do { + retry: + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case END_LINE: + goto end; + case GRAM_1_LINE: + my_getline(input); + parse_unigram(input, phrase_table, phrase_index); + goto retry; + case GRAM_2_LINE: + my_getline(input); + parse_bigram(input, phrase_table, phrase_index, bigram); + goto retry; + default: + assert(false); + } + } while (my_getline(input) != -1) ; + + end: + taglib_pop_state(); + return true; +} + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", "")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_1_ITEM_LINE:{ + /* handle \item in \1-gram */ + TAGLIB_GET_TOKEN(token, 0); + TAGLIB_GET_PHRASE_STRING(word, 1); + assert(taglib_validate_token_with_string + (phrase_index, token, word)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + phrase_index->add_unigram_frequency(token, count); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", "")); + + phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL; + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_2_ITEM_LINE:{ + /* handle \item in \2-gram */ + /* two tokens */ + TAGLIB_GET_TOKEN(token1, 0); + TAGLIB_GET_PHRASE_STRING(word1, 1); + assert(taglib_validate_token_with_string + (phrase_index, token1, word1)); + + TAGLIB_GET_TOKEN(token2, 2); + TAGLIB_GET_PHRASE_STRING(word2, 3); + assert(taglib_validate_token_with_string + (phrase_index, token2, word2)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + + if ( last_token != token1 ) { + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + + /* safe guard */ + last_token = null_token; + last_single_gram = NULL; + } + SingleGram * single_gram = NULL; + bigram->load(token1, single_gram); + + /* create the new single gram */ + if ( single_gram == NULL ) + single_gram = new SingleGram; + last_token = token1; + last_single_gram = single_gram; + } + + /* save the freq */ + assert(NULL != last_single_gram); + guint32 total_freq = 0; + assert(last_single_gram->get_total_freq(total_freq)); + assert(last_single_gram->insert_freq(token2, count)); + total_freq += count; + assert(last_single_gram->set_total_freq(total_freq)); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + //safe guard + last_token = 0; + last_single_gram = NULL; + } + + taglib_pop_state(); + return true; +} + +int main(int argc, char * argv[]){ + FILE * input = stdin; + const char * bigram_filename = SYSTEM_BIGRAM; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- import interpolation model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + bool retval = system_table_info.load(filename); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + g_free(filename); + + PhraseLargeTable2 phrase_table; + + MemoryChunk * chunk = new MemoryChunk; + retval = chunk->load(SYSTEM_PHRASE_INDEX); + if (!retval) { + fprintf(stderr, "open phrase_index.bin failed!\n"); + exit(ENOENT); + } + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bigram_filename); + exit(ENOENT); + } + + taglib_init(); + + values = g_ptr_array_new(); + required = g_hash_table_new(g_str_hash, g_str_equal); + + /* read first line */ + ssize_t result = my_getline(input); + if ( result == -1 ) { + fprintf(stderr, "empty file input.\n"); + exit(ENODATA); + } + + if (!parse_headline()) + exit(ENODATA); + + result = my_getline(input); + if ( result != -1 ) + parse_body(input, &phrase_table, &phrase_index, &bigram); + + taglib_fini(); + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} diff --git a/utils/training/CMakeLists.txt b/utils/training/CMakeLists.txt new file mode 100644 index 0000000..ee59bcd --- /dev/null +++ b/utils/training/CMakeLists.txt @@ -0,0 +1,129 @@ +add_executable( + gen_ngram + gen_ngram.cpp +) + +target_link_libraries( + gen_ngram + libpinyin +) + +add_executable( + gen_deleted_ngram + gen_deleted_ngram.cpp +) + +target_link_libraries( + gen_deleted_ngram + libpinyin +) + +add_executable( + gen_unigram + gen_unigram.cpp +) + +target_link_libraries( + gen_unigram + libpinyin +) + +add_executable( + gen_k_mixture_model + gen_k_mixture_model.cpp +) + +target_link_libraries( + gen_k_mixture_model + libpinyin +) + +add_executable( + estimate_interpolation + estimate_interpolation.cpp +) + +target_link_libraries( + estimate_interpolation + libpinyin +) + +add_executable( + estimate_k_mixture_model + estimate_k_mixture_model.cpp +) + +target_link_libraries( + estimate_k_mixture_model + libpinyin +) + +add_executable( + merge_k_mixture_model + merge_k_mixture_model.cpp +) + +target_link_libraries( + merge_k_mixture_model + libpinyin +) + +add_executable( + prune_k_mixture_model + prune_k_mixture_model.cpp +) + +target_link_libraries( + prune_k_mixture_model + libpinyin +) + +add_executable( + import_k_mixture_model + import_k_mixture_model.cpp +) + +target_link_libraries( + import_k_mixture_model + libpinyin +) + +add_executable( + export_k_mixture_model + export_k_mixture_model.cpp +) + +target_link_libraries( + export_k_mixture_model + libpinyin +) + +add_executable( + k_mixture_model_to_interpolation + k_mixture_model_to_interpolation.cpp +) + +target_link_libraries( + k_mixture_model_to_interpolation + libpinyin +) + +add_executable( + validate_k_mixture_model + validate_k_mixture_model.cpp +) + +target_link_libraries( + validate_k_mixture_model + libpinyin +) + +add_executable( + eval_correction_rate + eval_correction_rate.cpp +) + +target_link_libraries( + eval_correction_rate + libpinyin +)
\ No newline at end of file diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am new file mode 100644 index 0000000..dc834ec --- /dev/null +++ b/utils/training/Makefile.am @@ -0,0 +1,97 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ + @GLIB2_CFLAGS@ + +noinst_HEADERS = k_mixture_model.h + +bin_PROGRAMS = gen_unigram + +noinst_PROGRAMS = gen_ngram \ + gen_deleted_ngram \ + gen_k_mixture_model \ + estimate_interpolation \ + estimate_k_mixture_model \ + merge_k_mixture_model \ + prune_k_mixture_model \ + import_k_mixture_model \ + export_k_mixture_model \ + k_mixture_model_to_interpolation \ + validate_k_mixture_model \ + eval_correction_rate + +gen_ngram_SOURCES = gen_ngram.cpp + +gen_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +gen_deleted_ngram_SOURCES = gen_deleted_ngram.cpp + +gen_deleted_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +gen_unigram_SOURCES = gen_unigram.cpp + +gen_unigram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +gen_k_mixture_model_SOURCES = gen_k_mixture_model.cpp + +gen_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +estimate_interpolation_SOURCES = estimate_interpolation.cpp + +estimate_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +estimate_k_mixture_model_SOURCES = estimate_k_mixture_model.cpp + +estimate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +merge_k_mixture_model_SOURCES = merge_k_mixture_model.cpp + +merge_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +prune_k_mixture_model_SOURCES = prune_k_mixture_model.cpp + +prune_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +import_k_mixture_model_SOURCES = import_k_mixture_model.cpp + +import_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +export_k_mixture_model_SOURCES = export_k_mixture_model.cpp + +export_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp + +k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +validate_k_mixture_model_SOURCES = validate_k_mixture_model.cpp + +validate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +eval_correction_rate_SOURCES = eval_correction_rate.cpp + +eval_correction_rate_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp new file mode 100644 index 0000000..5cdc680 --- /dev/null +++ b/utils/training/estimate_interpolation.cpp @@ -0,0 +1,144 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2008 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +#include <math.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +parameter_t compute_interpolation(SingleGram * deleted_bigram, + FacadePhraseIndex * unigram, + SingleGram * bigram){ + bool success; + parameter_t lambda = 0, next_lambda = 0.6; + parameter_t epsilon = 0.001; + + while ( fabs(lambda - next_lambda) > epsilon){ + lambda = next_lambda; + next_lambda = 0; + guint32 table_num = 0; + parameter_t numerator = 0; + parameter_t part_of_denominator = 0; + + BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); + deleted_bigram->retrieve_all(array); + + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, i); + //get the phrase token + phrase_token_t token = item->m_token; + guint32 deleted_count = item->m_count; + + { + guint32 freq = 0; + parameter_t elem_poss = 0; + if (bigram && bigram->get_freq(token, freq)){ + guint32 total_freq; + assert(bigram->get_total_freq(total_freq)); + assert(0 != total_freq); + elem_poss = freq / (parameter_t) total_freq; + } + numerator = lambda * elem_poss; + } + + { + parameter_t elem_poss = 0; + PhraseItem item; + if (!unigram->get_phrase_item(token, item)){ + guint32 freq = item.get_unigram_frequency(); + guint32 total_freq = unigram->get_phrase_index_total_freq(); + elem_poss = freq / (parameter_t)total_freq; + } + part_of_denominator = (1 - lambda) * elem_poss; + } + + if (0 == (numerator + part_of_denominator)) + continue; + + next_lambda += deleted_count * (numerator / (numerator + part_of_denominator)); + } + assert(deleted_bigram->get_total_freq(table_num)); + next_lambda /= table_num; + + g_array_free(array, TRUE); + } + lambda = next_lambda; + return lambda; +} + +int main(int argc, char * argv[]){ + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); + + Bigram deleted_bigram; + deleted_bigram.attach(DELETED_BIGRAM, ATTACH_READONLY); + + GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + deleted_bigram.get_all_items(deleted_items); + + parameter_t lambda_sum = 0; + int lambda_count = 0; + + for ( int i = 0; i < deleted_items->len; ++i ){ + phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); + SingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + SingleGram * deleted_single_gram = NULL; + deleted_bigram.load(*token, deleted_single_gram); + + parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram); + + printf("token:%d lambda:%f\n", *token, lambda); + + lambda_sum += lambda; + lambda_count ++; + + if (single_gram) + delete single_gram; + delete deleted_single_gram; + } + + printf("average lambda:%f\n", (lambda_sum/lambda_count)); + g_array_free(deleted_items, TRUE); + return 0; +} + diff --git a/utils/training/estimate_k_mixture_model.cpp b/utils/training/estimate_k_mixture_model.cpp new file mode 100644 index 0000000..c0fa03f --- /dev/null +++ b/utils/training/estimate_k_mixture_model.cpp @@ -0,0 +1,159 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <locale.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" + +static const gchar * bigram_filename = "k_mixture_model_ngram.db"; +static const gchar * deleted_bigram_filename = "k_mixture_model_deleted_ngram.db"; + +static GOptionEntry entries[] = +{ + {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "the bigram file", NULL}, + {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &deleted_bigram_filename, "the deleted bigram file", NULL}, + {NULL} +}; + + +parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram, + KMixtureModelBigram * unigram, + KMixtureModelSingleGram * bigram){ + bool success; + parameter_t lambda = 0, next_lambda = 0.6; + parameter_t epsilon = 0.001; + + KMixtureModelMagicHeader magic_header; + assert(unigram->get_magic_header(magic_header)); + assert(0 != magic_header.m_total_freq); + + while (fabs(lambda - next_lambda) > epsilon){ + lambda = next_lambda; + next_lambda = 0; + parameter_t numerator = 0; + parameter_t part_of_denominator = 0; + + FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + deleted_bigram->retrieve_all(array); + + for ( size_t i = 0; i < array->len; ++i){ + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i); + //get the phrase token + phrase_token_t token = item->m_token; + guint32 deleted_count = item->m_item.m_WC; + + { + parameter_t elem_poss = 0; + KMixtureModelArrayHeader array_header; + KMixtureModelArrayItem array_item; + if ( bigram && bigram->get_array_item(token, array_item) ){ + assert(bigram->get_array_header(array_header)); + assert(0 != array_header.m_WC); + elem_poss = array_item.m_WC / (parameter_t) array_header.m_WC; + } + numerator = lambda * elem_poss; + } + + { + parameter_t elem_poss = 0; + KMixtureModelArrayHeader array_header; + if (unigram->get_array_header(token, array_header)){ + elem_poss = array_header.m_freq / (parameter_t) magic_header.m_total_freq; + } + part_of_denominator = (1 - lambda) * elem_poss; + } + if (0 == (numerator + part_of_denominator)) + continue; + + next_lambda += deleted_count * (numerator / (numerator + part_of_denominator)); + } + KMixtureModelArrayHeader header; + assert(deleted_bigram->get_array_header(header)); + assert(0 != header.m_WC); + next_lambda /= header.m_WC; + + g_array_free(array, TRUE); + } + lambda = next_lambda; + return lambda; +} + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- estimate k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + /* TODO: magic header signature check here. */ + KMixtureModelBigram unigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + unigram.attach(bigram_filename, ATTACH_READONLY); + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(bigram_filename, ATTACH_READONLY); + + KMixtureModelBigram deleted_bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + deleted_bigram.attach(deleted_bigram_filename, ATTACH_READONLY); + + GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + deleted_bigram.get_all_items(deleted_items); + + parameter_t lambda_sum = 0; + int lambda_count = 0; + + for( size_t i = 0; i < deleted_items->len; ++i ){ + phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + KMixtureModelSingleGram * deleted_single_gram = NULL; + deleted_bigram.load(*token, deleted_single_gram); + + KMixtureModelArrayHeader array_header; + if (single_gram) + assert(single_gram->get_array_header(array_header)); + KMixtureModelArrayHeader deleted_array_header; + assert(deleted_single_gram->get_array_header(deleted_array_header)); + + if ( 0 != deleted_array_header.m_WC ) { + parameter_t lambda = compute_interpolation(deleted_single_gram, &unigram, single_gram); + + printf("token:%d lambda:%f\n", *token, lambda); + + lambda_sum += lambda; + lambda_count ++; + } + + if (single_gram) + delete single_gram; + delete deleted_single_gram; + } + + printf("average lambda:%f\n", (lambda_sum/lambda_count)); + g_array_free(deleted_items, TRUE); + return 0; +} diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp new file mode 100644 index 0000000..b45781d --- /dev/null +++ b/utils/training/eval_correction_rate.cpp @@ -0,0 +1,211 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin_internal.h" +#include "utils_helper.h" + + +void print_help(){ + printf("Usage: eval_correction_rate\n"); +} + +bool get_possible_pinyin(FacadePhraseIndex * phrase_index, + TokenVector tokens, ChewingKeyVector keys){ + ChewingKey buffer[MAX_PHRASE_LENGTH]; + size_t key_index; guint32 max_freq; + guint32 freq; + g_array_set_size(keys, 0); + + for (size_t i = 0; i < tokens->len; ++i){ + phrase_token_t * token = &g_array_index(tokens, phrase_token_t, i); + PhraseItem item; + phrase_index->get_phrase_item(*token, item); + key_index = 0; max_freq = 0; + for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) { + freq = 0; + assert(item.get_nth_pronunciation(m, buffer, freq)); + if ( freq > max_freq ) { + key_index = m; + max_freq = freq; + } + } + + assert(item.get_nth_pronunciation(key_index, buffer, freq)); + assert(max_freq == freq); + guint8 len = item.get_phrase_length(); + g_array_append_vals(keys, buffer, len); + } + return true; +} + +bool get_best_match(PinyinLookup2 * pinyin_lookup, + ChewingKeyVector keys, TokenVector tokens){ + /* prepare the prefixes for get_best_match. */ + TokenVector prefixes = g_array_new + (FALSE, FALSE, sizeof(phrase_token_t)); + g_array_append_val(prefixes, sentence_start); + + /* initialize constraints. */ + CandidateConstraints constraints = g_array_new + (TRUE, FALSE, sizeof(lookup_constraint_t)); + g_array_set_size(constraints, keys->len); + for ( size_t i = 0; i < constraints->len; ++i ) { + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + bool retval = pinyin_lookup->get_best_match(prefixes, keys, constraints, tokens); + + g_array_free(prefixes, TRUE); + g_array_free(constraints, TRUE); + return retval; +} + +bool do_one_test(PinyinLookup2 * pinyin_lookup, + FacadePhraseIndex * phrase_index, + TokenVector tokens){ + bool retval = false; + + ChewingKeyVector keys = g_array_new(FALSE, TRUE, sizeof(ChewingKey)); + TokenVector guessed_tokens = g_array_new + (FALSE, TRUE, sizeof(phrase_token_t)); + + get_possible_pinyin(phrase_index, tokens, keys); + get_best_match(pinyin_lookup, keys, guessed_tokens); + /* compare the results */ + char * sentence = NULL; char * guessed_sentence = NULL; + pinyin_lookup->convert_to_utf8(tokens, sentence); + pinyin_lookup->convert_to_utf8 + (guessed_tokens, guessed_sentence); + + if ( strcmp(sentence, guessed_sentence) != 0 ) { + fprintf(stderr, "test sentence:%s\n", sentence); + fprintf(stderr, "guessed sentence:%s\n", guessed_sentence); + fprintf(stderr, "the result mis-matches.\n"); + retval = false; + } else { + retval = true; + } + + g_free(sentence); g_free(guessed_sentence); + g_array_free(keys, TRUE); + g_array_free(guessed_tokens, TRUE); + return retval; +} + +int main(int argc, char * argv[]){ + const char * evals_text = "evals2.text"; + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + pinyin_option_t options = USE_TONE; + FacadeChewingTable largetable; + + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PINYIN_INDEX); + largetable.load(options, chunk, NULL); + + FacadePhraseTable2 phrase_table; + chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram system_bigram; + system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); + Bigram user_bigram; + user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE); + + gfloat lambda = system_table_info.get_lambda(); + + PinyinLookup2 pinyin_lookup(lambda, options, + &largetable, &phrase_index, + &system_bigram, &user_bigram); + + /* open evals text. */ + FILE * evals_file = fopen(evals_text, "r"); + if ( NULL == evals_file ) { + fprintf(stderr, "Can't open file:%s\n", evals_text); + exit(ENOENT); + } + + /* Evaluates the correction rate of test text documents. */ + size_t tested_count = 0; size_t passed_count = 0; + char* linebuf = NULL; size_t size = 0; + TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t)); + + phrase_token_t token = null_token; + while( getline(&linebuf, &size, evals_file) ) { + if ( feof(evals_file) ) + break; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + + if ( null_token == token ) { + if ( tokens->len ) { /* one test. */ + if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { + tested_count ++; passed_count ++; + } else { + tested_count ++; + } + g_array_set_size(tokens, 0); + } + } else { + g_array_append_val(tokens, token); + } + } + + if ( tokens->len ) { /* one test. */ + if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { + tested_count ++; passed_count ++; + } else { + tested_count ++; + } + } + + parameter_t rate = passed_count / (parameter_t) tested_count; + printf("correction rate:%f\n", rate); + + g_array_free(tokens, TRUE); + fclose(evals_file); + free(linebuf); + + return 0; +} diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp new file mode 100644 index 0000000..e446e79 --- /dev/null +++ b/utils/training/export_k_mixture_model.cpp @@ -0,0 +1,156 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <locale.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" +#include "utils_helper.h" + +static const gchar * k_mixture_model_filename = NULL; + +static GOptionEntry entries[] = +{ + {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL}, + {NULL} +}; + + +bool print_k_mixture_model_magic_header(FILE * output, + KMixtureModelBigram * bigram){ + KMixtureModelMagicHeader magic_header; + if ( !bigram->get_magic_header(magic_header) ){ + fprintf(stderr, "no magic header in k mixture model.\n"); + exit(ENODATA); + } + fprintf(output, "\\data model \"k mixture model\" count %d N %d " + "total_freq %d\n", magic_header.m_WC, magic_header.m_N, + magic_header.m_total_freq); + return true; +} + +bool print_k_mixture_model_array_headers(FILE * output, + KMixtureModelBigram * bigram, + FacadePhraseIndex * phrase_index){ + fprintf(output, "\\1-gram\n"); + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t token = g_array_index(items, phrase_token_t, i); + KMixtureModelArrayHeader array_header; + assert(bigram->get_array_header(token, array_header)); + char * phrase = taglib_token_to_string(phrase_index, token); + if ( phrase ) + fprintf(output, "\\item %d %s count %d freq %d\n", + token, phrase, array_header.m_WC, array_header.m_freq); + + g_free(phrase); + } + return true; +} + +bool print_k_mixture_model_array_items(FILE * output, + KMixtureModelBigram * bigram, + FacadePhraseIndex * phrase_index){ + fprintf(output, "\\2-gram\n"); + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t token = g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + assert(bigram->load(token, single_gram)); + FlexibleBigramPhraseArray array = g_array_new + (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + single_gram->retrieve_all(array); + + for (size_t m = 0; m < array->len; ++m){ + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m); + char * word1 = taglib_token_to_string(phrase_index, token); + char * word2 = taglib_token_to_string(phrase_index, item->m_token); + + if (word1 && word2) + fprintf(output, "\\item %d %s %d %s count %d T %d N_n_0 %d n_1 %d Mr %d\n", + token, word1, item->m_token, word2, + item->m_item.m_WC, item->m_item.m_WC, + item->m_item.m_N_n_0, item->m_item.m_n_1, + item->m_item.m_Mr); + + g_free(word1); g_free(word2); + } + + g_array_free(array, TRUE); + delete single_gram; + } + + g_array_free(items, TRUE); + return true; +} + +bool end_data(FILE * output){ + fprintf(output, "\\end\n"); + return true; +} + +int main(int argc, char * argv[]){ + FILE * output = stdout; + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- export k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + if (!bigram.attach(k_mixture_model_filename, ATTACH_READONLY)) { + fprintf(stderr, "open %s failed.\n", k_mixture_model_filename); + exit(ENOENT); + } + + print_k_mixture_model_magic_header(output, &bigram); + print_k_mixture_model_array_headers(output, &bigram, &phrase_index); + print_k_mixture_model_array_items(output, &bigram, &phrase_index); + + end_data(output); + + return 0; +} diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp new file mode 100644 index 0000000..b6f96fa --- /dev/null +++ b/utils/training/gen_deleted_ngram.cpp @@ -0,0 +1,128 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007, 2011 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +static gboolean train_pi_gram = TRUE; +static const gchar * bigram_filename = DELETED_BIGRAM; + +static GOptionEntry entries[] = +{ + {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL}, + {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "deleted bi-gram file", NULL}, + {NULL} +}; + + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate deleted n-gram"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* load phrase table. */ + PhraseLargeTable2 phrase_table; + MemoryChunk * new_chunk = new MemoryChunk; + new_chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(new_chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENODATA); + + Bigram bigram; + bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); + + char* linebuf = NULL; size_t size = 0; + phrase_token_t last_token, cur_token = last_token = 0; + while( getline(&linebuf, &size, stdin) ){ + if ( feof(stdin) ) + break; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + + last_token = cur_token; + cur_token = token; + + /* skip null_token in second word. */ + if ( null_token == cur_token ) + continue; + + /* skip pi-gram training. */ + if ( null_token == last_token ){ + if ( !train_pi_gram ) + continue; + last_token = sentence_start; + } + + /* train bi-gram */ + SingleGram * single_gram = NULL; + bigram.load(last_token, single_gram); + + if ( NULL == single_gram ){ + single_gram = new SingleGram; + } + guint32 freq, total_freq; + //increase freq + if (single_gram->get_freq(cur_token, freq)) + assert(single_gram->set_freq(cur_token, freq + 1)); + else + assert(single_gram->insert_freq(cur_token, 1)); + //increase total freq + single_gram->get_total_freq(total_freq); + single_gram->set_total_freq(total_freq + 1); + + bigram.store(last_token, single_gram); + delete single_gram; + } + + free(linebuf); + return 0; +} diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp new file mode 100644 index 0000000..2dfb3d1 --- /dev/null +++ b/utils/training/gen_k_mixture_model.cpp @@ -0,0 +1,411 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <glib.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" +#include "k_mixture_model.h" + +/* Hash token of Hash token of word count. */ +typedef GHashTable * HashofDocument; +typedef GHashTable * HashofSecondWord; + +typedef GHashTable * HashofUnigram; + + +void print_help(){ + printf("Usage: gen_k_mixture_model [--skip-pi-gram-training]\n" + " [--maximum-occurs-allowed <INT>]\n" + " [--maximum-increase-rates-allowed <FLOAT>]\n" + " [--k-mixture-model-file <FILENAME>]\n" + " {<FILENAME>}+\n"); +} + + +static gint g_maximum_occurs = 20; +static parameter_t g_maximum_increase_rates = 3.; +static gboolean g_train_pi_gram = TRUE; +static const gchar * g_k_mixture_model_filename = NULL; + +static GOptionEntry entries[] = +{ + {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &g_train_pi_gram, "skip pi-gram training", NULL}, + {"maximum-occurs-allowed", 0, 0, G_OPTION_ARG_INT, &g_maximum_occurs, "maximum occurs allowed", NULL}, + {"maximum-increase-rates-allowed", 0, 0, G_OPTION_ARG_DOUBLE, &g_maximum_increase_rates, "maximum increase rates allowed", NULL}, + {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &g_k_mixture_model_filename, "k mixture model file", NULL}, + {NULL} +}; + + +bool read_document(PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + FILE * document, + HashofDocument hash_of_document, + HashofUnigram hash_of_unigram){ + + char * linebuf = NULL;size_t size = 0; + phrase_token_t last_token, cur_token = last_token = 0; + + while ( getline(&linebuf, &size, document) ){ + if ( feof(document) ) + break; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf); + + last_token = cur_token; + cur_token = token; + + /* skip null_token in second word. */ + if ( null_token == cur_token ) + continue; + + gpointer value = NULL; + gboolean lookup_result = g_hash_table_lookup_extended + (hash_of_unigram, GUINT_TO_POINTER(cur_token), + NULL, &value); + if ( !lookup_result ){ + g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token), + GUINT_TO_POINTER(1)); + } else { + guint32 freq = GPOINTER_TO_UINT(value); + freq ++; + g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token), + GUINT_TO_POINTER(freq)); + } + + /* skip pi-gram training. */ + if ( null_token == last_token ){ + if ( !g_train_pi_gram ) + continue; + last_token = sentence_start; + } + + /* remember the (last_token, cur_token) word pair. */ + HashofSecondWord hash_of_second_word = NULL; + lookup_result = g_hash_table_lookup_extended + (hash_of_document, GUINT_TO_POINTER(last_token), + NULL, &value); + if ( !lookup_result ){ + hash_of_second_word = g_hash_table_new + (g_direct_hash, g_direct_equal); + } else { + hash_of_second_word = (HashofSecondWord) value; + } + + value = NULL; + lookup_result = g_hash_table_lookup_extended + (hash_of_second_word, GUINT_TO_POINTER(cur_token), + NULL, &value); + guint32 count = 0; + if ( lookup_result ) { + count = GPOINTER_TO_UINT(value); + } + count ++; + g_hash_table_insert(hash_of_second_word, + GUINT_TO_POINTER(cur_token), + GUINT_TO_POINTER(count)); + g_hash_table_insert(hash_of_document, + GUINT_TO_POINTER(last_token), + hash_of_second_word); + } + + free(linebuf); + + return true; +} + +static void train_word_pair(HashofUnigram hash_of_unigram, + KMixtureModelSingleGram * single_gram, + phrase_token_t token2, guint32 count){ + KMixtureModelArrayItem array_item; + + bool exists = single_gram->get_array_item(token2, array_item); + if ( exists ) { + guint32 maximum_occurs_allowed = std_lite::max + ((guint32)g_maximum_occurs, + (guint32)ceil(array_item.m_Mr * g_maximum_increase_rates)); + /* Exceeds the maximum occurs allowed of the word or phrase, + * in a single document. + */ + if ( count > maximum_occurs_allowed ){ + gpointer value = NULL; + assert( g_hash_table_lookup_extended + (hash_of_unigram, GUINT_TO_POINTER(token2), + NULL, &value) ); + guint32 freq = GPOINTER_TO_UINT(value); + freq -= count; + if ( freq > 0 ) { + g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2), + GUINT_TO_POINTER(freq)); + } else if ( freq == 0 ) { + assert(g_hash_table_steal(hash_of_unigram, + GUINT_TO_POINTER(token2))); + } else { + assert(false); + } + return; + } + array_item.m_WC += count; + /* array_item.m_T += count; the same as m_WC. */ + array_item.m_N_n_0 ++; + if ( 1 == count ) + array_item.m_n_1 ++; + array_item.m_Mr = std_lite::max(array_item.m_Mr, count); + assert(single_gram->set_array_item(token2, array_item)); + } else { /* item doesn't exist. */ + /* the same as above. */ + if ( count > g_maximum_occurs ){ + gpointer value = NULL; + assert( g_hash_table_lookup_extended + (hash_of_unigram, GUINT_TO_POINTER(token2), + NULL, &value) ); + guint32 freq = GPOINTER_TO_UINT(value); + freq -= count; + if ( freq > 0 ) { + g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2), + GUINT_TO_POINTER(freq)); + } else if ( freq == 0 ) { + assert(g_hash_table_steal(hash_of_unigram, + GUINT_TO_POINTER(token2))); + } else { + assert(false); + } + return; + } + memset(&array_item, 0, sizeof(KMixtureModelArrayItem)); + array_item.m_WC = count; + /* array_item.m_T = count; the same as m_WC. */ + array_item.m_N_n_0 = 1; + if ( 1 == count ) + array_item.m_n_1 = 1; + array_item.m_Mr = count; + assert(single_gram->insert_array_item(token2, array_item)); + } + + /* save delta in the array header. */ + KMixtureModelArrayHeader array_header; + single_gram->get_array_header(array_header); + array_header.m_WC += count; + single_gram->set_array_header(array_header); +} + +bool train_single_gram(HashofUnigram hash_of_unigram, + HashofDocument hash_of_document, + KMixtureModelSingleGram * single_gram, + phrase_token_t token1, + guint32 & delta){ + assert(NULL != single_gram); + delta = 0; /* delta in WC of single_gram. */ + KMixtureModelArrayHeader array_header; + assert(single_gram->get_array_header(array_header)); + guint32 saved_array_header_WC = array_header.m_WC; + + HashofSecondWord hash_of_second_word = NULL; + gpointer key, value = NULL; + assert(g_hash_table_lookup_extended + (hash_of_document, GUINT_TO_POINTER(token1), + NULL, &value)); + hash_of_second_word = (HashofSecondWord) value; + assert(NULL != hash_of_second_word); + + /* train word pair */ + GHashTableIter iter; + g_hash_table_iter_init(&iter, hash_of_second_word); + while (g_hash_table_iter_next(&iter, &key, &value)) { + phrase_token_t token2 = GPOINTER_TO_UINT(key); + guint32 count = GPOINTER_TO_UINT(value); + train_word_pair(hash_of_unigram, single_gram, token2, count); + } + + assert(single_gram->get_array_header(array_header)); + delta = array_header.m_WC - saved_array_header_WC; + return true; +} + +static bool train_second_word(HashofUnigram hash_of_unigram, + KMixtureModelBigram * bigram, + HashofDocument hash_of_document, + phrase_token_t token1){ + guint32 delta = 0; + + KMixtureModelSingleGram * single_gram = NULL; + bool exists = bigram->load(token1, single_gram); + if ( !exists ) + single_gram = new KMixtureModelSingleGram; + train_single_gram(hash_of_unigram, hash_of_document, + single_gram, token1, delta); + + if ( 0 == delta ){ /* Please consider maximum occurs allowed. */ + delete single_gram; + return false; + } + + /* save the single gram. */ + assert(bigram->store(token1, single_gram)); + delete single_gram; + + KMixtureModelMagicHeader magic_header; + if (!bigram->get_magic_header(magic_header)){ + /* the first time to access the new k mixture model file. */ + memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader)); + } + + if ( magic_header.m_WC + delta < magic_header.m_WC ){ + fprintf(stderr, "the m_WC integer in magic header overflows.\n"); + return false; + } + magic_header.m_WC += delta; + assert(bigram->set_magic_header(magic_header)); + + return true; +} + +/* Note: this method is a post-processing method, run this last. */ +static bool post_processing_unigram(KMixtureModelBigram * bigram, + HashofUnigram hash_of_unigram){ + GHashTableIter iter; + gpointer key, value; + guint32 total_freq = 0; + + g_hash_table_iter_init(&iter, hash_of_unigram); + while (g_hash_table_iter_next(&iter, &key, &value)){ + guint32 token = GPOINTER_TO_UINT(key); + guint32 freq = GPOINTER_TO_UINT(value); + KMixtureModelArrayHeader array_header; + bool result = bigram->get_array_header(token, array_header); + array_header.m_freq += freq; + total_freq += freq; + bigram->set_array_header(token, array_header); + } + + KMixtureModelMagicHeader magic_header; + assert(bigram->get_magic_header(magic_header)); + if ( magic_header.m_total_freq + total_freq < magic_header.m_total_freq ){ + fprintf(stderr, "the m_total_freq in magic header overflows.\n"); + return false; + } + magic_header.m_total_freq += total_freq; + assert(bigram->set_magic_header(magic_header)); + + return true; +} + +int main(int argc, char * argv[]){ + int i = 1; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + PhraseLargeTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); + + while ( i < argc ){ + const char * filename = argv[i]; + FILE * document = fopen(filename, "r"); + if ( NULL == document ){ + int err_saved = errno; + fprintf(stderr, "can't open file: %s.\n", filename); + fprintf(stderr, "error:%s.\n", strerror(err_saved)); + exit(err_saved); + } + + HashofDocument hash_of_document = g_hash_table_new + (g_direct_hash, g_direct_equal); + HashofUnigram hash_of_unigram = g_hash_table_new + (g_direct_hash, g_direct_equal); + + assert(read_document(&phrase_table, &phrase_index, document, + hash_of_document, hash_of_unigram)); + fclose(document); + document = NULL; + + GHashTableIter iter; + gpointer key, value; + + /* train the document, and convert it to k mixture model. */ + g_hash_table_iter_init(&iter, hash_of_document); + while (g_hash_table_iter_next(&iter, &key, &value)) { + phrase_token_t token1 = GPOINTER_TO_UINT(key); + train_second_word(hash_of_unigram, &bigram, + hash_of_document, token1); + } + + KMixtureModelMagicHeader magic_header; + assert(bigram.get_magic_header(magic_header)); + magic_header.m_N ++; + assert(bigram.set_magic_header(magic_header)); + + post_processing_unigram(&bigram, hash_of_unigram); + + /* free resources of g_hash_of_document */ + g_hash_table_iter_init(&iter, hash_of_document); + while (g_hash_table_iter_next(&iter, &key, &value)) { + HashofSecondWord second_word = (HashofSecondWord) value; + g_hash_table_iter_steal(&iter); + g_hash_table_unref(second_word); + } + g_hash_table_unref(hash_of_document); + hash_of_document = NULL; + + g_hash_table_unref(hash_of_unigram); + hash_of_unigram = NULL; + + ++i; + } + + return 0; +} diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp new file mode 100644 index 0000000..1947959 --- /dev/null +++ b/utils/training/gen_ngram.cpp @@ -0,0 +1,136 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007, 2011 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +static gboolean train_pi_gram = TRUE; +static const gchar * bigram_filename = SYSTEM_BIGRAM; + +static GOptionEntry entries[] = +{ + {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL}, + {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "bi-gram file", NULL}, + {NULL} +}; + +int main(int argc, char * argv[]){ + FILE * input = stdin; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate n-gram"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + PhraseLargeTable2 phrase_table; + /* init phrase table */ + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); + + char* linebuf = NULL; size_t size = 0; + phrase_token_t last_token, cur_token = last_token = 0; + while( getline(&linebuf, &size, input) ){ + if ( feof(input) ) + break; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + + last_token = cur_token; + cur_token = token; + + /* skip null_token in second word. */ + if ( null_token == cur_token ) + continue; + + /* training uni-gram */ + phrase_index.add_unigram_frequency(cur_token, 1); + + /* skip pi-gram training. */ + if ( null_token == last_token ){ + if ( !train_pi_gram ) + continue; + last_token = sentence_start; + } + + /* train bi-gram */ + SingleGram * single_gram = NULL; + bigram.load(last_token, single_gram); + + if ( NULL == single_gram ){ + single_gram = new SingleGram; + } + guint32 freq, total_freq; + /* increase freq */ + if (single_gram->get_freq(cur_token, freq)) + assert(single_gram->set_freq(cur_token, freq + 1)); + else + assert(single_gram->insert_freq(cur_token, 1)); + /* increase total freq */ + single_gram->get_total_freq(total_freq); + single_gram->set_total_freq(total_freq + 1); + + bigram.store(last_token, single_gram); + delete single_gram; + } + + free(linebuf); + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp new file mode 100644 index 0000000..f4c51af --- /dev/null +++ b/utils/training/gen_unigram.cpp @@ -0,0 +1,111 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +static const gchar * table_dir = "."; + +static GOptionEntry entries[] = +{ + {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL}, + {NULL} +}; + +/* increase all unigram frequency by a constant. */ + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- increase uni-gram"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + bool retval = system_table_info.load(filename); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + g_free(filename); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + /* Note: please increase the value when corpus size becomes larger. + * To avoid zero value when computing unigram frequency in float format. + */ + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + assert(table_info->m_dict_index == i); + + if (SYSTEM_FILE != table_info->m_file_type && + DICTIONARY != table_info->m_file_type) + continue; + + guint32 freq = 1; +#if 0 + /* skip GBK_DICTIONARY. */ + if (GBK_DICTIONARY == table_info->m_dict_index) + freq = 1; +#endif + + const char * binfile = table_info->m_system_filename; + + MemoryChunk * chunk = new MemoryChunk; + bool retval = chunk->load(binfile); + if (!retval) { + fprintf(stderr, "load %s failed!\n", binfile); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + + PhraseIndexRange range; + int result = phrase_index.get_range(i, range); + if ( result == ERROR_OK ) { + for (size_t token = range.m_range_begin; + token <= range.m_range_end; ++token) { + phrase_index.add_unigram_frequency(token, freq); + } + } + } + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + if (!save_dictionary(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp new file mode 100644 index 0000000..40870cf --- /dev/null +++ b/utils/training/import_k_mixture_model.cpp @@ -0,0 +1,322 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" +#include "k_mixture_model.h" + +static const gchar * k_mixture_model_filename = NULL; + +static GOptionEntry entries[] = +{ + {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL}, + {NULL} +}; + + +enum LINE_TYPE{ + BEGIN_LINE = 1, + END_LINE, + GRAM_1_LINE, + GRAM_2_LINE, + GRAM_1_ITEM_LINE, + GRAM_2_ITEM_LINE +}; + +static int line_type = 0; +static GPtrArray * values = NULL; +static GHashTable * required = NULL; +/* variables for line buffer. */ +static char * linebuf = NULL; +static size_t len = 0; + +bool parse_headline(KMixtureModelBigram * bigram); + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram); + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram); + + +static ssize_t my_getline(FILE * input){ + ssize_t result = getline(&linebuf, &len, input); + if ( result == -1 ) + return result; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + return result; +} + +bool parse_headline(KMixtureModelBigram * bigram){ + /* enter "\data" line */ + assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", "")); + + /* read "\data" line */ + if ( !taglib_read(linebuf, line_type, values, required) ) { + fprintf(stderr, "error: k mixture model expected.\n"); + return false; + } + + assert(line_type == BEGIN_LINE); + /* check header */ + TAGLIB_GET_TAGVALUE(const char *, model, (const char *)); + if ( !( strcmp("k mixture model", model) == 0 ) ) { + fprintf(stderr, "error: k mixture model expected.\n"); + return false; + } + + TAGLIB_GET_TAGVALUE(glong, count, atol); + TAGLIB_GET_TAGVALUE(glong, N, atol); + TAGLIB_GET_TAGVALUE(glong, total_freq, atol); + + KMixtureModelMagicHeader magic_header; + memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader)); + magic_header.m_WC =count; magic_header.m_N = N; + magic_header.m_total_freq = total_freq; + bigram->set_magic_header(magic_header); + + return true; +} + +bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + + do { + retry: + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case END_LINE: + goto end; + case GRAM_1_LINE: + my_getline(input); + parse_unigram(input, phrase_table, phrase_index, bigram); + goto retry; + case GRAM_2_LINE: + my_getline(input); + parse_bigram(input, phrase_table, phrase_index, bigram); + goto retry; + default: + assert(false); + } + } while (my_getline(input) != -1) ; + + end: + taglib_pop_state(); + return true; +} + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", "")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_1_ITEM_LINE:{ + /* handle \item in \1-gram */ + TAGLIB_GET_TOKEN(token, 0); + TAGLIB_GET_PHRASE_STRING(word, 1); + assert(taglib_validate_token_with_string + (phrase_index, token, word)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + TAGLIB_GET_TAGVALUE(glong, freq, atol); + + KMixtureModelArrayHeader array_header; + memset(&array_header, 0, sizeof(KMixtureModelArrayHeader)); + array_header.m_WC = count; array_header.m_freq = freq; + bigram->set_array_header(token, array_header); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, + "count:T:N_n_0:n_1:Mr", "")); + + phrase_token_t last_token = null_token; + KMixtureModelSingleGram * last_single_gram = NULL; + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_2_ITEM_LINE:{ + /* handle \item in \2-gram */ + /* two tokens */ + TAGLIB_GET_TOKEN(token1, 0); + TAGLIB_GET_PHRASE_STRING(word1, 1); + assert(taglib_validate_token_with_string + (phrase_index, token1, word1)); + + TAGLIB_GET_TOKEN(token2, 2); + TAGLIB_GET_PHRASE_STRING(word2, 3); + assert(taglib_validate_token_with_string + (phrase_index, token2, word2)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + TAGLIB_GET_TAGVALUE(glong, T, atol); + assert(count == T); + TAGLIB_GET_TAGVALUE(glong, N_n_0, atol); + TAGLIB_GET_TAGVALUE(glong, n_1, atol); + TAGLIB_GET_TAGVALUE(glong, Mr, atol); + + KMixtureModelArrayItem array_item; + memset(&array_item, 0, sizeof(KMixtureModelArrayItem)); + array_item.m_WC = count; array_item.m_N_n_0 = N_n_0; + array_item.m_n_1 = n_1; array_item.m_Mr = Mr; + + if ( last_token != token1 ) { + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + /* safe guard */ + last_token = null_token; + last_single_gram = NULL; + } + KMixtureModelSingleGram * single_gram = NULL; + bigram->load(token1, single_gram); + + /* create the new single gram */ + if ( single_gram == NULL ) + single_gram = new KMixtureModelSingleGram; + last_token = token1; + last_single_gram = single_gram; + } + + assert(NULL != last_single_gram); + assert(last_single_gram->insert_array_item(token2, array_item)); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + /* safe guard */ + last_token = null_token; + last_single_gram = NULL; + } + + taglib_pop_state(); + return true; +} + +int main(int argc, char * argv[]){ + FILE * input = stdin; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- import k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + PhraseLargeTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); + + taglib_init(); + + /* prepare to read n-gram model */ + values = g_ptr_array_new(); + required = g_hash_table_new(g_str_hash, g_str_equal); + + ssize_t result = my_getline(input); + if ( result == -1 ) { + fprintf(stderr, "empty file input.\n"); + exit(ENODATA); + } + + if (!parse_headline(&bigram)) + exit(ENODATA); + + result = my_getline(input); + if ( result != -1 ) + parse_body(input, &phrase_table, &phrase_index, &bigram); + + taglib_fini(); + + return 0; +} diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h new file mode 100644 index 0000000..ad8d3d8 --- /dev/null +++ b/utils/training/k_mixture_model.h @@ -0,0 +1,172 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef K_MIXTURE_MODEL +#define K_MIXTURE_MODEL + +#include <math.h> +#include "novel_types.h" +#include "flexible_ngram.h" + +namespace pinyin{ + +typedef guint32 corpus_count_t; + +/* Note: storage parameters: N, T, n_r. + * N: the total number of documents. + * T: the total number of instances of the word or phrase. + * n_r: the number of documents having exactly <b>r</b> occurrences. + * only n_0, n_1 are used here. + */ + +static inline parameter_t compute_alpha(corpus_count_t N, corpus_count_t n_0){ + parameter_t alpha = 1 - n_0 / (parameter_t) N; + return alpha; +} + +static inline parameter_t compute_gamma(corpus_count_t N, + corpus_count_t n_0, + corpus_count_t n_1){ + parameter_t gamma = 1 - n_1 / (parameter_t) (N - n_0); + return gamma; +} + +static inline parameter_t compute_B(corpus_count_t N, + corpus_count_t T, + corpus_count_t n_0, + corpus_count_t n_1){ + /* Note: re-check this, to see if we can remove if statement. */ + /* Please consider B_2 is no less than 2 in paper. */ +#if 1 + if ( 0 == T - n_1 && 0 == N - n_0 - n_1 ) + return 2; +#endif + + parameter_t B = (T - n_1 ) / (parameter_t) (N - n_0 - n_1); + return B; +} + +/* three parameters model */ +static inline parameter_t compute_Pr_G_3(corpus_count_t k, + parameter_t alpha, + parameter_t gamma, + parameter_t B){ + if ( k == 0 ) + return 1 - alpha; + + if ( k == 1 ) + return alpha * (1 - gamma); + + if ( k > 1 ) { + return (alpha * gamma / (B - 1)) * pow((1 - 1 / (B - 1)) , k - 2); + } + + assert(false); +} + +static inline parameter_t compute_Pr_G_3_with_count(corpus_count_t k, + corpus_count_t N, + corpus_count_t T, + corpus_count_t n_0, + corpus_count_t n_1){ + parameter_t alpha = compute_alpha(N, n_0); + parameter_t gamma = compute_gamma(N, n_0, n_1); + parameter_t B = compute_B(N, T, n_0, n_1); + + return compute_Pr_G_3(k, alpha, gamma, B); +} + +/* two parameters model */ +static inline parameter_t compute_Pr_G_2(corpus_count_t k, + parameter_t alpha, + parameter_t B){ + parameter_t gamma = 1 - 1 / (B - 1); + return compute_Pr_G_3(k, alpha, gamma, B); +} + +static inline parameter_t compute_Pr_G_2_with_count(corpus_count_t k, + corpus_count_t N, + corpus_count_t T, + corpus_count_t n_0, + corpus_count_t n_1){ + parameter_t alpha = compute_alpha(N, n_0); + parameter_t B = compute_B(N, T, n_0, n_1); + return compute_Pr_G_2(k, alpha, B); +} + +#define K_MIXTURE_MODEL_MAGIC_NUMBER "KMMP" + +typedef struct{ + /* the total number of instances of all words. */ + guint32 m_WC; + /* the total number of documents. */ + guint32 m_N; + /* the total freq of uni-gram. */ + guint32 m_total_freq; +} KMixtureModelMagicHeader; + +typedef struct{ + /* the total number of instances of word W1. */ + guint32 m_WC; + /* the freq of uni-gram. see m_total_freq in magic header also. */ + guint32 m_freq; +} KMixtureModelArrayHeader; + +typedef struct{ + /* the total number of all W1,W2 word pair. */ + guint32 m_WC; + + /* the total number of instances of the word or phrase. + (two word phrase) */ + /* guint32 m_T; Please use m_WC instead. + alias of m_WC, always the same. */ + + /* n_r: the number of documents having exactly r occurrences. */ + /* guint32 m_n_0; + Note: compute this value using the following equation. + m_n_0 = KMixtureModelMagicHeader.m_N - m_N_n_0; + m_N_n_0, the number of documents which contains the word or phrase. + (two word phrase) */ + guint32 m_N_n_0; + guint32 m_n_1; + + /* maximum instances of the word or phrase (two word phrase) + in previous documents last seen. */ + guint32 m_Mr; +} KMixtureModelArrayItem; + +typedef FlexibleBigram<KMixtureModelMagicHeader, + KMixtureModelArrayHeader, + KMixtureModelArrayItem> +KMixtureModelBigram; + +typedef FlexibleSingleGram<KMixtureModelArrayHeader, + KMixtureModelArrayItem> +KMixtureModelSingleGram; + +typedef KMixtureModelSingleGram::ArrayItemWithToken +KMixtureModelArrayItemWithToken; + +}; + + +#endif diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp new file mode 100644 index 0000000..c5a66ec --- /dev/null +++ b/utils/training/k_mixture_model_to_interpolation.cpp @@ -0,0 +1,214 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "pinyin_internal.h" +#include "utils_helper.h" + +enum LINE_TYPE{ + BEGIN_LINE = 1, + END_LINE, + GRAM_1_LINE, + GRAM_2_LINE, + GRAM_1_ITEM_LINE, + GRAM_2_ITEM_LINE +}; + +static int line_type = 0; +static GPtrArray * values = NULL; +static GHashTable * required = NULL; +/* variables for line buffer. */ +static char * linebuf = NULL; +static size_t len = 0; + +bool parse_headline(FILE * input, FILE * output); + +bool parse_unigram(FILE * input, FILE * output); + +bool parse_bigram(FILE * input, FILE * output); + +static ssize_t my_getline(FILE * input){ + ssize_t result = getline(&linebuf, &len, input); + if ( result == -1 ) + return result; + + linebuf[strlen(linebuf) - 1] = '\0'; + return result; +} + +bool parse_headline(FILE * input, FILE * output) { + /* enter "\data" line */ + assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", + "count:N:total_freq")); + + /* read "\data" line */ + if ( !taglib_read(linebuf, line_type, values, required) ) { + fprintf(stderr, "error: k mixture model expected.\n"); + return false; + } + + assert(line_type == BEGIN_LINE); + TAGLIB_GET_TAGVALUE(const char *, model, (const char *)); + if ( !( strcmp("k mixture model", model) == 0 ) ){ + fprintf(stderr, "error: k mixture model expected.\n"); + return false; + } + + /* print header */ + fprintf(output, "\\data model interpolation\n"); + + return true; +} + +bool parse_body(FILE * input, FILE * output){ + taglib_push_state(); + + assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + + do { + retry: + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case END_LINE: + fprintf(output, "\\end\n"); + goto end; + case GRAM_1_LINE: + fprintf(output, "\\1-gram\n"); + my_getline(input); + parse_unigram(input, output); + goto retry; + case GRAM_2_LINE: + fprintf(output, "\\2-gram\n"); + my_getline(input); + parse_bigram(input, output); + goto retry; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_unigram(FILE * input, FILE * output){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case GRAM_1_ITEM_LINE: { + /* handle \item in \1-gram */ + TAGLIB_GET_TOKEN(token, 0); + TAGLIB_GET_PHRASE_STRING(word, 1); + + /* remove the "<start>" in the uni-gram of interpolation model */ + if ( sentence_start == token ) + break; + + TAGLIB_GET_TAGVALUE(glong, freq, atol); + + /* ignore zero unigram freq item */ + if ( 0 != freq ) + fprintf(output, "\\item %d %s count %ld\n", token, word, freq); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_bigram(FILE * input, FILE * output){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, + "count", "T:N_n_0:n_1:Mr")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_2_ITEM_LINE:{ + /* handle \item in \2-gram */ + /* two strings */ + TAGLIB_GET_TOKEN(token1, 0); + TAGLIB_GET_PHRASE_STRING(word1, 1); + + TAGLIB_GET_TOKEN(token2, 2); + TAGLIB_GET_PHRASE_STRING(word2, 3); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + fprintf(output, "\\item %d %s %d %s count %ld\n", + token1, word1, token2, word2, count); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + taglib_init(); + + values = g_ptr_array_new(); + required = g_hash_table_new(g_str_hash, g_str_equal); + + ssize_t result = my_getline(input); + if ( result == -1 ) { + fprintf(stderr, "empty file input.\n"); + exit(ENODATA); + } + + if (!parse_headline(input, output)) + exit(ENODATA); + + result = my_getline(input); + if ( result != -1 ) + parse_body(input, output); + + taglib_fini(); + + return 0; +} diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp new file mode 100644 index 0000000..ab08010 --- /dev/null +++ b/utils/training/merge_k_mixture_model.cpp @@ -0,0 +1,239 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <locale.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" + +void print_help(){ + printf("Usage: merge_k_mixture_model [--result-file <RESULT_FILENAME>]\n"); + printf(" {<SOURCE_FILENAME>}+\n"); +} + +static const gchar * result_filename = NULL; + +static GOptionEntry entries[] = +{ + {"result-file", 0, 0, G_OPTION_ARG_FILENAME, &result_filename, "merged result file", NULL}, + {NULL} +}; + +static bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first, + /* in */ FlexibleBigramPhraseArray second, + /* out */ FlexibleBigramPhraseArray & merged ){ + /* avoid to do empty merge. */ + assert( NULL != first && NULL != second && NULL != merged ); + + /* merge two arrays. */ + guint first_index, second_index = first_index = 0; + KMixtureModelArrayItemWithToken * first_item, + * second_item = first_item = NULL; + while ( first_index < first->len && second_index < second->len ){ + first_item = &g_array_index(first, KMixtureModelArrayItemWithToken, + first_index); + second_item = &g_array_index(second, KMixtureModelArrayItemWithToken, + second_index); + if ( first_item->m_token > second_item->m_token ) { + g_array_append_val(merged, *second_item); + second_index ++; + } else if ( first_item->m_token < second_item->m_token ) { + g_array_append_val(merged, *first_item); + first_index ++; + } else /* first_item->m_token == second_item->m_token */ { + KMixtureModelArrayItemWithToken merged_item; + memset(&merged_item, 0, sizeof(KMixtureModelArrayItemWithToken)); + merged_item.m_token = first_item->m_token;/* same as second_item */ + merged_item.m_item.m_WC = first_item->m_item.m_WC + + second_item->m_item.m_WC; + /* merged_item.m_item.m_T = first_item->m_item.m_T + + second_item->m_item.m_T; */ + merged_item.m_item.m_N_n_0 = first_item->m_item.m_N_n_0 + + second_item->m_item.m_N_n_0; + merged_item.m_item.m_n_1 = first_item->m_item.m_n_1 + + second_item->m_item.m_n_1; + merged_item.m_item.m_Mr = std_lite::max(first_item->m_item.m_Mr, + second_item->m_item.m_Mr); + g_array_append_val(merged, merged_item); + first_index ++; second_index ++; + } + } + + /* add remained items. */ + while ( first_index < first->len ){ + first_item = &g_array_index(first, KMixtureModelArrayItemWithToken, + first_index); + g_array_append_val(merged, *first_item); + first_index++; + } + + while ( second_index < second->len ){ + second_item = &g_array_index(second, KMixtureModelArrayItemWithToken, + second_index); + g_array_append_val(merged, *second_item); + second_index++; + } + + return true; +} + +static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + + KMixtureModelMagicHeader target_magic_header; + KMixtureModelMagicHeader new_magic_header; + KMixtureModelMagicHeader merged_magic_header; + + memset(&merged_magic_header, 0, sizeof(KMixtureModelMagicHeader)); + if (!target->get_magic_header(target_magic_header)) { + memset(&target_magic_header, 0, sizeof(KMixtureModelMagicHeader)); + } + assert(new_one->get_magic_header(new_magic_header)); + if ( target_magic_header.m_WC + new_magic_header.m_WC < + std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){ + fprintf(stderr, "the m_WC integer in magic header overflows.\n"); + return false; + } + if ( target_magic_header.m_total_freq + new_magic_header.m_total_freq < + std_lite::max( target_magic_header.m_total_freq, + new_magic_header.m_total_freq ) ){ + fprintf(stderr, "the m_total_freq in magic header overflows.\n"); + return false; + } + + merged_magic_header.m_WC = target_magic_header.m_WC + + new_magic_header.m_WC; + merged_magic_header.m_N = target_magic_header.m_N + + new_magic_header.m_N; + merged_magic_header.m_total_freq = target_magic_header.m_total_freq + + new_magic_header.m_total_freq; + + assert(target->set_magic_header(merged_magic_header)); + return true; +} + +static bool merge_array_items( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + + GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + new_one->get_all_items(new_items); + + for ( size_t i = 0; i < new_items->len; ++i ){ + phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i); + KMixtureModelSingleGram * target_single_gram = NULL; + KMixtureModelSingleGram * new_single_gram = NULL; + + assert(new_one->load(*token, new_single_gram)); + bool exists_in_target = target->load(*token, target_single_gram); + if ( !exists_in_target ){ + target->store(*token, new_single_gram); + delete new_single_gram; + continue; + } + + /* word count in array header in parallel with array items */ + KMixtureModelArrayHeader target_array_header; + KMixtureModelArrayHeader new_array_header; + KMixtureModelArrayHeader merged_array_header; + + assert(new_one->get_array_header(*token, new_array_header)); + assert(target->get_array_header(*token, target_array_header)); + memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader)); + + merged_array_header.m_WC = target_array_header.m_WC + + new_array_header.m_WC; + merged_array_header.m_freq = target_array_header.m_freq + + new_array_header.m_freq; + /* end of word count in array header computing. */ + + assert(NULL != target_single_gram); + KMixtureModelSingleGram * merged_single_gram = + new KMixtureModelSingleGram; + + FlexibleBigramPhraseArray target_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + target_single_gram->retrieve_all(target_array); + + FlexibleBigramPhraseArray new_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + new_single_gram->retrieve_all(new_array); + FlexibleBigramPhraseArray merged_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + + assert(merge_two_phrase_array(target_array, new_array, merged_array)); + + g_array_free(target_array, TRUE); + g_array_free(new_array, TRUE); + delete target_single_gram; delete new_single_gram; + + for ( size_t m = 0; m < merged_array->len; ++m ){ + KMixtureModelArrayItemWithToken * item = + &g_array_index(merged_array, + KMixtureModelArrayItemWithToken, m); + merged_single_gram->insert_array_item(item->m_token, item->m_item); + } + + assert(merged_single_gram->set_array_header(merged_array_header)); + assert(target->store(*token, merged_single_gram)); + delete merged_single_gram; + g_array_free(merged_array, TRUE); + } + + g_array_free(new_items, TRUE); + return true; +} + +bool merge_two_k_mixture_model( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + assert(NULL != target); + assert(NULL != new_one); + return merge_array_items(target, new_one) && + merge_magic_header(target, new_one); +} + +int main(int argc, char * argv[]){ + int i = 1; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- merge k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + KMixtureModelBigram target(K_MIXTURE_MODEL_MAGIC_NUMBER); + target.attach(result_filename, ATTACH_READWRITE|ATTACH_CREATE); + + while (i < argc){ + const char * new_filename = argv[i]; + KMixtureModelBigram new_one(K_MIXTURE_MODEL_MAGIC_NUMBER); + new_one.attach(new_filename, ATTACH_READONLY); + if ( !merge_two_k_mixture_model(&target, &new_one) ) + exit(EOVERFLOW); + ++i; + } + + return 0; +} diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp new file mode 100644 index 0000000..40dfb87 --- /dev/null +++ b/utils/training/prune_k_mixture_model.cpp @@ -0,0 +1,192 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + + +#include <errno.h> +#include <locale.h> +#include <limits.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" + + +void print_help(){ + printf("Usage: prune_k_mixture_model -k <INT> --CDF <DOUBLE> <FILENAME>\n"); +} + +static gint g_prune_k = 3; +static parameter_t g_prune_poss = 0.99; + +static GOptionEntry entries[] = +{ + {"pruneK", 'k', 0, G_OPTION_ARG_INT, &g_prune_k, "k parameter", NULL}, + {"CDF", 0, 0, G_OPTION_ARG_DOUBLE, &g_prune_poss, "CDF parameter", NULL}, + {NULL} +}; + + +bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header, + KMixtureModelSingleGram * & bigram, + FlexibleBigramPhraseArray removed_array){ + bool success; + + FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + bigram->retrieve_all(array); + + for ( size_t i = 0; i < array->len; ++i) { + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i); + phrase_token_t token = item->m_token; + parameter_t remained_poss = 1; parameter_t one_poss = 0; + bool errors = false; + for ( size_t k = 0; k < g_prune_k; ++k){ + one_poss = compute_Pr_G_3_with_count + (k, magic_header->m_N, item->m_item.m_WC, + magic_header->m_N - item->m_item.m_N_n_0, + item->m_item.m_n_1); + if ( !(0 <= one_poss && one_poss <= 1) ) + errors = true; + remained_poss -= one_poss; + } + + if ( fabs(remained_poss) < DBL_EPSILON ) + remained_poss = 0.; + + /* some wrong possibility. */ + if ( errors || !(0 <= remained_poss && remained_poss <= 1) ) { + fprintf(stderr, "some wrong possibility is encountered:%f.\n", + remained_poss); + fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n", + g_prune_k, magic_header->m_N, item->m_item.m_WC, + magic_header->m_N - item->m_item.m_N_n_0, + item->m_item.m_n_1); + exit(EDOM); + } + + if ( remained_poss < g_prune_poss ) { + /* prune this word or phrase. */ + KMixtureModelArrayItem removed_item; + bigram->remove_array_item(token, removed_item); + assert( memcmp(&removed_item, &(item->m_item), + sizeof(KMixtureModelArrayItem)) == 0 ); + + KMixtureModelArrayItemWithToken removed_item_with_token; + removed_item_with_token.m_token = token; + removed_item_with_token.m_item = removed_item; + g_array_append_val(removed_array, removed_item_with_token); + + KMixtureModelArrayHeader array_header; + bigram->get_array_header(array_header); + guint32 removed_count = removed_item.m_WC; + array_header.m_WC -= removed_count; + bigram->set_array_header(array_header); + magic_header->m_WC -= removed_count; + magic_header->m_total_freq -= removed_count; + } + } + + return true; +} + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- prune k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (2 != argc) { + fprintf(stderr, "wrong arguments.\n"); + exit(EINVAL); + } + + const gchar * bigram_filename = argv[1]; + + /* TODO: magic header signature check here. */ + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(bigram_filename, ATTACH_READWRITE); + + KMixtureModelMagicHeader magic_header; + if (!bigram.get_magic_header(magic_header)) { + fprintf(stderr, "no magic header in k mixture model.\n"); + exit(ENODATA); + } + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(items); + + /* print prune progress */ + size_t progress = 0; size_t onestep = items->len / 20; + for ( size_t i = 0; i < items->len; ++i ){ + if ( progress >= onestep ) { + progress = 0; fprintf(stderr, "*"); + } + progress ++; + + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + FlexibleBigramPhraseArray removed_array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + + prune_k_mixture_model(&magic_header, single_gram, removed_array); + bigram.store(*token, single_gram); + + delete single_gram; + + /* post processing for unigram reduce */ + for (size_t m = 0; m < removed_array->len; ++m ){ + KMixtureModelArrayItemWithToken * item = + &g_array_index(removed_array, + KMixtureModelArrayItemWithToken, m); + KMixtureModelArrayHeader array_header; + assert(bigram.get_array_header(item->m_token, array_header)); + array_header.m_freq -= item->m_item.m_WC; + assert(array_header.m_freq >= 0); + assert(bigram.set_array_header(item->m_token, array_header)); + } + + g_array_free(removed_array, TRUE); + removed_array = NULL; + } + + fprintf(stderr, "\n"); + + bigram.set_magic_header(magic_header); + + /* post processing clean up zero items */ + KMixtureModelArrayHeader array_header; + for ( size_t i = 0; i < items->len; ++i ){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + assert(bigram.get_array_header(*token, array_header)); + if ( 0 == array_header.m_WC && 0 == array_header.m_freq ) + assert(bigram.remove(*token)); + } + + g_array_free(items, TRUE); + + return 0; +} diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp new file mode 100644 index 0000000..7c057b9 --- /dev/null +++ b/utils/training/validate_k_mixture_model.cpp @@ -0,0 +1,174 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "pinyin_internal.h" +#include "k_mixture_model.h" + +void print_help(){ + printf("Usage: validate_k_mixture_model <FILENAME>\n"); +} + +bool validate_unigram(KMixtureModelBigram * bigram){ + KMixtureModelMagicHeader magic_header; + if( !bigram->get_magic_header(magic_header) ){ + fprintf(stderr, "no magic header in k mixture model.\n"); + return false; + } + + guint32 expected_word_count = magic_header.m_WC; + if ( 0 == expected_word_count ){ + fprintf(stderr, "word count in magic header is unexpected zero.\n"); + return false; + } + guint32 expected_total_freq = magic_header.m_total_freq; + if ( 0 == expected_total_freq ){ + fprintf(stderr, "total freq in magic header is unexpected zero.\n"); + return false; + } + + if ( expected_word_count != expected_total_freq ){ + fprintf(stderr, "the word count doesn't match the total freq.\n"); + return false; + } + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + guint32 word_count = 0; guint32 total_freq = 0; + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelArrayHeader array_header; + assert(bigram->get_array_header(*token, array_header)); + word_count += array_header.m_WC; + total_freq += array_header.m_freq; + } + + if ( word_count != expected_word_count ){ + fprintf(stderr, "word count in magic header:%d\n", + expected_word_count); + fprintf(stderr, "sum of word count in array headers:%d\n", word_count); + fprintf(stderr, "the sum differs from word count.\n"); + return false; + } + if ( total_freq != expected_total_freq ){ + fprintf(stderr, "total freq in magic header:%d\n", + expected_total_freq); + fprintf(stderr, "sum of freqs in array headers:%d\n", total_freq); + fprintf(stderr, "the total freq differs from sum of freqs.\n"); + return false; + } + + g_array_free(items, TRUE); + return true; +} + +bool validate_bigram(KMixtureModelBigram * bigram){ + bool result = true; + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + assert(bigram->load(*token, single_gram)); + + FlexibleBigramPhraseArray array = g_array_new + (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + single_gram->retrieve_all(array); + + KMixtureModelArrayHeader array_header; + assert(single_gram->get_array_header(array_header)); + + guint32 expected_sum = array_header.m_WC; + guint32 freq = array_header.m_freq; + if ( 0 == expected_sum ){ + if ( 0 != array->len ){ + fprintf(stderr, "in the array header of token %d:\n", *token); + fprintf(stderr, "word count is zero but has array items.\n"); + result = false; + } + if ( 0 != freq ){ + delete single_gram; + continue; + } else { + fprintf(stderr, "in the array header of token %d:\n", *token); + fprintf(stderr, "both word count and freq are " + "unexpected zero.\n"); + result = false; + } + } + + guint32 sum = 0; + for (size_t m = 0; m< array->len; ++m){ + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m); + + sum += item->m_item.m_WC; + } + + if ( sum != expected_sum ){ + fprintf(stderr, "word count in array header:%d\n", expected_sum); + fprintf(stderr, "sum of word count in array items:%d\n", sum); + fprintf(stderr, "the sum differs from word count.\n"); + result = false; + } + + g_array_free(array, TRUE); + delete single_gram; + } + + g_array_free(items, TRUE); + return result; +} + +int main(int argc, char * argv[]){ + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- validate k mixture model"); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (2 != argc) { + fprintf(stderr, "wrong arguments.\n"); + exit(EINVAL); + } + + const char * k_mixture_model_filename = argv[1]; + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(k_mixture_model_filename, ATTACH_READONLY); + + if (!validate_unigram(&bigram)) { + fprintf(stderr, "k mixture model validation failed.\n"); + exit(ENODATA); + } + + if (!validate_bigram(&bigram)) { + fprintf(stderr, "k mixture model validation failed.\n"); + exit(ENODATA); + } + + return 0; +} diff --git a/utils/utils_helper.h b/utils/utils_helper.h new file mode 100644 index 0000000..b91067b --- /dev/null +++ b/utils/utils_helper.h @@ -0,0 +1,147 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef UTILS_HELPER_H +#define UTILS_HELPER_H + + +#define TAGLIB_GET_TOKEN(var, index) \ + phrase_token_t var = null_token; \ + { \ + const char * string = (const char *) g_ptr_array_index \ + (values, index); \ + var = atoi(string); \ + } + +#define TAGLIB_GET_PHRASE_STRING(var, index) \ + const char * var = NULL; \ + { \ + var = (const char *) g_ptr_array_index \ + (values, index); \ + } + +#define TAGLIB_GET_TAGVALUE(type, var, conv) \ + type var; \ + { \ + gpointer value = NULL; \ + assert(g_hash_table_lookup_extended \ + (required, #var, NULL, &value)); \ + var = conv((const char *)value); \ + } + +#define TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, var, line) \ + phrase_token_t var = null_token; \ + do { \ + if (0 == strlen(line)) \ + break; \ + \ + gchar ** strs = g_strsplit_set(line, " \t", 2); \ + if (2 != g_strv_length(strs)) \ + assert(false); \ + \ + phrase_token_t _token = atoi(strs[0]); \ + const char * phrase = strs[1]; \ + if (null_token != _token) \ + assert(taglib_validate_token_with_string \ + (phrase_index, _token, phrase)); \ + \ + var = _token; \ + \ + g_strfreev(strs); \ + } while(false); + + +static bool load_phrase_index(const pinyin_table_info_t * phrase_files, + FacadePhraseIndex * phrase_index) { + MemoryChunk * chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (SYSTEM_FILE != table_info->m_file_type) + continue; + + const char * binfile = table_info->m_system_filename; + + chunk = new MemoryChunk; + bool retval = chunk->load(binfile); + if (!retval) { + fprintf(stderr, "load %s failed!\n", binfile); + delete chunk; + return false; + } + + phrase_index->load(i, chunk); + } + return true; +} + +static bool save_phrase_index(const pinyin_table_info_t * phrase_files, + FacadePhraseIndex * phrase_index) { + MemoryChunk * new_chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (SYSTEM_FILE != table_info->m_file_type) + continue; + + const char * binfile = table_info->m_system_filename; + + new_chunk = new MemoryChunk; + phrase_index->store(i, new_chunk); + bool retval = new_chunk->save(binfile); + if (!retval) { + fprintf(stderr, "save %s failed.", binfile); + delete new_chunk; + return false; + } + + phrase_index->load(i, new_chunk); + } + return true; +} + +static bool save_dictionary(const pinyin_table_info_t * phrase_files, + FacadePhraseIndex * phrase_index) { + MemoryChunk * new_chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (DICTIONARY != table_info->m_file_type) + continue; + + const char * binfile = table_info->m_system_filename; + + new_chunk = new MemoryChunk; + phrase_index->store(i, new_chunk); + bool retval = new_chunk->save(binfile); + if (!retval) { + fprintf(stderr, "save %s failed.", binfile); + delete new_chunk; + return false; + } + + phrase_index->load(i, new_chunk); + } + return true; +} + +#endif |