diff options
author | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
commit | b78429d78df745dd327b6dada6b9bd71ea5df84e (patch) | |
tree | 82c4625db8674c66d69fd566fce8efc347e3cb3a /utils/storage | |
download | libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.gz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.xz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.zip |
import libpinyin code
Diffstat (limited to 'utils/storage')
-rw-r--r-- | utils/storage/CMakeLists.txt | 29 | ||||
-rw-r--r-- | utils/storage/Makefile.am | 45 | ||||
-rw-r--r-- | utils/storage/export_interpolation.cpp | 144 | ||||
-rw-r--r-- | utils/storage/gen_binary_files.cpp | 115 | ||||
-rw-r--r-- | utils/storage/gen_pinyin_table.cpp | 330 | ||||
-rw-r--r-- | utils/storage/import_interpolation.cpp | 313 |
6 files changed, 976 insertions, 0 deletions
diff --git a/utils/storage/CMakeLists.txt b/utils/storage/CMakeLists.txt new file mode 100644 index 0000000..63cabcd --- /dev/null +++ b/utils/storage/CMakeLists.txt @@ -0,0 +1,29 @@ +add_executable( + gen_binary_files + gen_binary_files.cpp +) + +target_link_libraries( + gen_binary_files + libpinyin +) + +add_executable( + import_interpolation + import_interpolation.cpp +) + +target_link_libraries( + import_interpolation + libpinyin +) + +add_executable( + export_interpolation + export_interpolation.cpp +) + +target_link_libraries( + export_interpolation + libpinyin +) diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am new file mode 100644 index 0000000..db63488 --- /dev/null +++ b/utils/storage/Makefile.am @@ -0,0 +1,45 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ + @GLIB2_CFLAGS@ + +bin_PROGRAMS = gen_binary_files \ + import_interpolation + +noinst_PROGRAMS = export_interpolation \ + gen_pinyin_table + +gen_binary_files_SOURCES = gen_binary_files.cpp + +gen_binary_files_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +import_interpolation_SOURCES = import_interpolation.cpp + +import_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +export_interpolation_SOURCES = export_interpolation.cpp + +export_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +gen_pinyin_table_SOURCES = gen_pinyin_table.cpp + +gen_pinyin_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp new file mode 100644 index 0000000..c43eefb --- /dev/null +++ b/utils/storage/export_interpolation.cpp @@ -0,0 +1,144 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <stdio.h> +#include <assert.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +/* export interpolation model as textual format */ + +bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index); +bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram); + +bool begin_data(FILE * output){ + fprintf(output, "\\data model interpolation\n"); + return true; +} + +bool end_data(FILE * output){ + fprintf(output, "\\end\n"); + return true; +} + +int main(int argc, char * argv[]){ + FILE * output = stdout; + const char * bigram_filename = SYSTEM_BIGRAM; + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + bigram.attach(bigram_filename, ATTACH_READONLY); + + begin_data(output); + + gen_unigram(output, &phrase_index); + gen_bigram(output, &phrase_index, &bigram); + + end_data(output); + return 0; +} + +bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) { + fprintf(output, "\\1-gram\n"); + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) { + + PhraseIndexRange range; + int result = phrase_index->get_range(i, range); + if (ERROR_OK != result ) + continue; + + PhraseItem item; + for (phrase_token_t token = range.m_range_begin; + token < range.m_range_end; token++) { + int result = phrase_index->get_phrase_item(token, item); + + if ( result == ERROR_NO_ITEM ) + continue; + assert( result == ERROR_OK); + + size_t freq = item.get_unigram_frequency(); + if ( 0 == freq ) + continue; + char * phrase = taglib_token_to_string(phrase_index, token); + if ( phrase ) + fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq); + + g_free(phrase); + } + } + return true; +} + +bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){ + fprintf(output, "\\2-gram\n"); + + /* Retrieve all user items. */ + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + bigram->get_all_items(items); + + PhraseItem item; + + for(size_t i = 0; i < items->len; i++){ + phrase_token_t token = g_array_index(items, phrase_token_t, i); + SingleGram * single_gram = NULL; + bigram->load(token, single_gram); + + BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); + single_gram->retrieve_all(array); + for(size_t j = 0; j < array->len; j++) { + BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j); + + char * word1 = taglib_token_to_string(phrase_index, token); + char * word2 = taglib_token_to_string(phrase_index, item->m_token); + guint32 freq = item->m_count; + + if ( word1 && word2) + fprintf(output, "\\item %d %s %d %s count %d\n", + token, word1, item->m_token, word2, freq); + + g_free(word1); g_free(word2); + } + + g_array_free(array, TRUE); + delete single_gram; + } + + g_array_free(items, TRUE); + return true; +} diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp new file mode 100644 index 0000000..4216b44 --- /dev/null +++ b/utils/storage/gen_binary_files.cpp @@ -0,0 +1,115 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +static const gchar * table_dir = "."; + +static GOptionEntry entries[] = +{ + {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL}, + {NULL} +}; + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate binary files"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + bool retval = system_table_info.load(filename); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + g_free(filename); + + /* generate pinyin index*/ + pinyin_option_t options = USE_TONE; + ChewingLargeTable chewing_table(options); + PhraseLargeTable2 phrase_table; + + /* generate phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + assert(table_info->m_dict_index == i); + + if (SYSTEM_FILE != table_info->m_file_type && + DICTIONARY != table_info->m_file_type) + continue; + + const char * tablename = table_info->m_table_filename; + + filename = g_build_filename(table_dir, tablename, NULL); + FILE * tablefile = fopen(filename, "r"); + + if (NULL == tablefile) { + fprintf(stderr, "open %s failed!\n", tablename); + exit(ENOENT); + } + + chewing_table.load_text(tablefile); + fseek(tablefile, 0L, SEEK_SET); + phrase_table.load_text(tablefile); + fseek(tablefile, 0L, SEEK_SET); + phrase_index.load_text(i, tablefile); + fclose(tablefile); + g_free(filename); + } + + MemoryChunk * new_chunk = new MemoryChunk; + chewing_table.store(new_chunk); + new_chunk->save(SYSTEM_PINYIN_INDEX); + chewing_table.load(new_chunk); + + new_chunk = new MemoryChunk; + phrase_table.store(new_chunk); + new_chunk->save(SYSTEM_PHRASE_INDEX); + phrase_table.load(new_chunk); + + phrase_index.compact(); + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + if (!save_dictionary(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp new file mode 100644 index 0000000..3b541d1 --- /dev/null +++ b/utils/storage/gen_pinyin_table.cpp @@ -0,0 +1,330 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <stdio.h> +#include <glib.h> +#include "pinyin_internal.h" + + +void print_help(){ + printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n" + "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n" + "<OUTPUTFILE> the result output file\n" + "<FILEi> input pinyin files\n" + "<PHRASE_INDEX> phrase index identifier\n"); +} + + +static gint phrase_index = 0; +static const gchar * outputfile = "temp.out"; + +static GOptionEntry entries[] = +{ + {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL}, + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL}, + {NULL} +}; + + +using namespace pinyin; + +/* map from phrase_item to GArray of chewing_and_freq_item */ +GTree * g_chewing_tree; +/* Array of GArray of phrase_and_array_item */ +GArray * g_item_array[MAX_PHRASE_LENGTH + 1]; + +struct phrase_item{ + size_t length; + gunichar * uniphrase; +}; + +struct chewing_and_freq_item{ + ChewingKeyVector keys; + ChewingKeyRestVector key_rests; + guint32 freq; +}; + +struct phrase_and_array_item{ + phrase_item phrase; /* the key of g_chewing_tree */ + /* Array of chewing_and_freq_item */ + GArray * chewing_and_freq_array; /* the value of g_chewing_tree */ +}; + + +void feed_file(const char * filename); + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq); + +gboolean store_one_item(gpointer key, gpointer value, gpointer data); + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata); + +void gen_phrase_file(const char * outputfile, int phrase_index); + + +gint phrase_item_compare(gconstpointer a, gconstpointer b){ + phrase_item * itema = (phrase_item *) a; + phrase_item * itemb = (phrase_item *) b; + if ( itema->length != itemb->length ) + return itema->length - itemb->length; + else + return memcmp(itema->uniphrase, itemb->uniphrase, + sizeof(gunichar) * itema->length); +} + + +int main(int argc, char * argv[]){ + int i; + + g_chewing_tree = g_tree_new(phrase_item_compare); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate pinyin table"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + for (i = 1; i < argc; ++i) { + feed_file(argv[i]); + } + + printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree)); + + /* store in item array */ + g_item_array[0] = NULL; + for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_item_array[i] = g_array_new + (FALSE, TRUE, sizeof(phrase_and_array_item)); + } + g_tree_foreach(g_chewing_tree, store_one_item, NULL); + + /* sort item array */ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i); + } + + gen_phrase_file(outputfile, phrase_index); + + return 0; +} + +void feed_file ( const char * filename){ + char phrase[1024], pinyin[1024]; + guint32 freq; + + FILE * infile = fopen(filename, "r"); + if ( NULL == infile ){ + fprintf(stderr, "Can't open file %s.\n", filename); + exit(ENOENT); + } + + while ( !feof(infile)){ + int num = fscanf(infile, "%s %s %u", + phrase, pinyin, &freq); + + if (3 != num) + continue; + + if (feof(infile)) + break; + + feed_line(phrase, pinyin, freq); + } + + fclose(infile); +} + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq) { + phrase_item * item = new phrase_item; + item->length = g_utf8_strlen(phrase, -1); + + /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp + * where is the code which I don't want to touch. :-) + */ + + if (item->length >= MAX_PHRASE_LENGTH) { + fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); + + FullPinyinParser2 parser; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = g_array_new + (FALSE, FALSE, sizeof(ChewingKeyRest)); + + pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + assert(keys->len == key_rests->len); + + if (keys->len != item->length) { + fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item); + + chewing_and_freq_item value_item; + value_item.keys = keys; value_item.key_rests = key_rests; + value_item.freq = freq; + + assert(item->length == value_item.keys->len); + if (NULL == array) { + array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item)); + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + return; + } + + bool found = false; + for (size_t i = 0; i < array->len; ++i) { + chewing_and_freq_item * cur_item = + &g_array_index(array, chewing_and_freq_item, i); + int result = pinyin_exact_compare2 + ((ChewingKey *) value_item.keys->data, + (ChewingKey *) cur_item->keys->data, + value_item.keys->len); + + if (0 == result) { + fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", + phrase, pinyin, freq); + cur_item->freq += freq; + found = true; + } + } + + if (!found) { + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + } else { + /* clean up */ + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + } + + delete item; +} + + +gboolean store_one_item(gpointer key, gpointer value, gpointer data) { + phrase_and_array_item item; + item.phrase = *((phrase_item *) key); + item.chewing_and_freq_array = (GArray *) value; + int len = item.phrase.length; + g_array_append_val(g_item_array[len], item); + return FALSE; +} + + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata) { + int phrase_length = *((int *) userdata); + phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs; + phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs; + + ChewingKeyVector keys_lhs = g_array_index + (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + ChewingKeyVector keys_rhs = g_array_index + (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + return pinyin_exact_compare2((ChewingKey *)keys_lhs->data, + (ChewingKey *)keys_rhs->data, phrase_length); +} + + +void gen_phrase_file(const char * outputfile, int phrase_index){ + FILE * outfile = fopen(outputfile, "w"); + if (NULL == outfile ) { + fprintf(stderr, "Can't write file %s.\n", outputfile); + exit(ENOENT); + } + + phrase_token_t token = 1; + + /* phrase length index */ + for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) { + GArray * item_array = g_item_array[i]; + + /* item array index */ + for (size_t m = 0; m < item_array->len; ++m) { + phrase_and_array_item * item = &g_array_index + (item_array, phrase_and_array_item, m); + phrase_item phrase = item->phrase; + GArray * chewing_and_freqs = item->chewing_and_freq_array; + + gchar * phrase_str = g_ucs4_to_utf8 + (phrase.uniphrase, phrase.length, NULL, NULL, NULL); + + /* iterate each pinyin */ + for (size_t n = 0; n < chewing_and_freqs->len; ++n) { + chewing_and_freq_item * chewing_and_freq = + &g_array_index + (chewing_and_freqs, chewing_and_freq_item, n); + + ChewingKeyVector keys = chewing_and_freq->keys; + ChewingKeyRestVector key_rests = chewing_and_freq->key_rests; + + GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *)); + gchar * pinyin = NULL; + + size_t k; + for (k = 0; k < keys->len; ++k) { + ChewingKey key = g_array_index(keys, ChewingKey, k); + ChewingKeyRest key_rest = g_array_index + (key_rests, ChewingKeyRest, k); + + //assert (CHEWING_ZERO_TONE != key.m_tone); + pinyin = key.get_pinyin_string(); + g_array_append_val(pinyins, pinyin); + } + gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data); + + for (k = 0; k < pinyins->len; ++k) { + g_free(g_array_index(pinyins, gchar *, k)); + } + g_array_free(pinyins, TRUE); + + guint32 freq = chewing_and_freq->freq; + + /* avoid zero freq */ + if (freq < 3) freq = 3; + + fprintf(outfile, "%s\t%s\t%d\t%d\n", + pinyin_str, phrase_str, + PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq); + + g_free(pinyin_str); + } + g_free(phrase_str); + token++; + } + } + + fclose(outfile); +} diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp new file mode 100644 index 0000000..205a27a --- /dev/null +++ b/utils/storage/import_interpolation.cpp @@ -0,0 +1,313 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +static const gchar * table_dir = "."; + +static GOptionEntry entries[] = +{ + {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL}, + {NULL} +}; + + +enum LINE_TYPE{ + BEGIN_LINE = 1, + END_LINE, + GRAM_1_LINE, + GRAM_2_LINE, + GRAM_1_ITEM_LINE, + GRAM_2_ITEM_LINE +}; + +static int line_type = 0; +static GPtrArray * values = NULL; +static GHashTable * required = NULL; +/* variables for line buffer. */ +static char * linebuf = NULL; +static size_t len = 0; + +bool parse_headline(); + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index); + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * bigram); + +static ssize_t my_getline(FILE * input){ + ssize_t result = getline(&linebuf, &len, input); + if ( result == -1 ) + return result; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + return result; +} + +bool parse_headline(){ + /* enter "\data" line */ + assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); + + /* read "\data" line */ + if ( !taglib_read(linebuf, line_type, values, required) ) { + fprintf(stderr, "error: interpolation model expected.\n"); + return false; + } + + assert(line_type == BEGIN_LINE); + /* check header */ + TAGLIB_GET_TAGVALUE(const char *, model, (const char *)); + if ( !( strcmp("interpolation", model) == 0 ) ) { + fprintf(stderr, "error: interpolation model expected.\n"); + return false; + } + return true; +} + +bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + + do { + retry: + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case END_LINE: + goto end; + case GRAM_1_LINE: + my_getline(input); + parse_unigram(input, phrase_table, phrase_index); + goto retry; + case GRAM_2_LINE: + my_getline(input); + parse_bigram(input, phrase_table, phrase_index, bigram); + goto retry; + default: + assert(false); + } + } while (my_getline(input) != -1) ; + + end: + taglib_pop_state(); + return true; +} + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", "")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_1_ITEM_LINE:{ + /* handle \item in \1-gram */ + TAGLIB_GET_TOKEN(token, 0); + TAGLIB_GET_PHRASE_STRING(word, 1); + assert(taglib_validate_token_with_string + (phrase_index, token, word)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + phrase_index->add_unigram_frequency(token, count); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", "")); + + phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL; + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_2_ITEM_LINE:{ + /* handle \item in \2-gram */ + /* two tokens */ + TAGLIB_GET_TOKEN(token1, 0); + TAGLIB_GET_PHRASE_STRING(word1, 1); + assert(taglib_validate_token_with_string + (phrase_index, token1, word1)); + + TAGLIB_GET_TOKEN(token2, 2); + TAGLIB_GET_PHRASE_STRING(word2, 3); + assert(taglib_validate_token_with_string + (phrase_index, token2, word2)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + + if ( last_token != token1 ) { + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + + /* safe guard */ + last_token = null_token; + last_single_gram = NULL; + } + SingleGram * single_gram = NULL; + bigram->load(token1, single_gram); + + /* create the new single gram */ + if ( single_gram == NULL ) + single_gram = new SingleGram; + last_token = token1; + last_single_gram = single_gram; + } + + /* save the freq */ + assert(NULL != last_single_gram); + guint32 total_freq = 0; + assert(last_single_gram->get_total_freq(total_freq)); + assert(last_single_gram->insert_freq(token2, count)); + total_freq += count; + assert(last_single_gram->set_total_freq(total_freq)); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + //safe guard + last_token = 0; + last_single_gram = NULL; + } + + taglib_pop_state(); + return true; +} + +int main(int argc, char * argv[]){ + FILE * input = stdin; + const char * bigram_filename = SYSTEM_BIGRAM; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- import interpolation model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + bool retval = system_table_info.load(filename); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + g_free(filename); + + PhraseLargeTable2 phrase_table; + + MemoryChunk * chunk = new MemoryChunk; + retval = chunk->load(SYSTEM_PHRASE_INDEX); + if (!retval) { + fprintf(stderr, "open phrase_index.bin failed!\n"); + exit(ENOENT); + } + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bigram_filename); + exit(ENOENT); + } + + taglib_init(); + + values = g_ptr_array_new(); + required = g_hash_table_new(g_str_hash, g_str_equal); + + /* read first line */ + ssize_t result = my_getline(input); + if ( result == -1 ) { + fprintf(stderr, "empty file input.\n"); + exit(ENODATA); + } + + if (!parse_headline()) + exit(ENODATA); + + result = my_getline(input); + if ( result != -1 ) + parse_body(input, &phrase_table, &phrase_index, &bigram); + + taglib_fini(); + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} |