From ee66af93e3168149a581acc262c3050569f94b72 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 15 May 2017 15:16:17 +0800 Subject: import gen_zhuyin_table.cpp and test_zhuyin.cpp --- src/storage/table_info.cpp | 2 +- tests/test_zhuyin.cpp | 67 ++++++++ utils/storage/gen_zhuyin_table.cpp | 338 +++++++++++++++++++++++++++++++++++++ 3 files changed, 406 insertions(+), 1 deletion(-) create mode 100644 tests/test_zhuyin.cpp create mode 100644 utils/storage/gen_zhuyin_table.cpp diff --git a/src/storage/table_info.cpp b/src/storage/table_info.cpp index 4cfc842..bd9b03f 100644 --- a/src/storage/table_info.cpp +++ b/src/storage/table_info.cpp @@ -208,7 +208,7 @@ bool SystemTableInfo2::load(const char * filename) { m_model_data_version = modelver; m_lambda = lambda; - /* Note: only support pinyin or zhuyin table now. */ + /* Note: support pinyin and zhuyin table now. */ assert(PINYIN_TABLE == type || ZHUYIN_TABLE == type); m_table_phonetic_type = type; diff --git a/tests/test_zhuyin.cpp b/tests/test_zhuyin.cpp new file mode 100644 index 0000000..0fe840d --- /dev/null +++ b/tests/test_zhuyin.cpp @@ -0,0 +1,67 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2017 Peng Wu + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "zhuyin.h" +#include +#include +#include + +int main(int argc, char * argv[]){ + zhuyin_context_t * context = + zhuyin_init("../data", "../data"); + + zhuyin_instance_t * instance = zhuyin_alloc_instance(context); + + char* linebuf = NULL; + size_t size = 0; + ssize_t read; + while( (read = getline(&linebuf, &size, stdin)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + zhuyin_parse_more_chewings + (instance, linebuf); + zhuyin_guess_sentence(instance); + + char * sentence = NULL; + zhuyin_get_sentence (instance, &sentence); + if (sentence) + printf("%s\n", sentence); + g_free(sentence); + + zhuyin_train(instance); + zhuyin_reset(instance); + zhuyin_save(context); + } + + zhuyin_free_instance(instance); + + zhuyin_mask_out(context, 0x0, 0x0); + zhuyin_save(context); + zhuyin_fini(context); + + free(linebuf); + return 0; +} diff --git a/utils/storage/gen_zhuyin_table.cpp b/utils/storage/gen_zhuyin_table.cpp new file mode 100644 index 0000000..2cb6a0a --- /dev/null +++ b/utils/storage/gen_zhuyin_table.cpp @@ -0,0 +1,338 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2017 Peng Wu + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include +#include +#include "pinyin_internal.h" + + +void print_help(){ + printf("Usage: gen_pinyin_table -t \n" + "-o .. \n" + " the result output file\n" + " input pinyin files\n" + " phrase index identifier\n"); +} + + +static gint phrase_index = 0; +static const gchar * outputfile = "temp.out"; + +static GOptionEntry entries[] = +{ + {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL}, + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL}, + {NULL} +}; + + +using namespace pinyin; + +/* map from phrase_item to GArray of chewing_and_freq_item */ +GTree * g_chewing_tree; +/* Array of GArray of phrase_and_array_item */ +GArray * g_item_array[MAX_PHRASE_LENGTH + 1]; + +struct phrase_item{ + size_t length; + gunichar * uniphrase; +}; + +struct chewing_and_freq_item{ + ChewingKeyVector keys; + ChewingKeyRestVector key_rests; + guint32 freq; +}; + +struct phrase_and_array_item{ + phrase_item phrase; /* the key of g_chewing_tree */ + /* Array of chewing_and_freq_item */ + GArray * chewing_and_freq_array; /* the value of g_chewing_tree */ +}; + + +void feed_file(const char * filename); + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq); + +gboolean store_one_item(gpointer key, gpointer value, gpointer data); + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata); + +void gen_phrase_file(const char * outputfile, int phrase_index); + + +gint phrase_item_compare(gconstpointer a, gconstpointer b){ + phrase_item * itema = (phrase_item *) a; + phrase_item * itemb = (phrase_item *) b; + if ( itema->length != itemb->length ) + return itema->length - itemb->length; + else + return memcmp(itema->uniphrase, itemb->uniphrase, + sizeof(gunichar) * itema->length); +} + + +int main(int argc, char * argv[]){ + int i; + + g_chewing_tree = g_tree_new(phrase_item_compare); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate pinyin table"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + for (i = 1; i < argc; ++i) { + feed_file(argv[i]); + } + + printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree)); + + /* store in item array */ + g_item_array[0] = NULL; + for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_item_array[i] = g_array_new + (FALSE, TRUE, sizeof(phrase_and_array_item)); + } + g_tree_foreach(g_chewing_tree, store_one_item, NULL); + + /* sort item array */ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i); + } + + gen_phrase_file(outputfile, phrase_index); + + return 0; +} + +void feed_file ( const char * filename){ + FILE * infile = fopen(filename, "r"); + if ( NULL == infile ){ + fprintf(stderr, "Can't open file %s.\n", filename); + exit(ENOENT); + } + + char * linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, infile)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + /* assume tsi.src only use the single space to separate tokens. */ + gchar ** strs = g_strsplit_set(linebuf, " ", 3); + + const char * phrase = strs[0]; + guint32 freq = atoi(strs[1]); + const char * pinyin = strs[2]; + + if (3 != g_strv_length(strs)) { + fprintf(stderr, "wrong line format:%s\n", linebuf); + continue; + } + + if (feof(infile)) + break; + + feed_line(phrase, pinyin, freq); + } + + free(linebuf); + fclose(infile); +} + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq) { + phrase_item * item = new phrase_item; + item->length = g_utf8_strlen(phrase, -1); + + /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp + * where is the code which I don't want to touch. :-) + */ + + if (item->length >= MAX_PHRASE_LENGTH) { + fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); + + ZhuyinDirectParser2 parser; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = g_array_new + (FALSE, FALSE, sizeof(ChewingKeyRest)); + + pinyin_option_t options = USE_TONE | FORCE_TONE; + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + assert(keys->len == key_rests->len); + + if (keys->len != item->length) { + fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item); + + chewing_and_freq_item value_item; + value_item.keys = keys; value_item.key_rests = key_rests; + value_item.freq = freq; + + assert(item->length == value_item.keys->len); + if (NULL == array) { + array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item)); + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + return; + } + + bool found = false; + for (size_t i = 0; i < array->len; ++i) { + chewing_and_freq_item * cur_item = + &g_array_index(array, chewing_and_freq_item, i); + int result = pinyin_exact_compare2 + ((ChewingKey *) value_item.keys->data, + (ChewingKey *) cur_item->keys->data, + value_item.keys->len); + + if (0 == result) { + fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", + phrase, pinyin, freq); + cur_item->freq += freq; + found = true; + } + } + + if (!found) { + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + } else { + /* clean up */ + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + } + + delete item; +} + + +gboolean store_one_item(gpointer key, gpointer value, gpointer data) { + phrase_and_array_item item; + item.phrase = *((phrase_item *) key); + item.chewing_and_freq_array = (GArray *) value; + int len = item.phrase.length; + g_array_append_val(g_item_array[len], item); + return FALSE; +} + + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata) { + int phrase_length = *((int *) userdata); + phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs; + phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs; + + ChewingKeyVector keys_lhs = g_array_index + (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + ChewingKeyVector keys_rhs = g_array_index + (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + return pinyin_exact_compare2((ChewingKey *)keys_lhs->data, + (ChewingKey *)keys_rhs->data, phrase_length); +} + + +void gen_phrase_file(const char * outputfile, int phrase_index){ + FILE * outfile = fopen(outputfile, "w"); + if (NULL == outfile ) { + fprintf(stderr, "Can't write file %s.\n", outputfile); + exit(ENOENT); + } + + phrase_token_t token = 1; + + /* phrase length index */ + for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) { + GArray * item_array = g_item_array[i]; + + /* item array index */ + for (size_t m = 0; m < item_array->len; ++m) { + phrase_and_array_item * item = &g_array_index + (item_array, phrase_and_array_item, m); + phrase_item phrase = item->phrase; + GArray * chewing_and_freqs = item->chewing_and_freq_array; + + gchar * phrase_str = g_ucs4_to_utf8 + (phrase.uniphrase, phrase.length, NULL, NULL, NULL); + + /* iterate each pinyin */ + for (size_t n = 0; n < chewing_and_freqs->len; ++n) { + chewing_and_freq_item * chewing_and_freq = + &g_array_index + (chewing_and_freqs, chewing_and_freq_item, n); + + ChewingKeyVector keys = chewing_and_freq->keys; + ChewingKeyRestVector key_rests = chewing_and_freq->key_rests; + + GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *)); + gchar * pinyin = NULL; + + size_t k; + for (k = 0; k < keys->len; ++k) { + ChewingKey key = g_array_index(keys, ChewingKey, k); + ChewingKeyRest key_rest = g_array_index + (key_rests, ChewingKeyRest, k); + + assert (CHEWING_ZERO_TONE != key.m_tone); + pinyin = key.get_zhuyin_string(); + g_array_append_val(pinyins, pinyin); + } + gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data); + + for (k = 0; k < pinyins->len; ++k) { + g_free(g_array_index(pinyins, gchar *, k)); + } + g_array_free(pinyins, TRUE); + + guint32 freq = chewing_and_freq->freq; + + /* avoid zero freq */ + if (freq < 3) freq = 3; + + fprintf(outfile, "%s\t%s\t%d\t%d\n", + pinyin_str, phrase_str, + PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq); + + g_free(pinyin_str); + } + g_free(phrase_str); + token++; + } + } + + fclose(outfile); +} -- cgit