summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2017-05-15 15:16:17 +0800
committerPeng Wu <alexepico@gmail.com>2017-05-15 15:24:39 +0800
commitee66af93e3168149a581acc262c3050569f94b72 (patch)
tree24cce18b017e4351dc4f9c4aa0d8d991ad8b8c4c
parent5c396d0332a802382e5b66195c2baba893b1d41f (diff)
downloadlibpinyin-ee66af93e3168149a581acc262c3050569f94b72.tar.gz
libpinyin-ee66af93e3168149a581acc262c3050569f94b72.tar.xz
libpinyin-ee66af93e3168149a581acc262c3050569f94b72.zip
import gen_zhuyin_table.cpp and test_zhuyin.cpp
-rw-r--r--src/storage/table_info.cpp2
-rw-r--r--tests/test_zhuyin.cpp67
-rw-r--r--utils/storage/gen_zhuyin_table.cpp338
3 files changed, 406 insertions, 1 deletions
diff --git a/src/storage/table_info.cpp b/src/storage/table_info.cpp
index 4cfc842..bd9b03f 100644
--- a/src/storage/table_info.cpp
+++ b/src/storage/table_info.cpp
@@ -208,7 +208,7 @@ bool SystemTableInfo2::load(const char * filename) {
m_model_data_version = modelver;
m_lambda = lambda;
- /* Note: only support pinyin or zhuyin table now. */
+ /* Note: support pinyin and zhuyin table now. */
assert(PINYIN_TABLE == type || ZHUYIN_TABLE == type);
m_table_phonetic_type = type;
diff --git a/tests/test_zhuyin.cpp b/tests/test_zhuyin.cpp
new file mode 100644
index 0000000..0fe840d
--- /dev/null
+++ b/tests/test_zhuyin.cpp
@@ -0,0 +1,67 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2017 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "zhuyin.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int main(int argc, char * argv[]){
+ zhuyin_context_t * context =
+ zhuyin_init("../data", "../data");
+
+ zhuyin_instance_t * instance = zhuyin_alloc_instance(context);
+
+ char* linebuf = NULL;
+ size_t size = 0;
+ ssize_t read;
+ while( (read = getline(&linebuf, &size, stdin)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if ( strcmp ( linebuf, "quit" ) == 0)
+ break;
+
+ zhuyin_parse_more_chewings
+ (instance, linebuf);
+ zhuyin_guess_sentence(instance);
+
+ char * sentence = NULL;
+ zhuyin_get_sentence (instance, &sentence);
+ if (sentence)
+ printf("%s\n", sentence);
+ g_free(sentence);
+
+ zhuyin_train(instance);
+ zhuyin_reset(instance);
+ zhuyin_save(context);
+ }
+
+ zhuyin_free_instance(instance);
+
+ zhuyin_mask_out(context, 0x0, 0x0);
+ zhuyin_save(context);
+ zhuyin_fini(context);
+
+ free(linebuf);
+ return 0;
+}
diff --git a/utils/storage/gen_zhuyin_table.cpp b/utils/storage/gen_zhuyin_table.cpp
new file mode 100644
index 0000000..2cb6a0a
--- /dev/null
+++ b/utils/storage/gen_zhuyin_table.cpp
@@ -0,0 +1,338 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2017 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <stdio.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+
+
+void print_help(){
+ printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n"
+ "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"
+ "<OUTPUTFILE> the result output file\n"
+ "<FILEi> input pinyin files\n"
+ "<PHRASE_INDEX> phrase index identifier\n");
+}
+
+
+static gint phrase_index = 0;
+static const gchar * outputfile = "temp.out";
+
+static GOptionEntry entries[] =
+{
+ {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL},
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL},
+ {NULL}
+};
+
+
+using namespace pinyin;
+
+/* map from phrase_item to GArray of chewing_and_freq_item */
+GTree * g_chewing_tree;
+/* Array of GArray of phrase_and_array_item */
+GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
+
+struct phrase_item{
+ size_t length;
+ gunichar * uniphrase;
+};
+
+struct chewing_and_freq_item{
+ ChewingKeyVector keys;
+ ChewingKeyRestVector key_rests;
+ guint32 freq;
+};
+
+struct phrase_and_array_item{
+ phrase_item phrase; /* the key of g_chewing_tree */
+ /* Array of chewing_and_freq_item */
+ GArray * chewing_and_freq_array; /* the value of g_chewing_tree */
+};
+
+
+void feed_file(const char * filename);
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data);
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata);
+
+void gen_phrase_file(const char * outputfile, int phrase_index);
+
+
+gint phrase_item_compare(gconstpointer a, gconstpointer b){
+ phrase_item * itema = (phrase_item *) a;
+ phrase_item * itemb = (phrase_item *) b;
+ if ( itema->length != itemb->length )
+ return itema->length - itemb->length;
+ else
+ return memcmp(itema->uniphrase, itemb->uniphrase,
+ sizeof(gunichar) * itema->length);
+}
+
+
+int main(int argc, char * argv[]){
+ int i;
+
+ g_chewing_tree = g_tree_new(phrase_item_compare);
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate pinyin table");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ for (i = 1; i < argc; ++i) {
+ feed_file(argv[i]);
+ }
+
+ printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree));
+
+ /* store in item array */
+ g_item_array[0] = NULL;
+ for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_item_array[i] = g_array_new
+ (FALSE, TRUE, sizeof(phrase_and_array_item));
+ }
+ g_tree_foreach(g_chewing_tree, store_one_item, NULL);
+
+ /* sort item array */
+ for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
+ }
+
+ gen_phrase_file(outputfile, phrase_index);
+
+ return 0;
+}
+
+void feed_file ( const char * filename){
+ FILE * infile = fopen(filename, "r");
+ if ( NULL == infile ){
+ fprintf(stderr, "Can't open file %s.\n", filename);
+ exit(ENOENT);
+ }
+
+ char * linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, infile)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ /* assume tsi.src only use the single space to separate tokens. */
+ gchar ** strs = g_strsplit_set(linebuf, " ", 3);
+
+ const char * phrase = strs[0];
+ guint32 freq = atoi(strs[1]);
+ const char * pinyin = strs[2];
+
+ if (3 != g_strv_length(strs)) {
+ fprintf(stderr, "wrong line format:%s\n", linebuf);
+ continue;
+ }
+
+ if (feof(infile))
+ break;
+
+ feed_line(phrase, pinyin, freq);
+ }
+
+ free(linebuf);
+ fclose(infile);
+}
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
+ phrase_item * item = new phrase_item;
+ item->length = g_utf8_strlen(phrase, -1);
+
+ /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
+ * where is the code which I don't want to touch. :-)
+ */
+
+ if (item->length >= MAX_PHRASE_LENGTH) {
+ fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
+
+ ZhuyinDirectParser2 parser;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests = g_array_new
+ (FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ pinyin_option_t options = USE_TONE | FORCE_TONE;
+ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
+ assert(keys->len == key_rests->len);
+
+ if (keys->len != item->length) {
+ fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);
+
+ chewing_and_freq_item value_item;
+ value_item.keys = keys; value_item.key_rests = key_rests;
+ value_item.freq = freq;
+
+ assert(item->length == value_item.keys->len);
+ if (NULL == array) {
+ array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ return;
+ }
+
+ bool found = false;
+ for (size_t i = 0; i < array->len; ++i) {
+ chewing_and_freq_item * cur_item =
+ &g_array_index(array, chewing_and_freq_item, i);
+ int result = pinyin_exact_compare2
+ ((ChewingKey *) value_item.keys->data,
+ (ChewingKey *) cur_item->keys->data,
+ value_item.keys->len);
+
+ if (0 == result) {
+ fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
+ phrase, pinyin, freq);
+ cur_item->freq += freq;
+ found = true;
+ }
+ }
+
+ if (!found) {
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ } else {
+ /* clean up */
+ g_array_free(keys, TRUE);
+ g_array_free(key_rests, TRUE);
+ }
+
+ delete item;
+}
+
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data) {
+ phrase_and_array_item item;
+ item.phrase = *((phrase_item *) key);
+ item.chewing_and_freq_array = (GArray *) value;
+ int len = item.phrase.length;
+ g_array_append_val(g_item_array[len], item);
+ return FALSE;
+}
+
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata) {
+ int phrase_length = *((int *) userdata);
+ phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs;
+ phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs;
+
+ ChewingKeyVector keys_lhs = g_array_index
+ (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ ChewingKeyVector keys_rhs = g_array_index
+ (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ return pinyin_exact_compare2((ChewingKey *)keys_lhs->data,
+ (ChewingKey *)keys_rhs->data, phrase_length);
+}
+
+
+void gen_phrase_file(const char * outputfile, int phrase_index){
+ FILE * outfile = fopen(outputfile, "w");
+ if (NULL == outfile ) {
+ fprintf(stderr, "Can't write file %s.\n", outputfile);
+ exit(ENOENT);
+ }
+
+ phrase_token_t token = 1;
+
+ /* phrase length index */
+ for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) {
+ GArray * item_array = g_item_array[i];
+
+ /* item array index */
+ for (size_t m = 0; m < item_array->len; ++m) {
+ phrase_and_array_item * item = &g_array_index
+ (item_array, phrase_and_array_item, m);
+ phrase_item phrase = item->phrase;
+ GArray * chewing_and_freqs = item->chewing_and_freq_array;
+
+ gchar * phrase_str = g_ucs4_to_utf8
+ (phrase.uniphrase, phrase.length, NULL, NULL, NULL);
+
+ /* iterate each pinyin */
+ for (size_t n = 0; n < chewing_and_freqs->len; ++n) {
+ chewing_and_freq_item * chewing_and_freq =
+ &g_array_index
+ (chewing_and_freqs, chewing_and_freq_item, n);
+
+ ChewingKeyVector keys = chewing_and_freq->keys;
+ ChewingKeyRestVector key_rests = chewing_and_freq->key_rests;
+
+ GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *));
+ gchar * pinyin = NULL;
+
+ size_t k;
+ for (k = 0; k < keys->len; ++k) {
+ ChewingKey key = g_array_index(keys, ChewingKey, k);
+ ChewingKeyRest key_rest = g_array_index
+ (key_rests, ChewingKeyRest, k);
+
+ assert (CHEWING_ZERO_TONE != key.m_tone);
+ pinyin = key.get_zhuyin_string();
+ g_array_append_val(pinyins, pinyin);
+ }
+ gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data);
+
+ for (k = 0; k < pinyins->len; ++k) {
+ g_free(g_array_index(pinyins, gchar *, k));
+ }
+ g_array_free(pinyins, TRUE);
+
+ guint32 freq = chewing_and_freq->freq;
+
+ /* avoid zero freq */
+ if (freq < 3) freq = 3;
+
+ fprintf(outfile, "%s\t%s\t%d\t%d\n",
+ pinyin_str, phrase_str,
+ PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq);
+
+ g_free(pinyin_str);
+ }
+ g_free(phrase_str);
+ token++;
+ }
+ }
+
+ fclose(outfile);
+}