summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-12-07 15:56:49 +0800
committerPeng Wu <alexepico@gmail.com>2011-12-07 15:56:49 +0800
commit4aebb8f84e92da3075da3057afbcce5fa9fed2f7 (patch)
treefa670c1b840370dd01f4d4d746aace1e1d646032
parentdcfbeeaa0e381a7d03cdec05e8712753c2afb1e2 (diff)
downloadlibpinyin-4aebb8f84e92da3075da3057afbcce5fa9fed2f7.tar.gz
libpinyin-4aebb8f84e92da3075da3057afbcce5fa9fed2f7.tar.xz
libpinyin-4aebb8f84e92da3075da3057afbcce5fa9fed2f7.zip
write gen chewing table
-rw-r--r--src/storage/chewing_key.h10
-rw-r--r--utils/storage/gen_chewing_table.cpp320
2 files changed, 330 insertions, 0 deletions
diff --git a/src/storage/chewing_key.h b/src/storage/chewing_key.h
index 5bbd6ed..05b6949 100644
--- a/src/storage/chewing_key.h
+++ b/src/storage/chewing_key.h
@@ -193,6 +193,16 @@ struct ChewingKeyRest
const char * get_chewing_string();
};
+static inline gchar * get_pinyin_string(ChewingKey key,
+ ChewingKeyRest key_rest) {
+ if (CHEWING_ZERO_TONE != key.m_tone) {
+ return g_strdup_printf
+ ("%s%d", key_rest.get_pinyin_string(), key.m_tone);
+ } else {
+ return g_strdup(key_rest.get_pinyin_string());
+ }
+}
+
};
#endif
diff --git a/utils/storage/gen_chewing_table.cpp b/utils/storage/gen_chewing_table.cpp
new file mode 100644
index 0000000..d6d3673
--- /dev/null
+++ b/utils/storage/gen_chewing_table.cpp
@@ -0,0 +1,320 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+
+using namespace pinyin;
+
+/* map from phrase_item to GArray of chewing_and_freq_item */
+GTree * g_chewing_tree;
+/* Array of GArray of phrase_and_array_item */
+GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
+
+struct phrase_item{
+ size_t length;
+ gunichar * uniphrase;
+};
+
+struct chewing_and_freq_item{
+ ChewingKeyVector keys;
+ ChewingKeyRestVector key_rests;
+ guint32 freq;
+};
+
+struct phrase_and_array_item{
+ phrase_item phrase;
+ GArray * chewing_and_freq_array; /* Array of chewing_and_freq_item */
+};
+
+
+void feed_file(const char * filename);
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data);
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata);
+
+void gen_phrase_file(const char * outfilename, int phrase_index);
+
+void print_help(){
+ printf("Usage: gen_chewing_table -t <PHRASE_INDEX> "
+ "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n");
+ printf("<OUTPUTFILE> the result output file\n");
+ printf("<FILEi> input pinyin files\n");
+ printf("<PHRASE_INDEX> phrase index identifier\n");
+}
+
+
+gint phrase_item_compare(gconstpointer a, gconstpointer b){
+ phrase_item * itema = (phrase_item *) a;
+ phrase_item * itemb = (phrase_item *) b;
+ if ( itema->length != itemb->length )
+ return itema->length - itemb->length;
+ else
+ return memcmp(itema->uniphrase, itemb->uniphrase,
+ sizeof(gunichar) * itema->length);
+}
+
+
+int main(int argc, char * argv[]){
+ const char * outfilename = "temp.out";
+ int phrase_index = 0;
+ int i = 1;
+
+ g_chewing_tree = g_tree_new(phrase_item_compare);
+
+ while ( i < argc ){
+ if ( strcmp("--help", argv[i] ) == 0) {
+ print_help();
+ exit(0);
+ }else if ( strcmp("-t", argv[i] ) == 0){
+ if ( ++i >= argc ) {
+ print_help();
+ exit(EINVAL);
+ }
+ phrase_index = atoi(argv[i]);
+ }else if ( strcmp("-o", argv[i] ) == 0 ){
+ if ( ++i >= argc ) {
+ print_help();
+ exit(EINVAL);
+ }
+ outfilename = g_strdup(argv[i]);
+ } else {
+ feed_file(argv[i]);
+ }
+ ++i;
+ }
+
+ printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree));
+
+ /* store in item array */
+ for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_item_array[i] = g_array_new
+ (FALSE, TRUE, sizeof(phrase_and_array_item));
+ }
+ g_tree_foreach(g_chewing_tree, store_one_item, NULL);
+
+ /* sort item array */
+ for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
+ }
+
+ gen_phrase_file(outfilename, phrase_index);
+
+ return 0;
+}
+
+void feed_file ( const char * filename){
+ char phrase[1024], pinyin[1024];
+ guint32 freq;
+
+ FILE * infile = fopen(filename, "r");
+ if ( NULL == infile ){
+ fprintf(stderr, "Can't open file %s.\n", filename);
+ exit(ENOENT);
+ }
+
+ while ( !feof(infile)){
+ fscanf(infile, "%s", phrase);
+ fscanf(infile, "%s", pinyin);
+ fscanf(infile, "%u", &freq);
+ if (feof(infile))
+ break;
+ feed_line(phrase, pinyin, freq);
+ }
+
+ fclose(infile);
+}
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
+ phrase_item * item = new phrase_item;
+ item->length = g_utf8_strlen(phrase, -1);
+
+ /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
+ * where is the code which I don't want to touch. :-)
+ */
+
+ if (item->length >= MAX_PHRASE_LENGTH) {
+ fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
+
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests = g_array_new
+ (FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ pinyin_option_t options = USE_TONE;
+ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
+ assert(keys->len == key_rests->len);
+
+ if (keys->len != item->length) {
+ fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);
+
+ chewing_and_freq_item value_item;
+ value_item.keys = keys; value_item.key_rests = key_rests;
+ value_item.freq = freq;
+
+ assert(item->length == value_item.keys->len);
+ if (NULL == array) {
+ array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ return;
+ }
+
+ bool found = false;
+ for (size_t i = 0; i < array->len; ++i) {
+ chewing_and_freq_item * cur_item =
+ &g_array_index(array, chewing_and_freq_item, i);
+ int result = pinyin_exact_compare2
+ ((ChewingKey *) value_item.keys->data,
+ (ChewingKey *) cur_item->keys->data,
+ value_item.keys->len);
+
+ if (0 == result) {
+ fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
+ phrase, pinyin, freq);
+ cur_item->freq += freq;
+ found = true;
+ }
+ }
+
+ if (!found) {
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ } else {
+ /* clean up */
+ g_array_free(keys, TRUE);
+ g_array_free(key_rests, TRUE);
+ }
+
+ delete item;
+}
+
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data) {
+ phrase_and_array_item item;
+ item.phrase = *((phrase_item *) key);
+ item.chewing_and_freq_array = (GArray *) value;
+ int len = item.phrase.length;
+ g_array_append_val(g_item_array[len], item);
+ return FALSE;
+}
+
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata) {
+ int phrase_length = *((int *) userdata);
+ phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs;
+ phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs;
+
+ ChewingKeyVector keys_lhs = g_array_index
+ (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ ChewingKeyVector keys_rhs = g_array_index
+ (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ return pinyin_exact_compare2((ChewingKey *)keys_lhs->data,
+ (ChewingKey *)keys_rhs->data, phrase_length);
+}
+
+
+void gen_phrase_file(const char * outfilename, int phrase_index){
+ FILE * outfile = fopen(outfilename, "w");
+ if (NULL == outfile ) {
+ fprintf(stderr, "Can't write file %s.\n", outfilename);
+ exit(ENOENT);
+ }
+
+ phrase_token_t token = 1;
+
+ /* phrase length index */
+ for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) {
+ GArray * item_array = g_item_array[i];
+
+ /* item array index */
+ for (size_t m = 0; m < item_array->len; ++m) {
+ phrase_and_array_item * item = &g_array_index
+ (item_array, phrase_and_array_item, m);
+ phrase_item phrase = item->phrase;
+ GArray * chewing_and_freqs = item->chewing_and_freq_array;
+
+ gchar * phrase_str = g_ucs4_to_utf8
+ (phrase.uniphrase, phrase.length, NULL, NULL, NULL);
+
+ /* iterate each pinyin */
+ for (size_t n = 0; n < chewing_and_freqs->len; ++n) {
+ chewing_and_freq_item * chewing_and_freq =
+ &g_array_index
+ (chewing_and_freqs, chewing_and_freq_item, n);
+
+ ChewingKeyVector keys = chewing_and_freq->keys;
+ ChewingKeyRestVector key_rests = chewing_and_freq->key_rests;
+
+ GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *));
+ gchar * pinyin = NULL;
+
+ size_t k;
+ for (k = 0; k < keys->len; ++k) {
+ ChewingKey key = g_array_index(keys, ChewingKey, k);
+ ChewingKeyRest key_rest = g_array_index
+ (key_rests, ChewingKeyRest, k);
+
+ assert (CHEWING_ZERO_TONE != key.m_tone);
+ pinyin = get_pinyin_string(key, key_rest);
+ g_array_append_val(pinyins, pinyin);
+ }
+ gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data);
+
+ for (k = 0; k < pinyins->len; ++k) {
+ g_free(g_array_index(pinyins, gchar *, k));
+ }
+ g_array_free(pinyins, TRUE);
+
+ guint32 freq = chewing_and_freq->freq;
+
+ /* avoid zero freq */
+ if (freq < 3) freq = 3;
+
+ fprintf(outfile, "%s\t%s\t%d\t%d\n",
+ pinyin_str, phrase_str,
+ PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq);
+
+ g_free(pinyin_str); g_free(phrase_str);
+ }
+ token++;
+ }
+ }
+
+ fclose(outfile);
+}