diff options
author | Peng Wu <alexepico@gmail.com> | 2011-12-07 15:56:49 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-12-07 15:56:49 +0800 |
commit | 4aebb8f84e92da3075da3057afbcce5fa9fed2f7 (patch) | |
tree | fa670c1b840370dd01f4d4d746aace1e1d646032 /utils/storage | |
parent | dcfbeeaa0e381a7d03cdec05e8712753c2afb1e2 (diff) | |
download | libpinyin-4aebb8f84e92da3075da3057afbcce5fa9fed2f7.tar.gz libpinyin-4aebb8f84e92da3075da3057afbcce5fa9fed2f7.tar.xz libpinyin-4aebb8f84e92da3075da3057afbcce5fa9fed2f7.zip |
write gen chewing table
Diffstat (limited to 'utils/storage')
-rw-r--r-- | utils/storage/gen_chewing_table.cpp | 320 |
1 files changed, 320 insertions, 0 deletions
diff --git a/utils/storage/gen_chewing_table.cpp b/utils/storage/gen_chewing_table.cpp new file mode 100644 index 0000000..d6d3673 --- /dev/null +++ b/utils/storage/gen_chewing_table.cpp @@ -0,0 +1,320 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <stdio.h> +#include <glib.h> +#include "pinyin_internal.h" + +using namespace pinyin; + +/* map from phrase_item to GArray of chewing_and_freq_item */ +GTree * g_chewing_tree; +/* Array of GArray of phrase_and_array_item */ +GArray * g_item_array[MAX_PHRASE_LENGTH + 1]; + +struct phrase_item{ + size_t length; + gunichar * uniphrase; +}; + +struct chewing_and_freq_item{ + ChewingKeyVector keys; + ChewingKeyRestVector key_rests; + guint32 freq; +}; + +struct phrase_and_array_item{ + phrase_item phrase; + GArray * chewing_and_freq_array; /* Array of chewing_and_freq_item */ +}; + + +void feed_file(const char * filename); + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq); + +gboolean store_one_item(gpointer key, gpointer value, gpointer data); + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata); + +void gen_phrase_file(const char * outfilename, int phrase_index); + +void print_help(){ + printf("Usage: gen_chewing_table -t <PHRASE_INDEX> " + "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"); + printf("<OUTPUTFILE> the result output file\n"); + printf("<FILEi> input pinyin files\n"); + printf("<PHRASE_INDEX> phrase index identifier\n"); +} + + +gint phrase_item_compare(gconstpointer a, gconstpointer b){ + phrase_item * itema = (phrase_item *) a; + phrase_item * itemb = (phrase_item *) b; + if ( itema->length != itemb->length ) + return itema->length - itemb->length; + else + return memcmp(itema->uniphrase, itemb->uniphrase, + sizeof(gunichar) * itema->length); +} + + +int main(int argc, char * argv[]){ + const char * outfilename = "temp.out"; + int phrase_index = 0; + int i = 1; + + g_chewing_tree = g_tree_new(phrase_item_compare); + + while ( i < argc ){ + if ( strcmp("--help", argv[i] ) == 0) { + print_help(); + exit(0); + }else if ( strcmp("-t", argv[i] ) == 0){ + if ( ++i >= argc ) { + print_help(); + exit(EINVAL); + } + phrase_index = atoi(argv[i]); + }else if ( strcmp("-o", argv[i] ) == 0 ){ + if ( ++i >= argc ) { + print_help(); + exit(EINVAL); + } + outfilename = g_strdup(argv[i]); + } else { + feed_file(argv[i]); + } + ++i; + } + + printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree)); + + /* store in item array */ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_item_array[i] = g_array_new + (FALSE, TRUE, sizeof(phrase_and_array_item)); + } + g_tree_foreach(g_chewing_tree, store_one_item, NULL); + + /* sort item array */ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i); + } + + gen_phrase_file(outfilename, phrase_index); + + return 0; +} + +void feed_file ( const char * filename){ + char phrase[1024], pinyin[1024]; + guint32 freq; + + FILE * infile = fopen(filename, "r"); + if ( NULL == infile ){ + fprintf(stderr, "Can't open file %s.\n", filename); + exit(ENOENT); + } + + while ( !feof(infile)){ + fscanf(infile, "%s", phrase); + fscanf(infile, "%s", pinyin); + fscanf(infile, "%u", &freq); + if (feof(infile)) + break; + feed_line(phrase, pinyin, freq); + } + + fclose(infile); +} + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq) { + phrase_item * item = new phrase_item; + item->length = g_utf8_strlen(phrase, -1); + + /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp + * where is the code which I don't want to touch. :-) + */ + + if (item->length >= MAX_PHRASE_LENGTH) { + fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); + + FullPinyinParser2 parser; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = g_array_new + (FALSE, FALSE, sizeof(ChewingKeyRest)); + + pinyin_option_t options = USE_TONE; + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + assert(keys->len == key_rests->len); + + if (keys->len != item->length) { + fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item); + + chewing_and_freq_item value_item; + value_item.keys = keys; value_item.key_rests = key_rests; + value_item.freq = freq; + + assert(item->length == value_item.keys->len); + if (NULL == array) { + array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item)); + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + return; + } + + bool found = false; + for (size_t i = 0; i < array->len; ++i) { + chewing_and_freq_item * cur_item = + &g_array_index(array, chewing_and_freq_item, i); + int result = pinyin_exact_compare2 + ((ChewingKey *) value_item.keys->data, + (ChewingKey *) cur_item->keys->data, + value_item.keys->len); + + if (0 == result) { + fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", + phrase, pinyin, freq); + cur_item->freq += freq; + found = true; + } + } + + if (!found) { + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + } else { + /* clean up */ + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + } + + delete item; +} + + +gboolean store_one_item(gpointer key, gpointer value, gpointer data) { + phrase_and_array_item item; + item.phrase = *((phrase_item *) key); + item.chewing_and_freq_array = (GArray *) value; + int len = item.phrase.length; + g_array_append_val(g_item_array[len], item); + return FALSE; +} + + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata) { + int phrase_length = *((int *) userdata); + phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs; + phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs; + + ChewingKeyVector keys_lhs = g_array_index + (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + ChewingKeyVector keys_rhs = g_array_index + (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + return pinyin_exact_compare2((ChewingKey *)keys_lhs->data, + (ChewingKey *)keys_rhs->data, phrase_length); +} + + +void gen_phrase_file(const char * outfilename, int phrase_index){ + FILE * outfile = fopen(outfilename, "w"); + if (NULL == outfile ) { + fprintf(stderr, "Can't write file %s.\n", outfilename); + exit(ENOENT); + } + + phrase_token_t token = 1; + + /* phrase length index */ + for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) { + GArray * item_array = g_item_array[i]; + + /* item array index */ + for (size_t m = 0; m < item_array->len; ++m) { + phrase_and_array_item * item = &g_array_index + (item_array, phrase_and_array_item, m); + phrase_item phrase = item->phrase; + GArray * chewing_and_freqs = item->chewing_and_freq_array; + + gchar * phrase_str = g_ucs4_to_utf8 + (phrase.uniphrase, phrase.length, NULL, NULL, NULL); + + /* iterate each pinyin */ + for (size_t n = 0; n < chewing_and_freqs->len; ++n) { + chewing_and_freq_item * chewing_and_freq = + &g_array_index + (chewing_and_freqs, chewing_and_freq_item, n); + + ChewingKeyVector keys = chewing_and_freq->keys; + ChewingKeyRestVector key_rests = chewing_and_freq->key_rests; + + GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *)); + gchar * pinyin = NULL; + + size_t k; + for (k = 0; k < keys->len; ++k) { + ChewingKey key = g_array_index(keys, ChewingKey, k); + ChewingKeyRest key_rest = g_array_index + (key_rests, ChewingKeyRest, k); + + assert (CHEWING_ZERO_TONE != key.m_tone); + pinyin = get_pinyin_string(key, key_rest); + g_array_append_val(pinyins, pinyin); + } + gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data); + + for (k = 0; k < pinyins->len; ++k) { + g_free(g_array_index(pinyins, gchar *, k)); + } + g_array_free(pinyins, TRUE); + + guint32 freq = chewing_and_freq->freq; + + /* avoid zero freq */ + if (freq < 3) freq = 3; + + fprintf(outfile, "%s\t%s\t%d\t%d\n", + pinyin_str, phrase_str, + PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq); + + g_free(pinyin_str); g_free(phrase_str); + } + token++; + } + } + + fclose(outfile); +} |