summaryrefslogtreecommitdiffstats
path: root/utils/storage/gen_zhuyin_table.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'utils/storage/gen_zhuyin_table.cpp')
-rw-r--r--utils/storage/gen_zhuyin_table.cpp339
1 files changed, 0 insertions, 339 deletions
diff --git a/utils/storage/gen_zhuyin_table.cpp b/utils/storage/gen_zhuyin_table.cpp
deleted file mode 100644
index 87bc591..0000000
--- a/utils/storage/gen_zhuyin_table.cpp
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * libzhuyin
- * Library to deal with zhuyin.
- *
- * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- */
-
-
-#include <stdio.h>
-#include <glib.h>
-#include "zhuyin_internal.h"
-
-
-void print_help(){
- printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n"
- "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"
- "<OUTPUTFILE> the result output file\n"
- "<FILEi> input pinyin files\n"
- "<PHRASE_INDEX> phrase index identifier\n");
-}
-
-
-static gint phrase_index = 0;
-static const gchar * outputfile = "temp.out";
-
-static GOptionEntry entries[] =
-{
- {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL},
- {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL},
- {NULL}
-};
-
-
-using namespace zhuyin;
-
-/* map from phrase_item to GArray of chewing_and_freq_item */
-GTree * g_chewing_tree;
-/* Array of GArray of phrase_and_array_item */
-GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
-
-struct phrase_item{
- size_t length;
- gunichar * uniphrase;
-};
-
-struct chewing_and_freq_item{
- ChewingKeyVector keys;
- ChewingKeyRestVector key_rests;
- guint32 freq;
-};
-
-struct phrase_and_array_item{
- phrase_item phrase; /* the key of g_chewing_tree */
- /* Array of chewing_and_freq_item */
- GArray * chewing_and_freq_array; /* the value of g_chewing_tree */
-};
-
-
-void feed_file(const char * filename);
-
-void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
-
-gboolean store_one_item(gpointer key, gpointer value, gpointer data);
-
-int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
- gpointer userdata);
-
-void gen_phrase_file(const char * outputfile, int phrase_index);
-
-
-gint phrase_item_compare(gconstpointer a, gconstpointer b){
- phrase_item * itema = (phrase_item *) a;
- phrase_item * itemb = (phrase_item *) b;
- if ( itema->length != itemb->length )
- return itema->length - itemb->length;
- else
- return memcmp(itema->uniphrase, itemb->uniphrase,
- sizeof(gunichar) * itema->length);
-}
-
-
-int main(int argc, char * argv[]){
- int i;
-
- g_chewing_tree = g_tree_new(phrase_item_compare);
-
- GError * error = NULL;
- GOptionContext * context;
-
- context = g_option_context_new("- generate pinyin table");
- g_option_context_add_main_entries(context, entries, NULL);
- if (!g_option_context_parse(context, &argc, &argv, &error)) {
- g_print("option parsing failed:%s\n", error->message);
- exit(EINVAL);
- }
-
- for (i = 1; i < argc; ++i) {
- feed_file(argv[i]);
- }
-
- printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree));
-
- /* store in item array */
- g_item_array[0] = NULL;
- for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
- g_item_array[i] = g_array_new
- (FALSE, TRUE, sizeof(phrase_and_array_item));
- }
- g_tree_foreach(g_chewing_tree, store_one_item, NULL);
-
- /* sort item array */
- for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
- g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
- }
-
- gen_phrase_file(outputfile, phrase_index);
-
- return 0;
-}
-
-void feed_file ( const char * filename){
- FILE * infile = fopen(filename, "r");
- if ( NULL == infile ){
- fprintf(stderr, "Can't open file %s.\n", filename);
- exit(ENOENT);
- }
-
- char * linebuf = NULL; size_t size = 0; ssize_t read;
- while( (read = getline(&linebuf, &size, infile)) != -1 ){
- if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
- linebuf[strlen(linebuf) - 1] = '\0';
- }
-
- /* assume tsi.src only use the single space to separate tokens. */
- gchar ** strs = g_strsplit_set(linebuf, " ", 3);
-
- const char * phrase = strs[0];
- guint32 freq = atoi(strs[1]);
- const char * pinyin = strs[2];
-
- if (3 != g_strv_length(strs)) {
- fprintf(stderr, "wrong line format:%s\n", linebuf);
- continue;
- }
-
- if (feof(infile))
- break;
-
- feed_line(phrase, pinyin, freq);
- }
-
- free(linebuf);
- fclose(infile);
-}
-
-void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
- phrase_item * item = new phrase_item;
- item->length = g_utf8_strlen(phrase, -1);
-
- /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
- * where is the code which I don't want to touch. :-)
- */
-
- if (item->length >= MAX_PHRASE_LENGTH) {
- fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
- delete item;
- return;
- }
-
- item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
-
- ChewingDirectParser2 parser;
- ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
- ChewingKeyRestVector key_rests = g_array_new
- (FALSE, FALSE, sizeof(ChewingKeyRest));
-
- pinyin_option_t options = USE_TONE | FORCE_TONE;
- parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
- assert(keys->len == key_rests->len);
-
- if (keys->len != item->length) {
- fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
- delete item;
- return;
- }
-
- GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);
-
- chewing_and_freq_item value_item;
- value_item.keys = keys; value_item.key_rests = key_rests;
- value_item.freq = freq;
-
- assert(item->length == value_item.keys->len);
- if (NULL == array) {
- array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
- g_array_append_val(array, value_item);
- g_tree_insert(g_chewing_tree, item, array);
- return;
- }
-
- bool found = false;
- for (size_t i = 0; i < array->len; ++i) {
- chewing_and_freq_item * cur_item =
- &g_array_index(array, chewing_and_freq_item, i);
- int result = pinyin_exact_compare2
- ((ChewingKey *) value_item.keys->data,
- (ChewingKey *) cur_item->keys->data,
- value_item.keys->len);
-
- if (0 == result) {
- fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
- phrase, pinyin, freq);
- cur_item->freq += freq;
- found = true;
- }
- }
-
- if (!found) {
- g_array_append_val(array, value_item);
- g_tree_insert(g_chewing_tree, item, array);
- } else {
- /* clean up */
- g_array_free(keys, TRUE);
- g_array_free(key_rests, TRUE);
- }
-
- delete item;
-}
-
-
-gboolean store_one_item(gpointer key, gpointer value, gpointer data) {
- phrase_and_array_item item;
- item.phrase = *((phrase_item *) key);
- item.chewing_and_freq_array = (GArray *) value;
- int len = item.phrase.length;
- g_array_append_val(g_item_array[len], item);
- return FALSE;
-}
-
-
-int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
- gpointer userdata) {
- int phrase_length = *((int *) userdata);
- phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs;
- phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs;
-
- ChewingKeyVector keys_lhs = g_array_index
- (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
- ChewingKeyVector keys_rhs = g_array_index
- (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
- return pinyin_exact_compare2((ChewingKey *)keys_lhs->data,
- (ChewingKey *)keys_rhs->data, phrase_length);
-}
-
-
-void gen_phrase_file(const char * outputfile, int phrase_index){
- FILE * outfile = fopen(outputfile, "w");
- if (NULL == outfile ) {
- fprintf(stderr, "Can't write file %s.\n", outputfile);
- exit(ENOENT);
- }
-
- phrase_token_t token = 1;
-
- /* phrase length index */
- for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) {
- GArray * item_array = g_item_array[i];
-
- /* item array index */
- for (size_t m = 0; m < item_array->len; ++m) {
- phrase_and_array_item * item = &g_array_index
- (item_array, phrase_and_array_item, m);
- phrase_item phrase = item->phrase;
- GArray * chewing_and_freqs = item->chewing_and_freq_array;
-
- gchar * phrase_str = g_ucs4_to_utf8
- (phrase.uniphrase, phrase.length, NULL, NULL, NULL);
-
- /* iterate each pinyin */
- for (size_t n = 0; n < chewing_and_freqs->len; ++n) {
- chewing_and_freq_item * chewing_and_freq =
- &g_array_index
- (chewing_and_freqs, chewing_and_freq_item, n);
-
- ChewingKeyVector keys = chewing_and_freq->keys;
- ChewingKeyRestVector key_rests = chewing_and_freq->key_rests;
-
- GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *));
- gchar * pinyin = NULL;
-
- size_t k;
- for (k = 0; k < keys->len; ++k) {
- ChewingKey key = g_array_index(keys, ChewingKey, k);
- ChewingKeyRest key_rest = g_array_index
- (key_rests, ChewingKeyRest, k);
-
- assert (CHEWING_ZERO_TONE != key.m_tone);
- pinyin = key.get_bopomofo_string();
- g_array_append_val(pinyins, pinyin);
- }
- gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data);
-
- for (k = 0; k < pinyins->len; ++k) {
- g_free(g_array_index(pinyins, gchar *, k));
- }
- g_array_free(pinyins, TRUE);
-
- guint32 freq = chewing_and_freq->freq;
-
- /* avoid zero freq */
- if (freq < 3) freq = 3;
-
- fprintf(outfile, "%s\t%s\t%d\t%d\n",
- pinyin_str, phrase_str,
- PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq);
-
- g_free(pinyin_str);
- }
- g_free(phrase_str);
- token++;
- }
- }
-
- fclose(outfile);
-}