diff options
author | Peng Wu <alexepico@gmail.com> | 2012-02-24 11:55:52 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-02-24 11:55:52 +0800 |
commit | dbf7a2dcd8e35f0db28e0a8f4ef324c12a3af1b4 (patch) | |
tree | a6fbc14e3f1d648a6a9a57565de9de57264507ef /utils/storage | |
parent | 23b89bb317cdf38e645cd25b31e2a5dbaaf1fe84 (diff) | |
download | libpinyin-dbf7a2dcd8e35f0db28e0a8f4ef324c12a3af1b4.tar.gz libpinyin-dbf7a2dcd8e35f0db28e0a8f4ef324c12a3af1b4.tar.xz libpinyin-dbf7a2dcd8e35f0db28e0a8f4ef324c12a3af1b4.zip |
remove old parsers
Diffstat (limited to 'utils/storage')
-rw-r--r-- | utils/storage/gen_pinyin_table.cpp | 278 | ||||
-rw-r--r-- | utils/storage/gen_zhuyin_map.cpp | 117 |
2 files changed, 0 insertions, 395 deletions
diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp deleted file mode 100644 index 99a4a0e..0000000 --- a/utils/storage/gen_pinyin_table.cpp +++ /dev/null @@ -1,278 +0,0 @@ -/* - * libpinyin - * Library to deal with pinyin. - * - * Copyright (C) 2010 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - - -#include "novel_types.h" -#include "pinyin_base.h" -#include "pinyin_phrase.h" -#include <stdio.h> -#include <errno.h> -#include <locale.h> -#include <glib.h> - -using namespace pinyin; - - -GTree * g_pinyin_tree; -GArray * g_item_array[MAX_PHRASE_LENGTH + 1]; - -struct phrase_item{ - size_t length; - gunichar * uniphrase; -}; - -struct pinyin_and_freq_item{ - GArray * pinyin; - guint32 freq; -}; - -struct item{ - phrase_item * phrase; - GArray * pinyin_and_freq_array; /* Array of pinyin_and_freq_item. */ -}; - -void feed_file(const char * filename); - -void feed_line(const char * phrase, const char * pinyin, const guint32 freq); - -void store_in_item_array(); - -void sort_item_array(); - -void gen_phrase_file(const char * outfilename, int phrase_index); - -void print_help(){ - printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> " - "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"); - printf("<OUTPUTFILE> the result output file\n"); - printf("<FILEi> input pinyin files\n"); - printf("<PHRASE_INDEX> phrase index identifier\n"); -} - -gint phrase_item_compare(gconstpointer a, gconstpointer b){ - phrase_item * itema = (phrase_item *) a; - phrase_item * itemb = (phrase_item *) b; - if ( itema->length != itemb->length ) - return itema->length - itemb->length; - else - return memcmp(itema->uniphrase, itemb->uniphrase, - sizeof(gunichar) * itema->length); -} - -int main(int argc, char * argv[]){ - char * outfilename = "temp.out"; - int phrase_index = 0; - int i = 1; - - g_pinyin_tree = g_tree_new(phrase_item_compare); - - setlocale(LC_ALL,""); - while ( i < argc ){ - if ( strcmp("--help", argv[i] ) == 0) { - print_help(); - exit(0); - }else if ( strcmp("-t", argv[i] ) == 0){ - if ( ++i >= argc ) { - print_help(); - exit(EINVAL); - } - phrase_index = atoi(argv[i]); - }else if ( strcmp("-o", argv[i] ) == 0 ){ - if ( ++i >= argc ) { - print_help(); - exit(EINVAL); - } - outfilename = g_strdup(argv[i]); - } else { - feed_file(argv[i]); - } - ++i; - } - - printf("nnodes: %d\n", g_tree_nnodes(g_pinyin_tree)); - - store_in_item_array(); - sort_item_array(); - gen_phrase_file(outfilename, phrase_index); - - return 0; -} - - -void feed_file ( const char * filename){ - char phrase[1024], pinyin[1024]; - guint32 n_freq; - FILE * infile = fopen(filename, "r"); - if ( NULL == infile ){ - fprintf(stderr, "Can't open file %s.\n", filename); - exit(ENOENT); - } - while ( !feof(infile)){ - fscanf(infile, "%s", phrase); - fscanf(infile, "%s", pinyin); - fscanf(infile, "%u", &n_freq); - if (feof(infile)) - break; - feed_line(phrase, pinyin, n_freq); - } - fclose(infile); -} - -void feed_line (const char * phrase, const char * pinyin, const guint32 freq){ - phrase_item * new_phrase_ptr = (phrase_item *) - malloc( sizeof(phrase_item)); - new_phrase_ptr->length = g_utf8_strlen(phrase, -1); - /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp - * where is the code which I don't want to touch. :-) - */ - if (new_phrase_ptr->length >= MAX_PHRASE_LENGTH ) { - fprintf(stderr, "too long phrase:%s\t%s\t%d\n", phrase, - pinyin, freq); - free(new_phrase_ptr); - return; - } - new_phrase_ptr->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); - - PinyinDefaultParser parser; - NullPinyinValidator validator; - PinyinKeyVector keys; - PinyinKeyPosVector poses; - - keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); - poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); - parser.parse(validator, keys, poses, pinyin); - - GArray * array = (GArray *)g_tree_lookup(g_pinyin_tree, new_phrase_ptr); - - pinyin_and_freq_item value_item; - value_item.pinyin = keys; - value_item.freq = freq; - - if(new_phrase_ptr->length != value_item.pinyin->len){ - fprintf(stderr, "error:phrase:%s\tpinyin:%s\n", phrase, pinyin); - return; - } - - if ( array == NULL){ - array = g_array_new(FALSE, TRUE, sizeof(pinyin_and_freq_item)); - g_array_append_val(array, value_item); - g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); - return; - } - bool found = false; - for ( size_t i = 0; i < array->len ; ++i){ - pinyin_and_freq_item * old_value_item = &g_array_index(array, pinyin_and_freq_item, i); - int result = pinyin_exact_compare((PinyinKey *)value_item.pinyin->data, - (PinyinKey *)old_value_item->pinyin->data , value_item.pinyin->len); - if ( result == 0 ){ - printf("Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", - phrase, pinyin, freq); - old_value_item->freq += freq; - found = true; - } - } - - g_array_free(poses, TRUE); - - if ( !found ){ - g_array_append_val(array, value_item); - g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); - }else - g_array_free(keys, TRUE); - - free(new_phrase_ptr); - //g_array_free(keys, TRUE); -} - -gboolean store_one_item (gpointer key, gpointer value, gpointer data){ - item oneitem; - oneitem.phrase = (phrase_item *)key; - oneitem.pinyin_and_freq_array = (GArray *)value; - int length = oneitem.phrase->length; - g_array_append_val(g_item_array[length], oneitem); - return FALSE; -} - -void store_in_item_array(){ - for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ - g_item_array[i] = g_array_new(FALSE, TRUE, sizeof(item)); - } - g_tree_foreach(g_pinyin_tree, store_one_item, NULL); -} - -gint phrase_array_compare ( gconstpointer a, gconstpointer b, gpointer user_data){ - int phrase_length = *((int *) user_data); - GArray * arraya = - g_array_index(((item *)a)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin; - GArray * arrayb = - g_array_index(((item *)b)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin; - return pinyin_exact_compare((PinyinKey *)arraya->data, (PinyinKey*)arrayb->data, phrase_length); -} - -void sort_item_array(){ - for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ - g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i); - } -} - -void gen_phrase_file(const char * outfilename, int phrase_index){ - FILE * outfile = fopen(outfilename, "w"); - if (NULL == outfile ) { - fprintf(stderr, "Can't write file %s.\n", outfilename); - exit(ENOENT); - } - phrase_token_t token = 1; - char pinyin_buffer[4096]; - //phrase length - for ( size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ - GArray * item_array = g_item_array[i]; - //item array - for( size_t m = 0; m < item_array->len; ++m){ - item* oneitem = & g_array_index(item_array, item, m); - phrase_item * phrase = oneitem->phrase; - GArray * pinyin_and_freqs = oneitem->pinyin_and_freq_array; - const char * phrase_buffer = g_ucs4_to_utf8(phrase->uniphrase, - phrase->length, - NULL, NULL, NULL); - //each pinyin - for( size_t n = 0 ; n < pinyin_and_freqs->len; ++n){ - pinyin_and_freq_item * pinyin_and_freq = &g_array_index(pinyin_and_freqs, pinyin_and_freq_item, n); - GArray * pinyin = pinyin_and_freq->pinyin; - PinyinKey * key = &g_array_index(pinyin, PinyinKey, 0); - strcpy(pinyin_buffer,key->get_key_string()); - for (size_t k = 1; k < pinyin->len; ++k){ - strcat(pinyin_buffer, "'"); - PinyinKey * key = &g_array_index(pinyin, PinyinKey, k); - strcat(pinyin_buffer, key->get_key_string ()); - } - guint32 freq = pinyin_and_freq -> freq; - if ( freq < 3 ) - freq = 3; - fprintf( outfile, "%s\t%s\t%d\t%d\n", - pinyin_buffer, phrase_buffer, - PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), - freq); - } - token++; - } - } - fclose(outfile); -} diff --git a/utils/storage/gen_zhuyin_map.cpp b/utils/storage/gen_zhuyin_map.cpp deleted file mode 100644 index bc6c647..0000000 --- a/utils/storage/gen_zhuyin_map.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/* - * libpinyin - * Library to deal with pinyin. - * - * Copyright (C) 2006 James Su <suzhe@tsinghua.org.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - - -#include "pinyin_base.h" -#include <stdio.h> -#include <string.h> - -using namespace pinyin; - -static const char *map_names [] = { - "__zhuyin_standard_map", - "__zhuyin_hsu_map", - "__zhuyin_ibm_map", - "__zhuyin_gin_yieh_map", - "__zhuyin_et_map", - "__zhuyin_et26_map", - 0 -}; - -static const char *input_keys [] = { - "1qaz2wsxedcrfv5tgbyhnujm8ik,9ol.0p;/-7634", /* standard kb */ - "bpmfdtnlgkhjvcjvcrzasexuyhgeiawomnkllsdfj", /* hsu */ - "1234567890-qwertyuiopasdfghjkl;zxcvbn/m,.", /* IBM */ - "2wsx3edcrfvtgb6yhnujm8ik,9ol.0p;/-['=1qaz", /* Gin-yieh */ - "bpmfdtnlvkhg7c,./j;'sexuaorwiqzy890-=1234", /* ET */ - "bpmfdtnlvkhgvcgycjqwsexuaorwiqzpmntlhdfjk", /* ET26 */ - 0 -}; - -static PinyinKey pinyin_keys [] = -{ - PinyinKey (PINYIN_Bo), PinyinKey (PINYIN_Po), PinyinKey (PINYIN_Mo), PinyinKey (PINYIN_Fo), - PinyinKey (PINYIN_De), PinyinKey (PINYIN_Te), PinyinKey (PINYIN_Ne), PinyinKey (PINYIN_Le), - PinyinKey (PINYIN_Ge), PinyinKey (PINYIN_Ke), PinyinKey (PINYIN_He), PinyinKey (PINYIN_Ji), - PinyinKey (PINYIN_Qi), PinyinKey (PINYIN_Xi), PinyinKey (PINYIN_Zhi), PinyinKey (PINYIN_Chi), - PinyinKey (PINYIN_Shi), PinyinKey (PINYIN_Ri), PinyinKey (PINYIN_Zi), PinyinKey (PINYIN_Ci), - PinyinKey (PINYIN_Si), PinyinKey (PINYIN_ZeroInitial,PINYIN_I), PinyinKey (PINYIN_ZeroInitial,PINYIN_U), PinyinKey (PINYIN_ZeroInitial,PINYIN_V), - PinyinKey (PINYIN_ZeroInitial,PINYIN_A), PinyinKey (PINYIN_ZeroInitial,PINYIN_O), PinyinKey (PINYIN_ZeroInitial,PINYIN_E), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ea), - PinyinKey (PINYIN_ZeroInitial,PINYIN_Ai), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ei), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ao), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ou), - PinyinKey (PINYIN_ZeroInitial,PINYIN_An), PinyinKey (PINYIN_ZeroInitial,PINYIN_En), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ang),PinyinKey (PINYIN_ZeroInitial,PINYIN_Eng), - PinyinKey (PINYIN_ZeroInitial,PINYIN_Er), - PinyinKey (PINYIN_ZeroInitial,PINYIN_ZeroFinal,PINYIN_Fifth), - PinyinKey (PINYIN_ZeroInitial,PINYIN_ZeroFinal,PINYIN_Second), - PinyinKey (PINYIN_ZeroInitial,PINYIN_ZeroFinal,PINYIN_Third), - PinyinKey (PINYIN_ZeroInitial,PINYIN_ZeroFinal,PINYIN_Fourth) -}; - -void print_map (int num) -{ - PinyinKey map[93][3]; - - map[0][0].set_tone (PINYIN_First); - - const char *p = input_keys [num]; - - for (size_t i=0; *p; ++i, ++p) { - size_t idx = *p - 0x20; - size_t n; - for (n=0; n<3; ++n) - if (map[idx][n].is_empty ()) break; - - map[idx][n] = pinyin_keys [i]; - } - - printf("static const PinyinKey %s [][3] = \n{\n", map_names[num]); - - char buf11[40]; - char buf12[40]; - char buf13[40]; - - char buf21[40]; - char buf22[40]; - char buf23[40]; - - for (size_t i=0; i<93; ++i) { - snprintf (buf11, 40, "PinyinKey(%d)", map[i][0].get_value ()); - snprintf (buf12, 40, "PinyinKey(%d)", map[i][1].get_value ()); - snprintf (buf13, 40, "PinyinKey(%d)", map[i][2].get_value ()); - - snprintf (buf21, 40, "/* %s */", map[i][0].get_key_string ()); - snprintf (buf22, 40, "/* %s */", map[i][1].get_key_string ()); - snprintf (buf23, 40, "/* %s */", map[i][2].get_key_string ()); - - printf ("/* %c */{%-15s%9s, %-15s%9s, %-15s%9s},\n", i+0x20, buf11, buf21, buf12, buf22, buf13, buf23); - } - - printf("};\n\n"); -} - -int main () -{ - for (int i=0; input_keys[i]; ++i) - print_map (i); -} - -/* -vi:ts=4:nowrap:ai:expandtab -*/ |