summaryrefslogtreecommitdiffstats
path: root/utils/storage
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-02-24 11:55:52 +0800
committerPeng Wu <alexepico@gmail.com>2012-02-24 11:55:52 +0800
commitdbf7a2dcd8e35f0db28e0a8f4ef324c12a3af1b4 (patch)
treea6fbc14e3f1d648a6a9a57565de9de57264507ef /utils/storage
parent23b89bb317cdf38e645cd25b31e2a5dbaaf1fe84 (diff)
downloadlibpinyin-dbf7a2dcd8e35f0db28e0a8f4ef324c12a3af1b4.tar.gz
libpinyin-dbf7a2dcd8e35f0db28e0a8f4ef324c12a3af1b4.tar.xz
libpinyin-dbf7a2dcd8e35f0db28e0a8f4ef324c12a3af1b4.zip
remove old parsers
Diffstat (limited to 'utils/storage')
-rw-r--r--utils/storage/gen_pinyin_table.cpp278
-rw-r--r--utils/storage/gen_zhuyin_map.cpp117
2 files changed, 0 insertions, 395 deletions
diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp
deleted file mode 100644
index 99a4a0e..0000000
--- a/utils/storage/gen_pinyin_table.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * libpinyin
- * Library to deal with pinyin.
- *
- * Copyright (C) 2010 Peng Wu
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- */
-
-
-#include "novel_types.h"
-#include "pinyin_base.h"
-#include "pinyin_phrase.h"
-#include <stdio.h>
-#include <errno.h>
-#include <locale.h>
-#include <glib.h>
-
-using namespace pinyin;
-
-
-GTree * g_pinyin_tree;
-GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
-
-struct phrase_item{
- size_t length;
- gunichar * uniphrase;
-};
-
-struct pinyin_and_freq_item{
- GArray * pinyin;
- guint32 freq;
-};
-
-struct item{
- phrase_item * phrase;
- GArray * pinyin_and_freq_array; /* Array of pinyin_and_freq_item. */
-};
-
-void feed_file(const char * filename);
-
-void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
-
-void store_in_item_array();
-
-void sort_item_array();
-
-void gen_phrase_file(const char * outfilename, int phrase_index);
-
-void print_help(){
- printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> "
- "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n");
- printf("<OUTPUTFILE> the result output file\n");
- printf("<FILEi> input pinyin files\n");
- printf("<PHRASE_INDEX> phrase index identifier\n");
-}
-
-gint phrase_item_compare(gconstpointer a, gconstpointer b){
- phrase_item * itema = (phrase_item *) a;
- phrase_item * itemb = (phrase_item *) b;
- if ( itema->length != itemb->length )
- return itema->length - itemb->length;
- else
- return memcmp(itema->uniphrase, itemb->uniphrase,
- sizeof(gunichar) * itema->length);
-}
-
-int main(int argc, char * argv[]){
- char * outfilename = "temp.out";
- int phrase_index = 0;
- int i = 1;
-
- g_pinyin_tree = g_tree_new(phrase_item_compare);
-
- setlocale(LC_ALL,"");
- while ( i < argc ){
- if ( strcmp("--help", argv[i] ) == 0) {
- print_help();
- exit(0);
- }else if ( strcmp("-t", argv[i] ) == 0){
- if ( ++i >= argc ) {
- print_help();
- exit(EINVAL);
- }
- phrase_index = atoi(argv[i]);
- }else if ( strcmp("-o", argv[i] ) == 0 ){
- if ( ++i >= argc ) {
- print_help();
- exit(EINVAL);
- }
- outfilename = g_strdup(argv[i]);
- } else {
- feed_file(argv[i]);
- }
- ++i;
- }
-
- printf("nnodes: %d\n", g_tree_nnodes(g_pinyin_tree));
-
- store_in_item_array();
- sort_item_array();
- gen_phrase_file(outfilename, phrase_index);
-
- return 0;
-}
-
-
-void feed_file ( const char * filename){
- char phrase[1024], pinyin[1024];
- guint32 n_freq;
- FILE * infile = fopen(filename, "r");
- if ( NULL == infile ){
- fprintf(stderr, "Can't open file %s.\n", filename);
- exit(ENOENT);
- }
- while ( !feof(infile)){
- fscanf(infile, "%s", phrase);
- fscanf(infile, "%s", pinyin);
- fscanf(infile, "%u", &n_freq);
- if (feof(infile))
- break;
- feed_line(phrase, pinyin, n_freq);
- }
- fclose(infile);
-}
-
-void feed_line (const char * phrase, const char * pinyin, const guint32 freq){
- phrase_item * new_phrase_ptr = (phrase_item *)
- malloc( sizeof(phrase_item));
- new_phrase_ptr->length = g_utf8_strlen(phrase, -1);
- /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
- * where is the code which I don't want to touch. :-)
- */
- if (new_phrase_ptr->length >= MAX_PHRASE_LENGTH ) {
- fprintf(stderr, "too long phrase:%s\t%s\t%d\n", phrase,
- pinyin, freq);
- free(new_phrase_ptr);
- return;
- }
- new_phrase_ptr->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
-
- PinyinDefaultParser parser;
- NullPinyinValidator validator;
- PinyinKeyVector keys;
- PinyinKeyPosVector poses;
-
- keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
- poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
- parser.parse(validator, keys, poses, pinyin);
-
- GArray * array = (GArray *)g_tree_lookup(g_pinyin_tree, new_phrase_ptr);
-
- pinyin_and_freq_item value_item;
- value_item.pinyin = keys;
- value_item.freq = freq;
-
- if(new_phrase_ptr->length != value_item.pinyin->len){
- fprintf(stderr, "error:phrase:%s\tpinyin:%s\n", phrase, pinyin);
- return;
- }
-
- if ( array == NULL){
- array = g_array_new(FALSE, TRUE, sizeof(pinyin_and_freq_item));
- g_array_append_val(array, value_item);
- g_tree_insert(g_pinyin_tree, new_phrase_ptr, array);
- return;
- }
- bool found = false;
- for ( size_t i = 0; i < array->len ; ++i){
- pinyin_and_freq_item * old_value_item = &g_array_index(array, pinyin_and_freq_item, i);
- int result = pinyin_exact_compare((PinyinKey *)value_item.pinyin->data,
- (PinyinKey *)old_value_item->pinyin->data , value_item.pinyin->len);
- if ( result == 0 ){
- printf("Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
- phrase, pinyin, freq);
- old_value_item->freq += freq;
- found = true;
- }
- }
-
- g_array_free(poses, TRUE);
-
- if ( !found ){
- g_array_append_val(array, value_item);
- g_tree_insert(g_pinyin_tree, new_phrase_ptr, array);
- }else
- g_array_free(keys, TRUE);
-
- free(new_phrase_ptr);
- //g_array_free(keys, TRUE);
-}
-
-gboolean store_one_item (gpointer key, gpointer value, gpointer data){
- item oneitem;
- oneitem.phrase = (phrase_item *)key;
- oneitem.pinyin_and_freq_array = (GArray *)value;
- int length = oneitem.phrase->length;
- g_array_append_val(g_item_array[length], oneitem);
- return FALSE;
-}
-
-void store_in_item_array(){
- for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
- g_item_array[i] = g_array_new(FALSE, TRUE, sizeof(item));
- }
- g_tree_foreach(g_pinyin_tree, store_one_item, NULL);
-}
-
-gint phrase_array_compare ( gconstpointer a, gconstpointer b, gpointer user_data){
- int phrase_length = *((int *) user_data);
- GArray * arraya =
- g_array_index(((item *)a)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin;
- GArray * arrayb =
- g_array_index(((item *)b)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin;
- return pinyin_exact_compare((PinyinKey *)arraya->data, (PinyinKey*)arrayb->data, phrase_length);
-}
-
-void sort_item_array(){
- for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
- g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
- }
-}
-
-void gen_phrase_file(const char * outfilename, int phrase_index){
- FILE * outfile = fopen(outfilename, "w");
- if (NULL == outfile ) {
- fprintf(stderr, "Can't write file %s.\n", outfilename);
- exit(ENOENT);
- }
- phrase_token_t token = 1;
- char pinyin_buffer[4096];
- //phrase length
- for ( size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
- GArray * item_array = g_item_array[i];
- //item array
- for( size_t m = 0; m < item_array->len; ++m){
- item* oneitem = & g_array_index(item_array, item, m);
- phrase_item * phrase = oneitem->phrase;
- GArray * pinyin_and_freqs = oneitem->pinyin_and_freq_array;
- const char * phrase_buffer = g_ucs4_to_utf8(phrase->uniphrase,
- phrase->length,
- NULL, NULL, NULL);
- //each pinyin
- for( size_t n = 0 ; n < pinyin_and_freqs->len; ++n){
- pinyin_and_freq_item * pinyin_and_freq = &g_array_index(pinyin_and_freqs, pinyin_and_freq_item, n);
- GArray * pinyin = pinyin_and_freq->pinyin;
- PinyinKey * key = &g_array_index(pinyin, PinyinKey, 0);
- strcpy(pinyin_buffer,key->get_key_string());
- for (size_t k = 1; k < pinyin->len; ++k){
- strcat(pinyin_buffer, "'");
- PinyinKey * key = &g_array_index(pinyin, PinyinKey, k);
- strcat(pinyin_buffer, key->get_key_string ());
- }
- guint32 freq = pinyin_and_freq -> freq;
- if ( freq < 3 )
- freq = 3;
- fprintf( outfile, "%s\t%s\t%d\t%d\n",
- pinyin_buffer, phrase_buffer,
- PHRASE_INDEX_MAKE_TOKEN(phrase_index, token),
- freq);
- }
- token++;
- }
- }
- fclose(outfile);
-}
diff --git a/utils/storage/gen_zhuyin_map.cpp b/utils/storage/gen_zhuyin_map.cpp
deleted file mode 100644
index bc6c647..0000000
--- a/utils/storage/gen_zhuyin_map.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * libpinyin
- * Library to deal with pinyin.
- *
- * Copyright (C) 2006 James Su <suzhe@tsinghua.org.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- */
-
-
-#include "pinyin_base.h"
-#include <stdio.h>
-#include <string.h>
-
-using namespace pinyin;
-
-static const char *map_names [] = {
- "__zhuyin_standard_map",
- "__zhuyin_hsu_map",
- "__zhuyin_ibm_map",
- "__zhuyin_gin_yieh_map",
- "__zhuyin_et_map",
- "__zhuyin_et26_map",
- 0
-};
-
-static const char *input_keys [] = {
- "1qaz2wsxedcrfv5tgbyhnujm8ik,9ol.0p;/-7634", /* standard kb */
- "bpmfdtnlgkhjvcjvcrzasexuyhgeiawomnkllsdfj", /* hsu */
- "1234567890-qwertyuiopasdfghjkl;zxcvbn/m,.", /* IBM */
- "2wsx3edcrfvtgb6yhnujm8ik,9ol.0p;/-['=1qaz", /* Gin-yieh */
- "bpmfdtnlvkhg7c,./j;'sexuaorwiqzy890-=1234", /* ET */
- "bpmfdtnlvkhgvcgycjqwsexuaorwiqzpmntlhdfjk", /* ET26 */
- 0
-};
-
-static PinyinKey pinyin_keys [] =
-{
- PinyinKey (PINYIN_Bo), PinyinKey (PINYIN_Po), PinyinKey (PINYIN_Mo), PinyinKey (PINYIN_Fo),
- PinyinKey (PINYIN_De), PinyinKey (PINYIN_Te), PinyinKey (PINYIN_Ne), PinyinKey (PINYIN_Le),
- PinyinKey (PINYIN_Ge), PinyinKey (PINYIN_Ke), PinyinKey (PINYIN_He), PinyinKey (PINYIN_Ji),
- PinyinKey (PINYIN_Qi), PinyinKey (PINYIN_Xi), PinyinKey (PINYIN_Zhi), PinyinKey (PINYIN_Chi),
- PinyinKey (PINYIN_Shi), PinyinKey (PINYIN_Ri), PinyinKey (PINYIN_Zi), PinyinKey (PINYIN_Ci),
- PinyinKey (PINYIN_Si), PinyinKey (PINYIN_ZeroInitial,PINYIN_I), PinyinKey (PINYIN_ZeroInitial,PINYIN_U), PinyinKey (PINYIN_ZeroInitial,PINYIN_V),
- PinyinKey (PINYIN_ZeroInitial,PINYIN_A), PinyinKey (PINYIN_ZeroInitial,PINYIN_O), PinyinKey (PINYIN_ZeroInitial,PINYIN_E), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ea),
- PinyinKey (PINYIN_ZeroInitial,PINYIN_Ai), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ei), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ao), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ou),
- PinyinKey (PINYIN_ZeroInitial,PINYIN_An), PinyinKey (PINYIN_ZeroInitial,PINYIN_En), PinyinKey (PINYIN_ZeroInitial,PINYIN_Ang),PinyinKey (PINYIN_ZeroInitial,PINYIN_Eng),
- PinyinKey (PINYIN_ZeroInitial,PINYIN_Er),
- PinyinKey (PINYIN_ZeroInitial,PINYIN_ZeroFinal,PINYIN_Fifth),
- PinyinKey (PINYIN_ZeroInitial,PINYIN_ZeroFinal,PINYIN_Second),
- PinyinKey (PINYIN_ZeroInitial,PINYIN_ZeroFinal,PINYIN_Third),
- PinyinKey (PINYIN_ZeroInitial,PINYIN_ZeroFinal,PINYIN_Fourth)
-};
-
-void print_map (int num)
-{
- PinyinKey map[93][3];
-
- map[0][0].set_tone (PINYIN_First);
-
- const char *p = input_keys [num];
-
- for (size_t i=0; *p; ++i, ++p) {
- size_t idx = *p - 0x20;
- size_t n;
- for (n=0; n<3; ++n)
- if (map[idx][n].is_empty ()) break;
-
- map[idx][n] = pinyin_keys [i];
- }
-
- printf("static const PinyinKey %s [][3] = \n{\n", map_names[num]);
-
- char buf11[40];
- char buf12[40];
- char buf13[40];
-
- char buf21[40];
- char buf22[40];
- char buf23[40];
-
- for (size_t i=0; i<93; ++i) {
- snprintf (buf11, 40, "PinyinKey(%d)", map[i][0].get_value ());
- snprintf (buf12, 40, "PinyinKey(%d)", map[i][1].get_value ());
- snprintf (buf13, 40, "PinyinKey(%d)", map[i][2].get_value ());
-
- snprintf (buf21, 40, "/* %s */", map[i][0].get_key_string ());
- snprintf (buf22, 40, "/* %s */", map[i][1].get_key_string ());
- snprintf (buf23, 40, "/* %s */", map[i][2].get_key_string ());
-
- printf ("/* %c */{%-15s%9s, %-15s%9s, %-15s%9s},\n", i+0x20, buf11, buf21, buf12, buf22, buf13, buf23);
- }
-
- printf("};\n\n");
-}
-
-int main ()
-{
- for (int i=0; input_keys[i]; ++i)
- print_map (i);
-}
-
-/*
-vi:ts=4:nowrap:ai:expandtab
-*/