From f41d1fdf83408e042ab07925710a8913bad0c27c Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 3 Aug 2010 10:42:47 +0800 Subject: import from pinyin. --- utils/Makefile.am | 25 ++++ utils/storage/Makefile.am | 30 +++++ utils/storage/gen_binary_files.cpp | 68 ++++++++++ utils/storage/gen_pinyin_table.cpp | 248 +++++++++++++++++++++++++++++++++++++ 4 files changed, 371 insertions(+) create mode 100644 utils/Makefile.am create mode 100644 utils/storage/Makefile.am create mode 100644 utils/storage/gen_binary_files.cpp create mode 100644 utils/storage/gen_pinyin_table.cpp (limited to 'utils') diff --git a/utils/Makefile.am b/utils/Makefile.am new file mode 100644 index 0000000..1f0d85d --- /dev/null +++ b/utils/Makefile.am @@ -0,0 +1,25 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = storage + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I $(ac_aux_dir) diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am new file mode 100644 index 0000000..9328174 --- /dev/null +++ b/utils/storage/Makefile.am @@ -0,0 +1,30 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CPPFLAGS@ + +noinst_PROGRAMS = gen_pinyin_table gen_binary_files + +gen_pinyin_table_SOURCES = gen_pinyin_table.cpp + +gen_pinyin_table_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ + +gen_binary_files_SOURCES = gen_binary_files.cpp + +gen_binary_files_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp new file mode 100644 index 0000000..7386106 --- /dev/null +++ b/utils/storage/gen_binary_files.cpp @@ -0,0 +1,68 @@ +#include +#include "memory_chunk.h" +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" +#include "phrase_index.h" + +int main(int argc, char * argv[]){ + /* generate pinyin index*/ + PinyinCustomSettings custom; + PinyinLargeTable largetable(&custom); + + FILE * gbfile = fopen("../../data/gb_char.table", "r"); + if ( gbfile == NULL) { + printf("open gb_char.table failed!"); + return 1; + } + FILE * gbkfile = fopen("../../data/gbk_char.table","r"); + if ( gbkfile == NULL) { + printf("open gb_char.table failed!"); + return 1; + } + + largetable.load_text(gbfile); + fclose(gbfile); + largetable.load_text(gbkfile); + fclose(gbkfile); + + MemoryChunk * new_chunk = new MemoryChunk; + largetable.store(new_chunk); + new_chunk->save("../../data/pinyin_index.bin"); + largetable.load(new_chunk); + + + /* generate phrase index*/ + FacadePhraseIndex phrase_index; + + FILE* infile = fopen("../../data/gb_char.table", "r"); + if ( NULL == infile ){ + printf("open gb_char.table failed!\n"); + exit(1); + } + + phrase_index.load_text(1, infile); + fclose(infile); + + infile = fopen("../../data/gbk_char.table", "r"); + if ( NULL == infile ){ + printf("open gbk_char.table failed!\n"); + exit(1); + } + + phrase_index.load_text(2, infile); + fclose(infile); + + new_chunk = new MemoryChunk; + phrase_index.store(1, new_chunk); + new_chunk->save("../../data/gb_char.bin"); + phrase_index.load(1, new_chunk); + + new_chunk = new MemoryChunk; + phrase_index.store(2, new_chunk); + new_chunk->save("../../data/gbk_char.bin"); + phrase_index.load(2, new_chunk); + + return 0; +} diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp new file mode 100644 index 0000000..38e6a27 --- /dev/null +++ b/utils/storage/gen_pinyin_table.cpp @@ -0,0 +1,248 @@ +#include +#include +#include +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" + + +GTree * g_pinyin_tree; +GArray * g_item_array[MAX_PHRASE_LENGTH + 1]; + +struct phrase_item{ + size_t length; + gunichar * uniphrase; +}; + +struct pinyin_and_freq_item{ + GArray * pinyin; + guint32 freq; +}; + +struct item{ + phrase_item * phrase; + GArray * pinyin_and_freq_array; /* Array of pinyin_and_freq_item. */ +}; + +void feed_file(const char * filename); + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq); + +void store_in_item_array(); + +void sort_item_array(); + +void gen_phrase_file(const char * outfilename, int phrase_index); + +void print_help(){ + printf("Usage: gen_pinyin_table -t " + "-o .. \n"); + printf(" the result output file\n"); + printf(" input pinyin files\n"); + printf(" phrase index identifier\n"); + exit(1); +} + +gint phrase_item_compare(gconstpointer a, gconstpointer b){ + phrase_item * itema = (phrase_item *) a; + phrase_item * itemb = (phrase_item *) b; + if ( itema->length != itemb->length ) + return itema->length - itemb->length; + else + return memcmp(itema->uniphrase, itemb->uniphrase, + sizeof(gunichar) * itema->length); +} + +int main(int argc, char * argv[]){ + char outfilename[1024]="temp.out"; + int phrase_index = 0; + int i = 1; + + g_pinyin_tree = g_tree_new(phrase_item_compare); + + setlocale(LC_ALL,""); + while ( i < argc ){ + if ( strcmp("--help", argv[i] ) == 0) { + print_help(); + }else if ( strcmp("-t", argv[i] ) == 0){ + if ( ++i >= argc ) + print_help(); + phrase_index = atoi(argv[i]); + }else if ( strcmp("-o", argv[i] ) == 0 ){ + if ( ++i >= argc ) + print_help(); + strcpy( outfilename, argv[i]); + } else { + feed_file(argv[i]); + } + ++i; + } + + printf("nnodes: %d\n", g_tree_nnodes(g_pinyin_tree)); + + store_in_item_array(); + sort_item_array(); + gen_phrase_file(outfilename, phrase_index); + + return 0; +} + + +void feed_file ( const char * filename){ + char phrase[1024], pinyin[1024]; + guint32 n_freq; + FILE * infile = fopen(filename, "r"); + if ( NULL == infile ){ + fprintf(stderr, "Can't open file %s.\n", filename); + exit(1); + } + while ( !feof(infile)){ + fscanf(infile, "%s", phrase); + fscanf(infile, "%s", pinyin); + fscanf(infile, "%u", &n_freq); + if (feof(infile)) + break; + feed_line(phrase, pinyin, n_freq); + } + fclose(infile); +} + +void feed_line (const char * phrase, const char * pinyin, const guint32 freq){ + phrase_item * new_phrase_ptr = (phrase_item *) + malloc( sizeof(phrase_item)); + new_phrase_ptr->length = g_utf8_strlen(phrase, -1); + /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp + * where is the code which I don't want to touch. :-) + */ + if (new_phrase_ptr->length >= MAX_PHRASE_LENGTH ) { + printf("too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); + free(new_phrase_ptr); + return; + } + new_phrase_ptr->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); + + PinyinDefaultParser parser; + NullPinyinValidator validator; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + parser.parse(validator, keys, poses, pinyin); + + GArray * array = (GArray *)g_tree_lookup(g_pinyin_tree, new_phrase_ptr); + + pinyin_and_freq_item value_item; + value_item.pinyin = keys; + value_item.freq = freq; + + if(new_phrase_ptr->length != value_item.pinyin->len){ + printf("error:phrase:%s\tpinyin:%s\n", phrase, pinyin); + return; + } + + if ( array == NULL){ + array = g_array_new(FALSE, TRUE, sizeof(pinyin_and_freq_item)); + g_array_append_val(array, value_item); + g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); + return; + } + bool found = false; + for ( int i = 0; i < array->len ; ++i){ + pinyin_and_freq_item * old_value_item = &g_array_index(array, pinyin_and_freq_item, i); + int result = pinyin_exact_compare((PinyinKey *)value_item.pinyin->data, + (PinyinKey *)old_value_item->pinyin->data , value_item.pinyin->len); + if ( result == 0 ){ + printf("Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", + phrase, pinyin, freq); + old_value_item->freq += freq; + found = true; + } + } + + g_array_free(poses, TRUE); + + if ( !found ){ + g_array_append_val(array, value_item); + g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); + }else + g_array_free(keys, TRUE); + + free(new_phrase_ptr); + //g_array_free(keys, TRUE); +} + +gboolean store_one_item (gpointer key, gpointer value, gpointer data){ + item oneitem; + oneitem.phrase = (phrase_item *)key; + oneitem.pinyin_and_freq_array = (GArray *)value; + int length = oneitem.phrase->length; + g_array_append_val(g_item_array[length], oneitem); + return FALSE; +} + +void store_in_item_array(){ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_item_array[i] = g_array_new(FALSE, TRUE, sizeof(item)); + } + g_tree_foreach(g_pinyin_tree, store_one_item, NULL); +} + +gint phrase_array_compare ( gconstpointer a, gconstpointer b, gpointer user_data){ + int phrase_length = *((int *) user_data); + GArray * arraya = + g_array_index(((item *)a)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin; + GArray * arrayb = + g_array_index(((item *)b)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin; + return pinyin_exact_compare((PinyinKey *)arraya->data, (PinyinKey*)arrayb->data, phrase_length); +} + +void sort_item_array(){ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i); + } +} + +void gen_phrase_file(const char * outfilename, int phrase_index){ + FILE * outfile = fopen(outfilename, "w"); + if (NULL == outfile ) { + fprintf(stderr, "Can't write file %s.\n", outfilename); + exit(1); + } + phrase_token_t token = 1; + char pinyin_buffer[4096]; + //phrase length + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + GArray * item_array = g_item_array[i]; + //item array + for( int m = 0; m < item_array->len; ++m){ + item* oneitem = & g_array_index(item_array, item, m); + phrase_item * phrase = oneitem->phrase; + GArray * pinyin_and_freqs = oneitem->pinyin_and_freq_array; + const char * phrase_buffer = g_ucs4_to_utf8(phrase->uniphrase, + phrase->length, + NULL, NULL, NULL); + //each pinyin + for( int n = 0 ; n < pinyin_and_freqs->len; ++n){ + pinyin_and_freq_item * pinyin_and_freq = &g_array_index(pinyin_and_freqs, pinyin_and_freq_item, n); + GArray * pinyin = pinyin_and_freq->pinyin; + PinyinKey * key = &g_array_index(pinyin, PinyinKey, 0); + strcpy(pinyin_buffer,key->get_key_string()); + for (size_t k = 1; k < pinyin->len; ++k){ + strcat(pinyin_buffer, "'"); + PinyinKey * key = &g_array_index(pinyin, PinyinKey, k); + strcat(pinyin_buffer, key->get_key_string ()); + } + guint32 freq = pinyin_and_freq -> freq; + if ( freq < 3 ) + freq = 3; + fprintf( outfile, "%s\t%s\t%d\t%d\n", + pinyin_buffer, phrase_buffer, + PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), + freq); + } + token++; + } + } + fclose(outfile); +} -- cgit