diff options
author | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
commit | f41d1fdf83408e042ab07925710a8913bad0c27c (patch) | |
tree | 1757833ac4cdd0830834d2f9ef92be07c0bc1a5b /tests | |
parent | 34acf9be9033e0dc0a5905999133482c20b6cbf3 (diff) | |
download | libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.gz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.xz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.zip |
import from pinyin.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/Makefile.am | 25 | ||||
-rw-r--r-- | tests/include/Makefile.am | 25 | ||||
-rwxr-xr-x | tests/include/test_memory_chunk.cpp | 90 | ||||
-rw-r--r-- | tests/lookup/Makefile.am | 27 | ||||
-rw-r--r-- | tests/lookup/test_simple_lookup.cpp | 108 | ||||
-rw-r--r-- | tests/storage/Makefile.am | 41 | ||||
-rw-r--r-- | tests/storage/test_ngram.cpp | 126 | ||||
-rw-r--r-- | tests/storage/test_parser.cpp | 165 | ||||
-rw-r--r-- | tests/storage/test_phrase_index.cpp | 141 | ||||
-rw-r--r-- | tests/storage/test_pinyin_index.cpp | 148 |
10 files changed, 896 insertions, 0 deletions
diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 0000000..f36e5f9 --- /dev/null +++ b/tests/Makefile.am @@ -0,0 +1,25 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = include storage lookup + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I $(ac_aux_dir) diff --git a/tests/include/Makefile.am b/tests/include/Makefile.am new file mode 100644 index 0000000..53bc089 --- /dev/null +++ b/tests/include/Makefile.am @@ -0,0 +1,25 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +INCLUDES = -I$(top_srcdir)/src/include + +noinst_PROGRAMS = test_memory_chunk + +test_memory_chunk_SOURCES = test_memory_chunk.cpp + +test_memory_chunk_LDADD = ../../src/storage/libstorage.la @GLIB2_LIBS@ + diff --git a/tests/include/test_memory_chunk.cpp b/tests/include/test_memory_chunk.cpp new file mode 100755 index 0000000..6282d93 --- /dev/null +++ b/tests/include/test_memory_chunk.cpp @@ -0,0 +1,90 @@ +#include <stdio.h> +#include <iostream> +#include "memory_chunk.h" +// Test Memory Chunk Functionality + +int main(int argc, char * argv[]){ + MemoryChunk* chunk; + chunk = new MemoryChunk(); + int i = 12; + chunk->set_content(0, &i, sizeof(int)); + + int * p = (int *)chunk->begin(); + assert(chunk->size() == sizeof(int)); + std::cout<<*p<<std::endl; + std::cout<<chunk->capacity()<<std::endl; + p = & i; + chunk->set_chunk(p, sizeof(int), NULL); + short t = 5; + chunk->set_content(sizeof(int), &t, sizeof(short)); + assert( sizeof(int) + sizeof(short) == chunk->size()); + std::cout<<chunk->capacity()<<std::endl; + + p = (int *)chunk->begin(); + short * p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); + std::cout<<*p<<'\t'<<*p2<<std::endl; + + chunk->set_content(sizeof(int) + sizeof(short), &t, sizeof(short)); + + assert( sizeof(int) + (sizeof(short) << 1) == chunk->size()); + std::cout<<chunk->capacity()<<std::endl; + p = (int *)chunk->begin(); + p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); + std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<std::endl; + + chunk->set_size(sizeof(int) + sizeof(short) *3); + p = (int *)chunk->begin(); + p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); + + chunk->set_content(0, &i, sizeof(int)); + + *(p2+2) = 3; + std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<'\t'<<*(p2+2)<<std::endl; + + int m = 10; + chunk->set_chunk(&m, sizeof(int), NULL); + int n = 12; + chunk->insert_content(sizeof(int), &n, sizeof(int)); + n = 11; + chunk->insert_content(sizeof(int), &n, sizeof(int)); + + int * p3 = (int *)chunk->begin(); + std::cout<<*p3<<'\t'<<*(p3+1)<<'\t'<<*(p3+2)<<std::endl; + + chunk->remove_content(sizeof(int), sizeof(int)); + std::cout<<*p3<<'\t'<<*(p3+1)<<std::endl; + + int tmp; + assert(chunk->get_content(sizeof(int), &tmp, sizeof(int))); + std::cout<<tmp<<std::endl; + + + delete chunk; + + const char * filename = "/tmp/version"; + const char * version = "0.2.0"; + + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if ( !retval ){ + std::cerr<<"can't find chunk"<<std::endl; + }else{ + if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){ + std::cout<<"match"<<std::endl; + } + + } + + chunk->set_content(0, version, strlen(version) + 1); + chunk->save(filename); + + retval = chunk->load(filename); + if ( !retval ){ + std::cerr<<"can't find chunk"<<std::endl; + } + if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){ + std::cout<<"match"<<std::endl; + } + + return 0; +} diff --git a/tests/lookup/Makefile.am b/tests/lookup/Makefile.am new file mode 100644 index 0000000..ca863ce --- /dev/null +++ b/tests/lookup/Makefile.am @@ -0,0 +1,27 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + @GLIB2_CPPFLAGS@ + +noinst_PROGRAMS = test_simple_lookup + +test_simple_lookup_SOURCES = test_simple_lookup.cpp + +test_simple_lookup_LDADD = ../../src/storage/libstorage.la ../../src/lookup/liblookup.la @GLIB2_LDFLAGS@ diff --git a/tests/lookup/test_simple_lookup.cpp b/tests/lookup/test_simple_lookup.cpp new file mode 100644 index 0000000..04f4dce --- /dev/null +++ b/tests/lookup/test_simple_lookup.cpp @@ -0,0 +1,108 @@ +#include <string.h> +#include <stdio.h> +#include <sys/time.h> +#include <glib.h> +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" +#include "phrase_index.h" +#include "ngram.h" +#include "lookup.h" + +size_t bench_times = 1000; + +guint32 record_time () +{ + timeval tv; + gettimeofday (&tv, NULL); + return (guint32) tv.tv_sec * 1000000 + tv.tv_usec; +} + +void print_time (guint32 old_time, guint32 times) +{ + timeval tv; + gettimeofday (&tv, NULL); + + guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time; + + printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted ); +} + + +int main( int argc, char * argv[]){ + + PinyinCustomSettings custom; + PinyinLargeTable largetable(&custom); + + BitmapPinyinValidator validator; + validator.initialize(&largetable); + + MemoryChunk * new_chunk = new MemoryChunk; + new_chunk->load("../../data/pinyin_index.bin"); + largetable.load(new_chunk); + + FacadePhraseIndex phrase_index; + new_chunk = new MemoryChunk; + new_chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, new_chunk); + new_chunk = new MemoryChunk; + new_chunk->load("../../data/gbk_char.bin"); + phrase_index.load(2, new_chunk); + + Bigram bigram; + bigram.attach("../../data/bigram.db", "/tmp/bigram.db"); + + PinyinLookup pinyin_lookup(&custom, &largetable, &phrase_index, &bigram); + + char* linebuf = (char *)malloc ( 1024 * sizeof (char) ); + size_t size = 1024; + while( getline(&linebuf, &size, stdin) ){ + linebuf[strlen(linebuf)-1] = '\0'; + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + PinyinDefaultParser parser; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + + validator.initialize(&largetable); + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + parser.parse(validator, keys, poses,linebuf); + + if ( 0 == keys->len ) + continue; + CandidateConstraints constraints = g_array_new(FALSE, FALSE, sizeof(lookup_constraint_t)); + + g_array_set_size(constraints, keys->len); + for ( size_t i = 0; i < constraints->len; ++i){ + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + guint32 start_time = record_time(); + size_t times = 100; + for ( size_t i = 0; i < times; ++i) + pinyin_lookup.get_best_match(keys, constraints, results); + print_time(start_time, times); + for ( size_t i = 0; i < results->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if ( NULL == *token) + continue; + printf("pos:%d,token:%d\t", i, *token); + } + printf("\n"); + char * sentence = NULL; + pinyin_lookup.convert_to_utf8(results, sentence); + printf("%s\n", sentence); + + g_array_free(keys, true); + g_array_free(poses, true); + g_free(sentence); + } + free(linebuf); +} diff --git a/tests/storage/Makefile.am b/tests/storage/Makefile.am new file mode 100644 index 0000000..e38c690 --- /dev/null +++ b/tests/storage/Makefile.am @@ -0,0 +1,41 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CPPFLAGS@ + +noinst_PROGRAMS = test_parser \ + test_pinyin_index \ + test_phrase_index \ + test_ngram + +test_parser_SOURCES = test_parser.cpp + +test_parser_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ + +test_pinyin_index_SOURCES = test_pinyin_index.cpp + +test_pinyin_index_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ + +test_phrase_index_SOURCES = test_phrase_index.cpp + +test_phrase_index_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ + +test_ngram_SOURCES = test_ngram.cpp + +test_ngram_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp new file mode 100644 index 0000000..7bdb141 --- /dev/null +++ b/tests/storage/test_ngram.cpp @@ -0,0 +1,126 @@ +#include <stdio.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "ngram.h" + + +int main(int argc, char * argv[]){ + SingleGram single_gram; + + const guint32 total_freq = 16; + assert(single_gram.set_total_freq(total_freq)); + + + phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3}; + guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; + + for(int i = 0; i < 6 ;++i){ + single_gram.set_freq(tokens[i], freqs[i]); + } + + guint32 freq; + single_gram.get_freq(3, freq); + assert(freq == 32); + + printf("--------------------------------------------------------\n"); + PhraseIndexRange range; + BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem)); + range.m_range_begin = 0; range.m_range_end = 8; + single_gram.search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + + + assert(single_gram.get_total_freq(freq)); + assert(freq == total_freq); + + + Bigram bigram; + assert(bigram.attach(NULL, "/tmp/system.db")); + bigram.store(1, &single_gram); + single_gram.set_freq(5, 8); + single_gram.set_total_freq(32); + + bigram.store(2, &single_gram); + + printf("--------------------------------------------------------\n"); + SingleGram * system, * user; + bigram.load(1, system, user); + assert(NULL == system); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + user->search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete user; + + printf("--------------------------------------------------------\n"); + bigram.load(2, system, user); + assert(NULL == system); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + user->search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete user; + + bigram.attach("/tmp/system.db", NULL); + printf("--------------------------------------------------------\n"); + bigram.load(1, system, user); + assert(NULL == user); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + system->search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete system; + + printf("--------------------------------------------------------\n"); + bigram.load(2, system, user); + assert(NULL == user); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + system->search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete system; + + printf("--------------------------------------------------------\n"); + single_gram.prune(); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + single_gram.search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + assert(single_gram.get_total_freq(freq)); + printf("total_freq:%d\n", freq); + + g_array_free(array, TRUE); + + GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(system_items, user_items); + + printf("----------------------system----------------------------\n"); + for ( int i = 0; i < system_items->len; ++i){ + phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i); + printf("item:%d\n", *token); + } + printf("-----------------------user-----------------------------\n"); + for ( int i = 0; i < user_items->len; ++i){ + phrase_token_t * token = &g_array_index(user_items, phrase_token_t, i); + printf("item:%d\n", *token); + } +} diff --git a/tests/storage/test_parser.cpp b/tests/storage/test_parser.cpp new file mode 100644 index 0000000..ba5bfb8 --- /dev/null +++ b/tests/storage/test_parser.cpp @@ -0,0 +1,165 @@ +/* + * libpinyin + * + * Copyright (c) 2006 James Su <suzhe@tsinghua.org.cn> + * + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA + * + * $Id$ + * + */ + +#include <string.h> +#include <iostream> +#include "pinyin_base.h" + +typedef std::string String; + + +static const char *help_msg = + "Too few argument!\n" + "Usage:\n" + " test-parser [options]\n\n" + " -i Use incomplete pinyin.\n" + " -f table Use specified pinyin table file.\n" + " -p parser Use specified parser instead of Default.\n" + " parser could be:\n" + " sp-stone\n" + " sp-zrm\n" + " sp-ms\n" + " sp-ziguang\n" + " sp-abc\n" + " sp-liushi\n" + " zy-zhuyin\n" + " zy-standard\n" + " zy-hsu\n" + " zy-ibm\n" + " zy-gin-yieh\n" + " zy-et\n" + " zy-et26\n"; + +int main (int argc, char * argv []) +{ + NullPinyinValidator validator; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + PinyinCustomSettings custom; + PinyinParser *parser = 0; + //PinyinTable table; + const char *tablefile = "../data/pinyin-table.txt"; + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + + int i = 0; + while (i<argc) { + if (++i >= argc) break; + + if (String ("-h") == argv [i] || String ("--help") == argv [i]) { + std::cout << help_msg; + return 0; + } + + if (String ("-i") == argv [i]) { + custom.set_use_incomplete (true); + continue; + } + + if (String ("-p") == argv [i]) { + if (++i >= argc) { + std::cerr << "No argument for option " << argv [i-1] << "\n"; + return -1; + } + if (!strcmp (argv[i], "sp") || !strcmp (argv[i], "sp-default")) + parser = new PinyinShuangPinParser (); + else if (!strcmp (argv[i], "sp-stone")) + parser = new PinyinShuangPinParser (SHUANG_PIN_STONE); + else if (!strcmp (argv[i], "sp-zrm")) + parser = new PinyinShuangPinParser (SHUANG_PIN_ZRM); + else if (!strcmp (argv[i], "sp-ms")) + parser = new PinyinShuangPinParser (SHUANG_PIN_MS); + else if (!strcmp (argv[i], "sp-ziguang")) + parser = new PinyinShuangPinParser (SHUANG_PIN_ZIGUANG); + else if (!strcmp (argv[i], "sp-abc")) + parser = new PinyinShuangPinParser (SHUANG_PIN_ABC); + else if (!strcmp (argv[i], "sp-liushi")) + parser = new PinyinShuangPinParser (SHUANG_PIN_LIUSHI); + continue; + } + + if (String ("-f") == argv [i]) { + if (++i >= argc) { + std::cerr << "No argument for option " << argv [i-1] << "\n"; + return -1; + } + tablefile = argv [i]; + continue; + } + + std::cerr << "Invalid option: " << argv [i] << "\n"; + return -1; + }; + + if (!parser) parser = new PinyinDefaultParser (); + +/* + if (!table.load (tablefile)) { + std::cerr << "Failed to load tablefile: " << tablefile << "\n"; + return -1; + } +*/ + //table.update_custom_settings (custom); + + + char buf[1024]; + + while (1) { + std::cout << "Input:" << std::flush; + std::cin.getline (buf, 1023, '\n'); + + if (strncmp (buf, "quit", 4) == 0) break; + + int len = parser->parse (validator, keys, poses,(const char *) buf); + + std::cout << "Parsed " << len << " chars, " << keys->len << " keys:\n"; + + for (size_t i=0; i < keys->len; ++i){ + PinyinKey * key = &g_array_index(keys, PinyinKey, i); + std::cout << key->get_key_string () << " "; + } + + std::cout << std::endl; + + for ( size_t i=0; i < poses->len; ++i){ + PinyinKeyPos * pos = &g_array_index(poses, PinyinKeyPos, i); + std::cout << pos->get_pos() << " " << pos->get_length()<<" "; + } + + std::cout << std::endl; + + for (size_t i=0; i < keys->len; ++i){ + PinyinKey * key = &g_array_index(keys, PinyinKey, i); + std::cout << key->get_key_zhuyin_string () << " "; + } + + std::cout << std::endl; + } +} + +/* +vi:ts=4:nowrap:ai:expandtab +*/ diff --git a/tests/storage/test_phrase_index.cpp b/tests/storage/test_phrase_index.cpp new file mode 100644 index 0000000..d858ae2 --- /dev/null +++ b/tests/storage/test_phrase_index.cpp @@ -0,0 +1,141 @@ +#include <stdio.h> +#include <sys/time.h> +#include <glib.h> +#include "memory_chunk.h" +#include "pinyin_base.h" +#include "phrase_index.h" + +size_t bench_times = 100000; + +guint32 record_time () +{ + timeval tv; + gettimeofday (&tv, NULL); + return (guint32) tv.tv_sec * 1000000 + tv.tv_usec; +} + +void print_time (guint32 old_time, guint32 times) +{ + timeval tv; + gettimeofday (&tv, NULL); + + guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time; + + printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted ); +} + + +int main(int argc, char * argv[]){ + PhraseItem phrase_item; + utf16_t string1 = 2; + PinyinKey key1 = PinyinKey((PinyinInitial)3,(PinyinFinal)3,(PinyinTone)3); + PinyinKey key2 = PinyinKey((PinyinInitial)4,(PinyinFinal)4,(PinyinTone)4); + + + phrase_item.set_phrase_string(1, &string1); + phrase_item.append_pronunciation(&key1, 100); + phrase_item.append_pronunciation(&key2, 300); + + assert(phrase_item.get_phrase_length() == 1); + + PinyinKey key3; + guint32 freq; + phrase_item.get_nth_pronunciation(0, &key3, freq); + assert(key3 == key1); + assert(freq == 100); + phrase_item.get_nth_pronunciation(1, &key3, freq); + assert(key3 == key2); + assert(freq == 300); + + PinyinCustomSettings custom; + gfloat poss = phrase_item.get_pinyin_possibility(custom, &key1); + printf("pinyin possiblitiy:%f\n", poss); + + assert(phrase_item.get_unigram_frequency() == 0); + + utf16_t string2; + phrase_item.get_phrase_string(&string2); + assert(string1 == string2); + + FacadePhraseIndex phrase_index; + assert(phrase_index.add_phrase_item(1, &phrase_item)); + + MemoryChunk* chunk = new MemoryChunk; + assert(phrase_index.store(0, chunk)); + assert(phrase_index.load(0, chunk)); + + PhraseItem item2; + guint32 time = record_time(); + for ( int i = 0; i < bench_times; ++i){ + phrase_index.get_phrase_item(1, item2); + assert(item2.get_unigram_frequency() == 0); + assert(item2.get_n_pronunciation() == 2); + assert(item2.get_phrase_length() == 1); + assert(item2.get_pinyin_possibility(custom, &key2) == 0.75); + } + print_time(time, bench_times); + + { + PhraseItem item3; + phrase_index.get_phrase_item(1, item3); + item3.increase_pinyin_possibility(custom, &key1, 200); + assert(item3.get_pinyin_possibility(custom, &key1) == 0.5) ; + } + + { + PhraseItem item5; + phrase_index.get_phrase_item(1, item5); + gfloat poss = item5.get_pinyin_possibility(custom, &key1); + printf("pinyin poss:%f\n", poss); + assert(poss == 0.5); + } + + FacadePhraseIndex phrase_index_load; + + FILE* infile = fopen("../../data/gb_char.table", "r"); + if ( NULL == infile ){ + printf("open gb_char.table failed!\n"); + exit(1); + } + + phrase_index_load.load_text(1, infile); + fclose(infile); + + infile = fopen("../../data/gbk_char.table", "r"); + if ( NULL == infile ){ + printf("open gbk_char.table failed!\n"); + exit(1); + } + + phrase_index_load.load_text(2, infile); + fclose(infile); + + MemoryChunk* store1 = new MemoryChunk; + phrase_index_load.store(1, store1); + phrase_index_load.load(1, store1); + + MemoryChunk* store2 = new MemoryChunk; + phrase_index_load.store(2, store2); + phrase_index_load.load(2, store2); + + phrase_index_load.get_phrase_item(16870555, item2); + assert( item2.get_phrase_length() == 14); + assert( item2.get_n_pronunciation() == 1); + + gunichar2 buf[1024]; + item2.get_phrase_string(buf); + char * string = g_utf16_to_utf8( buf, 14, NULL, NULL, NULL); + printf("%s\n", string); + g_free(string); + + guint32 delta = 3; + phrase_index_load.add_unigram_frequency(16870555, delta); + phrase_index_load.get_phrase_item(16870555, item2); + assert( item2.get_unigram_frequency() == 3); + + phrase_index_load.get_phrase_item(16777222, item2); + assert(item2.get_phrase_length() == 1); + assert(item2.get_n_pronunciation() == 5); + + return 0; +} diff --git a/tests/storage/test_pinyin_index.cpp b/tests/storage/test_pinyin_index.cpp new file mode 100644 index 0000000..e79eb3b --- /dev/null +++ b/tests/storage/test_pinyin_index.cpp @@ -0,0 +1,148 @@ +#include <string.h> +#include <stdio.h> +#include <sys/time.h> +#include <glib.h> +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" +#include "phrase_index.h" + +size_t bench_times = 1000; + +guint32 record_time () +{ + timeval tv; + gettimeofday (&tv, NULL); + return (guint32) tv.tv_sec * 1000000 + tv.tv_usec; +} + +void print_time (guint32 old_time, guint32 times) +{ + timeval tv; + gettimeofday (&tv, NULL); + + guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time; + + printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted ); +} + + +int main( int argc, char * argv[]){ + + PinyinCustomSettings custom; + PinyinLargeTable largetable(&custom); + + FILE * gbfile = fopen("../../data/gb_char.table", "r"); + if ( gbfile == NULL) { + printf("open gb_char.table failed!"); + return 1; + } + FILE * gbkfile = fopen("../../data/gbk_char.table","r"); + if ( gbkfile == NULL) { + printf("open gb_char.table failed!"); + return 1; + } + + largetable.load_text(gbfile); + fclose(gbfile); + largetable.load_text(gbkfile); + fclose(gbkfile); + + FacadePhraseIndex phrase_index; + + FILE* infile = fopen("../../data/gb_char.table", "r"); + if ( NULL == infile ){ + printf("open gb_char.table failed!\n"); + exit(1); + } + + phrase_index.load_text(1, infile); + fclose(infile); + + infile = fopen("../../data/gbk_char.table", "r"); + if ( NULL == infile ){ + printf("open gbk_char.table failed!\n"); + exit(1); + } + + phrase_index.load_text(2, infile); + fclose(infile); + + MemoryChunk* new_chunk = new MemoryChunk; + largetable.store(new_chunk); + largetable.load(new_chunk); + + char* linebuf = (char *)malloc ( 1024 * sizeof (char) ); + size_t size = 1024; + while( getline(&linebuf, &size, stdin) ){ + linebuf[strlen(linebuf)-1] = '\0'; + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + PinyinDefaultParser parser; + NullPinyinValidator validator; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + parser.parse(validator, keys, poses, linebuf); + + guint32 start = record_time(); + + PhraseIndexRanges ranges; + for( int i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ + ranges[i] = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange)); + } + for ( int i = 0 ; i < bench_times; ++i){ + largetable.search(keys->len, (PinyinKey *)keys->data, ranges); + } + + for( int i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ + GArray * range = ranges[i]; + g_array_set_size( range, 0); + } + print_time(start, bench_times); + + largetable.search(keys->len, (PinyinKey *)keys->data, ranges); + for( int i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ + GArray * range = ranges[i]; + if ( range ){ + for (int k = 0; k < range->len; ++k){ + PhraseIndexRange* onerange = &g_array_index(range, PhraseIndexRange, k); + printf("start:%ld\tend:%ld\n", onerange->m_range_begin, onerange->m_range_end); + PhraseItem item; + for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){ + phrase_index.get_phrase_item( token, item); + gunichar2 bufstr[1024]; + item.get_phrase_string(bufstr); + char * string = g_utf16_to_utf8 + ( bufstr, item.get_phrase_length(), + NULL, NULL, NULL); + printf("%s\t", string); + g_free(string); + PinyinKey pinyin_buffer[1024]; + size_t npron = item.get_n_pronunciation(); + guint32 freq; + for ( size_t n = 0; n < npron; ++n){ + item.get_nth_pronunciation(n, pinyin_buffer, freq); + for ( size_t o = 0; o < item.get_phrase_length(); ++o){ + printf("%s'", pinyin_buffer[o].get_key_string()); + } + printf("\b \t %d", freq); + } + printf("\n"); + } + } + if ( range->len) + printf("range items number:%d\n", range->len); + } + g_array_set_size( range, 0); + } + + g_array_free(keys, TRUE); + g_array_free(poses, TRUE); + } + free(linebuf); +} |