diff options
author | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
commit | b78429d78df745dd327b6dada6b9bd71ea5df84e (patch) | |
tree | 82c4625db8674c66d69fd566fce8efc347e3cb3a /tests/storage | |
download | libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.gz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.xz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.zip |
import libpinyin code
Diffstat (limited to 'tests/storage')
-rw-r--r-- | tests/storage/CMakeLists.txt | 71 | ||||
-rw-r--r-- | tests/storage/Makefile.am | 71 | ||||
-rw-r--r-- | tests/storage/test_chewing_table.cpp | 148 | ||||
-rw-r--r-- | tests/storage/test_flexible_ngram.cpp | 138 | ||||
-rw-r--r-- | tests/storage/test_ngram.cpp | 87 | ||||
-rw-r--r-- | tests/storage/test_parser2.cpp | 144 | ||||
-rw-r--r-- | tests/storage/test_phrase_index.cpp | 122 | ||||
-rw-r--r-- | tests/storage/test_phrase_index_logger.cpp | 67 | ||||
-rw-r--r-- | tests/storage/test_phrase_table.cpp | 86 | ||||
-rw-r--r-- | tests/storage/test_table_info.cpp | 84 |
10 files changed, 1018 insertions, 0 deletions
diff --git a/tests/storage/CMakeLists.txt b/tests/storage/CMakeLists.txt new file mode 100644 index 0000000..96b12fc --- /dev/null +++ b/tests/storage/CMakeLists.txt @@ -0,0 +1,71 @@ +include_directories(..) + +add_executable( + test_parser2 + test_parser2.cpp +) + +target_link_libraries( + test_parser2 + libpinyin +) + +add_executable( + test_chewing_table + test_chewing_table.cpp +) + +target_link_libraries( + test_chewing_table + libpinyin +) + +add_executable( + test_phrase_index + test_phrase_index.cpp +) + +target_link_libraries( + test_phrase_index + libpinyin +) + +add_executable( + test_phrase_index_logger + test_phrase_index_logger.cpp +) + +target_link_libraries( + test_phrase_index_logger + libpinyin +) + +add_executable( + test_phrase_table + test_phrase_table.cpp +) + +target_link_libraries( + test_phrase_table + libpinyin +) + +add_executable( + test_ngram + test_ngram.cpp +) + +target_link_libraries( + test_ngram + libpinyin +) + +add_executable( + test_flexible_ngram + test_flexible_ngram.cpp +) + +target_link_libraries( + test_flexible_ngram + libpinyin +) diff --git a/tests/storage/Makefile.am b/tests/storage/Makefile.am new file mode 100644 index 0000000..b7ed8b6 --- /dev/null +++ b/tests/storage/Makefile.am @@ -0,0 +1,71 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/tests \ + @GLIB2_CFLAGS@ + +TESTS = test_phrase_index_logger \ + test_ngram \ + test_flexible_ngram + +noinst_PROGRAMS = test_phrase_index \ + test_phrase_index_logger \ + test_phrase_table \ + test_ngram \ + test_flexible_ngram \ + test_parser2 \ + test_chewing_table \ + test_table_info + + +test_phrase_index_SOURCES = test_phrase_index.cpp + +test_phrase_index_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_phrase_index_logger_SOURCES = test_phrase_index_logger.cpp + +test_phrase_index_logger_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +test_phrase_table_SOURCES = test_phrase_table.cpp + +test_phrase_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_ngram_SOURCES = test_ngram.cpp + +test_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_flexible_ngram_SOURCES = test_flexible_ngram.cpp + +test_flexible_ngram_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +test_parser2_SOURCES = test_parser2.cpp + +test_parser2_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_chewing_table_SOURCES = test_chewing_table.cpp + +test_chewing_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_table_info_SOURCES = test_table_info.cpp + +test_table_info_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ diff --git a/tests/storage/test_chewing_table.cpp b/tests/storage/test_chewing_table.cpp new file mode 100644 index 0000000..f3d0f5d --- /dev/null +++ b/tests/storage/test_chewing_table.cpp @@ -0,0 +1,148 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "timer.h" +#include <string.h> +#include "pinyin_internal.h" +#include "tests_helper.h" + +size_t bench_times = 1000; + +int main(int argc, char * argv[]) { + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE; + ChewingLargeTable largetable(options); + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_table(phrase_files, &largetable, NULL, &phrase_index)) + exit(ENOENT); + + MemoryChunk * new_chunk = new MemoryChunk; + largetable.store(new_chunk); + largetable.load(new_chunk); + + char* linebuf = NULL; size_t size = 0; ssize_t read; + while ((read = getline(&linebuf, &size, stdin)) != -1) { + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + FullPinyinParser2 parser; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); + if (0 == keys->len) { + fprintf(stderr, "Invalid input.\n"); + continue; + } + + guint32 start = record_time(); + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(PhraseIndexRanges)); + + phrase_index.prepare_ranges(ranges); + + for (size_t i = 0; i < bench_times; ++i) { + phrase_index.clear_ranges(ranges); + largetable.search(keys->len, (ChewingKey *)keys->data, ranges); + } + print_time(start, bench_times); + + phrase_index.clear_ranges(ranges); + largetable.search(keys->len, (ChewingKey *)keys->data, ranges); + + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & range = ranges[i]; + if (!range) + continue; + + if (range->len) + printf("range items number:%d\n", range->len); + + for (size_t k = 0; k < range->len; ++k) { + PhraseIndexRange * onerange = + &g_array_index(range, PhraseIndexRange, k); + printf("start:%d\tend:%d\n", onerange->m_range_begin, + onerange->m_range_end); + + PhraseItem item; + for ( phrase_token_t token = onerange->m_range_begin; + token != onerange->m_range_end; ++token){ + + phrase_index.get_phrase_item( token, item); + + /* get phrase string */ + ucs4_t buffer[MAX_PHRASE_LENGTH + 1]; + item.get_phrase_string(buffer); + char * string = g_ucs4_to_utf8 + ( buffer, item.get_phrase_length(), + NULL, NULL, NULL); + printf("%s\t", string); + g_free(string); + + ChewingKey chewing_buffer[MAX_PHRASE_LENGTH]; + size_t npron = item.get_n_pronunciation(); + guint32 freq; + for (size_t m = 0; m < npron; ++m){ + item.get_nth_pronunciation(m, chewing_buffer, freq); + for (size_t n = 0; n < item.get_phrase_length(); + ++n){ + gchar * pinyins = + chewing_buffer[n].get_pinyin_string(); + printf("%s'", pinyins); + g_free(pinyins); + } + printf("\b\t%d\t", freq); + } + } + printf("\n"); + } + g_array_set_size(range, 0); + } + + phrase_index.destroy_ranges(ranges); + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + } + + if (linebuf) + free(linebuf); + + /* mask out all index items. */ + largetable.mask_out(0x0, 0x0); + + return 0; +} diff --git a/tests/storage/test_flexible_ngram.cpp b/tests/storage/test_flexible_ngram.cpp new file mode 100644 index 0000000..d7d7950 --- /dev/null +++ b/tests/storage/test_flexible_ngram.cpp @@ -0,0 +1,138 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin_internal.h" + +int main(int argc, char * argv[]) { + FlexibleSingleGram<guint32, guint32> single_gram; + typedef FlexibleSingleGram<guint32, guint32>::ArrayItemWithToken array_item_t; + + const guint32 total_freq = 16; + assert(single_gram.set_array_header(total_freq)); + + phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3 }; + guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; + + guint32 freq; + + for ( size_t i = 0; i < G_N_ELEMENTS(tokens); ++i ){ + if ( single_gram.get_array_item(tokens[i], freq) ) + assert(single_gram.set_array_item(tokens[i], freqs[i])); + else + assert(single_gram.insert_array_item(tokens[i], freqs[i])); + } + + single_gram.get_array_item(3, freq); + assert(freq == 32); + + printf("--------------------------------------------------------\n"); + PhraseIndexRange range; + FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(array_item_t)); + range.m_range_begin = 0; range.m_range_end = 8; + single_gram.search(&range, array); + for ( size_t i = 0; i < array->len; ++i ){ + array_item_t * item = &g_array_index(array, array_item_t, i); + printf("item:%d:%d\n", item->m_token, item->m_item); + } + + assert(single_gram.get_array_header(freq)); + assert(freq == total_freq); + + FlexibleBigram<guint32, guint32, guint32> bigram("TEST"); + assert(bigram.attach("/tmp/training.db", ATTACH_READWRITE|ATTACH_CREATE)); + bigram.store(1, &single_gram); + assert(single_gram.insert_array_item(5, 8)); + assert(single_gram.remove_array_item(1, freq)); + assert(single_gram.set_array_header(32)); + assert(single_gram.get_array_header(freq)); + printf("new array header:%d\n", freq); + bigram.store(2, &single_gram); + + for (int m = 1; m <= 2; ++m ){ + printf("--------------------------------------------------------\n"); + FlexibleSingleGram<guint32, guint32> * train_gram; + bigram.load(m, train_gram); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + train_gram->search(&range, array); + for ( size_t i = 0; i < array->len; ++i ){ + array_item_t * item = &g_array_index(array, array_item_t, i); + printf("item:%d:%d\n", item->m_token, item->m_item); + } + delete train_gram; + } + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(items); + printf("-----------------------items----------------------------\n"); + for ( size_t i = 0; i < items->len; ++i ){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + printf("item:%d\n", *token); + } + + printf("-----------------------magic header---------------------\n"); + bigram.set_magic_header(total_freq); + bigram.get_magic_header(freq); + assert(total_freq == freq); + printf("magic header:%d\n", freq); + + printf("-----------------------array header---------------------\n"); + for ( int i = 1; i <= 2; ++i){ + bigram.get_array_header(i, freq); + printf("single gram: %d, freq:%d\n", i, freq); + } + + bigram.set_array_header(1, 1); + + printf("-----------------------array header---------------------\n"); + for ( int i = 1; i <= 2; ++i){ + bigram.get_array_header(i, freq); + printf("single gram: %d, freq:%d\n", i, freq); + } + + for (int m = 1; m <= 2; ++m ){ + printf("--------------------------------------------------------\n"); + FlexibleSingleGram<guint32, guint32> * train_gram; + bigram.load(m, train_gram); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + train_gram->search(&range, array); + for ( size_t i = 0; i < array->len; ++i ){ + array_item_t * item = &g_array_index(array, array_item_t, i); + printf("item:%d:%d\n", item->m_token, item->m_item); + } + delete train_gram; + } + + assert(bigram.remove(1)); + + bigram.get_all_items(items); + printf("-----------------------items----------------------------\n"); + for ( size_t i = 0; i < items->len; ++i ){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + printf("item:%d\n", *token); + } + + g_array_free(items, TRUE); + g_array_free(array, TRUE); + return 0; +} diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp new file mode 100644 index 0000000..f82cf1f --- /dev/null +++ b/tests/storage/test_ngram.cpp @@ -0,0 +1,87 @@ +#include <stdio.h> +#include "pinyin_internal.h" + + +int main(int argc, char * argv[]){ + SingleGram single_gram; + + const guint32 total_freq = 16; + assert(single_gram.set_total_freq(total_freq)); + + phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3}; + guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; + + guint32 freq; + + for(size_t i = 0; i < 6 ;++i){ + if ( single_gram.get_freq(tokens[i], freq)) + assert(single_gram.set_freq(tokens[i], freqs[i])); + else + assert(single_gram.insert_freq(tokens[i], freqs[i])); + } + + single_gram.get_freq(3, freq); + assert(freq == 32); + + printf("--------------------------------------------------------\n"); + PhraseIndexRange range; + BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem)); + range.m_range_begin = 0; range.m_range_end = 8; + single_gram.search(&range,array); + for ( size_t i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + + assert(single_gram.get_total_freq(freq)); + assert(freq == total_freq); + + Bigram bigram; + assert(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE)); + bigram.store(1, &single_gram); + assert(single_gram.insert_freq(5, 8)); + assert(single_gram.remove_freq(1, freq)); + single_gram.set_total_freq(32); + + bigram.store(2, &single_gram); + + + SingleGram * gram = NULL; + for ( int m = 1; m <= 2; ++m ){ + printf("--------------------------------------------------------\n"); + bigram.load(m, gram); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + gram->search(&range,array); + for ( size_t i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete gram; + } + + printf("--------------------------------------------------------\n"); + assert(single_gram.get_total_freq(freq)); + printf("total_freq:%d\n", freq); + + g_array_free(array, TRUE); + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(items); + + printf("----------------------system----------------------------\n"); + for ( size_t i = 0; i < items->len; ++i){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + printf("item:%d\n", *token); + } + + assert(bigram.load_db("/tmp/test.db")); + assert(bigram.save_db("/tmp/test.db")); + + g_array_free(items, TRUE); + + /* mask out all index items. */ + bigram.mask_out(0x0, 0x0); + + return 0; +} diff --git a/tests/storage/test_parser2.cpp b/tests/storage/test_parser2.cpp new file mode 100644 index 0000000..638cd96 --- /dev/null +++ b/tests/storage/test_parser2.cpp @@ -0,0 +1,144 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "timer.h" +#include <errno.h> +#include <stdio.h> +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include "pinyin_parser2.h" + + +static const gchar * parsername = ""; +static gboolean incomplete = FALSE; + +static GOptionEntry entries[] = +{ + {"parser", 'p', 0, G_OPTION_ARG_STRING, &parsername, "parser", "fullpinyin doublepinyin chewing"}, + {"incomplete", 'i', 0, G_OPTION_ARG_NONE, &incomplete, "incomplete pinyin", NULL}, + {NULL} +}; + +#if 0 + " -s <scheme> specify scheme for doublepinyin/chewing.\n" + " schemes for doublepinyin: zrm, ms, ziguang, abc, pyjj, xhe.\n" + " schemes for chewing: standard, ibm, ginyieh, eten.\n" +#endif + + +size_t bench_times = 1000; + +using namespace pinyin; + + +int main(int argc, char * argv[]) { + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- test pinyin parser"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE | USE_RESPLIT_TABLE; + if (incomplete) + options |= PINYIN_INCOMPLETE | CHEWING_INCOMPLETE; + + PinyinParser2 * parser = NULL; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + /* create the parser */ + if (strcmp("fullpinyin", parsername) == 0) { + parser = new FullPinyinParser2(); + } else if (strcmp("doublepinyin", parsername) == 0) { + parser = new DoublePinyinParser2(); + } else if (strcmp("chewing", parsername) == 0) { + parser = new ChewingParser2(); + } + + if (!parser) + parser = new FullPinyinParser2(); + + char* linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, stdin)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + +#if 0 + ChewingKey key; + bool success = parser->parse_one_key(options, key, + linebuf, strlen(linebuf)); + if (success) { + gchar * pinyins = key.get_pinyin_string(); + printf("pinyin:%s\n", pinyins); + g_free(pinyins); + } +#endif + +#if 1 + int len = 0; + guint32 start_time = record_time(); + for ( size_t i = 0; i < bench_times; ++i) + len = parser->parse(options, keys, key_rests, + linebuf, strlen(linebuf)); + + print_time(start_time, bench_times); + + printf("parsed %d chars, %d keys.\n", len, keys->len); + + assert(keys->len == key_rests->len); + + for (size_t i = 0; i < keys->len; ++i) { + ChewingKey * key = + &g_array_index(keys, ChewingKey, i); + ChewingKeyRest * key_rest = + &g_array_index(key_rests, ChewingKeyRest, i); + + gchar * pinyins = key->get_pinyin_string(); + printf("%s %d %d\t", pinyins, + key_rest->m_raw_begin, key_rest->m_raw_end); + g_free(pinyins); + } + printf("\n"); +#endif + + } + + if (linebuf) + free(linebuf); + + delete parser; + + g_array_free(key_rests, TRUE); + g_array_free(keys, TRUE); + + return 0; +} diff --git a/tests/storage/test_phrase_index.cpp b/tests/storage/test_phrase_index.cpp new file mode 100644 index 0000000..79a3ca4 --- /dev/null +++ b/tests/storage/test_phrase_index.cpp @@ -0,0 +1,122 @@ +#include "timer.h" +#include <stdio.h> +#include <errno.h> +#include "pinyin_internal.h" +#include "tests_helper.h" + +size_t bench_times = 100000; + +int main(int argc, char * argv[]){ + PhraseItem phrase_item; + ucs4_t string1 = 2; + ChewingKey key1 = ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ENG); + ChewingKey key2 = ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ANG); + + + phrase_item.set_phrase_string(1, &string1); + phrase_item.add_pronunciation(&key1, 100); + phrase_item.add_pronunciation(&key2, 300); + + assert(phrase_item.get_phrase_length() == 1); + + ChewingKey key3; + guint32 freq; + phrase_item.get_nth_pronunciation(0, &key3, freq); + assert(key3 == key1); + assert(freq == 100); + phrase_item.get_nth_pronunciation(1, &key3, freq); + assert(key3 == key2); + assert(freq == 300); + + pinyin_option_t options = 0; + gfloat poss = phrase_item.get_pronunciation_possibility(options, &key1); + printf("pinyin possiblitiy:%f\n", poss); + + assert(phrase_item.get_unigram_frequency() == 0); + + ucs4_t string2; + phrase_item.get_phrase_string(&string2); + assert(string1 == string2); + + FacadePhraseIndex phrase_index_test; + assert(!phrase_index_test.add_phrase_item(1, &phrase_item)); + + MemoryChunk* chunk = new MemoryChunk; + assert(phrase_index_test.store(0, chunk)); + assert(phrase_index_test.load(0, chunk)); + + PhraseItem item2; + guint32 time = record_time(); + for ( size_t i = 0; i < bench_times; ++i){ + phrase_index_test.get_phrase_item(1, item2); + assert(item2.get_unigram_frequency() == 0); + assert(item2.get_n_pronunciation() == 2); + assert(item2.get_phrase_length() == 1); + assert(item2.get_pronunciation_possibility(options, &key2) == 0.75); + } + print_time(time, bench_times); + + { + PhraseItem item3; + phrase_index_test.get_phrase_item(1, item3); + item3.increase_pronunciation_possibility(options, &key1, 200); + assert(item3.get_pronunciation_possibility(options, &key1) == 0.5) ; + } + + { + PhraseItem item5; + phrase_index_test.get_phrase_item(1, item5); + gfloat poss = item5.get_pronunciation_possibility(options, &key1); + printf("pinyin poss:%f\n", poss); + assert(poss == 0.5); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_table(phrase_files, NULL, NULL, &phrase_index)) + exit(ENOENT); + + phrase_index.compact(); + + MemoryChunk* store1 = new MemoryChunk; + phrase_index.store(1, store1); + phrase_index.load(1, store1); + + MemoryChunk* store2 = new MemoryChunk; + phrase_index.store(2, store2); + phrase_index.load(2, store2); + + phrase_index.compact(); + + phrase_index.get_phrase_item(16870553, item2); + assert( item2.get_phrase_length() == 14); + assert( item2.get_n_pronunciation() == 1); + + ucs4_t buf[1024]; + item2.get_phrase_string(buf); + char * string = g_ucs4_to_utf8( buf, 14, NULL, NULL, NULL); + printf("%s\n", string); + g_free(string); + + guint32 delta = 3; + phrase_index.add_unigram_frequency(16870553, delta); + phrase_index.get_phrase_item(16870553, item2); + assert( item2.get_unigram_frequency() == 3); + + phrase_index.get_phrase_item(16777222, item2); + assert(item2.get_phrase_length() == 1); + assert(item2.get_n_pronunciation() == 2); + + return 0; +} diff --git a/tests/storage/test_phrase_index_logger.cpp b/tests/storage/test_phrase_index_logger.cpp new file mode 100644 index 0000000..c423c40 --- /dev/null +++ b/tests/storage/test_phrase_index_logger.cpp @@ -0,0 +1,67 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "pinyin_internal.h" + + +/* TODO: check whether gb_char.bin and gb_char2.bin should be the same. */ + +int main(int argc, char * argv[]){ + FacadePhraseIndex phrase_index; + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + PhraseIndexRange range; + assert(ERROR_OK == phrase_index.get_range(1, range)); + for (size_t i = range.m_range_begin; i < range.m_range_end; ++i ) { + phrase_index.add_unigram_frequency(i, 1); + } + + printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq()); + + MemoryChunk * new_chunk = new MemoryChunk; + phrase_index.store(1, new_chunk); + new_chunk->save("/tmp/gb_char.bin"); + delete new_chunk; + + chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + new_chunk = new MemoryChunk; + assert(phrase_index.diff(1, chunk, new_chunk)); + new_chunk->save("/tmp/gb_char.dbin"); + delete new_chunk; + + chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + new_chunk = new MemoryChunk; + new_chunk->load("/tmp/gb_char.dbin"); + assert(phrase_index.merge(1, new_chunk)); + chunk = new MemoryChunk; + phrase_index.store(1, chunk); + chunk->save("/tmp/gb_char2.bin"); + delete chunk; + + printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq()); + + return 0; +} diff --git a/tests/storage/test_phrase_table.cpp b/tests/storage/test_phrase_table.cpp new file mode 100644 index 0000000..a9c8ed5 --- /dev/null +++ b/tests/storage/test_phrase_table.cpp @@ -0,0 +1,86 @@ +#include "timer.h" +#include <string.h> +#include "pinyin_internal.h" +#include "tests_helper.h" + +size_t bench_times = 1000; + +int main(int argc, char * argv[]){ + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + PhraseLargeTable2 largetable; + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_table(phrase_files, NULL, &largetable, &phrase_index)) + exit(ENOENT); + + MemoryChunk * chunk = new MemoryChunk; + largetable.store(chunk); + largetable.load(chunk); + + char* linebuf = NULL; size_t size = 0; ssize_t read; + while ((read = getline(&linebuf, &size, stdin)) != -1) { + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + glong phrase_len = g_utf8_strlen(linebuf, -1); + ucs4_t * new_phrase = g_utf8_to_ucs4(linebuf, -1, NULL, NULL, NULL); + + if (0 == phrase_len) + continue; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index.prepare_tokens(tokens); + + guint32 start = record_time(); + for (size_t i = 0; i < bench_times; ++i){ + phrase_index.clear_tokens(tokens); + largetable.search(phrase_len, new_phrase, tokens); + } + print_time(start, bench_times); + + phrase_index.clear_tokens(tokens); + int retval = largetable.search(phrase_len, new_phrase, tokens); + + if (retval & SEARCH_OK) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * array = tokens[i]; + if (NULL == array) + continue; + + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = g_array_index + (array, phrase_token_t, k); + + printf("token:%d\t", token); + } + } + printf("\n"); + } + + phrase_index.destroy_tokens(tokens); + g_free(new_phrase); + } + + if ( linebuf ) + free(linebuf); + + /* mask out all index items. */ + largetable.mask_out(0x0, 0x0); + + return 0; +} diff --git a/tests/storage/test_table_info.cpp b/tests/storage/test_table_info.cpp new file mode 100644 index 0000000..68b4735 --- /dev/null +++ b/tests/storage/test_table_info.cpp @@ -0,0 +1,84 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2013 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include "pinyin_internal.h" + + +int main(int argc, char * argv[]) { + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + printf("lambda:%f\n", system_table_info.get_lambda()); + + size_t i; + for (i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = + system_table_info.get_table_info() + i; + + assert(i == table_info->m_dict_index); + printf("table index:%d\n", table_info->m_dict_index); + + switch(table_info->m_file_type) { + case NOT_USED: + printf("not used.\n"); + break; + + case SYSTEM_FILE: + printf("system file:%s %s %s.\n", table_info->m_table_filename, + table_info->m_system_filename, table_info->m_user_filename); + break; + + case DICTIONARY: + printf("dictionary:%s %s %s.\n", table_info->m_table_filename, + table_info->m_system_filename, table_info->m_user_filename); + break; + + case USER_FILE: + printf("user file:%s.\n", table_info->m_user_filename); + break; + + default: + assert(false); + } + } + + UserTableInfo user_table_info; + retval = user_table_info.is_conform(&system_table_info); + assert(!retval); + + user_table_info.make_conform(&system_table_info); + retval = user_table_info.is_conform(&system_table_info); + assert(retval); + + assert(user_table_info.save("/tmp/user.conf")); + assert(user_table_info.load("/tmp/user.conf")); + + retval = user_table_info.is_conform(&system_table_info); + assert(retval); + + return 0; +} |