summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.github/workflows/make-check.yml30
-rw-r--r--CMakeLists.txt5
-rw-r--r--ChangeLog144
-rw-r--r--Makefile.am5
-rw-r--r--NEWS177
-rw-r--r--configure.ac22
-rw-r--r--data/CMakeLists.txt23
-rw-r--r--data/Makefile.am6
-rw-r--r--data/table.conf.in29
-rw-r--r--libpinyin.pc.in1
-rw-r--r--libzhuyin.pc.in1
-rw-r--r--scripts2/fullpinyintable.py23
-rw-r--r--scripts2/options.py72
-rw-r--r--scripts2/specials.txt6
-rw-r--r--scripts2/specialtable.py2
-rw-r--r--src/Makefile.am81
-rw-r--r--src/include/Makefile.am8
-rw-r--r--src/include/memory_chunk.h38
-rw-r--r--src/include/pinyin_utils.h32
-rw-r--r--src/include/unaligned_memory.h61
-rw-r--r--src/libpinyin.exp74
-rw-r--r--src/libzhuyin.exp52
-rw-r--r--src/lookup/Makefile.am34
-rw-r--r--src/lookup/phonetic_lookup.cpp2
-rw-r--r--src/lookup/phonetic_lookup.h35
-rw-r--r--src/lookup/pinyin_lookup2.cpp11
-rw-r--r--src/pinyin.cpp542
-rw-r--r--src/pinyin.h33
-rw-r--r--src/pinyin_internal.h1
-rw-r--r--src/storage/Makefile.am16
-rw-r--r--src/storage/bdb_utils.h4
-rw-r--r--src/storage/chewing_large_table.cpp21
-rw-r--r--src/storage/chewing_large_table.h1
-rw-r--r--src/storage/chewing_large_table2.cpp11
-rw-r--r--src/storage/chewing_large_table2.h82
-rw-r--r--src/storage/chewing_large_table2_bdb.cpp158
-rw-r--r--src/storage/chewing_large_table2_bdb.h17
-rw-r--r--src/storage/chewing_large_table2_kyotodb.cpp183
-rw-r--r--src/storage/chewing_large_table2_kyotodb.h17
-rw-r--r--src/storage/facade_chewing_table2.h26
-rw-r--r--src/storage/facade_phrase_table3.h25
-rw-r--r--src/storage/flexible_ngram.h1
-rw-r--r--src/storage/flexible_ngram_bdb.h6
-rw-r--r--src/storage/flexible_ngram_kyotodb.h8
-rw-r--r--src/storage/kyotodb_utils.h2
-rw-r--r--src/storage/ngram.cpp46
-rw-r--r--src/storage/ngram.h1
-rw-r--r--src/storage/ngram_bdb.cpp12
-rw-r--r--src/storage/ngram_kyotodb.cpp26
-rw-r--r--src/storage/phonetic_key_matrix.cpp94
-rw-r--r--src/storage/phonetic_key_matrix.h5
-rw-r--r--src/storage/phrase_index.cpp32
-rw-r--r--src/storage/phrase_index.h13
-rw-r--r--src/storage/phrase_index_logger.h2
-rw-r--r--src/storage/phrase_large_table2.cpp22
-rw-r--r--src/storage/phrase_large_table3.cpp5
-rw-r--r--src/storage/phrase_large_table3.h9
-rw-r--r--src/storage/phrase_large_table3_bdb.cpp100
-rw-r--r--src/storage/phrase_large_table3_bdb.h2
-rw-r--r--src/storage/phrase_large_table3_kyotodb.cpp113
-rw-r--r--src/storage/phrase_large_table3_kyotodb.h2
-rw-r--r--src/storage/pinyin_custom2.h35
-rw-r--r--src/storage/pinyin_custom3.h142
-rw-r--r--src/storage/pinyin_parser2.cpp60
-rw-r--r--src/storage/pinyin_parser2.h9
-rw-r--r--src/storage/pinyin_parser_table.h1320
-rw-r--r--src/storage/pinyin_phrase3.h4
-rw-r--r--src/storage/special_table.h3
-rw-r--r--src/storage/table_info.cpp49
-rw-r--r--src/storage/table_info.h8
-rw-r--r--src/storage/zhuyin_parser2.cpp25
-rw-r--r--src/storage/zhuyin_parser2.h8
-rw-r--r--src/zhuyin.cpp50
-rw-r--r--src/zhuyin.h4
-rw-r--r--tests/Makefile.am10
-rw-r--r--tests/include/CMakeLists.txt2
-rw-r--r--tests/include/Makefile.am2
-rw-r--r--tests/include/test_memory_chunk.cpp8
-rw-r--r--tests/lookup/Makefile.am9
-rw-r--r--tests/lookup/test_pinyin_lookup.cpp2
-rw-r--r--tests/storage/CMakeLists.txt6
-rw-r--r--tests/storage/Makefile.am9
-rw-r--r--tests/storage/test_flexible_ngram.cpp20
-rw-r--r--tests/storage/test_ngram.cpp20
-rw-r--r--tests/storage/test_phrase_index.cpp6
-rw-r--r--tests/storage/test_phrase_index_logger.cpp6
-rw-r--r--tests/storage/test_table_info.cpp6
-rw-r--r--tests/test_pinyin.cpp4
-rw-r--r--utils/segment/Makefile.am17
-rw-r--r--utils/storage/Makefile.am9
-rw-r--r--utils/storage/gen_binary_files.cpp4
-rw-r--r--utils/storage/import_interpolation.cpp45
-rw-r--r--utils/training/Makefile.am19
-rw-r--r--utils/training/estimate_interpolation.cpp4
-rw-r--r--utils/training/estimate_k_mixture_model.cpp10
-rw-r--r--utils/training/eval_correction_rate.cpp6
-rw-r--r--utils/training/export_k_mixture_model.cpp4
-rw-r--r--utils/training/gen_deleted_ngram.cpp4
-rw-r--r--utils/training/gen_k_mixture_model.cpp54
-rw-r--r--utils/training/gen_ngram.cpp4
-rw-r--r--utils/training/gen_unigram.cpp3
-rw-r--r--utils/training/import_k_mixture_model.cpp40
-rw-r--r--utils/training/k_mixture_model.h2
-rw-r--r--utils/training/k_mixture_model_to_interpolation.cpp28
-rw-r--r--utils/training/merge_k_mixture_model.cpp16
-rw-r--r--utils/training/prune_k_mixture_model.cpp8
-rw-r--r--utils/training/validate_k_mixture_model.cpp6
-rw-r--r--utils/utils_helper.h11
108 files changed, 3138 insertions, 1595 deletions
diff --git a/.github/workflows/make-check.yml b/.github/workflows/make-check.yml
new file mode 100644
index 0000000..47a83b6
--- /dev/null
+++ b/.github/workflows/make-check.yml
@@ -0,0 +1,30 @@
+name: C/C++ CI
+
+on:
+ push:
+ branches: [ main ]
+ pull_request:
+ branches: [ main ]
+
+jobs:
+ build:
+
+ runs-on: ubuntu-latest
+
+ container:
+ image: fedora:rawhide
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: install build dependency
+ run: sudo dnf install -y gcc-c++ libdb-devel glib2-devel make gnome-common wget
+ - name: autoconf
+ run: ./autogen.sh
+ - name: configure
+ run: ./configure --with-dbm=BerkeleyDB
+ - name: make
+ run: make V=1 VERBOSE=1
+ - name: make check
+ run: make check
+ - name: make distcheck
+ run: make distcheck
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 78761aa..4f62a9f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,6 +71,7 @@ if (DB_FOUND)
include_directories ( ${DB_INCLUDE_DIR} )
SET (LIBS ${LIBS} ${DB_LIBRARIES})
SET (HAVE_BERKELEY_DB 1)
+ SET (DATABASE_FORMAT "BerkeleyDB")
SET (CMAKE_CXX_LINK_EXECUTABLE
"${CMAKE_CXX_LINK_EXECUTABLE} ${LIBS}")
endif (DB_FOUND)
@@ -82,6 +83,7 @@ if (NOT HAVE_BERKELEY_DB)
include_directories ( ${KyotoCabinet_INCLUDE_PATH} )
SET (LIBS ${LIBS} ${KyotoCabinet_LIBRARY})
SET (HAVE_KYOTO_CABINET 1)
+ SET (DATABASE_FORMAT "KyotoCabinet")
SET (CMAKE_CXX_LINK_EXECUTABLE
"${CMAKE_CXX_LINK_EXECUTABLE} ${LIBS}")
endif (KyotoCabinet_FOUND)
@@ -123,7 +125,7 @@ if (DEFINED SYSCONF_INSTALL_DIR)
endif (DEFINED SYSCONF_INSTALL_DIR)
set (DIR_SHARE_LIBPINYIN ${DIR_SHARE}/libpinyin)
-set (DIR_INCLUDE_LIBPINYIN ${DIR_INCLUDE}/libpinyin-${LIBPINYIN_BINARY_VERSION})
+set (DIR_INCLUDE_LIBPINYIN ${DIR_INCLUDE}/libpinyin-${VERSION})
######## Configuration
@@ -162,6 +164,7 @@ if (CMAKE_BUILD_TYPE MATCHES Debug)
endif (CMAKE_BUILD_TYPE MATCHES Debug)
include_directories(
+ ${CMAKE_BINARY_DIR}
${GLIB2_INCLUDE_DIR}
${PROJECT_SOURCE_DIR}/
${PROJECT_SOURCE_DIR}/src
diff --git a/ChangeLog b/ChangeLog
index ab12deb..e69de29 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,144 +0,0 @@
-version 2.2.2
-* minor fixes
-
-version 2.2.1
-* fixes predicted candidates
-
-version 2.2.0
-* bug fixes
-
-version 2.1.91
-* fixes zhuyin parsers;
-
-version 2.1.0
-* support sort option in pinyin_guess_candidates function;
-
-version 2.0.92
-* reduce memory consumption after imported user dictionary;
-
-version 2.0.91
-* merge libzhuyin code;
-
-version 2.0.0
-* the first official release of 2.0.x;
-* fixes autoconf;
-
-version 1.9.92
-* fixes crash in double pinyin;
-
-version 1.9.91
-* multiple sentence candidates;
-
-version 1.7.0
-* fixes build on FreeBSD;
-* update cmake files;
-
-version 1.6.91
-* change license to GPLv3+;
-* import open-gram dictionary and remove pinyin tones;
-* add some checks when load data from file;
-
-version 1.6.0
-* bug fixes.
-
-version 1.5.91
-* change pinyin/phrase tables to use dbm.
-* enhance pinyin key representation and pinyin parsers.
-
-version 1.2.0
-* bug fixes.
-
-version 1.1.91
-* support Kyoto Cabinet as alternative to Berkeley DB.
-* improve multiple dictionaries support feature.
-
-version 1.1.0
-* support to export user phrases.
-
-version 1.0.0
-* the first official release of 1.0.x.
-
-version 0.9.94
-* bug fixes.
-
-version 0.9.93
-* fixes libpinyin issues from coverity scan report.
-
-version 0.9.92
-* bug fixes.
-
-version 0.9.91
-* code re-factor.
-
-version 0.9.0
-* the first official release of 0.9.x.
-* fixes import dictionary.
-
-version 0.8.93
-* add back pinyin_clear_constraint.
-
-version 0.8.92
-* fixes model data.
-
-version 0.8.91
-* multiple dictioniares and user dictionary support.
-
-version 0.8.1
-* bug fixes.
-
-version 0.8.0
-* the first official release of 0.8.x.
-
-version 0.7.92
-* re-factor PhraseLookup class.
-* all tests passed simple valgrind memory check.
-
-version 0.7.91
-* simplify PinyinLookup class.
-
-version 0.7.1
-* add API to lookup pinyin for characters.
-
-version 0.7.0
-* the first official release of 0.7.x.
-
-version 0.6.92
-* draft support for multiple professional phrase libraries.
-
-version 0.6.91
-* support ucs4 characters.
-* support guess sentence with prefix.
-* initially support fuzzy pinyin segment.
-
-version 0.6.0
-* the first official release of 0.6.x.
-
-version 0.5.92
-* fixes new parsers and chewing large table.
-* improves pinyin_save.
-
-version 0.5.91
-* some code re-factor and simplify.
-* fixes the self-learning work around.
-
-version 0.5.0
-* the first official release of 0.5.x.
-
-version 0.4.93
-* fixes some bugs in new parsers.
-
-version 0.4.92
-* enable parallel make.
-
-version 0.4.91
-* New parsers for full pinyin/double pinyin/chewing.
- * libpinyin now fully supports all pinyin auto corrections of
-ibus-pinyin.
- * libpinyin now better supports an/ang, en/eng, in/ing fuzzy
-pinyin match.
-
-version 0.3.0
-* the first official release of 0.3.x.
-
-version 0.2.99
-* import from pinyin.
diff --git a/Makefile.am b/Makefile.am
index 6266b2d..fbaefed 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -31,3 +31,8 @@ pkgconfig_DATA = libpinyin.pc
if ENABLE_LIBZHUYIN
pkgconfig_DATA += libzhuyin.pc
endif
+
+dist-hook:
+ if test -d .git ; then \
+ git log --name-status --date=iso > $(distdir)/ChangeLog ; \
+ fi
diff --git a/NEWS b/NEWS
index e69de29..ff89d75 100644
--- a/NEWS
+++ b/NEWS
@@ -0,0 +1,177 @@
+version 2.8.1
+* bug fixes
+
+version 2.8.0
+* bug fixes for ARMv7
+
+version 2.7.92
+* bug fixes
+
+version 2.7.91
+* improve suggestion candidates
+* support longer candidates
+
+version 2.6.2
+* bug fixes
+
+version 2.6.1
+* bug fixes
+
+version 2.6.0
+* bug fixes
+
+version 2.4.92
+* update pinyin data
+* bug fixes
+
+version 2.4.91
+* improve full pinyin auto correction
+* bug fixes
+
+version 2.3.0
+* update pinyin data
+
+version 2.2.2
+* minor fixes
+
+version 2.2.1
+* fixes predicted candidates
+
+version 2.2.0
+* bug fixes
+
+version 2.1.91
+* fixes zhuyin parsers;
+
+version 2.1.0
+* support sort option in pinyin_guess_candidates function;
+
+version 2.0.92
+* reduce memory consumption after imported user dictionary;
+
+version 2.0.91
+* merge libzhuyin code;
+
+version 2.0.0
+* the first official release of 2.0.x;
+* fixes autoconf;
+
+version 1.9.92
+* fixes crash in double pinyin;
+
+version 1.9.91
+* multiple sentence candidates;
+
+version 1.7.0
+* fixes build on FreeBSD;
+* update cmake files;
+
+version 1.6.91
+* change license to GPLv3+;
+* import open-gram dictionary and remove pinyin tones;
+* add some checks when load data from file;
+
+version 1.6.0
+* bug fixes.
+
+version 1.5.91
+* change pinyin/phrase tables to use dbm.
+* enhance pinyin key representation and pinyin parsers.
+
+version 1.2.0
+* bug fixes.
+
+version 1.1.91
+* support Kyoto Cabinet as alternative to Berkeley DB.
+* improve multiple dictionaries support feature.
+
+version 1.1.0
+* support to export user phrases.
+
+version 1.0.0
+* the first official release of 1.0.x.
+
+version 0.9.94
+* bug fixes.
+
+version 0.9.93
+* fixes libpinyin issues from coverity scan report.
+
+version 0.9.92
+* bug fixes.
+
+version 0.9.91
+* code re-factor.
+
+version 0.9.0
+* the first official release of 0.9.x.
+* fixes import dictionary.
+
+version 0.8.93
+* add back pinyin_clear_constraint.
+
+version 0.8.92
+* fixes model data.
+
+version 0.8.91
+* multiple dictioniares and user dictionary support.
+
+version 0.8.1
+* bug fixes.
+
+version 0.8.0
+* the first official release of 0.8.x.
+
+version 0.7.92
+* re-factor PhraseLookup class.
+* all tests passed simple valgrind memory check.
+
+version 0.7.91
+* simplify PinyinLookup class.
+
+version 0.7.1
+* add API to lookup pinyin for characters.
+
+version 0.7.0
+* the first official release of 0.7.x.
+
+version 0.6.92
+* draft support for multiple professional phrase libraries.
+
+version 0.6.91
+* support ucs4 characters.
+* support guess sentence with prefix.
+* initially support fuzzy pinyin segment.
+
+version 0.6.0
+* the first official release of 0.6.x.
+
+version 0.5.92
+* fixes new parsers and chewing large table.
+* improves pinyin_save.
+
+version 0.5.91
+* some code re-factor and simplify.
+* fixes the self-learning work around.
+
+version 0.5.0
+* the first official release of 0.5.x.
+
+version 0.4.93
+* fixes some bugs in new parsers.
+
+version 0.4.92
+* enable parallel make.
+
+version 0.4.91
+* New parsers for full pinyin/double pinyin/chewing.
+ * libpinyin now fully supports all pinyin auto corrections of
+ibus-pinyin.
+ * libpinyin now better supports an/ang, en/eng, in/ing fuzzy
+pinyin match.
+
+version 0.3.0
+* the first official release of 0.3.x.
+
+version 0.2.99
+* import from pinyin.
diff --git a/configure.ac b/configure.ac
index cb39888..80c1040 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5,12 +5,12 @@
# if not 1, append datestamp to the version number.
m4_define([libpinyin_released], [1])
m4_define([libpinyin_major_version], [2])
-m4_define([libpinyin_minor_version], [3])
-m4_define([libpinyin_micro_version], [0])
+m4_define([libpinyin_minor_version], [8])
+m4_define([libpinyin_micro_version], [1])
m4_define(libpinyin_maybe_datestamp,
m4_esyscmd([if test x]libpinyin_released[ != x1; then date +.%Y%m%d | tr -d '\n\r'; fi]))
-m4_define([libpinyin_abi_current], [13])
+m4_define([libpinyin_abi_current], [15])
m4_define([libpinyin_abi_revision], [0])
m4_define([libpinyin_version],
@@ -44,11 +44,19 @@ AC_PROG_CPP
AC_PROG_INSTALL
AC_PROG_LN_S
AC_PROG_MAKE_SET
+AC_PROG_LD
AC_GNU_SOURCE
AX_CXX_COMPILE_STDCXX([11])
+# Detect whether LLVM ld is being used
+using_lld=no
+if `$LD -v 2>&1 | grep 'LLVM' >/dev/null 2>&1` ; then
+ using_lld=yes
+fi
+AM_CONDITIONAL([LLVMLD], [test "$using_lld" = "yes"])
+
# Init libtool
AC_PROG_LIBTOOL
AC_SUBST(LIBTOOL_DEPS)
@@ -92,9 +100,9 @@ AC_ARG_WITH(dbm,
if test x"$DBM" = x"BerkeleyDB"; then
# Check Berkeley DB
- AC_CHECK_HEADER([db.h], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4]))
+ AC_CHECK_HEADER([db.h], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 5]))
- AC_SEARCH_LIBS([db_create], [db], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4]))
+ AC_SEARCH_LIBS([db_create], [db], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 5]))
AC_DEFINE([HAVE_BERKELEY_DB], [], [Have Berkeley DB.])
fi
@@ -112,6 +120,9 @@ fi
AM_CONDITIONAL([KYOTOCABINET], [test x"$DBM" = x"KyotoCabinet"])
+DATABASE_FORMAT="$DBM"
+AC_SUBST(DATABASE_FORMAT)
+
# --enable-libzhuyin
AC_ARG_ENABLE(libzhuyin,
AC_HELP_STRING([--enable-libzhuyin],
@@ -128,6 +139,7 @@ AC_CONFIG_FILES([libpinyin.pc
Makefile
doc/Makefile
data/Makefile
+ data/table.conf
src/Makefile
src/include/Makefile
src/storage/Makefile
diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt
index 2c7d09b..d3df1d0 100644
--- a/data/CMakeLists.txt
+++ b/data/CMakeLists.txt
@@ -45,12 +45,13 @@ add_custom_command(
${CMAKE_SOURCE_DIR}/data/gb_char.table
${CMAKE_SOURCE_DIR}/data/gbk_char.table
${CMAKE_SOURCE_DIR}/data/interpolation2.text
+ ${CMAKE_SOURCE_DIR}/data/table.conf
COMMENT
"Downloading textual model data..."
COMMAND
- wget http://downloads.sourceforge.net/libpinyin/models/model17.text.tar.gz
+ wget http://downloads.sourceforge.net/libpinyin/models/model19.text.tar.gz
COMMAND
- tar xvf model17.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data
+ tar xvf model19.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data
)
add_custom_command(
@@ -67,9 +68,9 @@ add_custom_command(
${gen_binary_files_BIN} --table-dir ${CMAKE_SOURCE_DIR}/data
DEPENDS
gen_binary_files
- ${CMAKE_SOURCE_DIR}/data/gb_char.table
- ${CMAKE_SOURCE_DIR}/data/gbk_char.table
- ${CMAKE_SOURCE_DIR}/data/table.conf
+ ${CMAKE_SOURCE_DIR}/data/gb_char.table
+ ${CMAKE_SOURCE_DIR}/data/gbk_char.table
+ ${CMAKE_SOURCE_DIR}/data/table.conf
)
add_custom_command(
@@ -83,7 +84,11 @@ add_custom_command(
${gen_unigram_BIN} --table-dir ${CMAKE_SOURCE_DIR}/data
DEPENDS
import_interpolation
- ${CMAKE_SOURCE_DIR}/data/interpolation2.text
+ ${CMAKE_SOURCE_DIR}/data/interpolation2.text
+ ${CMAKE_BINARY_DIR}/data/gb_char.bin
+ ${CMAKE_BINARY_DIR}/data/gbk_char.bin
+ ${CMAKE_BINARY_DIR}/data/phrase_index.bin
+ ${CMAKE_BINARY_DIR}/data/pinyin_index.bin
)
install(
@@ -98,3 +103,9 @@ set_directory_properties(
ADDITIONAL_MAKE_CLEAN_FILES
${BINARY_MODEL_DATA_FILES}
)
+
+configure_file(
+ table.conf.in
+ table.conf
+ @ONLY
+)
diff --git a/data/Makefile.am b/data/Makefile.am
index 2e1d40d..b8c6716 100644
--- a/data/Makefile.am
+++ b/data/Makefile.am
@@ -38,7 +38,7 @@ binary_model_data = phrase_index.bin pinyin_index.bin \
MAINTAINERCLEANFILES = Makefile.in
EXTRA_DIST = $(textual_model_data) \
- table.conf
+ table.conf.in
libpinyin_db_DATA = $(binary_model_data) \
table.conf
@@ -48,8 +48,8 @@ libpinyin_dbdir = $(libdir)/libpinyin/data
CLEANFILES = $(binary_model_data)
interpolation2.text:
- wget http://downloads.sourceforge.net/libpinyin/models/model17.text.tar.gz
- tar xvf model17.text.tar.gz -C $(top_srcdir)/data
+ wget http://downloads.sourceforge.net/libpinyin/models/model19.text.tar.gz
+ tar xvf model19.text.tar.gz -C $(top_srcdir)/data
$(tablefiles) table.conf: interpolation2.text
diff --git a/data/table.conf.in b/data/table.conf.in
new file mode 100644
index 0000000..26a12b0
--- /dev/null
+++ b/data/table.conf.in
@@ -0,0 +1,29 @@
+binary format version:7
+model data version:14
+lambda parameter:0.312699
+
+source table format:pinyin
+database format:@DATABASE_FORMAT@
+
+default RESERVED NULL NULL NULL NOT_USED
+default GB_DICTIONARY gb_char.table gb_char.bin gb_char.dbin SYSTEM_FILE
+default GBK_DICTIONARY gbk_char.table gbk_char.bin gbk_char.dbin SYSTEM_FILE
+default OPENGRAM_DICTIONARY opengram.table opengram.bin opengram.dbin SYSTEM_FILE
+default MERGED_DICTIONARY merged.table merged.bin merged.dbin SYSTEM_FILE
+default ADDON_DICTIONARY NULL NULL addon.bin USER_FILE
+default NETWORK_DICTIONARY NULL NULL network.bin USER_FILE
+default USER_DICTIONARY NULL NULL user.bin USER_FILE
+
+addon 4 art.table art.bin NULL DICTIONARY
+addon 5 culture.table culture.bin NULL DICTIONARY
+addon 6 economy.table economy.bin NULL DICTIONARY
+addon 7 geology.table geology.bin NULL DICTIONARY
+addon 8 history.table history.bin NULL DICTIONARY
+
+addon 9 life.table life.bin NULL DICTIONARY
+addon 10 nature.table nature.bin NULL DICTIONARY
+addon 11 people.table people.bin NULL DICTIONARY
+addon 12 science.table science.bin NULL DICTIONARY
+addon 13 society.table society.bin NULL DICTIONARY
+addon 14 sport.table sport.bin NULL DICTIONARY
+addon 15 technology.table technology.bin NULL DICTIONARY
diff --git a/libpinyin.pc.in b/libpinyin.pc.in
index ea08282..fef958d 100644
--- a/libpinyin.pc.in
+++ b/libpinyin.pc.in
@@ -3,6 +3,7 @@ exec_prefix=@exec_prefix@
libdir=@libdir@
includedir=@includedir@
pkgdatadir=@libdir@/libpinyin
+database_format=@DATABASE_FORMAT@
libpinyinincludedir=${includedir}/libpinyin-@VERSION@
libpinyin_binary_version=@LIBPINYIN_BINARY_VERSION@
diff --git a/libzhuyin.pc.in b/libzhuyin.pc.in
index 66ae943..f248d1c 100644
--- a/libzhuyin.pc.in
+++ b/libzhuyin.pc.in
@@ -2,6 +2,7 @@ prefix=@prefix@
exec_prefix=@exec_prefix@
libdir=@libdir@
includedir=@includedir@
+database_format=@DATABASE_FORMAT@
libzhuyinincludedir=${includedir}/libpinyin-@VERSION@
libzhuyin_binary_version=@LIBPINYIN_BINARY_VERSION@
diff --git a/scripts2/fullpinyintable.py b/scripts2/fullpinyintable.py
index 6f39acd..4af94a9 100644
--- a/scripts2/fullpinyintable.py
+++ b/scripts2/fullpinyintable.py
@@ -100,6 +100,7 @@ def gen_pinyin_list():
def gen_pinyins():
#generate all pinyins
+ distance = 0
for pinyin in pinyin_list:
flags = []
if pinyin in PINYIN_ZHUYIN_MAP.keys():
@@ -113,7 +114,7 @@ def gen_pinyins():
if zhuyin in chewing.CHEWING_ASCII_INITIAL_MAP and \
pinyin not in ZHUYIN_SPECIAL_INITIAL_SET_IN_PINYIN_FORM:
flags.append("ZHUYIN_INCOMPLETE")
- yield pinyin, pinyin, zhuyin, flags, get_chewing(pinyin)
+ yield pinyin, pinyin, zhuyin, flags, get_chewing(pinyin), distance
def get_shengmu_chewing(shengmu):
@@ -127,6 +128,7 @@ def get_shengmu_chewing(shengmu):
def gen_shengmu():
#generate all shengmu
+ distance = 0
for shengmu in shengmu_list:
if shengmu in pinyin_list:
continue
@@ -135,12 +137,12 @@ def gen_shengmu():
chewing_initial = chewing_key[0]
if chewing_initial in chewing.ASCII_CHEWING_INITIAL_MAP:
chewing_initial = chewing.ASCII_CHEWING_INITIAL_MAP[chewing_initial]
- yield shengmu, shengmu, chewing_initial, flags, chewing_key
+ yield shengmu, shengmu, chewing_initial, flags, chewing_key, distance
def gen_corrects():
#generate corrections
- for correct, wrong in auto_correct:
+ for correct, wrong, distance in auto_correct:
flags = ['IS_PINYIN', 'PINYIN_CORRECT_{0}_{1}'.format(wrong.upper(),
correct.upper())]
for pinyin in pinyin_list:
@@ -149,17 +151,17 @@ def gen_corrects():
zhuyin = PINYIN_ZHUYIN_MAP[pinyin]
wrong_pinyin = pinyin.replace(correct, wrong)
yield pinyin, wrong_pinyin, zhuyin,\
- flags, get_chewing(pinyin)
+ flags, get_chewing(pinyin), distance
def gen_u_to_v():
#generate U to V
- for correct, wrong, flags in auto_correct_ext:
+ for correct, wrong, flags, distance in auto_correct_ext:
#over-ride flags
flags = ['IS_PINYIN', 'PINYIN_CORRECT_V_U']
pinyin = correct
zhuyin = PINYIN_ZHUYIN_MAP[pinyin]
- yield correct, wrong, zhuyin, flags, get_chewing(pinyin)
+ yield correct, wrong, zhuyin, flags, get_chewing(pinyin), distance
#pinyin table
@@ -174,7 +176,8 @@ eten26_zhuyin_index = []
def filter_pinyin_list():
- for (correct, wrong, zhuyin, flags, chewing_key) in gen_pinyin_list():
+ for (correct, wrong, zhuyin, flags, chewing_key, distance) in \
+ gen_pinyin_list():
(luoma, secondary) = (None, None)
if zhuyin in ZHUYIN_LUOMA_PINYIN_MAP:
@@ -190,7 +193,7 @@ def filter_pinyin_list():
content_table.append((correct, zhuyin, luoma, secondary, chewing_key))
if "IS_PINYIN" in flags:
- pinyin_index.append((wrong, flags, correct))
+ pinyin_index.append((wrong, flags, correct, distance))
#skip pinyin correct options
if correct != wrong:
continue
@@ -292,9 +295,9 @@ def gen_content_table():
def gen_pinyin_index():
entries = []
- for (wrong, flags, correct) in pinyin_index:
+ for (wrong, flags, correct, distance) in pinyin_index:
index = [x[0] for x in content_table].index(correct)
- entry = '{{"{0}", {1}, {2}}}'.format(wrong, flags, index)
+ entry = '{{"{0}", {1}, {2}, {3}}}'.format(wrong, flags, index, distance)
entries.append(entry)
return ',\n'.join(entries)
diff --git a/scripts2/options.py b/scripts2/options.py
index fcfb9fd..e4bd01f 100644
--- a/scripts2/options.py
+++ b/scripts2/options.py
@@ -22,47 +22,47 @@
auto_correct = [
# "correct", "wrong"
- ("ng", "gn"),
- ("ng", "mg"),
- ("iu", "iou"),
- ("ui", "uei"),
- ("un", "uen"),
+ ("ng", "gn", 1),
+ ("ng", "mg", 1),
+ ("iu", "iou", 1),
+ ("ui", "uei", 1),
+ ("un", "uen", 1),
# ("ue", "ve"),
- ("ve", "ue"),
- ("ong", "on"),
+ ("ve", "ue", 1),
+ ("ong", "on", 1),
]
auto_correct_ext = [
# "correct", "wrong", flag
- ("ju", "jv", "PINYIN_CORRECT_V_U"),
- ("qu", "qv", "PINYIN_CORRECT_V_U"),
- ("xu", "xv", "PINYIN_CORRECT_V_U"),
- ("yu", "yv", "PINYIN_CORRECT_V_U"),
-
- ("jue", "jve", "PINYIN_CORRECT_V_U"),
- ("que", "qve", "PINYIN_CORRECT_V_U"),
- ("xue", "xve", "PINYIN_CORRECT_V_U"),
- ("yue", "yve", "PINYIN_CORRECT_V_U"),
-
- ("juan", "jvan", "PINYIN_CORRECT_V_U"),
- ("quan", "qvan", "PINYIN_CORRECT_V_U"),
- ("xuan", "xvan", "PINYIN_CORRECT_V_U"),
- ("yuan", "yvan", "PINYIN_CORRECT_V_U"),
-
- ("jun", "jvn", "PINYIN_CORRECT_V_U"),
- ("qun", "qvn", "PINYIN_CORRECT_V_U"),
- ("xun", "xvn", "PINYIN_CORRECT_V_U"),
- ("yun", "yvn", "PINYIN_CORRECT_V_U"),
-
-# ("juang", "jvang", "PINYIN_CORRECT_V_U"),
-# ("quang", "qvang", "PINYIN_CORRECT_V_U"),
-# ("xuang", "xvang", "PINYIN_CORRECT_V_U"),
-# ("yuang", "yvang", "PINYIN_CORRECT_V_U"),
-
-# ("jun", "jven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"),
-# ("qun", "qven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"),
-# ("xun", "xven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"),
-# ("yun", "yven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"),
+ ("ju", "jv", "PINYIN_CORRECT_V_U", 1),
+ ("qu", "qv", "PINYIN_CORRECT_V_U", 1),
+ ("xu", "xv", "PINYIN_CORRECT_V_U", 1),
+ ("yu", "yv", "PINYIN_CORRECT_V_U", 1),
+
+ ("jue", "jve", "PINYIN_CORRECT_V_U", 1),
+ ("que", "qve", "PINYIN_CORRECT_V_U", 1),
+ ("xue", "xve", "PINYIN_CORRECT_V_U", 1),
+ ("yue", "yve", "PINYIN_CORRECT_V_U", 1),
+
+ ("juan", "jvan", "PINYIN_CORRECT_V_U", 1),
+ ("quan", "qvan", "PINYIN_CORRECT_V_U", 1),
+ ("xuan", "xvan", "PINYIN_CORRECT_V_U", 1),
+ ("yuan", "yvan", "PINYIN_CORRECT_V_U", 1),
+
+ ("jun", "jvn", "PINYIN_CORRECT_V_U", 1),
+ ("qun", "qvn", "PINYIN_CORRECT_V_U", 1),
+ ("xun", "xvn", "PINYIN_CORRECT_V_U", 1),
+ ("yun", "yvn", "PINYIN_CORRECT_V_U", 1),
+
+# ("juang", "jvang", "PINYIN_CORRECT_V_U", 1),
+# ("quang", "qvang", "PINYIN_CORRECT_V_U", 1),
+# ("xuang", "xvang", "PINYIN_CORRECT_V_U", 1),
+# ("yuang", "yvang", "PINYIN_CORRECT_V_U", 1),
+
+# ("jun", "jven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U", 1),
+# ("qun", "qven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U", 1),
+# ("xun", "xven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U", 1),
+# ("yun", "yven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U", 1),
]
diff --git a/scripts2/specials.txt b/scripts2/specials.txt
new file mode 100644
index 0000000..387a384
--- /dev/null
+++ b/scripts2/specials.txt
@@ -0,0 +1,6 @@
+e'nen 100
+en'en 300
+qun'a 100
+qu'na 300
+jia'nao 100
+jian'ao 300
diff --git a/scripts2/specialtable.py b/scripts2/specialtable.py
index 17bd673..e0182b0 100644
--- a/scripts2/specialtable.py
+++ b/scripts2/specialtable.py
@@ -170,7 +170,7 @@ def gen_resplit_table():
#init code
load_phrase("pinyins.txt")
-#load_phrase("specials.txt")
+load_phrase("specials.txt")
divided_list = filter_divided()
resplit_list = filter_resplit()
sort_all()
diff --git a/src/Makefile.am b/src/Makefile.am
index c821d04..70419b7 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -14,19 +14,18 @@
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
-AUTOMAKE_OPTIONS = gnu
-SUBDIRS = include storage lookup
+AUTOMAKE_OPTIONS = gnu subdir-objects
+SUBDIRS = include storage lookup
-EXTRA_DIST = libpinyin.ver \
- libzhuyin.ver
+EXTRA_DIST = libpinyin.ver libzhuyin.ver
-MAINTAINERCLEANFILES = Makefile.in
+MAINTAINERCLEANFILES = Makefile.in
-CLEANFILES = *.bak
+CLEANFILES = *.bak
-ACLOCAL = aclocal -I $(ac_aux_dir)
+ACLOCAL = aclocal -I $(ac_aux_dir)
-INCLUDES = -I$(top_srcdir)/src \
+AM_CPPFLAGS = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
-I$(top_srcdir)/src/lookup \
@@ -40,33 +39,79 @@ if ENABLE_LIBZHUYIN
libpinyininclude_HEADERS += zhuyin.h
endif
+pinyin_SOURCES = \
+ storage/phrase_index.cpp \
+ storage/phrase_large_table2.cpp \
+ storage/phrase_large_table3.cpp \
+ storage/ngram.cpp \
+ storage/tag_utility.cpp \
+ storage/chewing_key.cpp \
+ storage/pinyin_parser2.cpp \
+ storage/zhuyin_parser2.cpp \
+ storage/phonetic_key_matrix.cpp \
+ storage/chewing_large_table.cpp \
+ storage/chewing_large_table2.cpp \
+ storage/table_info.cpp \
+ lookup/pinyin_lookup2.cpp \
+ lookup/phrase_lookup.cpp \
+ lookup/lookup.cpp \
+ lookup/phonetic_lookup.cpp \
+ $(NULL)
+
+if BERKELEYDB
+pinyin_SOURCES += storage/ngram_bdb.cpp \
+ storage/phrase_large_table3_bdb.cpp \
+ storage/chewing_large_table2_bdb.cpp
+endif
+
+if KYOTOCABINET
+pinyin_SOURCES += storage/ngram_kyotodb.cpp \
+ storage/phrase_large_table3_kyotodb.cpp \
+ storage/chewing_large_table2_kyotodb.cpp
+endif
+
+
noinst_HEADERS = pinyin_internal.h
-lib_LTLIBRARIES = libpinyin.la
+lib_LTLIBRARIES = libpinyin.la
-noinst_LTLIBRARIES = libpinyin_internal.la
+noinst_LIBRARIES = libpinyin_internal.a
-libpinyin_la_SOURCES = pinyin.cpp
+libpinyin_la_SOURCES = $(pinyin_SOURCES) pinyin.cpp
-libpinyin_la_LIBADD = storage/libstorage.la lookup/liblookup.la @GLIB2_LIBS@
+libpinyin_la_LIBADD = @GLIB2_LIBS@
-libpinyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libpinyin.ver \
+if LLVMLD
+## LLVM linker does not support --version-script,
+## use -exported_symbols_list instead
+libpinyin_la_LDFLAGS = -Wl,-exported_symbols_list,$(srcdir)/libpinyin.exp \
+ -version-info @LT_VERSION_INFO@
+else
+libpinyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libpinyin.ver \
-version-info @LT_VERSION_INFO@
+endif
if ENABLE_LIBZHUYIN
lib_LTLIBRARIES += libzhuyin.la
-libzhuyin_la_SOURCES = zhuyin.cpp
+libzhuyin_la_SOURCES = $(pinyin_SOURCES) zhuyin.cpp
-libzhuyin_la_LIBADD = storage/libstorage.la lookup/liblookup.la @GLIB2_LIBS@
+libzhuyin_la_LIBADD = @GLIB2_LIBS@
-libzhuyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libzhuyin.ver \
+if LLVMLD
+## LLVM linker does not support --version-script,
+## use -exported_symbols_list instead
+libzhuyin_la_LDFLAGS = -Wl,-exported_symbols_list,$(srcdir)/libzhuyin.exp \
+ -version-info @LT_VERSION_INFO@
+else
+libzhuyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libzhuyin.ver \
-version-info @LT_VERSION_INFO@
endif
+endif
-libpinyin_internal_la_SOURCES = pinyin_internal.cpp
+libpinyin_internal_a_SOURCES = pinyin_internal.cpp
-libpinyin_internal_la_LIBADD = storage/libstorage.la lookup/liblookup.la
+libpinyin_internal_a_LIBADD = storage/libstorage.a lookup/liblookup.a
## Note:
## As libpinyin internal interface will change, only provides static library
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
index 8f9b417..e81e5d7 100644
--- a/src/include/Makefile.am
+++ b/src/include/Makefile.am
@@ -14,11 +14,13 @@
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
-MAINTAINERCLEANFILES = Makefile.in
+MAINTAINERCLEANFILES = Makefile.in
libpinyinincludedir = $(includedir)/libpinyin-@VERSION@
libpinyininclude_HEADERS= novel_types.h
-noinst_HEADERS = memory_chunk.h \
- stl_lite.h
+noinst_HEADERS = memory_chunk.h \
+ pinyin_utils.h \
+ stl_lite.h \
+ unaligned_memory.h
diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h
index 044fa0d..713fb5c 100644
--- a/src/include/memory_chunk.h
+++ b/src/include/memory_chunk.h
@@ -33,6 +33,7 @@
#define LIBPINYIN_USE_MMAP
#endif
#include "stl_lite.h"
+#include "pinyin_utils.h"
namespace pinyin{
@@ -72,7 +73,7 @@ private:
munmap(m_data_begin - header, header + capacity());
#endif
else
- assert(FALSE);
+ abort();
}
@@ -134,7 +135,7 @@ private:
/* checksum for aligned parts. */
guint32 index = 0;
for (; index < aligns; index += sizeof(guint32)) {
- const char * p = data + index;
+ const unsigned char * p = (const unsigned char *)data + index;
/* use little endian here. */
guint32 item = *p | *(p + 1) << 8 |
@@ -146,7 +147,7 @@ private:
/* checksum for remained parts. */
guint32 shift = 0;
for (; index < length; index++) {
- const char * p = data + index;
+ const unsigned char * p = (const unsigned char *)data + index;
guint32 item = *p << shift;
shift += 8;
@@ -290,6 +291,20 @@ public:
}
/**
+ * MemoryChunk::set_content:
+ * @offset: the offset in this MemoryChunk.
+ * @data: the data to be copied.
+ * @returns: whether the data is copied successfully.
+ *
+ * Data are written directly to the memory area in this MemoryChunk.
+ *
+ */
+ template <typename T>
+ bool set_content(size_t offset, T data){
+ return set_content(offset, &data, sizeof(T));
+ }
+
+ /**
* MemoryChunk::append_content:
* @data: the begin of the data to be copied.
* @len: the length of the data to be copied.
@@ -349,7 +364,7 @@ public:
* Get the content in this MemoryChunk.
*
*/
- bool get_content(size_t offset, void * buffer, size_t length){
+ bool get_content(size_t offset, void * buffer, size_t length) const {
if ( size() < offset + length )
return false;
memcpy( buffer, m_data_begin + offset, length);
@@ -357,6 +372,21 @@ public:
}
/**
+ * MemoryChunk::get_content:
+ * @offset: the offset in this MemoryChunk.
+ * @returns: the content
+ *
+ * Get the content in this MemoryChunk.
+ *
+ */
+ template <typename T>
+ T get_content(size_t offset) const {
+ T value;
+ check_result(get_content(offset, &value, sizeof(T)));
+ return value;
+ }
+
+ /**
* MemoryChunk::compact_memory:
*
* Compact memory, reduce the size.
diff --git a/src/include/pinyin_utils.h b/src/include/pinyin_utils.h
new file mode 100644
index 0000000..9be3bd1
--- /dev/null
+++ b/src/include/pinyin_utils.h
@@ -0,0 +1,32 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2022 Peng Wu
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PINYIN_UTILS_H
+#define PINYIN_UTILS_H
+
+#include <assert.h>
+
+#if defined(NDEBUG) || defined(G_DISABLE_ASSERT)
+#define check_result(expr) expr
+#else
+#define check_result(expr) assert(expr)
+#endif
+
+#endif
diff --git a/src/include/unaligned_memory.h b/src/include/unaligned_memory.h
new file mode 100644
index 0000000..d748c35
--- /dev/null
+++ b/src/include/unaligned_memory.h
@@ -0,0 +1,61 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2022 Matias Larsson
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef UNALIGNED_MEMORY_H
+#define UNALIGNED_MEMORY_H
+
+#include <cstring>
+
+/**
+ * UnalignedMemory: Safe unaligned memory access.
+ *
+ * Some instruction sets, or some instructions in some instruction sets
+ * require that memory access is aligned to a specific boundary. These
+ * instructions may trap on unaligned access.
+ *
+ * This class provides methods to load and store values at unaligned
+ * addresses. It ensures that the compiler doesn't generate instructions
+ * that could trap on the unaligned memory access.
+ */
+
+namespace pinyin{
+ template <typename T>
+ class UnalignedMemory{
+ public:
+ /**
+ * Read a value from a possibly unaligned memory address.
+ */
+ static T load(const void * src) {
+ T value;
+ memcpy(&value, src, sizeof(T));
+ return value;
+ }
+
+ /**
+ * Store a value into a possibly unaligned memory address.
+ */
+ static void store(T value, void * dest) {
+ memcpy(dest, &value, sizeof(T));
+ }
+ };
+};
+
+
+#endif
diff --git a/src/libpinyin.exp b/src/libpinyin.exp
new file mode 100644
index 0000000..570cf58
--- /dev/null
+++ b/src/libpinyin.exp
@@ -0,0 +1,74 @@
+_pinyin_init
+_pinyin_save
+_pinyin_set_full_pinyin_scheme
+_pinyin_set_double_pinyin_scheme
+_pinyin_set_zhuyin_scheme
+_pinyin_load_phrase_library
+_pinyin_unload_phrase_library
+_pinyin_load_addon_phrase_library
+_pinyin_unload_addon_phrase_library
+_pinyin_begin_add_phrases
+_pinyin_iterator_add_phrase
+_pinyin_end_add_phrases
+_pinyin_begin_get_phrases
+_pinyin_iterator_has_next_phrase
+_pinyin_iterator_get_next_phrase
+_pinyin_end_get_phrases
+_pinyin_fini
+_pinyin_mask_out
+_pinyin_set_options
+_pinyin_alloc_instance
+_pinyin_free_instance
+_pinyin_get_context
+_pinyin_guess_sentence
+_pinyin_guess_sentence_with_prefix
+_pinyin_guess_predicted_candidates
+_pinyin_phrase_segment
+_pinyin_get_sentence
+_pinyin_parse_full_pinyin
+_pinyin_parse_more_full_pinyins
+_pinyin_parse_double_pinyin
+_pinyin_parse_more_double_pinyins
+_pinyin_parse_chewing
+_pinyin_parse_more_chewings
+_pinyin_get_parsed_input_length
+_pinyin_in_chewing_keyboard
+_pinyin_guess_candidates
+_pinyin_choose_candidate
+_pinyin_choose_predicted_candidate
+_pinyin_clear_constraint
+_pinyin_lookup_tokens
+_pinyin_train
+_pinyin_reset
+_pinyin_get_zhuyin_string
+_pinyin_get_pinyin_string
+_pinyin_get_luoma_pinyin_string
+_pinyin_get_secondary_zhuyin_string
+_pinyin_get_pinyin_strings
+_pinyin_get_pinyin_is_incomplete
+_pinyin_token_get_phrase
+_pinyin_token_get_n_pronunciation
+_pinyin_token_get_nth_pronunciation
+_pinyin_token_get_unigram_frequency
+_pinyin_token_add_unigram_frequency
+_pinyin_get_n_candidate
+_pinyin_get_candidate
+_pinyin_get_candidate_type
+_pinyin_get_candidate_string
+_pinyin_get_candidate_nbest_index
+_pinyin_get_pinyin_key
+_pinyin_get_pinyin_key_rest
+_pinyin_get_pinyin_key_rest_positions
+_pinyin_get_pinyin_key_rest_length
+_pinyin_get_pinyin_offset
+_pinyin_get_left_pinyin_offset
+_pinyin_get_right_pinyin_offset
+_pinyin_get_character_offset
+_pinyin_get_n_phrase
+_pinyin_get_phrase_token
+_pinyin_get_full_pinyin_auxiliary_text
+_pinyin_get_double_pinyin_auxiliary_text
+_pinyin_get_chewing_auxiliary_text
+_pinyin_remember_user_input
+_pinyin_is_user_candidate
+_pinyin_remove_user_candidate
diff --git a/src/libzhuyin.exp b/src/libzhuyin.exp
new file mode 100644
index 0000000..fb30130
--- /dev/null
+++ b/src/libzhuyin.exp
@@ -0,0 +1,52 @@
+_zhuyin_init
+_zhuyin_save
+_zhuyin_set_chewing_scheme
+_zhuyin_set_full_pinyin_scheme
+_zhuyin_load_phrase_library
+_zhuyin_unload_phrase_library
+_zhuyin_begin_add_phrases
+_zhuyin_iterator_add_phrase
+_zhuyin_end_add_phrases
+_zhuyin_fini
+_zhuyin_mask_out
+_zhuyin_set_options
+_zhuyin_alloc_instance
+_zhuyin_free_instance
+_zhuyin_guess_sentence
+_zhuyin_guess_sentence_with_prefix
+_zhuyin_phrase_segment
+_zhuyin_get_sentence
+_zhuyin_parse_full_pinyin
+_zhuyin_parse_more_full_pinyins
+_zhuyin_parse_chewing
+_zhuyin_parse_more_chewings
+_zhuyin_get_parsed_input_length
+_zhuyin_in_chewing_keyboard
+_zhuyin_guess_candidates_after_cursor
+_zhuyin_guess_candidates_before_cursor
+_zhuyin_choose_candidate
+_zhuyin_clear_constraint
+_zhuyin_lookup_tokens
+_zhuyin_train
+_zhuyin_reset
+_zhuyin_get__zhuyin_string
+_zhuyin_get_pinyin_string
+_zhuyin_token_get_phrase
+_zhuyin_token_get_n_pronunciation
+_zhuyin_token_get_nth_pronunciation
+_zhuyin_token_get_unigram_frequency
+_zhuyin_token_add_unigram_frequency
+_zhuyin_get_n_candidate
+_zhuyin_get_candidate
+_zhuyin_get_candidate_type
+_zhuyin_get_candidate_string
+_zhuyin_get_zhuyin_key
+_zhuyin_get_zhuyin_key_rest
+_zhuyin_get_zhuyin_key_rest_positions
+_zhuyin_get_zhuyin_key_rest_length
+_zhuyin_get_zhuyin_offset
+_zhuyin_get_left_zhuyin_offset
+_zhuyin_get_right_zhuyin_offset
+_zhuyin_get_character_offset
+_zhuyin_get_n_phrase
+_zhuyin_get_phrase_token
diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am
index 55c1881..75a7ae5 100644
--- a/src/lookup/Makefile.am
+++ b/src/lookup/Makefile.am
@@ -14,27 +14,23 @@
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
-MAINTAINERCLEANFILES = Makefile.in
+MAINTAINERCLEANFILES = Makefile.in
-INCLUDES = -I$(top_srcdir)/src/include \
- -I$(top_srcdir)/src/storage \
- @GLIB2_CFLAGS@
+AM_CPPFLAGS = -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ @GLIB2_CFLAGS@
-noinst_HEADERS = lookup.h \
- pinyin_lookup2.h \
- phrase_lookup.h \
- phonetic_lookup.h \
- phonetic_lookup_linear.h \
- phonetic_lookup_heap.h
+noinst_HEADERS = lookup.h \
+ pinyin_lookup2.h \
+ phrase_lookup.h \
+ phonetic_lookup.h \
+ phonetic_lookup_linear.h \
+ phonetic_lookup_heap.h
-noinst_LTLIBRARIES = liblookup.la
+noinst_LIBRARIES = liblookup.a
-liblookup_la_CXXFLAGS = "-fPIC"
-
-liblookup_la_LDFLAGS = -static
-
-liblookup_la_SOURCES = pinyin_lookup2.cpp \
- phrase_lookup.cpp \
- lookup.cpp \
- phonetic_lookup.cpp
+liblookup_a_SOURCES = pinyin_lookup2.cpp \
+ phrase_lookup.cpp \
+ lookup.cpp \
+ phonetic_lookup.cpp
diff --git a/src/lookup/phonetic_lookup.cpp b/src/lookup/phonetic_lookup.cpp
index e658911..4205630 100644
--- a/src/lookup/phonetic_lookup.cpp
+++ b/src/lookup/phonetic_lookup.cpp
@@ -200,7 +200,7 @@ bool ForwardPhoneticConstraints::diff_result(MatchResult best,
}
}
- assert(add_constraint(pos, next_pos, other_token));
+ check_result(add_constraint(pos, next_pos, other_token));
}
return changed;
diff --git a/src/lookup/phonetic_lookup.h b/src/lookup/phonetic_lookup.h
index 78fac1b..72e70a5 100644
--- a/src/lookup/phonetic_lookup.h
+++ b/src/lookup/phonetic_lookup.h
@@ -22,9 +22,10 @@
#define PHONETIC_LOOKUP_H
-#include "novel_types.h"
#include <limits.h>
#include <math.h>
+#include "novel_types.h"
+#include "pinyin_utils.h"
#include "phonetic_key_matrix.h"
#include "ngram.h"
#include "lookup.h"
@@ -250,7 +251,7 @@ public:
initial_value.m_handles[1] = token;
trellis_node<nstore> initial_node;
- assert(initial_node.eval_item(&initial_value));
+ check_result(initial_node.eval_item(&initial_value));
LookupStepContent initial_step_content = (LookupStepContent)
g_ptr_array_index(m_steps_content, 0);
@@ -308,7 +309,7 @@ public:
if (!lookup_result) {
trellis_node<nstore> node;
- assert(node.eval_item(candidate));
+ check_result(node.eval_item(candidate));
g_array_append_val(step_content, node);
g_hash_table_insert(step_index, GUINT_TO_POINTER(token), GUINT_TO_POINTER(step_content->len - 1));
@@ -321,7 +322,7 @@ public:
return node->eval_item(candidate);
}
- assert(FALSE);
+ abort();
}
/* get tails */
@@ -388,7 +389,7 @@ bool extract_result(const ForwardPhoneticTrellis<nstore, nbest> * trellis,
phrase_token_t last_token = tail->m_handles[0];
int sub_index = tail->m_sub_index;
- assert(trellis->get_candidate(index, last_token, sub_index, tail));
+ check_result(trellis->get_candidate(index, last_token, sub_index, tail));
}
/* no need to reverse the result */
@@ -406,7 +407,7 @@ public:
/* set tail node */
bool set_tail(const matrix_step<nstore> * tail);
/* back trace */
- /* always assume/assert matrix_step.eval_item(...) return true? */
+ /* always assume/check_result matrix_step.eval_item(...) return true? */
bool back_trace(const ForwardPhoneticTrellis * trellis);
/* extract results */
int extract(/* out */ GPtrArray * arrays);
@@ -546,7 +547,7 @@ protected:
g_ptr_array_index(topresults, 0);
const trellis_constraint_t * constraint = NULL;
- assert(m_constraints->get_constraint(start, constraint));
+ check_result(m_constraints->get_constraint(start, constraint));
if (CONSTRAINT_ONESTEP == constraint->m_type) {
return unigram_gen_next_step(start, constraint->m_constraint_step,
@@ -578,7 +579,7 @@ protected:
int start, int end,
PhraseIndexRanges ranges) {
const trellis_constraint_t * constraint = NULL;
- assert(m_constraints->get_constraint(start, constraint));
+ check_result(m_constraints->get_constraint(start, constraint));
bool found = false;
BigramPhraseArray bigram_phrase_items = g_array_new
@@ -761,7 +762,7 @@ public:
/* begin the viterbi beam search. */
for ( int i = 0; i < nstep - 1; ++i ){
const trellis_constraint_t * cur_constraint = NULL;
- assert(m_constraints->get_constraint(i, cur_constraint));
+ check_result(m_constraints->get_constraint(i, cur_constraint));
if (CONSTRAINT_NOSEARCH == cur_constraint->m_type)
continue;
@@ -792,7 +793,7 @@ public:
for ( int m = i + 1; m < nstep; ++m ){
const trellis_constraint_t * next_constraint = NULL;
- assert(m_constraints->get_constraint(m, next_constraint));
+ check_result(m_constraints->get_constraint(m, next_constraint));
if (CONSTRAINT_NOSEARCH == next_constraint->m_type)
break;
@@ -830,7 +831,7 @@ public:
const trellis_value_t * tail = (const trellis_value_t *)
g_ptr_array_index(tails, i);
- assert(extract_result<nstore>(&m_trellis, tail, result));
+ check_result(extract_result<nstore>(&m_trellis, tail, result));
results->add_result(result);
}
@@ -860,7 +861,7 @@ public:
continue;
const trellis_constraint_t * constraint = NULL;
- assert(constraints->get_constraint(i, constraint));
+ check_result(constraints->get_constraint(i, constraint));
if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) {
if (CONSTRAINT_ONESTEP == constraint->m_type) {
@@ -880,12 +881,12 @@ public:
if (!user) {
user = new SingleGram;
}
- assert(user->get_total_freq(total_freq));
+ check_result(user->get_total_freq(total_freq));
guint32 freq = 0;
/* compute train factor */
if (!user->get_freq(token, freq)) {
- assert(user->insert_freq(token, 0));
+ check_result(user->insert_freq(token, 0));
seed = initial_seed;
} else {
seed = std_lite::max(freq, initial_seed);
@@ -897,10 +898,10 @@ public:
if (seed > 0 && total_freq > total_freq + seed)
goto next;
- assert(user->set_total_freq(total_freq + seed));
+ check_result(user->set_total_freq(total_freq + seed));
/* if total_freq is not overflow, then freq won't overflow. */
- assert(user->set_freq(token, freq + seed));
- assert(m_user_bigram->store(last_token, user));
+ check_result(user->set_freq(token, freq + seed));
+ check_result(m_user_bigram->store(last_token, user));
next:
assert(NULL != user);
if (user)
diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp
index bb3d053..8946de3 100644
--- a/src/lookup/pinyin_lookup2.cpp
+++ b/src/lookup/pinyin_lookup2.cpp
@@ -22,6 +22,7 @@
#include "facade_chewing_table2.h"
#include "pinyin_lookup2.h"
#include "stl_lite.h"
+#include "pinyin_utils.h"
using namespace pinyin;
@@ -602,12 +603,12 @@ bool PinyinLookup2::train_result2(PhoneticKeyMatrix * matrix,
if (!user) {
user = new SingleGram;
}
- assert(user->get_total_freq(total_freq));
+ check_result(user->get_total_freq(total_freq));
guint32 freq = 0;
/* compute train factor */
if (!user->get_freq(token, freq)) {
- assert(user->insert_freq(token, 0));
+ check_result(user->insert_freq(token, 0));
seed = initial_seed;
} else {
seed = std_lite::max(freq, initial_seed);
@@ -619,10 +620,10 @@ bool PinyinLookup2::train_result2(PhoneticKeyMatrix * matrix,
if (seed > 0 && total_freq > total_freq + seed)
goto next;
- assert(user->set_total_freq(total_freq + seed));
+ check_result(user->set_total_freq(total_freq + seed));
/* if total_freq is not overflow, then freq won't overflow. */
- assert(user->set_freq(token, freq + seed));
- assert(m_user_bigram->store(last_token, user));
+ check_result(user->set_freq(token, freq + seed));
+ check_result(m_user_bigram->store(last_token, user));
next:
assert(NULL != user);
if (user)
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 75eb41b..9dd784b 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -71,18 +71,24 @@ struct _pinyin_instance_t{
/* pointer of pinyin_context_t. */
pinyin_context_t * m_context;
+ ucs4_t * m_prefix_ucs4;
+ glong m_prefix_len;
/* the tokens of phrases before the user input. */
TokenVector m_prefixes;
/* cached parsed pinyin keys. */
PhoneticKeyMatrix m_matrix;
size_t m_parsed_len;
+ size_t m_parsed_key_len;
/* cached pinyin lookup variables. */
ForwardPhoneticConstraints * m_constraints;
NBestMatchResults m_nbest_results;
TokenVector m_phrase_result;
CandidateVector m_candidates;
+
+ /* cache the sort option here. */
+ guint m_sort_option;
};
struct _lookup_candidate_t{
@@ -671,7 +677,7 @@ bool pinyin_iterator_get_next_phrase(export_iterator_t * iter,
/* fill phrase and pronunciation pair. */
ucs4_t phrase_ucs4[MAX_PHRASE_LENGTH];
guint8 len = item.get_phrase_length();
- assert(item.get_phrase_string(phrase_ucs4));
+ check_result(item.get_phrase_string(phrase_ucs4));
gchar * phrase_utf8 = g_ucs4_to_utf8
(phrase_ucs4, len, NULL, NULL, NULL);
@@ -681,7 +687,7 @@ bool pinyin_iterator_get_next_phrase(export_iterator_t * iter,
assert(nth_pronun < n_pronuns);
ChewingKey keys[MAX_PHRASE_LENGTH];
guint32 freq = 0;
- assert(item.get_nth_pronunciation(nth_pronun, keys, freq));
+ check_result(item.get_nth_pronunciation(nth_pronun, keys, freq));
GPtrArray * array = g_ptr_array_new();
for(size_t i = 0; i < len; ++i) {
@@ -732,15 +738,7 @@ void pinyin_end_get_phrases(export_iterator_t * iter){
delete iter;
}
-bool pinyin_save(pinyin_context_t * context){
- if (!context->m_user_dir)
- return false;
-
- if (!context->m_modified)
- return false;
-
- context->m_phrase_index->compact();
-
+static bool _write_files(pinyin_context_t * context){
const pinyin_table_info_t * phrase_files =
context->m_system_table_info.get_default_tables();
@@ -788,11 +786,94 @@ bool pinyin_save(pinyin_context_t * context){
gchar * tmppathname = g_build_filename(context->m_user_dir,
tmpfilename, NULL);
+
+ log->save(tmppathname);
+
+ g_free(tmpfilename);
+ g_free(tmppathname);
+ delete log;
+ }
+
+ if (USER_FILE == table_info->m_file_type) {
+ /* user phrase library */
+ MemoryChunk * chunk = new MemoryChunk;
+ context->m_phrase_index->store(i, chunk);
+
+ const char * userfilename = table_info->m_user_filename;
+ gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
+ gchar * tmppathname = g_build_filename(context->m_user_dir,
+ tmpfilename, NULL);
+
+ chunk->save(tmppathname);
+
+ g_free(tmpfilename);
+ g_free(tmppathname);
+ delete chunk;
+ }
+ }
+
+ /* save user pinyin table */
+ gchar * tmpfilename = g_build_filename
+ (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL);
+ unlink(tmpfilename);
+
+ context->m_pinyin_table->store(tmpfilename);
+
+ g_free(tmpfilename);
+
+ /* save user phrase table */
+ tmpfilename = g_build_filename
+ (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL);
+ unlink(tmpfilename);
+
+ context->m_phrase_table->store(tmpfilename);
+
+ g_free(tmpfilename);
+
+ /* save user bi-gram */
+ tmpfilename = g_build_filename
+ (context->m_user_dir, USER_BIGRAM ".tmp", NULL);
+ unlink(tmpfilename);
+ context->m_user_bigram->save_db(tmpfilename);
+
+ g_free(tmpfilename);
+
+ return true;
+}
+
+static bool _rename_files(pinyin_context_t * context){
+ const pinyin_table_info_t * phrase_files =
+ context->m_system_table_info.get_default_tables();
+
+ /* skip the reserved zero phrase library. */
+ for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ PhraseIndexRange range;
+ int retval = context->m_phrase_index->get_range(i, range);
+
+ if (ERROR_NO_SUB_PHRASE_INDEX == retval)
+ continue;
+
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (NOT_USED == table_info->m_file_type)
+ continue;
+
+ const char * userfilename = table_info->m_user_filename;
+
+ if (NULL == userfilename)
+ continue;
+
+ if (SYSTEM_FILE == table_info->m_file_type ||
+ DICTIONARY == table_info->m_file_type) {
+ const char * userfilename = table_info->m_user_filename;
+ gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
+
+ gchar * tmppathname = g_build_filename(context->m_user_dir,
+ tmpfilename, NULL);
g_free(tmpfilename);
gchar * chunkpathname = g_build_filename(context->m_user_dir,
userfilename, NULL);
- log->save(tmppathname);
int result = rename(tmppathname, chunkpathname);
if (0 != result)
@@ -801,14 +882,9 @@ bool pinyin_save(pinyin_context_t * context){
g_free(chunkpathname);
g_free(tmppathname);
- delete log;
}
if (USER_FILE == table_info->m_file_type) {
- /* user phrase library */
- MemoryChunk * chunk = new MemoryChunk;
- context->m_phrase_index->store(i, chunk);
-
const char * userfilename = table_info->m_user_filename;
gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
gchar * tmppathname = g_build_filename(context->m_user_dir,
@@ -818,8 +894,6 @@ bool pinyin_save(pinyin_context_t * context){
gchar * chunkpathname = g_build_filename(context->m_user_dir,
userfilename, NULL);
- chunk->save(tmppathname);
-
int result = rename(tmppathname, chunkpathname);
if (0 != result)
fprintf(stderr, "rename %s to %s failed.\n",
@@ -827,19 +901,15 @@ bool pinyin_save(pinyin_context_t * context){
g_free(chunkpathname);
g_free(tmppathname);
- delete chunk;
}
}
/* save user pinyin table */
gchar * tmpfilename = g_build_filename
(context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL);
- unlink(tmpfilename);
gchar * filename = g_build_filename
(context->m_user_dir, USER_PINYIN_INDEX, NULL);
- context->m_pinyin_table->store(tmpfilename);
-
int result = rename(tmpfilename, filename);
if (0 != result)
fprintf(stderr, "rename %s to %s failed.\n",
@@ -851,12 +921,9 @@ bool pinyin_save(pinyin_context_t * context){
/* save user phrase table */
tmpfilename = g_build_filename
(context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL);
- unlink(tmpfilename);
filename = g_build_filename
(context->m_user_dir, USER_PHRASE_INDEX, NULL);
- context->m_phrase_table->store(tmpfilename);
-
result = rename(tmpfilename, filename);
if (0 != result)
fprintf(stderr, "rename %s to %s failed.\n",
@@ -868,9 +935,7 @@ bool pinyin_save(pinyin_context_t * context){
/* save user bi-gram */
tmpfilename = g_build_filename
(context->m_user_dir, USER_BIGRAM ".tmp", NULL);
- unlink(tmpfilename);
filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL);
- context->m_user_bigram->save_db(tmpfilename);
result = rename(tmpfilename, filename);
if (0 != result)
@@ -880,10 +945,24 @@ bool pinyin_save(pinyin_context_t * context){
g_free(tmpfilename);
g_free(filename);
+ return true;
+}
+
+bool pinyin_save(pinyin_context_t * context){
+ if (!context->m_user_dir)
+ return false;
+
+ if (!context->m_modified)
+ return false;
+
+ context->m_phrase_index->compact();
+
+ bool retval = _write_files(context) && _rename_files(context);
+
mark_version(context);
context->m_modified = false;
- return true;
+ return retval;
}
bool pinyin_set_full_pinyin_scheme(pinyin_context_t * context,
@@ -926,7 +1005,7 @@ bool pinyin_set_zhuyin_scheme(pinyin_context_t * context,
context->m_chewing_parser = new ZhuyinDaChenCP26Parser2();
break;
default:
- assert(FALSE);
+ abort();
}
return true;
}
@@ -1043,9 +1122,12 @@ pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
pinyin_instance_t * instance = new pinyin_instance_t;
instance->m_context = context;
+ instance->m_prefix_ucs4 = NULL;
+ instance->m_prefix_len = 0;
instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
instance->m_parsed_len = 0;
+ instance->m_parsed_key_len = 0;
instance->m_constraints = new ForwardPhoneticConstraints
(context->m_phrase_index);
@@ -1055,6 +1137,9 @@ pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
instance->m_candidates =
g_array_new(TRUE, TRUE, sizeof(lookup_candidate_t));
+ instance->m_sort_option =
+ SORT_BY_PHRASE_LENGTH | SORT_BY_PINYIN_LENGTH | SORT_BY_FREQUENCY;
+
return instance;
}
@@ -1071,6 +1156,7 @@ static bool _free_candidates(CandidateVector candidates) {
}
void pinyin_free_instance(pinyin_instance_t * instance){
+ g_free(instance->m_prefix_ucs4);
g_array_free(instance->m_prefixes, TRUE);
delete instance->m_constraints;
g_array_free(instance->m_phrase_result, TRUE);
@@ -1116,17 +1202,22 @@ static void _compute_prefixes(pinyin_instance_t * instance,
pinyin_context_t * & context = instance->m_context;
FacadePhraseIndex * & phrase_index = context->m_phrase_index;
- glong len_str = 0;
- ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL);
GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ g_free (instance->m_prefix_ucs4);
+ instance->m_prefix_ucs4 = g_utf8_to_ucs4(prefix, -1, NULL,
+ &(instance->m_prefix_len), NULL);
+
+ const ucs4_t * ucs4_str = instance->m_prefix_ucs4;
+ const glong len_str = instance->m_prefix_len;
+
if (ucs4_str && len_str) {
/* add prefixes. */
for (ssize_t i = 1; i <= len_str; ++i) {
if (i > MAX_PHRASE_LENGTH)
break;
- ucs4_t * start = ucs4_str + len_str - i;
+ const ucs4_t * start = ucs4_str + len_str - i;
PhraseTokens tokens;
memset(tokens, 0, sizeof(tokens));
@@ -1141,7 +1232,6 @@ static void _compute_prefixes(pinyin_instance_t * instance,
}
}
g_array_free(tokenarray, TRUE);
- g_free(ucs4_str);
}
bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
@@ -1193,7 +1283,7 @@ bool pinyin_get_sentence(pinyin_instance_t * instance,
MatchResult result = NULL;
assert(index < results.size());
- assert(results.get_result(index, result));
+ check_result(results.get_result(index, result));
bool retval = pinyin::convert_to_utf8
(context->m_phrase_index, result,
@@ -1208,9 +1298,10 @@ bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
pinyin_context_t * & context = instance->m_context;
pinyin_option_t options = context->m_options;
+ gint16 distance = 0;
int pinyin_len = strlen(onepinyin);
bool retval = context->m_full_pinyin_parser->parse_one_key
- (options, *onekey, onepinyin, pinyin_len);
+ (options, *onekey, distance, onepinyin, pinyin_len);
return retval;
}
@@ -1229,6 +1320,7 @@ size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
key_rests, pinyins, strlen(pinyins));
instance->m_parsed_len = parsed_len;
+ instance->m_parsed_key_len = keys->len;
fill_matrix(&matrix, keys, key_rests, parsed_len);
@@ -1249,9 +1341,10 @@ bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
pinyin_context_t * & context = instance->m_context;
pinyin_option_t options = context->m_options;
+ gint16 distance = 0;
int pinyin_len = strlen(onepinyin);
bool retval = context->m_double_pinyin_parser->parse_one_key
- (options, *onekey, onepinyin, pinyin_len);
+ (options, *onekey, distance, onepinyin, pinyin_len);
return retval;
}
@@ -1270,6 +1363,7 @@ size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
key_rests, pinyins, strlen(pinyins));
instance->m_parsed_len = parsed_len;
+ instance->m_parsed_key_len = keys->len;
fill_matrix(&matrix, keys, key_rests, parsed_len);
@@ -1289,9 +1383,10 @@ bool pinyin_parse_chewing(pinyin_instance_t * instance,
/* disable the zhuyin correction options. */
options &= ~ZHUYIN_CORRECT_ALL;
+ gint16 distance = 0;
int chewing_len = strlen(onechewing);
bool retval = context->m_chewing_parser->parse_one_key
- (options, *onekey, onechewing, chewing_len );
+ (options, *onekey, distance, onechewing, chewing_len );
return retval;
}
@@ -1313,6 +1408,7 @@ size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
key_rests, chewings, strlen(chewings));
instance->m_parsed_len = parsed_len;
+ instance->m_parsed_key_len = keys->len;
fill_matrix(&matrix, keys, key_rests, parsed_len);
@@ -1341,6 +1437,7 @@ bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
static bool _token_get_phrase(FacadePhraseIndex * phrase_index,
phrase_token_t token,
+ guint begin,
guint * len,
gchar ** utf8_str) {
PhraseItem item;
@@ -1353,9 +1450,9 @@ static bool _token_get_phrase(FacadePhraseIndex * phrase_index,
item.get_phrase_string(buffer);
guint length = item.get_phrase_length();
if (len)
- *len = length;
+ *len = length - begin;
if (utf8_str)
- *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
+ *utf8_str = g_ucs4_to_utf8(buffer + begin, length - begin, NULL, NULL, NULL);
return true;
}
@@ -1370,7 +1467,6 @@ static gint compare_item_with_token(gconstpointer lhs,
return (token_lhs - token_rhs);
}
-#endif
static gint compare_item_with_phrase_length_and_frequency(gconstpointer lhs,
gconstpointer rhs) {
@@ -1388,28 +1484,39 @@ static gint compare_item_with_phrase_length_and_frequency(gconstpointer lhs,
return -(freq_lhs - freq_rhs); /* in descendant order */
}
+#endif
-static gint compare_item_with_phrase_length_and_pinyin_length_and_frequency
-(gconstpointer lhs, gconstpointer rhs) {
+static gint compare_item_with_sort_option
+(gconstpointer lhs, gconstpointer rhs, gpointer user_data) {
lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
+ guint sort_option = GPOINTER_TO_UINT(user_data);
- guint8 len_lhs = item_lhs->m_phrase_length;
- guint8 len_rhs = item_rhs->m_phrase_length;
+ if (sort_option & SORT_BY_PHRASE_LENGTH) {
+ guint8 len_lhs = item_lhs->m_phrase_length;
+ guint8 len_rhs = item_rhs->m_phrase_length;
- if (len_lhs != len_rhs)
- return -(len_lhs - len_rhs); /* in descendant order */
+ if (len_lhs != len_rhs)
+ return -(len_lhs - len_rhs); /* in descendant order */
+ }
- len_lhs = item_lhs->m_end - item_lhs->m_begin;
- len_rhs = item_rhs->m_end - item_rhs->m_begin;
+ if (sort_option & SORT_BY_PINYIN_LENGTH) {
+ guint8 len_lhs = item_lhs->m_end - item_lhs->m_begin;
+ guint8 len_rhs = item_rhs->m_end - item_rhs->m_begin;
- if (len_lhs != len_rhs)
- return -(len_lhs - len_rhs); /* in descendant order */
+ if (len_lhs != len_rhs)
+ return -(len_lhs - len_rhs); /* in descendant order */
+ }
- guint32 freq_lhs = item_lhs->m_freq;
- guint32 freq_rhs = item_rhs->m_freq;
+ if (sort_option & SORT_BY_FREQUENCY) {
+ guint32 freq_lhs = item_lhs->m_freq;
+ guint32 freq_rhs = item_rhs->m_freq;
- return -(freq_lhs - freq_rhs); /* in descendant order */
+ if (freq_lhs != freq_rhs)
+ return -(freq_lhs - freq_rhs); /* in descendant order */
+ }
+
+ return 0;
}
static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
@@ -1452,7 +1559,7 @@ static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
/* use the first candidate. */
MatchResult result = NULL;
- assert(results.get_result(0, result));
+ check_result(results.get_result(0, result));
phrase_token_t cur_token = g_array_index
(result, phrase_token_t, offset);
@@ -1513,7 +1620,23 @@ static void _compute_frequency_of_items(pinyin_context_t * context,
gfloat lambda = context->m_system_table_info.get_lambda();
- /* handle addon candidates first. */
+ /* handle prefix candidates. */
+ if (PREDICTED_PREFIX_CANDIDATE == item->m_candidate_type) {
+ total_freq = context->m_phrase_index->
+ get_phrase_index_total_freq();
+
+ context->m_phrase_index->get_phrase_item
+ (token, cached_item);
+
+ /* Note: possibility value <= 1.0. */
+ guint32 freq = ((1 - lambda) *
+ cached_item.get_unigram_frequency() /
+ (gfloat) total_freq) * 256 * 256 * 256;
+ item->m_freq = freq;
+ continue;
+ }
+
+ /* handle addon candidates. */
if (ADDON_CANDIDATE == item->m_candidate_type) {
total_freq = context->m_phrase_index->
get_phrase_index_total_freq();
@@ -1555,6 +1678,70 @@ static void _compute_frequency_of_items(pinyin_context_t * context,
}
}
+static bool _prepend_longer_candidates(pinyin_instance_t * instance,
+ CandidateVector candidates) {
+
+ pinyin_context_t * & context = instance->m_context;
+ FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+ PhoneticKeyMatrix & matrix = instance->m_matrix;
+ size_t prefix_len = instance->m_parsed_key_len;
+
+ GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(tokens));
+ phrase_index->prepare_tokens(tokens);
+ int result = search_suggestion_with_matrix
+ (context->m_pinyin_table, &matrix, prefix_len, tokens);
+ int num = reduce_tokens(tokens, tokenarray, false);
+ phrase_index->destroy_tokens(tokens);
+
+ phrase_token_t longer_token = null_token;
+ PhraseItem longer_item, item;
+ for (int i = 0; i < tokenarray->len; ++i) {
+ phrase_token_t token = g_array_index(tokenarray, phrase_token_t, i);
+
+ if (ERROR_OK != phrase_index->get_phrase_item(token, item))
+ continue;
+
+ /* skip the phrase longer than prefix_len * 2 + 1 */
+ if (item.get_phrase_length() > (prefix_len * 2 + 1))
+ continue;
+
+ if (longer_token == null_token) {
+ longer_token = token;
+ phrase_index->get_phrase_item(longer_token, longer_item);
+ continue;
+ }
+
+ if (item.get_unigram_frequency() >
+ longer_item.get_unigram_frequency()) {
+ longer_token = token;
+ phrase_index->get_phrase_item(longer_token, longer_item);
+ }
+ }
+
+ if (longer_token == null_token)
+ return false;
+
+ /* compute the unigram frequency. */
+ gfloat lambda = context->m_system_table_info.get_lambda();
+ guint32 total_freq = phrase_index->get_phrase_index_total_freq();
+ guint32 freq = ((1 - lambda) *
+ longer_item.get_unigram_frequency() /
+ (gfloat) total_freq) * 256 * 256 * 256;
+
+ /* prepend longer candidate to candidates. */
+ lookup_candidate_t candidate;
+ candidate.m_candidate_type = LONGER_CANDIDATE;
+ candidate.m_token = longer_token;
+ candidate.m_freq = freq;
+ g_array_prepend_val(candidates, candidate);
+
+ g_array_free(tokenarray, TRUE);
+ return true;
+}
+
static bool _prepend_sentence_candidates(pinyin_instance_t * instance,
CandidateVector candidates) {
const size_t size = instance->m_nbest_results.size();
@@ -1588,20 +1775,26 @@ static bool _compute_phrase_length(pinyin_context_t * context,
switch(candidate->m_candidate_type) {
case NBEST_MATCH_CANDIDATE:
- assert(FALSE);
+ abort();
case NORMAL_CANDIDATE:
- case PREDICTED_CANDIDATE: {
+ case PREDICTED_BIGRAM_CANDIDATE: {
phrase_index->get_phrase_item(candidate->m_token, item);
candidate->m_phrase_length = item.get_phrase_length();
break;
}
+ case PREDICTED_PREFIX_CANDIDATE: {
+ phrase_index->get_phrase_item(candidate->m_token, item);
+ candidate->m_phrase_length =
+ item.get_phrase_length() - candidate->m_begin;
+ break;
+ }
case ADDON_CANDIDATE: {
addon_phrase_index->get_phrase_item(candidate->m_token, item);
candidate->m_phrase_length = item.get_phrase_length();
break;
}
case ZOMBIE_CANDIDATE:
- assert(FALSE);
+ abort();
}
}
@@ -1624,20 +1817,27 @@ static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
break;
}
case NORMAL_CANDIDATE:
- case PREDICTED_CANDIDATE:
+ case LONGER_CANDIDATE:
+ case PREDICTED_BIGRAM_CANDIDATE:
+ _token_get_phrase
+ (instance->m_context->m_phrase_index,
+ candidate->m_token, 0, NULL,
+ &(candidate->m_phrase_string));
+ break;
+ case PREDICTED_PREFIX_CANDIDATE:
_token_get_phrase
(instance->m_context->m_phrase_index,
- candidate->m_token, NULL,
+ candidate->m_token, candidate->m_begin, NULL,
&(candidate->m_phrase_string));
break;
case ADDON_CANDIDATE:
_token_get_phrase
(instance->m_context->m_addon_phrase_index,
- candidate->m_token, NULL,
+ candidate->m_token, 0, NULL,
&(candidate->m_phrase_string));
break;
case ZOMBIE_CANDIDATE:
- assert(FALSE);
+ abort();
}
}
@@ -1662,8 +1862,7 @@ static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
static bool _remove_duplicated_items_by_phrase_string
-(pinyin_instance_t * instance,
- CandidateVector candidates) {
+(pinyin_instance_t * instance, CandidateVector candidates) {
size_t i;
/* create the GArray of indexed item */
GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
@@ -1690,6 +1889,22 @@ static bool _remove_duplicated_items_by_phrase_string
cur_item->m_phrase_string)) {
/* found duplicated candidates */
+ /* as the longer candidates is longer than the pinyin input,
+ then only longer candidates can be equal. */
+
+ if (LONGER_CANDIDATE == saved_item->m_candidate_type &&
+ LONGER_CANDIDATE == cur_item->m_candidate_type) {
+ /* keep the high possiblity one */
+ if (saved_item->m_freq < cur_item->m_freq) {
+ cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ } else {
+ saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ saved_item = cur_item;
+ }
+
+ continue;
+ }
+
/* both are nbest match candidate */
if (NBEST_MATCH_CANDIDATE == saved_item->m_candidate_type &&
NBEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
@@ -1772,7 +1987,7 @@ static bool _check_offset(PhoneticKeyMatrix & matrix, size_t offset) {
bool pinyin_guess_candidates(pinyin_instance_t * instance,
size_t offset,
- sort_option_t sort_option) {
+ guint sort_option) {
pinyin_context_t * & context = instance->m_context;
pinyin_option_t & options = context->m_options;
@@ -1784,6 +1999,8 @@ bool pinyin_guess_candidates(pinyin_instance_t * instance,
if (0 == matrix.size())
return false;
+ instance->m_sort_option = sort_option;
+
/* lookup the previous token here. */
phrase_token_t prev_token = null_token;
@@ -1870,20 +2087,17 @@ bool pinyin_guess_candidates(pinyin_instance_t * instance,
_compute_frequency_of_items(context, prev_token, &merged_gram, candidates);
/* sort the candidates. */
- switch (sort_option) {
- case SORT_BY_PHRASE_LENGTH_AND_FREQUENCY:
- g_array_sort(candidates,
- compare_item_with_phrase_length_and_frequency);
- break;
- case SORT_BY_PHRASE_LENGTH_AND_PINYIN_LENGTH_AND_FREQUENCY:
- g_array_sort(candidates,
- compare_item_with_phrase_length_and_pinyin_length_and_frequency);
- break;
- }
+ g_array_sort_with_data
+ (candidates, compare_item_with_sort_option,
+ GUINT_TO_POINTER(sort_option));
/* post process to remove duplicated candidates */
- _prepend_sentence_candidates(instance, instance->m_candidates);
+ if (!(sort_option & SORT_WITHOUT_LONGER_CANDIDATE))
+ _prepend_longer_candidates(instance, instance->m_candidates);
+
+ if (!(sort_option & SORT_WITHOUT_SENTENCE_CANDIDATE))
+ _prepend_sentence_candidates(instance, instance->m_candidates);
_compute_phrase_strings_of_items(instance, instance->m_candidates);
@@ -1910,6 +2124,7 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
_free_candidates(candidates);
+ /* search bigram candidate. */
g_array_set_size(instance->m_prefixes, 0);
_compute_prefixes(instance, prefix);
@@ -1925,45 +2140,79 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
context->m_user_bigram->load(prev_token, user_gram);
merge_single_gram(&merged_gram, NULL, user_gram);
+ if (user_gram)
+ delete user_gram;
+
if (merged_gram.get_length())
break;
}
- if (0 == merged_gram.get_length())
- return false;
+ if (0 != merged_gram.get_length()) {
- /* retrieve all items. */
- BigramPhraseWithCountArray tokens = g_array_new
- (FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
- merged_gram.retrieve_all(tokens);
+ /* retrieve all items. */
+ BigramPhraseWithCountArray tokens = g_array_new
+ (FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+ merged_gram.retrieve_all(tokens);
- /* sort the longer word first. */
- PhraseItem cached_item;
- for (ssize_t len = length; len > 0; --len) {
- /* append items. */
- for (size_t k = 0; k < tokens->len; ++k){
- BigramPhraseItemWithCount * phrase_item = &g_array_index
- (tokens, BigramPhraseItemWithCount, k);
+ /* sort the longer word first. */
+ PhraseItem cached_item;
+ for (ssize_t len = length; len > 0; --len) {
+ /* append items. */
+ for (size_t k = 0; k < tokens->len; ++k){
+ BigramPhraseItemWithCount * phrase_item = &g_array_index
+ (tokens, BigramPhraseItemWithCount, k);
- if (phrase_item->m_count < filter)
- continue;
+ if (phrase_item->m_count < filter)
+ continue;
- int result = phrase_index->get_phrase_item
- (phrase_item->m_token, cached_item);
- if (ERROR_NO_SUB_PHRASE_INDEX == result)
- continue;
+ int result = phrase_index->get_phrase_item
+ (phrase_item->m_token, cached_item);
+ if (ERROR_NO_SUB_PHRASE_INDEX == result)
+ continue;
- if (len != cached_item.get_phrase_length())
- continue;
+ if (len != cached_item.get_phrase_length())
+ continue;
- lookup_candidate_t item;
- item.m_candidate_type = PREDICTED_CANDIDATE;
- item.m_token = phrase_item->m_token;
- g_array_append_val(candidates, item);
+ lookup_candidate_t item;
+ item.m_candidate_type = PREDICTED_BIGRAM_CANDIDATE;
+ item.m_token = phrase_item->m_token;
+ g_array_append_val(candidates, item);
+ }
}
+ }
+
+ /* search prefix candidate. */
+ GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ PhraseTokens phrase_tokens;
+ memset(phrase_tokens, 0, sizeof(phrase_tokens));
+ phrase_index->prepare_tokens(phrase_tokens);
+ int result = context->m_phrase_table->search_suggestion
+ (instance->m_prefix_len, instance->m_prefix_ucs4, phrase_tokens);
+ int num = reduce_tokens(phrase_tokens, tokenarray, false);
+ phrase_index->destroy_tokens(phrase_tokens);
+
+ PhraseItem item;
+ for (size_t i = 0; i < tokenarray->len; ++i) {
+ phrase_token_t token = g_array_index(tokenarray, phrase_token_t, i);
+
+ phrase_index->get_phrase_item(token, item);
+ /* skip the phrase longer than prefix_len * 2 + 1 */
+ if (item.get_phrase_length() > (instance->m_prefix_len * 2 + 1))
+ continue;
+
+ lookup_candidate_t template_item;
+ template_item.m_candidate_type = PREDICTED_PREFIX_CANDIDATE;
+ template_item.m_token = token;
+ template_item.m_begin = instance->m_prefix_len;
+ /* The prefix candidate only uses the m_begin variable. */
+ template_item.m_end = 0;
+ g_array_append_val(candidates, template_item);
}
+ g_array_free(tokenarray, TRUE);
+
/* post process to sort the candidates */
_compute_phrase_length(context, candidates);
@@ -1971,7 +2220,11 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
_compute_frequency_of_items(context, prev_token, &merged_gram, candidates);
/* sort the candidates by phrase length and frequency. */
- g_array_sort(candidates, compare_item_with_phrase_length_and_frequency);
+ guint sort_option = SORT_BY_PHRASE_LENGTH | SORT_BY_FREQUENCY;
+
+ g_array_sort_with_data
+ (candidates, compare_item_with_sort_option,
+ GUINT_TO_POINTER(sort_option));
/* post process to remove duplicated candidates */
@@ -1979,16 +2232,17 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
_remove_duplicated_items_by_phrase_string(instance, instance->m_candidates);
- if (user_gram)
- delete user_gram;
-
return true;
}
int pinyin_choose_candidate(pinyin_instance_t * instance,
size_t offset,
lookup_candidate_t * candidate){
- assert(PREDICTED_CANDIDATE != candidate->m_candidate_type);
+ const guint32 initial_seed = 23 * 3;
+ const guint32 unigram_factor = 7;
+
+ assert(PREDICTED_BIGRAM_CANDIDATE != candidate->m_candidate_type &&
+ PREDICTED_PREFIX_CANDIDATE != candidate->m_candidate_type);
pinyin_context_t * context = instance->m_context;
PhoneticKeyMatrix & matrix = instance->m_matrix;
@@ -1997,12 +2251,23 @@ int pinyin_choose_candidate(pinyin_instance_t * instance,
if (NBEST_MATCH_CANDIDATE == candidate->m_candidate_type) {
MatchResult best = NULL, other = NULL;
- assert(results.get_result(0, best));
- assert(results.get_result(candidate->m_nbest_index, other));
+ check_result(results.get_result(0, best));
+ check_result(results.get_result(candidate->m_nbest_index, other));
constraints->diff_result(best, other);
return matrix.size() - 1;
}
+ if (LONGER_CANDIDATE == candidate->m_candidate_type) {
+ /* only train uni-gram for longer candidate. */
+ phrase_token_t token = candidate->m_token;
+ int error = context->m_phrase_index->add_unigram_frequency
+ (token, initial_seed * unigram_factor);
+ if (ERROR_INTEGER_OVERFLOW == error)
+ return false;
+
+ return true;
+ }
+
if (ADDON_CANDIDATE == candidate->m_candidate_type) {
PhraseItem item;
context->m_addon_phrase_index->get_phrase_item
@@ -2034,6 +2299,19 @@ int pinyin_choose_candidate(pinyin_instance_t * instance,
candidate->m_token = token;
}
+ if (instance->m_sort_option & SORT_WITHOUT_SENTENCE_CANDIDATE) {
+ assert(0 == offset);
+
+ /* only train uni-gram. */
+ phrase_token_t token = candidate->m_token;
+ int error = context->m_phrase_index->add_unigram_frequency
+ (token, initial_seed * unigram_factor);
+ if (ERROR_INTEGER_OVERFLOW == error)
+ return false;
+
+ return true;
+ }
+
/* sync m_constraints to the length of m_pinyin_keys. */
bool retval = constraints->validate_constraint(&matrix);
@@ -2049,7 +2327,8 @@ int pinyin_choose_candidate(pinyin_instance_t * instance,
bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance,
lookup_candidate_t * candidate){
- assert(PREDICTED_CANDIDATE == candidate->m_candidate_type);
+ assert(PREDICTED_BIGRAM_CANDIDATE == candidate->m_candidate_type ||
+ PREDICTED_PREFIX_CANDIDATE == candidate->m_candidate_type);
const guint32 initial_seed = 23 * 3;
const guint32 unigram_factor = 7;
@@ -2064,6 +2343,10 @@ bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance,
if (ERROR_INTEGER_OVERFLOW == error)
return false;
+ /* The prefix candidate only trains uni-gram frequency. */
+ if (PREDICTED_PREFIX_CANDIDATE == candidate->m_candidate_type)
+ return true;
+
phrase_token_t prev_token = _get_previous_token(instance, 0);
if (null_token == prev_token)
return false;
@@ -2076,14 +2359,14 @@ bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance,
/* train bi-gram */
guint32 total_freq = 0;
- assert(user_gram->get_total_freq(total_freq));
+ check_result(user_gram->get_total_freq(total_freq));
guint32 freq = 0;
if (!user_gram->get_freq(token, freq)) {
- assert(user_gram->insert_freq(token, initial_seed));
+ check_result(user_gram->insert_freq(token, initial_seed));
} else {
- assert(user_gram->set_freq(token, freq + initial_seed));
+ check_result(user_gram->set_freq(token, freq + initial_seed));
}
- assert(user_gram->set_total_freq(total_freq + initial_seed));
+ check_result(user_gram->set_total_freq(total_freq + initial_seed));
context->m_user_bigram->store(prev_token, user_gram);
delete user_gram;
return true;
@@ -2131,7 +2414,7 @@ bool pinyin_train(pinyin_instance_t * instance, guint8 index){
MatchResult result = NULL;
assert(index < results.size());
- assert(results.get_result(index, result));
+ check_result(results.get_result(index, result));
bool retval = context->m_pinyin_lookup->train_result3
(&matrix, instance->m_constraints, result);
@@ -2229,7 +2512,7 @@ bool pinyin_token_get_phrase(pinyin_instance_t * instance,
pinyin_context_t * & context = instance->m_context;
return _token_get_phrase(context->m_phrase_index,
- token, len, utf8_str);
+ token, 0, len, utf8_str);
}
bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance,
@@ -2769,7 +3052,7 @@ static gchar * _get_aux_text_prefix(pinyin_instance_t * instance,
else if (IS_ZHUYIN == options)
str = key.get_zhuyin_string();
else
- assert(FALSE);
+ abort();
gchar * newprefix = g_strconcat(prefix, str, " ", NULL);
@@ -2813,7 +3096,7 @@ static gchar * _get_aux_text_postfix(pinyin_instance_t * instance,
else if (IS_ZHUYIN == options)
str = key.get_zhuyin_string();
else
- assert(FALSE);
+ abort();
gchar * newpostfix = g_strconcat(postfix, str, " ", NULL);
@@ -2934,7 +3217,7 @@ bool pinyin_get_double_pinyin_auxiliary_text(pinyin_instance_t * instance,
middle = g_strconcat(shengmu, yunmu, "|", NULL);
break;
default:
- assert(FALSE);
+ abort();
}
g_free(shengmu);
@@ -3046,8 +3329,10 @@ static bool _remember_phrase_recur(pinyin_instance_t * instance,
return false;
/* as cached_keys and phrase has the same length. */
- assert(cached_keys->len > 0);
- assert(cached_keys->len <= MAX_PHRASE_LENGTH);
+ if (cached_keys->len <= 0)
+ return false;
+ if (cached_keys->len > MAX_PHRASE_LENGTH)
+ return false;
return _add_phrase(context, index, cached_keys,
phrase, phrase_length, count);
@@ -3055,7 +3340,8 @@ static bool _remember_phrase_recur(pinyin_instance_t * instance,
const size_t size = matrix.get_column_size(start);
/* assume pinyin parsers will filter invalid keys. */
- assert(size > 0);
+ if (size <= 0)
+ return false;
bool result = false;
@@ -3069,18 +3355,22 @@ static bool _remember_phrase_recur(pinyin_instance_t * instance,
const ChewingKey zero_key;
if (zero_key == key) {
/* assume only one key here for "'" or the last key. */
- assert(1 == size);
+ if (1 != size)
+ return false;
+
return _remember_phrase_recur
(instance, cached_keys, cached_tokens,
newstart, phrase, count);
}
+#if 0
/* meet in-complete pinyin */
if (CHEWING_ZERO_MIDDLE == key.m_middle &&
CHEWING_ZERO_FINAL == key.m_final) {
assert(CHEWING_ZERO_TONE == key.m_tone);
return false;
}
+#endif
/* check pronunciation */
if (cached_keys->len >= phrase_length)
@@ -3136,7 +3426,8 @@ bool pinyin_remember_user_input(pinyin_instance_t * instance,
return false;
}
- assert(cached_tokens->len == phrase_length);
+ if (cached_tokens->len != phrase_length)
+ return false;
ChewingKeyVector cached_keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey));
@@ -3152,7 +3443,8 @@ bool pinyin_remember_user_input(pinyin_instance_t * instance,
bool pinyin_is_user_candidate(pinyin_instance_t * instance,
lookup_candidate_t * candidate) {
- if (NORMAL_CANDIDATE != candidate->m_candidate_type)
+ if (NORMAL_CANDIDATE != candidate->m_candidate_type &&
+ LONGER_CANDIDATE != candidate->m_candidate_type)
return false;
phrase_token_t token = candidate->m_token;
diff --git a/src/pinyin.h b/src/pinyin.h
index 6328e1d..bf3ac38 100644
--- a/src/pinyin.h
+++ b/src/pinyin.h
@@ -43,13 +43,24 @@ typedef enum _lookup_candidate_type_t{
NBEST_MATCH_CANDIDATE = 1,
NORMAL_CANDIDATE,
ZOMBIE_CANDIDATE,
- PREDICTED_CANDIDATE,
+ PREDICTED_BIGRAM_CANDIDATE,
+ PREDICTED_PREFIX_CANDIDATE,
ADDON_CANDIDATE,
+ LONGER_CANDIDATE,
} lookup_candidate_type_t;
typedef enum _sort_option_t{
- SORT_BY_PHRASE_LENGTH_AND_FREQUENCY = 1,
- SORT_BY_PHRASE_LENGTH_AND_PINYIN_LENGTH_AND_FREQUENCY,
+ /* The sort order is phrase length, pinyin length, frequency. */
+ SORT_WITHOUT_SENTENCE_CANDIDATE = 0x1,
+ SORT_WITHOUT_LONGER_CANDIDATE = 0x2,
+ SORT_BY_PHRASE_LENGTH = 0x4,
+ SORT_BY_PINYIN_LENGTH = 0x8,
+ SORT_BY_FREQUENCY = 0x10,
+ /* For compatibility. */
+ SORT_BY_PHRASE_LENGTH_AND_FREQUENCY =
+ SORT_WITHOUT_LONGER_CANDIDATE | SORT_BY_PHRASE_LENGTH | SORT_BY_FREQUENCY,
+ SORT_BY_PHRASE_LENGTH_AND_PINYIN_LENGTH_AND_FREQUENCY =
+ SORT_WITHOUT_LONGER_CANDIDATE | SORT_BY_PHRASE_LENGTH | SORT_BY_PINYIN_LENGTH | SORT_BY_FREQUENCY,
} sort_option_t;
/**
@@ -483,7 +494,7 @@ bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
*/
bool pinyin_guess_candidates(pinyin_instance_t * instance,
size_t offset,
- sort_option_t sort_option);
+ guint sort_option);
/**
* pinyin_choose_candidate:
@@ -600,20 +611,6 @@ bool pinyin_get_luoma_pinyin_string(pinyin_instance_t * instance,
gchar ** utf8_str);
/**
- * pinyin_get_luoma_pinyin_string:
- * @instance: the pinyin instance.
- * @key: the pinyin key.
- * @utf8_str: the luoma pinyin string.
- * @returns: whether the get operation is successful.
- *
- * Get the luoma pinyin string of the key.
- *
- */
-bool pinyin_get_luoma_pinyin_string(pinyin_instance_t * instance,
- ChewingKey * key,
- gchar ** utf8_str);
-
-/**
* pinyin_get_secondary_zhuyin_string:
* @instance: the pinyin instance.
* @key: the pinyin key.
diff --git a/src/pinyin_internal.h b/src/pinyin_internal.h
index 082360b..44b455f 100644
--- a/src/pinyin_internal.h
+++ b/src/pinyin_internal.h
@@ -27,6 +27,7 @@
#include "memory_chunk.h"
#include "pinyin_custom2.h"
#include "chewing_key.h"
+#include "pinyin_utils.h"
#include "pinyin_parser2.h"
#include "zhuyin_parser2.h"
#include "phonetic_key_matrix.h"
diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am
index ba9d4f4..b64b9ea 100644
--- a/src/storage/Makefile.am
+++ b/src/storage/Makefile.am
@@ -14,7 +14,7 @@
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
-INCLUDES = -I$(top_srcdir)/src/include \
+AM_CPPFLAGS = -I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
@GLIB2_CFLAGS@
@@ -26,7 +26,7 @@ if ENABLE_LIBZHUYIN
libpinyininclude_HEADERS += zhuyin_custom2.h
endif
-noinst_HEADERS = chewing_enum.h \
+noinst_HEADERS = chewing_enum.h \
chewing_key.h \
pinyin_parser2.h \
zhuyin_parser2.h \
@@ -64,13 +64,9 @@ noinst_HEADERS = chewing_enum.h \
kyotodb_utils.h
-noinst_LTLIBRARIES = libstorage.la
+noinst_LIBRARIES = libstorage.a
-libstorage_la_CXXFLAGS = "-fPIC"
-
-libstorage_la_LDFLAGS = -static
-
-libstorage_la_SOURCES = phrase_index.cpp \
+libstorage_a_SOURCES = phrase_index.cpp \
phrase_large_table2.cpp \
phrase_large_table3.cpp \
ngram.cpp \
@@ -84,13 +80,13 @@ libstorage_la_SOURCES = phrase_index.cpp \
table_info.cpp
if BERKELEYDB
-libstorage_la_SOURCES += ngram_bdb.cpp \
+libstorage_a_SOURCES += ngram_bdb.cpp \
phrase_large_table3_bdb.cpp \
chewing_large_table2_bdb.cpp
endif
if KYOTOCABINET
-libstorage_la_SOURCES += ngram_kyotodb.cpp \
+libstorage_a_SOURCES += ngram_kyotodb.cpp \
phrase_large_table3_kyotodb.cpp \
chewing_large_table2_kyotodb.cpp
endif
diff --git a/src/storage/bdb_utils.h b/src/storage/bdb_utils.h
index b1c5832..7a83793 100644
--- a/src/storage/bdb_utils.h
+++ b/src/storage/bdb_utils.h
@@ -59,6 +59,10 @@ inline bool copy_bdb(DB * srcdb, DB * destdb) {
while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
ret = destdb->put(destdb, NULL, &key, &data, 0);
assert(0 == ret);
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
}
assert(DB_NOTFOUND == ret);
diff --git a/src/storage/chewing_large_table.cpp b/src/storage/chewing_large_table.cpp
index 5716c5c..bd76e9b 100644
--- a/src/storage/chewing_large_table.cpp
+++ b/src/storage/chewing_large_table.cpp
@@ -344,7 +344,7 @@ ChewingLengthIndexLevel::~ChewingLengthIndexLevel() {
CASE(14);
CASE(15);
default:
- assert(false);
+ abort();
}
}
#undef CASE
@@ -389,7 +389,7 @@ int ChewingLengthIndexLevel::search(pinyin_option_t options, int phrase_length,
CASE(14);
CASE(15);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -545,7 +545,7 @@ int ChewingLengthIndexLevel::add_index(int phrase_length,
CASE(14);
CASE(15);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -599,7 +599,7 @@ int ChewingLengthIndexLevel::remove_index(int phrase_length,
CASE(14);
CASE(15);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -669,8 +669,13 @@ bool ChewingLargeTable::load_text(FILE * infile, TABLE_PHONETIC_TYPE type) {
size_t freq;
while (!feof(infile)) {
+#ifdef __APPLE__
+ int num = fscanf(infile, "%255s %255[^ \t] %u %ld",
+ pinyin, phrase, &token, &freq);
+#else
int num = fscanf(infile, "%255s %255s %u %ld",
pinyin, phrase, &token, &freq);
+#endif
if (4 != num)
continue;
@@ -799,7 +804,7 @@ bool ChewingBitmapIndexLevel::store(MemoryChunk * new_chunk,
bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
table_offset_t end) {
char * begin = (char *) chunk->begin();
- guint32 nindex = *((guint32 *)(begin + offset)); /* number of index */
+ guint32 nindex = chunk->get_content<guint32>(offset); /* number of index */
table_offset_t * index = (table_offset_t *)
(begin + offset + sizeof(guint32));
@@ -845,7 +850,7 @@ bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
CASE(14);
CASE(15);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -905,7 +910,7 @@ bool ChewingLengthIndexLevel::store(MemoryChunk * new_chunk,
CASE(14);
CASE(15);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -1029,7 +1034,7 @@ bool ChewingLengthIndexLevel::mask_out(phrase_token_t mask,
CASE(14);
CASE(15);
default:
- assert(false);
+ abort();
}
}
#undef CASE
diff --git a/src/storage/chewing_large_table.h b/src/storage/chewing_large_table.h
index d836769..21158e1 100644
--- a/src/storage/chewing_large_table.h
+++ b/src/storage/chewing_large_table.h
@@ -26,6 +26,7 @@
#include "novel_types.h"
#include "memory_chunk.h"
#include "chewing_key.h"
+#include "pinyin_utils.h"
#include "table_info.h"
namespace pinyin{
diff --git a/src/storage/chewing_large_table2.cpp b/src/storage/chewing_large_table2.cpp
index c8f9b06..7ac1398 100644
--- a/src/storage/chewing_large_table2.cpp
+++ b/src/storage/chewing_large_table2.cpp
@@ -56,7 +56,7 @@ void ChewingLargeTable2::init_entries() {
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
}
@@ -97,7 +97,7 @@ void ChewingLargeTable2::fini_entries() {
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
}
@@ -115,8 +115,13 @@ bool ChewingLargeTable2::load_text(FILE * infile, TABLE_PHONETIC_TYPE type) {
size_t freq;
while (!feof(infile)) {
+#ifdef __APPLE__
+ int num = fscanf(infile, "%255s %255[^ \t] %u %ld",
+ pinyin, phrase, &token, &freq);
+#else
int num = fscanf(infile, "%255s %255s %u %ld",
pinyin, phrase, &token, &freq);
+#endif
if (4 != num)
continue;
@@ -149,7 +154,7 @@ bool ChewingLargeTable2::load_text(FILE * infile, TABLE_PHONETIC_TYPE type) {
};
if (len != keys->len) {
- fprintf(stderr, "ChewingLargeTable::load_text:%s\t%s\t%u\t%ld\n",
+ fprintf(stderr, "ChewingLargeTable2::load_text:%s\t%s\t%u\t%ld\n",
pinyin, phrase, token, freq);
continue;
}
diff --git a/src/storage/chewing_large_table2.h b/src/storage/chewing_large_table2.h
index ebf6114..d37ed7c 100644
--- a/src/storage/chewing_large_table2.h
+++ b/src/storage/chewing_large_table2.h
@@ -38,6 +38,32 @@ namespace pinyin{
class MaskOutVisitor2;
+template<int phrase_length>
+class PrefixLessThanWithTones{
+protected:
+ int m_prefix_len;
+
+public:
+ PrefixLessThanWithTones(int prefix_len) :
+ m_prefix_len(prefix_len) {}
+
+ ~PrefixLessThanWithTones() {
+ m_prefix_len = 0;
+ }
+
+ int prefix_compare_with_tones(const PinyinIndexItem2<phrase_length> &lhs,
+ const PinyinIndexItem2<phrase_length> &rhs) {
+ ChewingKey * keys_lhs = (ChewingKey *) lhs.m_keys;
+ ChewingKey * keys_rhs = (ChewingKey *) rhs.m_keys;
+ return pinyin_compare_with_tones(keys_lhs, keys_rhs, m_prefix_len);
+ }
+
+ bool operator () (const PinyinIndexItem2<phrase_length> &lhs,
+ const PinyinIndexItem2<phrase_length> &rhs) {
+ return 0 > prefix_compare_with_tones(lhs, rhs);
+ }
+};
+
/* As this is a template class, the code will be in the header file. */
template<int phrase_length>
class ChewingTableEntry{
@@ -57,8 +83,8 @@ public:
/* convert method. */
/* compress consecutive tokens */
int convert(const ChewingKey keys[],
- const IndexItem * begin, const IndexItem * end,
- PhraseIndexRanges ranges) const {
+ const IndexItem * begin, const IndexItem * end,
+ PhraseIndexRanges ranges) const {
const IndexItem * iter = NULL;
PhraseIndexRange cursor;
GArray * head, * cursor_head = NULL;
@@ -120,6 +146,58 @@ public:
return convert(keys, range.first, range.second, ranges);
}
+ int convert_suggestion(int prefix_len,
+ const ChewingKey prefix_keys[],
+ const IndexItem * begin, const IndexItem * end,
+ PhraseTokens tokens) const {
+ assert(prefix_len < phrase_length);
+ const IndexItem * iter = NULL;
+ GArray * array = NULL;
+
+ int result = SEARCH_NONE;
+ for (iter = begin; iter != end; ++iter) {
+ if (0 != pinyin_compare_with_tones
+ (prefix_keys, iter->m_keys, prefix_len))
+ continue;
+
+ phrase_token_t token = iter->m_token;
+ array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)];
+ if (NULL == array)
+ continue;
+
+ result |= SEARCH_OK;
+ g_array_append_val(array, token);
+ }
+
+ return result;
+ }
+
+ /* search_suggestion method */
+ int search_suggestion(int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const {
+ /* Usually suggestion candidates will have at least two characters,
+ use PhraseTokens instead of PhraseIndexRanges. */
+ assert(prefix_len < phrase_length);
+
+ IndexItem item;
+ if (contains_incomplete_pinyin(prefix_keys, prefix_len)) {
+ compute_incomplete_chewing_index
+ (prefix_keys, item.m_keys, prefix_len);
+ } else {
+ compute_chewing_index(prefix_keys, item.m_keys, prefix_len);
+ }
+
+ const IndexItem * begin = (IndexItem *) m_chunk.begin();
+ const IndexItem * end = (IndexItem *) m_chunk.end();
+
+ PrefixLessThanWithTones<phrase_length> less_than(prefix_len);
+ std_lite::pair<const IndexItem *, const IndexItem *> range =
+ std_lite::equal_range(begin, end, item, less_than);
+
+ return convert_suggestion(prefix_len, prefix_keys, range.first, range.second, tokens);
+ }
+
/* add/remove index method */
int add_index(/* in */ const ChewingKey keys[],
/* in */ phrase_token_t token) {
diff --git a/src/storage/chewing_large_table2_bdb.cpp b/src/storage/chewing_large_table2_bdb.cpp
index 6741ffd..a90685e 100644
--- a/src/storage/chewing_large_table2_bdb.cpp
+++ b/src/storage/chewing_large_table2_bdb.cpp
@@ -24,6 +24,31 @@
namespace pinyin{
+/* keep dbm key compare function inside the corresponding dbm file
+ to get more flexibility. */
+
+static bool bdb_chewing_continue_search(const DBT *dbt1,
+ const DBT *dbt2) {
+ ChewingKey * lhs_chewing = (ChewingKey *) dbt1->data;
+ int lhs_chewing_length = dbt1->size / sizeof(ChewingKey);
+ ChewingKey * rhs_chewing = (ChewingKey *) dbt2->data;
+ int rhs_chewing_length = dbt2->size / sizeof(ChewingKey);
+
+ /* The key in dbm is longer than the key in application. */
+ if (lhs_chewing_length >= rhs_chewing_length)
+ return false;
+
+ int min_chewing_length = lhs_chewing_length;
+
+ int result = pinyin_exact_compare2
+ (lhs_chewing, rhs_chewing, min_chewing_length);
+ if (0 != result)
+ return false;
+
+ /* continue the longer chewing search. */
+ return true;
+}
+
ChewingLargeTable2::ChewingLargeTable2() {
/* create in-memory db. */
m_db = NULL;
@@ -195,7 +220,7 @@ int ChewingLargeTable2::search_internal(int phrase_length,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -203,6 +228,64 @@ int ChewingLargeTable2::search_internal(int phrase_length,
return SEARCH_NONE;
}
+template<int phrase_length>
+int ChewingLargeTable2::search_suggestion_internal
+(/* in */ const DBT & db_data,
+ int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const {
+ int result = SEARCH_NONE;
+
+ ChewingTableEntry<phrase_length> * entry =
+ (ChewingTableEntry<phrase_length> *)
+ g_ptr_array_index(m_entries, phrase_length);
+ assert(NULL != entry);
+
+ entry->m_chunk.set_chunk(db_data.data, db_data.size, NULL);
+
+ result = entry->search_suggestion(prefix_len, prefix_keys, tokens) | result;
+
+ return result;
+}
+
+int ChewingLargeTable2::search_suggestion_internal
+(int phrase_length,
+ /* in */ const DBT & db_data,
+ int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const {
+ assert(prefix_len < phrase_length);
+
+#define CASE(len) case len: \
+ { \
+ return search_suggestion_internal<len> \
+ (db_data, prefix_len, prefix_keys, tokens); \
+ }
+ switch(phrase_length) {
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ default:
+ abort();
+ }
+
+#undef CASE
+
+ return SEARCH_NONE;
+}
template<int phrase_length>
int ChewingLargeTable2::add_index_internal(/* in */ const ChewingKey index[],
@@ -302,7 +385,7 @@ int ChewingLargeTable2::add_index_internal(int phrase_length,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -376,7 +459,7 @@ int ChewingLargeTable2::remove_index_internal(int phrase_length,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -443,11 +526,14 @@ bool ChewingLargeTable2::mask_out(phrase_token_t mask,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
+ /* Initialize our DBTs. */
+ memset(&db_key, 0, sizeof(DBT));
+ memset(&db_data, 0, sizeof(DBT));
}
assert(ret == DB_NOTFOUND);
@@ -460,4 +546,68 @@ bool ChewingLargeTable2::mask_out(phrase_token_t mask,
return true;
}
+/* search_suggesion method */
+int ChewingLargeTable2::search_suggestion
+(int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const {
+ ChewingKey index[MAX_PHRASE_LENGTH];
+ int result = SEARCH_NONE;
+
+ if (NULL == m_db)
+ return result;
+
+ if (contains_incomplete_pinyin(prefix_keys, prefix_len))
+ compute_incomplete_chewing_index(prefix_keys, index, prefix_len);
+ else
+ compute_chewing_index(prefix_keys, index, prefix_len);
+
+ DBC * cursorp = NULL;
+ /* Get a cursor */
+ int ret = m_db->cursor(m_db, NULL, &cursorp, 0);
+ if (ret != 0)
+ return result;
+
+ DBT db_key1;
+ memset(&db_key1, 0, sizeof(DBT));
+ db_key1.data = (void *) index;
+ db_key1.size = prefix_len * sizeof(ChewingKey);
+
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ /* Get the prefix entry */
+ ret = cursorp->c_get(cursorp, &db_key1, &db_data, DB_SET);
+ if (ret != 0) {
+ cursorp->c_close(cursorp);
+ return result;
+ }
+
+ /* Get the next entry */
+ DBT db_key2;
+ memset(&db_key2, 0, sizeof(DBT));
+ memset(&db_data, 0, sizeof(DBT));
+ ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT);
+ if (ret != 0) {
+ cursorp->c_close(cursorp);
+ return result;
+ }
+
+ while(bdb_chewing_continue_search(&db_key1, &db_key2)) {
+ int phrase_length = db_key2.size / sizeof(ChewingKey);
+ result = search_suggestion_internal
+ (phrase_length, db_data, prefix_len, prefix_keys, tokens) | result;
+
+ memset(&db_key2, 0, sizeof(DBT));
+ memset(&db_data, 0, sizeof(DBT));
+ ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT);
+ if (ret != 0) {
+ cursorp->c_close(cursorp);
+ return result;
+ }
+ }
+
+ cursorp->c_close(cursorp);
+ return result;
+}
+
};
diff --git a/src/storage/chewing_large_table2_bdb.h b/src/storage/chewing_large_table2_bdb.h
index e12855e..a0da787 100644
--- a/src/storage/chewing_large_table2_bdb.h
+++ b/src/storage/chewing_large_table2_bdb.h
@@ -59,6 +59,18 @@ protected:
/* out */ PhraseIndexRanges ranges) const;
template<int phrase_length>
+ int search_suggestion_internal(/* in */ const DBT & db_data,
+ int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const;
+
+ int search_suggestion_internal(int phrase_length,
+ /* in */ const DBT & db_data,
+ int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const;
+
+ template<int phrase_length>
int add_index_internal(/* in */ const ChewingKey index[],
/* in */ const ChewingKey keys[],
/* in */ phrase_token_t token);
@@ -101,6 +113,11 @@ public:
int search(int phrase_length, /* in */ const ChewingKey keys[],
/* out */ PhraseIndexRanges ranges) const;
+ /* search_suggesion method */
+ int search_suggestion(int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const;
+
/* add/remove index method */
int add_index(int phrase_length, /* in */ const ChewingKey keys[],
/* in */ phrase_token_t token);
diff --git a/src/storage/chewing_large_table2_kyotodb.cpp b/src/storage/chewing_large_table2_kyotodb.cpp
index b2bc9fc..d5aeeb9 100644
--- a/src/storage/chewing_large_table2_kyotodb.cpp
+++ b/src/storage/chewing_large_table2_kyotodb.cpp
@@ -20,17 +20,43 @@
#include "chewing_large_table2.h"
#include <kchashdb.h>
-#include <kcprotodb.h>
+#include <kccachedb.h>
+#include "pinyin_utils.h"
#include "kyotodb_utils.h"
using namespace kyotocabinet;
namespace pinyin{
+/* keep dbm key compare function inside the corresponding dbm file
+ to get more flexibility. */
+
+bool kyotodb_chewing_continue_search(const char* akbuf, size_t aksiz,
+ const char* bkbuf, size_t bksiz) {
+ ChewingKey * lhs_chewing = (ChewingKey *) akbuf;
+ int lhs_chewing_length = aksiz / sizeof(ChewingKey);
+ ChewingKey * rhs_chewing = (ChewingKey *) bkbuf;
+ int rhs_chewing_length = bksiz / sizeof(ChewingKey);
+
+ /* The key in dbm is longer than the key in application. */
+ if (lhs_chewing_length >= rhs_chewing_length)
+ return false;
+
+ int min_chewing_length = lhs_chewing_length;
+
+ int result = pinyin_exact_compare2
+ (lhs_chewing, rhs_chewing, min_chewing_length);
+ if (0 != result)
+ return false;
+
+ /* continue the longer chewing search. */
+ return true;
+}
+
ChewingLargeTable2::ChewingLargeTable2() {
/* create in-memory db. */
m_db = new ProtoTreeDB;
- assert(m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE));
+ check_result(m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE));
m_entries = NULL;
init_entries();
@@ -76,6 +102,10 @@ bool ChewingLargeTable2::load_db(const char * filename) {
if (!m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE))
return false;
+ if (!m_db->load_snapshot(filename, NULL))
+ return false;
+
+#if 0
/* load db into memory. */
BasicDB * tmp_db = new TreeDB;
if (!tmp_db->open(filename, BasicDB::OREADER))
@@ -86,6 +116,7 @@ bool ChewingLargeTable2::load_db(const char * filename) {
tmp_db->close();
delete tmp_db;
+#endif
return true;
}
@@ -95,6 +126,10 @@ bool ChewingLargeTable2::store_db(const char * new_filename) {
if ( ret != 0 && errno != ENOENT)
return false;
+ if (!m_db->dump_snapshot(new_filename, NULL))
+ return false;
+
+#if 0
BasicDB * tmp_db = new TreeDB;
if (!tmp_db->open(new_filename, BasicDB::OWRITER|BasicDB::OCREATE))
return false;
@@ -105,6 +140,7 @@ bool ChewingLargeTable2::store_db(const char * new_filename) {
tmp_db->synchronize();
tmp_db->close();
delete tmp_db;
+#endif
return true;
}
@@ -134,8 +170,8 @@ int ChewingLargeTable2::search_internal(/* in */ const ChewingKey index[],
entry->m_chunk.set_size(vsiz);
/* m_chunk may re-allocate here. */
char * vbuf = (char *) entry->m_chunk.begin();
- assert(vsiz == m_db->get(kbuf, phrase_length * sizeof(ChewingKey),
- vbuf, vsiz));
+ check_result(vsiz == m_db->get(kbuf, phrase_length * sizeof(ChewingKey),
+ vbuf, vsiz));
result = entry->search(keys, ranges) | result;
@@ -169,7 +205,68 @@ int ChewingLargeTable2::search_internal(int phrase_length,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
+ }
+
+#undef CASE
+
+ return SEARCH_NONE;
+}
+
+template<int phrase_length>
+int ChewingLargeTable2::search_suggestion_internal
+(/* in */ const MemoryChunk & chunk,
+ int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const {
+ int result = SEARCH_NONE;
+
+ ChewingTableEntry<phrase_length> * entry =
+ (ChewingTableEntry<phrase_length> *)
+ g_ptr_array_index(m_entries, phrase_length);
+ assert(NULL != entry);
+
+ entry->m_chunk.set_chunk(chunk.begin(), chunk.size(), NULL);
+
+ result = entry->search_suggestion(prefix_len, prefix_keys, tokens) | result;
+
+ entry->m_chunk.set_size(0);
+
+ return result;
+}
+
+int ChewingLargeTable2::search_suggestion_internal
+(int phrase_length,
+ /* in */ const MemoryChunk & chunk,
+ int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const {
+
+#define CASE(len) case len: \
+ { \
+ return search_suggestion_internal<len> \
+ (chunk, prefix_len, prefix_keys, tokens); \
+ }
+
+ switch(phrase_length) {
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ default:
+ abort();
}
#undef CASE
@@ -226,7 +323,7 @@ int ChewingLargeTable2::add_index_internal(/* in */ const ChewingKey index[],
entry->m_chunk.set_size(vsiz);
/* m_chunk may re-allocate here. */
vbuf = (char *) entry->m_chunk.begin();
- assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
+ check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
int result = entry->add_index(keys, token);
@@ -267,7 +364,7 @@ int ChewingLargeTable2::add_index_internal(int phrase_length,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -295,7 +392,7 @@ int ChewingLargeTable2::remove_index_internal(/* in */ const ChewingKey index[],
entry->m_chunk.set_size(vsiz);
/* m_chunk may re-allocate here. */
vbuf = (char *) entry->m_chunk.begin();
- assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
+ check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
int result = entry->remove_index(keys, token);
if (ERROR_OK != result)
@@ -338,7 +435,7 @@ int ChewingLargeTable2::remove_index_internal(int phrase_length,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -399,12 +496,12 @@ public:
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
- assert(false);
+ abort();
return NOP;
}
@@ -423,4 +520,68 @@ bool ChewingLargeTable2::mask_out(phrase_token_t mask,
return true;
}
+/* search_suggesion method */
+int ChewingLargeTable2::search_suggestion
+(int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const {
+ ChewingKey index[MAX_PHRASE_LENGTH];
+ int result = SEARCH_NONE;
+
+ if (NULL == m_db)
+ return result;
+
+ if (contains_incomplete_pinyin(prefix_keys, prefix_len))
+ compute_incomplete_chewing_index(prefix_keys, index, prefix_len);
+ else
+ compute_chewing_index(prefix_keys, index, prefix_len);
+
+ const char * akbuf = (char *) index;
+ const size_t aksiz = prefix_len * sizeof(ChewingKey);
+ const int32_t vsiz = m_db->check(akbuf, aksiz);
+ /* -1 on failure. */
+ if (-1 == vsiz)
+ return result;
+
+ BasicDB::Cursor * cursor = m_db->cursor();
+ bool retval = cursor->jump(akbuf, aksiz);
+ if (!retval) {
+ delete cursor;
+ return result;
+ }
+
+ /* Get the next entry */
+ retval = cursor->step();
+ if (!retval) {
+ delete cursor;
+ return result;
+ }
+
+ size_t bksiz = 0;
+ const char * bkbuf = cursor->get_key(&bksiz);
+ MemoryChunk chunk;
+ while(kyotodb_chewing_continue_search(akbuf, aksiz, bkbuf, bksiz)) {
+ int phrase_length = bksiz / sizeof(ChewingKey);
+ size_t bvsiz = 0;
+ char * bvbuf = cursor->get_value(&bvsiz);
+ chunk.set_chunk(bvbuf, bvsiz, NULL);
+ result = search_suggestion_internal
+ (phrase_length, chunk, prefix_len, prefix_keys, tokens) | result;
+ chunk.set_size(0);
+ delete [] bvbuf;
+
+ retval = cursor->step();
+ if (!retval) {
+ delete cursor;
+ return result;
+ }
+
+ bksiz = 0;
+ bkbuf = cursor->get_key(&bksiz);
+ }
+
+ delete cursor;
+ return result;
+}
+
};
diff --git a/src/storage/chewing_large_table2_kyotodb.h b/src/storage/chewing_large_table2_kyotodb.h
index 92f317b..fcfee83 100644
--- a/src/storage/chewing_large_table2_kyotodb.h
+++ b/src/storage/chewing_large_table2_kyotodb.h
@@ -59,6 +59,18 @@ protected:
/* out */ PhraseIndexRanges ranges) const;
template<int phrase_length>
+ int search_suggestion_internal(/* in */ const MemoryChunk & chunk,
+ int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const;
+
+ int search_suggestion_internal(int phrase_length,
+ /* in */ const MemoryChunk & chunk,
+ int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const;
+
+ template<int phrase_length>
int add_index_internal(/* in */ const ChewingKey index[],
/* in */ const ChewingKey keys[],
/* in */ phrase_token_t token);
@@ -100,6 +112,11 @@ public:
int search(int phrase_length, /* in */ const ChewingKey keys[],
/* out */ PhraseIndexRanges ranges) const;
+ /* search_suggesion method */
+ int search_suggestion(int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const;
+
/* add/remove index method */
int add_index(int phrase_length, /* in */ const ChewingKey keys[],
/* in */ phrase_token_t token);
diff --git a/src/storage/facade_chewing_table2.h b/src/storage/facade_chewing_table2.h
index 1cf2a1f..0d22b5d 100644
--- a/src/storage/facade_chewing_table2.h
+++ b/src/storage/facade_chewing_table2.h
@@ -129,6 +129,32 @@ public:
}
/**
+ * FacadeChewingTable2::search_suggestion:
+ * @prefix_len: the length of the prefix to be searched.
+ * @prefix_keys: the pinyin key of the prefix to be searched.
+ * @tokens: the array of GArrays to store the matched prefix token.
+ * @returns: the search result of enum SearchResult.
+ *
+ * Search the phrase tokens according to the prefix pinyin keys.
+ *
+ */
+ int search_suggestion(int prefix_len,
+ /* in */ const ChewingKey prefix_keys[],
+ /* out */ PhraseTokens tokens) const {
+ int result = SEARCH_NONE;
+
+ if (NULL != m_system_chewing_table)
+ result |= m_system_chewing_table->search_suggestion
+ (prefix_len, prefix_keys, tokens);
+
+ if (NULL != m_user_chewing_table)
+ result |= m_user_chewing_table->search_suggestion
+ (prefix_len, prefix_keys, tokens);
+
+ return result;
+ }
+
+ /**
* FacadeChewingTable2::add_index:
* @phrase_length: the length of the phrase to be added.
* @keys: the pinyin keys of the phrase to be added.
diff --git a/src/storage/facade_phrase_table3.h b/src/storage/facade_phrase_table3.h
index 3f71421..9ad9e85 100644
--- a/src/storage/facade_phrase_table3.h
+++ b/src/storage/facade_phrase_table3.h
@@ -129,6 +129,31 @@ public:
}
/**
+ * FacadePhraseTable3::search_suggestion:
+ * @phrase_length: the length of the prefix to be searched.
+ * @phrase: the ucs4 characters of the prefix to be searched.
+ * @tokens: the GArray of tokens to store the matched phrases.
+ * @returns: the search result of enum SearchResult.
+ *
+ * Search the phrase tokens according to the ucs4 prefix characters.
+ *
+ */
+ int search_suggestion(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const {
+ int result = SEARCH_NONE;
+
+ if (NULL != m_system_phrase_table)
+ result |= m_system_phrase_table->search_suggestion
+ (phrase_length, phrase, tokens);
+
+ if (NULL != m_user_phrase_table)
+ result |= m_user_phrase_table->search_suggestion
+ (phrase_length, phrase, tokens);
+
+ return result;
+ }
+
+ /**
* FacadePhraseTable3::add_index:
* @phrase_length: the length of the phrase to be added.
* @phrase: the ucs4 characters of the phrase to be added.
diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h
index f6b0161..a80baa3 100644
--- a/src/storage/flexible_ngram.h
+++ b/src/storage/flexible_ngram.h
@@ -28,6 +28,7 @@
* struct MagicHeader, ArrayHeader, ArrayItem.
*/
+#include "pinyin_utils.h"
#include "flexible_single_gram.h"
#ifdef HAVE_BERKELEY_DB
diff --git a/src/storage/flexible_ngram_bdb.h b/src/storage/flexible_ngram_bdb.h
index 786cb76..3453752 100644
--- a/src/storage/flexible_ngram_bdb.h
+++ b/src/storage/flexible_ngram_bdb.h
@@ -105,7 +105,7 @@ public:
int ret = db_create(&m_db, NULL, 0);
if ( ret != 0 )
- assert(false);
+ abort();
ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
if ( ret != 0 && (flags & ATTACH_CREATE) ) {
@@ -270,6 +270,10 @@ public:
}
phrase_token_t * token = (phrase_token_t *) key.data;
g_array_append_val(items, *token);
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
}
if ( ret != DB_NOTFOUND ){
diff --git a/src/storage/flexible_ngram_kyotodb.h b/src/storage/flexible_ngram_kyotodb.h
index ad84b78..b12fa42 100644
--- a/src/storage/flexible_ngram_kyotodb.h
+++ b/src/storage/flexible_ngram_kyotodb.h
@@ -171,7 +171,7 @@ public:
m_chunk.set_size(vsiz);
char * vbuf = (char *) m_chunk.begin();
- assert (vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
+ check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
if ( memcmp(vbuf, m_magic_number,
sizeof(m_magic_number)) == 0 )
@@ -206,8 +206,8 @@ public:
m_chunk.set_size(vsiz);
char * vbuf = (char *) m_chunk.begin();
- assert (vsiz == m_db->get(kbuf, sizeof(phrase_token_t),
- vbuf, vsiz));
+ check_result(vsiz == m_db->get(kbuf, sizeof(phrase_token_t),
+ vbuf, vsiz));
single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem>
(m_chunk.begin(), vsiz, copy);
@@ -397,7 +397,7 @@ public:
} else { /* found */
m_chunk.set_size(vsiz);
char * vbuf = (char *) m_chunk.begin();
- assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
+ check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
}
m_chunk.set_content(0, &header, sizeof(ArrayHeader));
diff --git a/src/storage/kyotodb_utils.h b/src/storage/kyotodb_utils.h
index 5e39212..4562007 100644
--- a/src/storage/kyotodb_utils.h
+++ b/src/storage/kyotodb_utils.h
@@ -49,6 +49,7 @@ inline uint32_t attach_options(guint32 flags) {
/* Kyoto Cabinet requires non-NULL pointer for zero length value. */
static const char * empty_vbuf = (char *)UINTPTR_MAX;
+#if 0
class CopyVisitor : public DB::Visitor {
private:
BasicDB * m_db;
@@ -68,6 +69,7 @@ public:
return NOP;
}
};
+#endif
};
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
index b8347d9..e4bfe8f 100644
--- a/src/storage/ngram.cpp
+++ b/src/storage/ngram.cpp
@@ -46,14 +46,12 @@ SingleGram::SingleGram(void * buffer, size_t length, bool copy){
}
bool SingleGram::get_total_freq(guint32 & total) const{
- char * buf_begin = (char *)m_chunk.begin();
- total = *((guint32 *)buf_begin);
+ total = m_chunk.get_content<guint32>(0);
return true;
}
bool SingleGram::set_total_freq(guint32 total){
- char * buf_begin = (char *)m_chunk.begin();
- *((guint32 *)buf_begin) = total;
+ m_chunk.set_content<guint32>(0, total);
return true;
}
@@ -68,7 +66,7 @@ guint32 SingleGram::get_length(){
if (0 == length) {
/* no items here, total freq should be zero. */
guint32 total_freq = 0;
- assert(get_total_freq(total_freq));
+ check_result(get_total_freq(total_freq));
assert(0 == total_freq);
}
@@ -79,7 +77,7 @@ guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
guint32 removed_items = 0;
guint32 total_freq = 0;
- assert(get_total_freq(total_freq));
+ check_result(get_total_freq(total_freq));
const SingleGramItem * begin = (const SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
@@ -100,12 +98,12 @@ guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
--cur;
}
- assert(set_total_freq(total_freq));
+ check_result(set_total_freq(total_freq));
return removed_items;
}
bool SingleGram::prune(){
- assert(false);
+ abort();
#if 0
SingleGramItem * begin = (SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
@@ -122,8 +120,8 @@ bool SingleGram::prune(){
}
}
guint32 total_freq;
- assert(get_total_freq(total_freq));
- assert(set_total_freq(total_freq - nitem));
+ check_result(get_total_freq(total_freq));
+ check_result(set_total_freq(total_freq - nitem));
#endif
return true;
}
@@ -140,7 +138,7 @@ bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
guint32 total_freq;
BigramPhraseItemWithCount bigram_item_with_count;
- assert(get_total_freq(total_freq));
+ check_result(get_total_freq(total_freq));
for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
bigram_item_with_count.m_token = cur_item->m_token;
@@ -164,14 +162,14 @@ bool SingleGram::search(/* in */ PhraseIndexRange * range,
guint32 total_freq;
BigramPhraseItem bigram_item;
- assert(get_total_freq(total_freq));
+ check_result(get_total_freq(total_freq));
for ( ; cur_item != end; ++cur_item){
- if ( cur_item->m_token >= range->m_range_end )
- break;
- bigram_item.m_token = cur_item->m_token;
- bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
- g_array_append_val(array, bigram_item);
+ if ( cur_item->m_token >= range->m_range_end )
+ break;
+ bigram_item.m_token = cur_item->m_token;
+ bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
+ g_array_append_val(array, bigram_item);
}
return true;
@@ -283,15 +281,17 @@ bool merge_single_gram(SingleGram * merged, const SingleGram * system,
MemoryChunk & merged_chunk = merged->m_chunk;
+ merged_chunk.set_size(0);
+
if (NULL == system) {
- merged_chunk.set_chunk(user->m_chunk.begin(),
- user->m_chunk.size(), NULL);
+ merged_chunk.set_content(0, user->m_chunk.begin(),
+ user->m_chunk.size());
return true;
}
if (NULL == user) {
- merged_chunk.set_chunk(system->m_chunk.begin(),
- system->m_chunk.size(), NULL);
+ merged_chunk.set_content(0, system->m_chunk.begin(),
+ system->m_chunk.size());
return true;
}
@@ -300,8 +300,8 @@ bool merge_single_gram(SingleGram * merged, const SingleGram * system,
/* merge the origin info and delta info */
guint32 system_total, user_total;
- assert(system->get_total_freq(system_total));
- assert(user->get_total_freq(user_total));
+ check_result(system->get_total_freq(system_total));
+ check_result(user->get_total_freq(user_total));
const guint32 merged_total = system_total + user_total;
merged_chunk.set_content(0, &merged_total, sizeof(guint32));
diff --git a/src/storage/ngram.h b/src/storage/ngram.h
index 7f7a653..57979b4 100644
--- a/src/storage/ngram.h
+++ b/src/storage/ngram.h
@@ -24,6 +24,7 @@
#include "config.h"
#include <glib.h>
#include "novel_types.h"
+#include "pinyin_utils.h"
#ifdef HAVE_BERKELEY_DB
#include "ngram_bdb.h"
diff --git a/src/storage/ngram_bdb.cpp b/src/storage/ngram_bdb.cpp
index a13b431..8910f9d 100644
--- a/src/storage/ngram_bdb.cpp
+++ b/src/storage/ngram_bdb.cpp
@@ -199,6 +199,10 @@ bool Bigram::get_all_items(GArray * items){
assert(key.size == sizeof(phrase_token_t));
phrase_token_t * token = (phrase_token_t *)key.data;
g_array_append_val(items, *token);
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
}
assert (ret == DB_NOTFOUND);
@@ -223,12 +227,12 @@ bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
phrase_token_t index = g_array_index(items, phrase_token_t, i);
if ((index & mask) == value) {
- assert(remove(index));
+ check_result(remove(index));
continue;
}
SingleGram * gram = NULL;
- assert(load(index, gram));
+ check_result(load(index, gram));
int num = gram->mask_out(mask, value);
if (0 == num) {
@@ -237,9 +241,9 @@ bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
}
if (0 == gram->get_length()) {
- assert(remove(index));
+ check_result(remove(index));
} else {
- assert(store(index, gram));
+ check_result(store(index, gram));
}
delete gram;
diff --git a/src/storage/ngram_kyotodb.cpp b/src/storage/ngram_kyotodb.cpp
index 98f2f59..560e196 100644
--- a/src/storage/ngram_kyotodb.cpp
+++ b/src/storage/ngram_kyotodb.cpp
@@ -22,7 +22,7 @@
#include <assert.h>
#include <errno.h>
#include <kchashdb.h>
-#include <kcprotodb.h>
+#include <kcstashdb.h>
#include "kyotodb_utils.h"
@@ -50,16 +50,20 @@ void Bigram::reset(){
}
-/* Use ProtoHashDB for load_db/save_db methods. */
+/* Use StashDB for load_db/save_db methods. */
bool Bigram::load_db(const char * dbfile){
reset();
/* create in-memory db. */
- m_db = new ProtoHashDB;
+ m_db = new StashDB;
if ( !m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE) )
return false;
+ if (!m_db->load_snapshot(dbfile, NULL))
+ return false;
+
+#if 0
/* load db into memory. */
BasicDB * tmp_db = new HashDB;
if (!tmp_db->open(dbfile, BasicDB::OREADER))
@@ -70,6 +74,7 @@ bool Bigram::load_db(const char * dbfile){
tmp_db->close();
delete tmp_db;
+#endif
return true;
}
@@ -80,6 +85,10 @@ bool Bigram::save_db(const char * dbfile){
if ( ret != 0 && errno != ENOENT)
return false;
+ if (!m_db->dump_snapshot(dbfile, NULL))
+ return false;
+
+#if 0
BasicDB * tmp_db = new HashDB;
if ( !tmp_db->open(dbfile, BasicDB::OWRITER|BasicDB::OCREATE) )
@@ -91,6 +100,7 @@ bool Bigram::save_db(const char * dbfile){
tmp_db->synchronize();
tmp_db->close();
delete tmp_db;
+#endif
return true;
}
@@ -123,7 +133,7 @@ bool Bigram::load(phrase_token_t index, SingleGram * & single_gram,
m_chunk.set_size(vsiz);
char * vbuf = (char *) m_chunk.begin();
- assert (vsiz == m_db->get(kbuf, sizeof(phrase_token_t),
+ check_result (vsiz == m_db->get(kbuf, sizeof(phrase_token_t),
vbuf, vsiz));
single_gram = new SingleGram(m_chunk.begin(), vsiz, copy);
@@ -196,12 +206,12 @@ bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
phrase_token_t index = g_array_index(items, phrase_token_t, i);
if ((index & mask) == value) {
- assert(remove(index));
+ check_result(remove(index));
continue;
}
SingleGram * gram = NULL;
- assert(load(index, gram));
+ check_result(load(index, gram));
int num = gram->mask_out(mask, value);
if (0 == num) {
@@ -210,9 +220,9 @@ bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
}
if (0 == gram->get_length()) {
- assert(remove(index));
+ check_result(remove(index));
} else {
- assert(store(index, gram));
+ check_result(store(index, gram));
}
delete gram;
diff --git a/src/storage/phonetic_key_matrix.cpp b/src/storage/phonetic_key_matrix.cpp
index 058c2e4..ab7e879 100644
--- a/src/storage/phonetic_key_matrix.cpp
+++ b/src/storage/phonetic_key_matrix.cpp
@@ -437,6 +437,100 @@ int search_matrix(const FacadeChewingTable2 * table,
return result;
}
+int search_suggestion_with_matrix_recur(GArray * cached_keys,
+ const FacadeChewingTable2 * table,
+ const PhoneticKeyMatrix * matrix,
+ size_t prefix_len,
+ size_t start, size_t end,
+ PhraseTokens tokens) {
+ if (start > end)
+ return SEARCH_NONE;
+
+ /* only do chewing table search with 'start' and 'end'. */
+ if (start == end) {
+ /* exceed the maximum phrase length. */
+ if (cached_keys->len > MAX_PHRASE_LENGTH)
+ return SEARCH_NONE;
+
+ /* skip the phrase longer than prefix_len * 2 + 1,
+ use the m_parsed_key_len variable for the prefix_len. */
+ if (cached_keys->len > prefix_len * 2)
+ return SEARCH_NONE;
+
+ /* only "'" here. */
+ if (0 == cached_keys->len)
+ return SEARCH_NONE;
+
+#if 0
+ printf("search table for suggestion candidate:%d\n", cached_keys->len);
+#endif
+ return table->search_suggestion
+ (cached_keys->len, (ChewingKey *)cached_keys->data, tokens);
+ }
+
+ int result = SEARCH_NONE;
+
+ const size_t size = matrix->get_column_size(start);
+ /* assume pinyin parsers will filter invalid keys. */
+ assert(size > 0);
+
+ for (size_t i = 0; i < size; ++i) {
+ ChewingKey key; ChewingKeyRest key_rest;
+ matrix->get_item(start, i, key, key_rest);
+
+ const size_t newstart = key_rest.m_raw_end;
+
+ const ChewingKey zero_key;
+ if (zero_key == key) {
+ /* assume only one key here for "'" or the last key. */
+ assert(1 == size);
+ return search_suggestion_with_matrix_recur
+ (cached_keys, table, matrix, prefix_len, newstart, end, tokens);
+ }
+
+ /* push value */
+ g_array_append_val(cached_keys, key);
+
+ result |= search_suggestion_with_matrix_recur
+ (cached_keys, table, matrix, prefix_len, newstart, end, tokens);
+
+ /* pop value */
+ g_array_set_size(cached_keys, cached_keys->len - 1);
+ }
+
+ return result;
+}
+
+int search_suggestion_with_matrix(const FacadeChewingTable2 * table,
+ const PhoneticKeyMatrix * matrix,
+ size_t prefix_len,
+ PhraseTokens tokens) {
+ int result = SEARCH_NONE;
+
+ /* skip the prefix phrase is equal or longer than MAX_PHRASE_LENGTH,
+ as the prefix phrase candidate will always longer than prefix_len. */
+ if (prefix_len >= MAX_PHRASE_LENGTH)
+ return result;
+
+ size_t start = 0, end = matrix->size() - 1;
+
+ const size_t start_len = matrix->get_column_size(start);
+ if (0 == start_len)
+ return result;
+
+ const size_t end_len = matrix->get_column_size(end);
+ if (0 == end_len)
+ return result;
+
+ GArray * cached_keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey));
+
+ result = search_suggestion_with_matrix_recur
+ (cached_keys, table, matrix, prefix_len, start, end, tokens);
+
+ g_array_free(cached_keys, TRUE);
+ return result;
+}
+
gfloat compute_pronunciation_possibility_recur(const PhoneticKeyMatrix * matrix,
size_t start, size_t end,
GArray * cached_keys,
diff --git a/src/storage/phonetic_key_matrix.h b/src/storage/phonetic_key_matrix.h
index ca7aa84..0b80a96 100644
--- a/src/storage/phonetic_key_matrix.h
+++ b/src/storage/phonetic_key_matrix.h
@@ -212,6 +212,11 @@ int search_matrix(const FacadeChewingTable2 * table,
size_t start, size_t end,
PhraseIndexRanges ranges);
+int search_suggestion_with_matrix(const FacadeChewingTable2 * table,
+ const PhoneticKeyMatrix * matrix,
+ size_t prefix_len,
+ PhraseTokens tokens);
+
gfloat compute_pronunciation_possibility(const PhoneticKeyMatrix * matrix,
size_t start, size_t end,
GArray * cached_keys,
diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp
index 06b613f..bb98251 100644
--- a/src/storage/phrase_index.cpp
+++ b/src/storage/phrase_index.cpp
@@ -20,6 +20,7 @@
#include "phrase_index.h"
#include "pinyin_custom2.h"
+#include "unaligned_memory.h"
namespace pinyin{
@@ -61,10 +62,12 @@ bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){
for (int i = 0; i < npron; ++i) {
char * chewing_begin = buf_begin + offset +
i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
- guint32 * freq = (guint32 *)(chewing_begin +
- phrase_length * sizeof(ChewingKey));
- total_freq += *freq;
+ guint32 * pfreq = (guint32 *)(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+ guint32 freq = UnalignedMemory<guint32>::load(pfreq);
+
+ total_freq += freq;
if (0 == pinyin_exact_compare2
(keys, (ChewingKey *)chewing_begin, phrase_length)) {
@@ -74,8 +77,9 @@ bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){
if (delta > 0 && total_freq > total_freq + delta)
return false;
- *freq += delta;
+ freq += delta;
total_freq += delta;
+ UnalignedMemory<guint32>::store(freq, pfreq);
return true;
}
}
@@ -117,9 +121,11 @@ void PhraseItem::increase_pronunciation_possibility(ChewingKey * keys,
for (int i = 0; i < npron; ++i) {
char * chewing_begin = buf_begin + offset +
i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
- guint32 * freq = (guint32 *)(chewing_begin +
- phrase_length * sizeof(ChewingKey));
- total_freq += *freq;
+
+ guint32 * pfreq = (guint32 *)(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+ guint32 freq = UnalignedMemory<guint32>::load(pfreq);
+ total_freq += freq;
if (0 == pinyin_compare_with_tones(keys, (ChewingKey *)chewing_begin,
phrase_length)) {
@@ -128,8 +134,9 @@ void PhraseItem::increase_pronunciation_possibility(ChewingKey * keys,
if (delta > 0 && total_freq > total_freq + delta)
return;
- *freq += delta;
+ freq += delta;
total_freq += delta;
+ UnalignedMemory<guint32>::store(freq, pfreq);
}
}
}
@@ -505,7 +512,7 @@ bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
break;
}
default:
- assert(false);
+ abort();
}
}
return true;
@@ -527,8 +534,13 @@ bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile,
phrase_token_t cur_token = 0;
while (!feof(infile)){
+#ifdef __APPLE__
+ int num = fscanf(infile, "%255s %255[^ \t] %u %ld",
+ pinyin, phrase, &token, &freq);
+#else
int num = fscanf(infile, "%255s %255s %u %ld",
pinyin, phrase, &token, &freq);
+#endif
if (4 != num)
continue;
@@ -796,7 +808,7 @@ bool _compute_new_header(PhraseIndexLogger * logger,
break;
}
default:
- assert(false);
+ abort();
}
}
diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h
index 83dfb51..f97ac65 100644
--- a/src/storage/phrase_index.h
+++ b/src/storage/phrase_index.h
@@ -31,6 +31,7 @@
#include "memory_chunk.h"
#include "phrase_index_logger.h"
#include "table_info.h"
+#include "unaligned_memory.h"
/**
* Phrase Index File Format
@@ -121,8 +122,7 @@ public:
*
*/
guint32 get_unigram_frequency(){
- char * buf_begin = (char *)m_chunk.begin();
- return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
+ return m_chunk.get_content<guint32>(sizeof(guint8) + sizeof(guint8));
}
/**
@@ -142,12 +142,13 @@ public:
for ( int i = 0 ; i < npron ; ++i){
char * chewing_begin = buf_begin + offset +
i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
- guint32 * freq = (guint32 *)(chewing_begin +
- phrase_length * sizeof(ChewingKey));
- total_freq += *freq;
+
+ guint32 freq = UnalignedMemory<guint32>::load(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+ total_freq += freq;
if ( 0 == pinyin_compare_with_tones(keys, (ChewingKey *)chewing_begin,
phrase_length) ){
- matched += *freq;
+ matched += freq;
}
}
diff --git a/src/storage/phrase_index_logger.h b/src/storage/phrase_index_logger.h
index cffd937..d1e42b6 100644
--- a/src/storage/phrase_index_logger.h
+++ b/src/storage/phrase_index_logger.h
@@ -294,7 +294,7 @@ public:
break;
}
default:
- assert(false);
+ abort();
}
/* store log record. */
diff --git a/src/storage/phrase_large_table2.cpp b/src/storage/phrase_large_table2.cpp
index 38dafb3..28eb313 100644
--- a/src/storage/phrase_large_table2.cpp
+++ b/src/storage/phrase_large_table2.cpp
@@ -184,7 +184,7 @@ PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
}
g_array_free(m_phrase_array_indexes, TRUE);
@@ -228,7 +228,7 @@ int PhraseLengthIndexLevel2::search(int phrase_length,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
}
@@ -348,7 +348,7 @@ int PhraseLengthIndexLevel2::add_index(int phrase_length,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
@@ -402,7 +402,7 @@ int PhraseLengthIndexLevel2::remove_index(int phrase_length,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
}
@@ -472,8 +472,13 @@ bool PhraseLargeTable2::load_text(FILE * infile){
size_t freq;
while (!feof(infile)) {
+#ifdef __APPLE__
+ int num = fscanf(infile, "%255s %255[^ \t] %u %ld",
+ pinyin, phrase, &token, &freq);
+#else
int num = fscanf(infile, "%255s %255s %u %ld",
pinyin, phrase, &token, &freq);
+#endif
if (4 != num)
continue;
@@ -556,7 +561,8 @@ bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk,
table_offset_t offset,
table_offset_t end) {
char * buf_begin = (char *) chunk->begin();
- guint32 nindex = *((guint32 *)(buf_begin + offset));
+ guint32 nindex = chunk->get_content<guint32>(offset);
+
table_offset_t * index = (table_offset_t *)
(buf_begin + offset + sizeof(guint32));
@@ -600,7 +606,7 @@ bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
#undef CASE
}
@@ -656,7 +662,7 @@ bool PhraseLengthIndexLevel2::store(MemoryChunk * new_chunk,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
//add '#'
new_chunk->set_content(offset, &c_separate, sizeof(char));
@@ -776,7 +782,7 @@ bool PhraseLengthIndexLevel2::mask_out(phrase_token_t mask,
CASE(15);
CASE(16);
default:
- assert(false);
+ abort();
}
}
/* shrink self array. */
diff --git a/src/storage/phrase_large_table3.cpp b/src/storage/phrase_large_table3.cpp
index 696c612..1e8fd76 100644
--- a/src/storage/phrase_large_table3.cpp
+++ b/src/storage/phrase_large_table3.cpp
@@ -128,8 +128,13 @@ bool PhraseLargeTable3::load_text(FILE * infile){
size_t freq;
while (!feof(infile)) {
+#ifdef __APPLE__
+ int num = fscanf(infile, "%255s %255[^ \t] %u %ld",
+ pinyin, phrase, &token, &freq);
+#else
int num = fscanf(infile, "%255s %255s %u %ld",
pinyin, phrase, &token, &freq);
+#endif
if (4 != num)
continue;
diff --git a/src/storage/phrase_large_table3.h b/src/storage/phrase_large_table3.h
index 2774767..b4da01d 100644
--- a/src/storage/phrase_large_table3.h
+++ b/src/storage/phrase_large_table3.h
@@ -24,6 +24,7 @@
#include <stdio.h>
#include "novel_types.h"
#include "memory_chunk.h"
+#include "pinyin_utils.h"
#ifdef HAVE_BERKELEY_DB
#include "phrase_large_table3_bdb.h"
@@ -68,7 +69,8 @@ public:
static inline int reduce_tokens(const PhraseTokens tokens,
- TokenVector tokenarray) {
+ TokenVector tokenarray,
+ bool validate = true) {
int num = 0;
g_array_set_size(tokenarray, 0);
@@ -82,8 +84,9 @@ static inline int reduce_tokens(const PhraseTokens tokens,
g_array_append_vals(tokenarray, array->data, array->len);
}
- /* the following line will be removed in future after code are verified. */
- assert(0 <= num && num <= 4);
+ /* the following lines will be removed in future after code are verified. */
+ if (validate)
+ assert(0 <= num && num <= 4);
return num;
}
diff --git a/src/storage/phrase_large_table3_bdb.cpp b/src/storage/phrase_large_table3_bdb.cpp
index 03632ae..9074170 100644
--- a/src/storage/phrase_large_table3_bdb.cpp
+++ b/src/storage/phrase_large_table3_bdb.cpp
@@ -24,6 +24,44 @@
namespace pinyin{
+/* keep the following function synced between dbm implementations
+ for consistent phrase key compare. */
+
+inline int compare_phrase(ucs4_t * lhs, ucs4_t * rhs, int phrase_length) {
+ int result;
+ for (int i = 0; i < phrase_length; ++i) {
+ result = lhs[i] - rhs[i];
+ if (0 != result)
+ return result;
+ }
+
+ return 0;
+}
+
+/* keep dbm key compare function inside the corresponding dbm file
+ to get more flexibility. */
+
+static bool bdb_phrase_continue_search(const DBT *dbt1,
+ const DBT *dbt2) {
+ ucs4_t * lhs_phrase = (ucs4_t *) dbt1->data;
+ int lhs_phrase_length = dbt1->size / sizeof(ucs4_t);
+ ucs4_t * rhs_phrase = (ucs4_t *) dbt2->data;
+ int rhs_phrase_length = dbt2->size / sizeof(ucs4_t);
+
+ /* The key in dbm is longer than the key in application. */
+ if (lhs_phrase_length >= rhs_phrase_length)
+ return false;
+
+ int min_phrase_length = lhs_phrase_length;
+
+ int result = compare_phrase (lhs_phrase, rhs_phrase, min_phrase_length);
+ if (0 != result)
+ return false;
+
+ /* continue the longer phrase search. */
+ return true;
+}
+
PhraseLargeTable3::PhraseLargeTable3() {
/* create in-memory db. */
m_db = NULL;
@@ -169,6 +207,64 @@ int PhraseLargeTable3::search(int phrase_length,
return result;
}
+int PhraseLargeTable3::search_suggestion(int phrase_length,
+ /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const {
+ int result = SEARCH_NONE;
+
+ if (NULL == m_db)
+ return result;
+ assert(NULL != m_entry);
+
+ DBC * cursorp = NULL;
+ /* Get a cursor */
+ int ret = m_db->cursor(m_db, NULL, &cursorp, 0);
+ if (ret != 0)
+ return result;
+
+ DBT db_key1;
+ memset(&db_key1, 0, sizeof(DBT));
+ db_key1.data = (void *) phrase;
+ db_key1.size = phrase_length * sizeof(ucs4_t);
+
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ /* Get the prefix entry */
+ ret = cursorp->c_get(cursorp, &db_key1, &db_data, DB_SET);
+ if (ret != 0) {
+ cursorp->c_close(cursorp);
+ return result;
+ }
+
+ /* Get the next entry */
+ DBT db_key2;
+ memset(&db_key2, 0, sizeof(DBT));
+ memset(&db_data, 0, sizeof(DBT));
+ ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT);
+ if (ret != 0) {
+ cursorp->c_close(cursorp);
+ return result;
+ }
+
+ while(bdb_phrase_continue_search(&db_key1, &db_key2)) {
+
+ m_entry->m_chunk.set_chunk(db_data.data, db_data.size, NULL);
+ result = m_entry->search(tokens) | result;
+ m_entry->m_chunk.set_size(0);
+
+ memset(&db_key2, 0, sizeof(DBT));
+ memset(&db_data, 0, sizeof(DBT));
+ ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT);
+ if (ret != 0) {
+ cursorp->c_close(cursorp);
+ return result;
+ }
+ }
+
+ cursorp->c_close(cursorp);
+ return result;
+}
+
/* add_index/remove_index method */
int PhraseLargeTable3::add_index(int phrase_length,
/* in */ const ucs4_t phrase[],
@@ -302,6 +398,10 @@ bool PhraseLargeTable3::mask_out(phrase_token_t mask,
db_data.size = entry.m_chunk.size();
int ret = cursorp->put(cursorp, &db_key, &db_data, DB_CURRENT);
assert(ret == 0);
+
+ /* Initialize our DBTs. */
+ memset(&db_key, 0, sizeof(DBT));
+ memset(&db_data, 0, sizeof(DBT));
}
assert(ret == DB_NOTFOUND);
diff --git a/src/storage/phrase_large_table3_bdb.h b/src/storage/phrase_large_table3_bdb.h
index 73f7625..da8f199 100644
--- a/src/storage/phrase_large_table3_bdb.h
+++ b/src/storage/phrase_large_table3_bdb.h
@@ -58,6 +58,8 @@ public:
/* search method */
int search(int phrase_length, /* in */ const ucs4_t phrase[],
/* out */ PhraseTokens tokens) const;
+ int search_suggestion(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const;
/* add_index/remove_index method */
int add_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token);
diff --git a/src/storage/phrase_large_table3_kyotodb.cpp b/src/storage/phrase_large_table3_kyotodb.cpp
index 529aa65..f4b6faa 100644
--- a/src/storage/phrase_large_table3_kyotodb.cpp
+++ b/src/storage/phrase_large_table3_kyotodb.cpp
@@ -20,7 +20,7 @@
#include "phrase_large_table3.h"
#include <kchashdb.h>
-#include <kcprotodb.h>
+#include <kccachedb.h>
#include "kyotodb_utils.h"
@@ -28,10 +28,47 @@ using namespace kyotocabinet;
namespace pinyin{
+/* keep the following function synced between dbm implementations
+ for consistent phrase key compare. */
+inline int compare_phrase(ucs4_t * lhs, ucs4_t * rhs, int phrase_length) {
+ int result;
+ for (int i = 0; i < phrase_length; ++i) {
+ result = lhs[i] - rhs[i];
+ if (0 != result)
+ return result;
+ }
+
+ return 0;
+}
+
+/* keep dbm key compare function inside the corresponding dbm file
+ to get more flexibility. */
+
+bool kyotodb_phrase_continue_search(const char* akbuf, size_t aksiz,
+ const char* bkbuf, size_t bksiz) {
+ ucs4_t * lhs_phrase = (ucs4_t *) akbuf;
+ int lhs_phrase_length = aksiz / sizeof(ucs4_t);
+ ucs4_t * rhs_phrase = (ucs4_t *) bkbuf;
+ int rhs_phrase_length = bksiz / sizeof(ucs4_t);
+
+ /* The key in dbm is longer than the key in application. */
+ if (lhs_phrase_length >= rhs_phrase_length)
+ return false;
+
+ int min_phrase_length = lhs_phrase_length;
+
+ int result = compare_phrase (lhs_phrase, rhs_phrase, min_phrase_length);
+ if (0 != result)
+ return false;
+
+ /* continue the longer phrase search. */
+ return true;
+}
+
PhraseLargeTable3::PhraseLargeTable3() {
/* create in-memory db. */
m_db = new ProtoTreeDB;
- assert(m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE));
+ check_result(m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE));
m_entry = new PhraseTableEntry;
}
@@ -80,6 +117,10 @@ bool PhraseLargeTable3::load_db(const char * filename) {
if (!m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE))
return false;
+ if (!m_db->load_snapshot(filename, NULL))
+ return false;
+
+#if 0
/* load db into memory. */
BasicDB * tmp_db = new TreeDB;
if (!tmp_db->open(filename, BasicDB::OREADER))
@@ -90,6 +131,7 @@ bool PhraseLargeTable3::load_db(const char * filename) {
tmp_db->close();
delete tmp_db;
+#endif
return true;
}
@@ -99,6 +141,10 @@ bool PhraseLargeTable3::store_db(const char * new_filename){
if ( ret != 0 && errno != ENOENT)
return false;
+ if (!m_db->dump_snapshot(new_filename, NULL))
+ return false;
+
+#if 0
BasicDB * tmp_db = new TreeDB;
if (!tmp_db->open(new_filename, BasicDB::OWRITER|BasicDB::OCREATE))
return false;
@@ -109,6 +155,7 @@ bool PhraseLargeTable3::store_db(const char * new_filename){
tmp_db->synchronize();
tmp_db->close();
delete tmp_db;
+#endif
return true;
}
@@ -137,14 +184,68 @@ int PhraseLargeTable3::search(int phrase_length,
m_entry->m_chunk.set_size(vsiz);
/* m_chunk may re-allocate here. */
char * vbuf = (char *) m_entry->m_chunk.begin();
- assert (vsiz == m_db->get(kbuf, phrase_length * sizeof(ucs4_t),
- vbuf, vsiz));
+ check_result(vsiz == m_db->get(kbuf, phrase_length * sizeof(ucs4_t),
+ vbuf, vsiz));
result = m_entry->search(tokens) | result;
return result;
}
+int PhraseLargeTable3::search_suggestion(int phrase_length,
+ /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const {
+ int result = SEARCH_NONE;
+
+ if (NULL == m_db)
+ return result;
+ assert(NULL != m_entry);
+
+ const char * akbuf = (char *) phrase;
+ const size_t aksiz = phrase_length * sizeof(ucs4_t);
+ const int32_t vsiz = m_db->check(akbuf, aksiz);
+ /* -1 on failure. */
+ if (-1 == vsiz)
+ return result;
+
+ BasicDB::Cursor * cursor = m_db->cursor();
+ bool retval = cursor->jump(akbuf, aksiz);
+ if (!retval) {
+ delete cursor;
+ return result;
+ }
+
+ /* Get the next entry */
+ retval = cursor->step();
+ if (!retval) {
+ delete cursor;
+ return result;
+ }
+
+ size_t bksiz = 0;
+ const char * bkbuf = cursor->get_key(&bksiz);
+ while(kyotodb_phrase_continue_search(akbuf, aksiz, bkbuf, bksiz)) {
+ size_t bvsiz = 0;
+ char * bvbuf = cursor->get_value(&bvsiz);
+ m_entry->m_chunk.set_chunk(bvbuf, bvsiz, NULL);
+ result = m_entry->search(tokens) | result;
+ m_entry->m_chunk.set_size(0);
+ delete [] bvbuf;
+
+ retval = cursor->step();
+ if (!retval) {
+ delete cursor;
+ return result;
+ }
+
+ bksiz = 0;
+ bkbuf = cursor->get_key(&bksiz);
+ }
+
+ delete cursor;
+ return result;
+}
+
/* add_index/remove_index method */
int PhraseLargeTable3::add_index(int phrase_length,
/* in */ const ucs4_t phrase[],
@@ -192,7 +293,7 @@ int PhraseLargeTable3::add_index(int phrase_length,
m_entry->m_chunk.set_size(vsiz);
/* m_chunk may re-allocate here. */
vbuf = (char *) m_entry->m_chunk.begin();
- assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
+ check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
int result = m_entry->add_index(token);
@@ -223,7 +324,7 @@ int PhraseLargeTable3::remove_index(int phrase_length,
m_entry->m_chunk.set_size(vsiz);
/* m_chunk may re-allocate here. */
vbuf = (char *) m_entry->m_chunk.begin();
- assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
+ check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz));
int result = m_entry->remove_index(token);
if (ERROR_OK != result)
diff --git a/src/storage/phrase_large_table3_kyotodb.h b/src/storage/phrase_large_table3_kyotodb.h
index d122de0..c7f3b87 100644
--- a/src/storage/phrase_large_table3_kyotodb.h
+++ b/src/storage/phrase_large_table3_kyotodb.h
@@ -60,6 +60,8 @@ public:
/* search method */
int search(int phrase_length, /* in */ const ucs4_t phrase[],
/* out */ PhraseTokens tokens) const;
+ int search_suggestion(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const;
/* add_index/remove_index method */
int add_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token);
diff --git a/src/storage/pinyin_custom2.h b/src/storage/pinyin_custom2.h
index fe2e3bd..320e846 100644
--- a/src/storage/pinyin_custom2.h
+++ b/src/storage/pinyin_custom2.h
@@ -28,7 +28,7 @@ G_BEGIN_DECLS
/**
* PinyinTableFlag:
*/
-enum PinyinTableFlag{
+typedef enum{
IS_PINYIN = 1U << 1,
IS_ZHUYIN = 1U << 2,
PINYIN_INCOMPLETE = 1U << 3,
@@ -38,7 +38,7 @@ enum PinyinTableFlag{
USE_DIVIDED_TABLE = 1U << 7,
USE_RESPLIT_TABLE = 1U << 8,
DYNAMIC_ADJUST = 1U << 9
-};
+} PinyinTableFlag;
/**
* PinyinAmbiguity2:
@@ -46,7 +46,7 @@ enum PinyinTableFlag{
* The enums of pinyin ambiguities.
*
*/
-enum PinyinAmbiguity2{
+typedef enum{
PINYIN_AMB_C_CH = 1U << 10,
PINYIN_AMB_S_SH = 1U << 11,
PINYIN_AMB_Z_ZH = 1U << 12,
@@ -58,7 +58,7 @@ enum PinyinAmbiguity2{
PINYIN_AMB_EN_ENG = 1U << 18,
PINYIN_AMB_IN_ING = 1U << 19,
PINYIN_AMB_ALL = 0x3FFU << 10
-};
+} PinyinAmbiguity2;
/**
* PinyinCorrection2:
@@ -67,7 +67,7 @@ enum PinyinAmbiguity2{
*
*/
-enum PinyinCorrection2{
+typedef enum{
PINYIN_CORRECT_GN_NG = 1U << 21,
PINYIN_CORRECT_MG_NG = 1U << 22,
PINYIN_CORRECT_IOU_IU = 1U << 23,
@@ -77,37 +77,35 @@ enum PinyinCorrection2{
PINYIN_CORRECT_V_U = 1U << 27,
PINYIN_CORRECT_ON_ONG = 1U << 28,
PINYIN_CORRECT_ALL = 0xFFU << 21
-};
+} PinyinCorrection2;
/**
- * PinyinCorrection2:
+ * ZhuyinCorrection2:
*
- * The enums of pinyin corrections.
+ * The enums of zhuyin corrections.
*
*/
-enum ZhuyinCorrection2{
+typedef enum{
ZHUYIN_CORRECT_HSU = 1U << 29,
ZHUYIN_CORRECT_ETEN26 = 1U << 30,
ZHUYIN_CORRECT_SHUFFLE = 1U << 31,
ZHUYIN_CORRECT_ALL = 0x7U << 29
-};
+} ZhuyinCorrection2;
/**
* @brief enums of Full Pinyin Schemes.
*/
-enum FullPinyinScheme
-{
+typedef enum{
FULL_PINYIN_HANYU = 1,
FULL_PINYIN_LUOMA = 2,
FULL_PINYIN_SECONDARY_ZHUYIN = 3,
FULL_PINYIN_DEFAULT = FULL_PINYIN_HANYU
-};
+} FullPinyinScheme;
/**
* @brief enums of Double Pinyin Schemes.
*/
-enum DoublePinyinScheme
-{
+typedef enum{
DOUBLE_PINYIN_ZRM = 1,
DOUBLE_PINYIN_MS = 2,
DOUBLE_PINYIN_ZIGUANG = 3,
@@ -116,13 +114,12 @@ enum DoublePinyinScheme
DOUBLE_PINYIN_XHE = 6,
DOUBLE_PINYIN_CUSTOMIZED = 30, /* for user's keyboard */
DOUBLE_PINYIN_DEFAULT = DOUBLE_PINYIN_MS
-};
+} DoublePinyinScheme;
/**
* @brief enums of Zhuyin Schemes.
*/
-enum ZhuyinScheme
-{
+typedef enum{
ZHUYIN_STANDARD = 1,
ZHUYIN_HSU = 2,
ZHUYIN_IBM = 3,
@@ -133,7 +130,7 @@ enum ZhuyinScheme
ZHUYIN_HSU_DVORAK = 8,
ZHUYIN_DACHEN_CP26 = 9,
ZHUYIN_DEFAULT = ZHUYIN_STANDARD
-};
+} ZhuyinScheme;
G_END_DECLS
diff --git a/src/storage/pinyin_custom3.h b/src/storage/pinyin_custom3.h
deleted file mode 100644
index 37e89bb..0000000
--- a/src/storage/pinyin_custom3.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * libpinyin
- * Library to deal with pinyin.
- *
- * Copyright (C) 2016 Peng Wu <alexepico@gmail.com>
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#include <glib.h>
-
-
-G_BEGIN_DECLS
-
-
-/**
- * PinyinStandardOption: type pinyin_option_t.
- *
- * The enums of pinyin standard option.
- *
- */
-enum PinyinStandardOption{
- IS_PINYIN = 1U << 1,
- IS_ZHUYIN = 1U << 2,
- PINYIN_INCOMPLETE = 1U << 3,
- ZHUYIN_INCOMPLETE = 1U << 4,
- USE_TONE = 1U << 5,
- FORCE_TONE = 1U << 6,
- USE_DIVIDED_TABLE = 1U << 7,
- USE_RESPLIT_TABLE = 1U << 8,
- DYNAMIC_ADJUST = 1U << 9,
- FULL_PINYIN_SUPPORT_QUOTATION = 1U << 10,
-};
-
-/**
- * PinyinFuzzyOption: type pinyin_option_t.
- *
- * The enums of pinyin fuzzy option.
- *
- */
-enum PinyinFuzzyOption{
- PINYIN_FUZZY_C_CH = 1U << 10,
- PINYIN_FUZZY_S_SH = 1U << 11,
- PINYIN_FUZZY_Z_ZH = 1U << 12,
- PINYIN_FUZZY_F_H = 1U << 13,
- PINYIN_FUZZY_G_K = 1U << 14,
- PINYIN_FUZZY_L_N = 1U << 15,
- PINYIN_FUZZY_L_R = 1U << 16,
- PINYIN_FUZZY_AN_ANG = 1U << 17,
- PINYIN_FUZZY_EN_ENG = 1U << 18,
- PINYIN_FUZZY_IN_ING = 1U << 19,
- PINYIN_FUZZY_ALL = 0x3FFU << 10
-};
-
-/**
- * PinyinCorrectOption: type pinyin_option_t.
- *
- * The enums of pinyin correct option.
- *
- */
-enum PinyinCorrectOption{
- PINYIN_CORRECT_GN_NG = 1U << 21,
- PINYIN_CORRECT_MG_NG = 1U << 22,
- PINYIN_CORRECT_IOU_IU = 1U << 23,
- PINYIN_CORRECT_UEI_UI = 1U << 24,
- PINYIN_CORRECT_UEN_UN = 1U << 25,
- PINYIN_CORRECT_UE_VE = 1U << 26,
- PINYIN_CORRECT_V_U = 1U << 27,
- PINYIN_CORRECT_ON_ONG = 1U << 28,
- PINYIN_CORRECT_ALL = 0xFFU << 21,
-};
-
-/**
- * ZhuyinCorrectOption: type pinyin_option_t.
- *
- * The enums of zhuyin correct option.
- *
- */
-enum ZhuyinCorrectOption{
- ZHUYIN_CORRECT_HSU = 1U << 29,
- ZHUYIN_CORRECT_ETEN26 = 1U << 30,
- ZHUYIN_CORRECT_SHUFFLE = 1U << 31,
- ZHUYIN_CORRECT_ALL = 0x7U << 29
-};
-
-/**
- * @brief enums of Full Pinyin Schemes.
- */
-enum FullPinyinScheme
-{
- FULL_PINYIN_HANYU = 1,
- FULL_PINYIN_LUOMA = 2,
- FULL_PINYIN_SECONDARY_BOPOMOFO = 3,
- FULL_PINYIN_DEFAULT = FULL_PINYIN_HANYU
-};
-
-/**
- * @brief enums of Double Pinyin Schemes.
- */
-enum DoublePinyinScheme
-{
- DOUBLE_PINYIN_ZRM = 1,
- DOUBLE_PINYIN_MS = 2,
- DOUBLE_PINYIN_ZIGUANG = 3,
- DOUBLE_PINYIN_ABC = 4,
- DOUBLE_PINYIN_PYJJ = 5,
- DOUBLE_PINYIN_XHE = 6,
- DOUBLE_PINYIN_CUSTOMIZED = 30, /* for user's keyboard */
- DOUBLE_PINYIN_DEFAULT = DOUBLE_PINYIN_MS
-};
-
-/**
- * @brief enums of Zhuyin Schemes.
- */
-enum ZhuyinScheme
-{
- ZHUYIN_STANDARD = 1,
- ZHUYIN_HSU = 2,
- ZHUYIN_IBM = 3,
- ZHUYIN_GINYIEH = 4,
- ZHUYIN_ETEN = 5,
- ZHUYIN_ETEN26 = 6,
- ZHUYIN_STANDARD_DVORAK = 7,
- ZHUYIN_HSU_DVORAK = 8,
- ZHUYIN_DACHEN_CP26 = 9,
- ZHUYIN_DEFAULT = ZHUYIN_STANDARD
-};
-
-
-G_END_DECLS
diff --git a/src/storage/pinyin_parser2.cpp b/src/storage/pinyin_parser2.cpp
index da32455..ce640c3 100644
--- a/src/storage/pinyin_parser2.cpp
+++ b/src/storage/pinyin_parser2.cpp
@@ -66,6 +66,7 @@ struct parse_value_t{
ChewingKeyRest m_key_rest;
gint16 m_num_keys;
gint16 m_parsed_len;
+ gint16 m_distance;
gint16 m_last_step;
/* constructor */
@@ -73,6 +74,7 @@ public:
parse_value_t(){
m_num_keys = 0;
m_parsed_len = 0;
+ m_distance = 0;
m_last_step = -1;
}
};
@@ -121,7 +123,8 @@ static inline bool search_pinyin_index2(pinyin_option_t options,
const pinyin_index_item_t * index,
size_t len,
const char * pinyin,
- ChewingKey & key){
+ ChewingKey & key,
+ gint16 & distance){
pinyin_index_item_t item;
memset(&item, 0, sizeof(item));
item.m_pinyin_input = pinyin;
@@ -141,6 +144,7 @@ static inline bool search_pinyin_index2(pinyin_option_t options,
return false;
key = content_table[index->m_table_index].m_chewing_key;
+ distance = index->m_distance;
assert(key.get_table_index() == index->m_table_index);
return true;
}
@@ -159,6 +163,7 @@ FullPinyinParser2::FullPinyinParser2 (){
bool FullPinyinParser2::parse_one_key (pinyin_option_t options,
ChewingKey & key,
+ gint16 & distance,
const char * pinyin, int len) const {
/* "'" are not accepted in parse_one_key. */
gchar * input = g_strndup(pinyin, len);
@@ -189,7 +194,7 @@ bool FullPinyinParser2::parse_one_key (pinyin_option_t options,
/* Note: optimize here? */
input[parsed_len] = '\0';
if (!search_pinyin_index2(options, m_pinyin_index, m_pinyin_index_len,
- input, key)) {
+ input, key, distance)) {
g_free(input);
return false;
}
@@ -239,6 +244,7 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
nextstep->m_key_rest = ChewingKeyRest();
nextstep->m_num_keys = curstep->m_num_keys;
nextstep->m_parsed_len = curstep->m_parsed_len + 1;
+ nextstep->m_distance = curstep->m_distance;
nextstep->m_last_step = i;
next_sep = 0;
continue;
@@ -265,13 +271,14 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
nextstep = &g_array_index(m_parse_steps, parse_value_t, n);
/* gen next step */
+ gint16 distance = 0;
const char * onepinyin = input + m;
gint16 onepinyinlen = n - m;
value = parse_value_t();
ChewingKey key; ChewingKeyRest rest;
bool parsed = parse_one_key
- (options, key, onepinyin, onepinyinlen);
+ (options, key, distance, onepinyin, onepinyinlen);
rest.m_raw_begin = m; rest.m_raw_end = n;
if (!parsed)
continue;
@@ -281,6 +288,7 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
value.m_key = key; value.m_key_rest = rest;
value.m_num_keys = curstep->m_num_keys + 1;
value.m_parsed_len = curstep->m_parsed_len + onepinyinlen;
+ value.m_distance = curstep->m_distance + distance;
value.m_last_step = m;
/* save next step */
@@ -297,21 +305,26 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
/* handle with the same pinyin length and the number of keys */
if (value.m_parsed_len == nextstep->m_parsed_len &&
- value.m_num_keys == nextstep->m_num_keys) {
+ value.m_num_keys == nextstep->m_num_keys &&
+ value.m_distance < nextstep->m_distance)
+ *nextstep = value;
#if 0
- /* prefer the 'a' at the end of clause,
- * ex: "zheyanga$" -> "zhe'yang'a$".
- */
- if (value.m_parsed_len == len &&
- (nextstep->m_key.m_initial != CHEWING_ZERO_INITIAL &&
- nextstep->m_key.m_final == CHEWING_A) &&
- (value.m_key.m_initial == CHEWING_ZERO_INITIAL &&
- value.m_key.m_middle == CHEWING_ZERO_MIDDLE &&
- value.m_key.m_final == CHEWING_A))
- *nextstep = value;
+ /* prefer the 'a' at the end of clause,
+ * ex: "zheyanga$" -> "zhe'yang'a$".
+ */
+ if (value.m_parsed_len == len &&
+ (value.m_parsed_len == nextstep->m_parsed_len &&
+ value.m_num_keys == nextstep->m_num_keys &&
+ value.m_distance == nextstep->m_distance) &&
+ (nextstep->m_key.m_initial != CHEWING_ZERO_INITIAL &&
+ nextstep->m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+ nextstep->m_key.m_final == CHEWING_A) &&
+ (value.m_key.m_initial == CHEWING_ZERO_INITIAL &&
+ value.m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+ value.m_key.m_final == CHEWING_A))
+ *nextstep = value;
#endif
- }
}
}
}
@@ -382,7 +395,7 @@ bool FullPinyinParser2::set_scheme(FullPinyinScheme scheme){
m_pinyin_index_len = G_N_ELEMENTS(secondary_zhuyin_index);
break;
default:
- assert(false);
+ abort();
}
return true;
}
@@ -391,6 +404,7 @@ bool FullPinyinParser2::set_scheme(FullPinyinScheme scheme){
bool DoublePinyinParser2::parse_one_key(pinyin_option_t options,
ChewingKey & key,
+ gint16 & distance,
const char *str, int len) const {
options &= ~(PINYIN_CORRECT_ALL|PINYIN_AMB_ALL);
@@ -537,9 +551,10 @@ int DoublePinyinParser2::parse(pinyin_option_t options, ChewingKeyVector & keys,
i = std_lite::min(maximum_len - parsed_len,
(int)max_double_pinyin_length);
+ gint16 distance = 0;
ChewingKey key; ChewingKeyRest key_rest;
for (; i > 0; --i) {
- bool success = parse_one_key(options, key, cur_str, i);
+ bool success = parse_one_key(options, key, distance, cur_str, i);
if (success)
break;
}
@@ -593,7 +608,7 @@ bool DoublePinyinParser2::set_scheme(DoublePinyinScheme scheme) {
m_fallback_table = double_pinyin_xhe_fallback;
return true;
case DOUBLE_PINYIN_CUSTOMIZED:
- assert(FALSE);
+ abort();
};
return false; /* no such scheme. */
@@ -607,6 +622,7 @@ PinyinDirectParser2::PinyinDirectParser2 (){
bool PinyinDirectParser2::parse_one_key(pinyin_option_t options,
ChewingKey & key,
+ gint16 & distance,
const char *str, int len) const {
/* "'" are not accepted in parse_one_key. */
gchar * input = g_strndup(str, len);
@@ -637,7 +653,7 @@ bool PinyinDirectParser2::parse_one_key(pinyin_option_t options,
/* Note: optimize here? */
input[parsed_len] = '\0';
if (!search_pinyin_index2(options, m_pinyin_index, m_pinyin_index_len,
- input, key)) {
+ input, key, distance)) {
g_free(input);
return false;
}
@@ -663,8 +679,6 @@ int PinyinDirectParser2::parse(pinyin_option_t options,
g_array_set_size(keys, 0);
g_array_set_size(key_rests, 0);
- ChewingKey key; ChewingKeyRest key_rest;
-
int parsed_len = 0;
int i = 0, cur = 0, next = 0;
while (cur < len) {
@@ -675,7 +689,9 @@ int PinyinDirectParser2::parse(pinyin_option_t options,
}
next = i;
- if (parse_one_key(options, key, str + cur, next - cur)) {
+ gint16 distance = 0;
+ ChewingKey key; ChewingKeyRest key_rest;
+ if (parse_one_key(options, key, distance, str + cur, next - cur)) {
key_rest.m_raw_begin = cur; key_rest.m_raw_end = next;
/* save the pinyin. */
diff --git a/src/storage/pinyin_parser2.h b/src/storage/pinyin_parser2.h
index 4a2f0f2..790c0cb 100644
--- a/src/storage/pinyin_parser2.h
+++ b/src/storage/pinyin_parser2.h
@@ -42,6 +42,7 @@ typedef struct {
const char * m_pinyin_input;
guint32 m_flags;
guint16 m_table_index;
+ guint16 m_distance;
} pinyin_index_item_t;
typedef struct {
@@ -114,7 +115,7 @@ public:
* Parse only one struct ChewingKey from a string.
*
*/
- virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const = 0;
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const = 0;
/**
* PhoneticParser2::parse:
@@ -156,7 +157,7 @@ public:
g_array_free(m_parse_steps, TRUE);
}
- virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const;
/* Note:
* the parse method will use dynamic programming to drive parse_one_key.
@@ -195,7 +196,7 @@ public:
virtual ~DoublePinyinParser2() {}
- virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const;
virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const;
@@ -215,7 +216,7 @@ public:
virtual ~PinyinDirectParser2() {}
- virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const;
virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const;
};
diff --git a/src/storage/pinyin_parser_table.h b/src/storage/pinyin_parser_table.h
index ad8bf7c..031a629 100644
--- a/src/storage/pinyin_parser_table.h
+++ b/src/storage/pinyin_parser_table.h
@@ -7,666 +7,666 @@
namespace pinyin{
const pinyin_index_item_t pinyin_index[] = {
-{"a", IS_ZHUYIN|IS_PINYIN, 1},
-{"agn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 4},
-{"ai", IS_ZHUYIN|IS_PINYIN, 2},
-{"amg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 4},
-{"an", IS_ZHUYIN|IS_PINYIN, 3},
-{"ang", IS_ZHUYIN|IS_PINYIN, 4},
-{"ao", IS_ZHUYIN|IS_PINYIN, 5},
-{"b", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 6},
-{"ba", IS_ZHUYIN|IS_PINYIN, 7},
-{"bagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 10},
-{"bai", IS_ZHUYIN|IS_PINYIN, 8},
-{"bamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 10},
-{"ban", IS_ZHUYIN|IS_PINYIN, 9},
-{"bang", IS_ZHUYIN|IS_PINYIN, 10},
-{"bao", IS_ZHUYIN|IS_PINYIN, 11},
-{"begn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 14},
-{"bei", IS_ZHUYIN|IS_PINYIN, 12},
-{"bemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 14},
-{"ben", IS_ZHUYIN|IS_PINYIN, 13},
-{"beng", IS_ZHUYIN|IS_PINYIN, 14},
-{"bi", IS_ZHUYIN|IS_PINYIN, 15},
-{"bian", IS_ZHUYIN|IS_PINYIN, 16},
-{"biao", IS_ZHUYIN|IS_PINYIN, 17},
-{"bie", IS_ZHUYIN|IS_PINYIN, 18},
-{"bign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 20},
-{"bimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 20},
-{"bin", IS_ZHUYIN|IS_PINYIN, 19},
-{"bing", IS_ZHUYIN|IS_PINYIN, 20},
-{"bo", IS_ZHUYIN|IS_PINYIN, 21},
-{"bu", IS_ZHUYIN|IS_PINYIN, 22},
-{"c", IS_PINYIN|PINYIN_INCOMPLETE, 23},
-{"ca", IS_ZHUYIN|IS_PINYIN, 24},
-{"cagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 27},
-{"cai", IS_ZHUYIN|IS_PINYIN, 25},
-{"camg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 27},
-{"can", IS_ZHUYIN|IS_PINYIN, 26},
-{"cang", IS_ZHUYIN|IS_PINYIN, 27},
-{"cao", IS_ZHUYIN|IS_PINYIN, 28},
-{"ce", IS_ZHUYIN|IS_PINYIN, 29},
-{"cegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 31},
-{"cemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 31},
-{"cen", IS_ZHUYIN|IS_PINYIN, 30},
-{"ceng", IS_ZHUYIN|IS_PINYIN, 31},
-{"ch", IS_PINYIN|PINYIN_INCOMPLETE, 32},
-{"cha", IS_ZHUYIN|IS_PINYIN, 33},
-{"chagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 36},
-{"chai", IS_ZHUYIN|IS_PINYIN, 34},
-{"chamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 36},
-{"chan", IS_ZHUYIN|IS_PINYIN, 35},
-{"chang", IS_ZHUYIN|IS_PINYIN, 36},
-{"chao", IS_ZHUYIN|IS_PINYIN, 37},
-{"che", IS_ZHUYIN|IS_PINYIN, 38},
-{"chegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 40},
-{"chemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 40},
-{"chen", IS_ZHUYIN|IS_PINYIN, 39},
-{"cheng", IS_ZHUYIN|IS_PINYIN, 40},
-{"chi", IS_ZHUYIN|IS_PINYIN, 41},
-{"chogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 42},
-{"chomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 42},
-{"chon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 42},
-{"chong", IS_ZHUYIN|IS_PINYIN, 42},
-{"chou", IS_ZHUYIN|IS_PINYIN, 43},
-{"chu", IS_ZHUYIN|IS_PINYIN, 44},
-{"chuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 48},
-{"chuai", IS_ZHUYIN|IS_PINYIN, 46},
-{"chuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 48},
-{"chuan", IS_ZHUYIN|IS_PINYIN, 47},
-{"chuang", IS_ZHUYIN|IS_PINYIN, 48},
-{"chuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 49},
-{"chuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 50},
-{"chui", IS_ZHUYIN|IS_PINYIN, 49},
-{"chun", IS_ZHUYIN|IS_PINYIN, 50},
-{"chuo", IS_ZHUYIN|IS_PINYIN, 51},
-{"ci", IS_ZHUYIN|IS_PINYIN, 52},
-{"cogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 53},
-{"comg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 53},
-{"con", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 53},
-{"cong", IS_ZHUYIN|IS_PINYIN, 53},
-{"cou", IS_ZHUYIN|IS_PINYIN, 54},
-{"cu", IS_ZHUYIN|IS_PINYIN, 55},
-{"cuan", IS_ZHUYIN|IS_PINYIN, 56},
-{"cuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 57},
-{"cuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 58},
-{"cui", IS_ZHUYIN|IS_PINYIN, 57},
-{"cun", IS_ZHUYIN|IS_PINYIN, 58},
-{"cuo", IS_ZHUYIN|IS_PINYIN, 59},
-{"d", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 60},
-{"da", IS_ZHUYIN|IS_PINYIN, 61},
-{"dagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 64},
-{"dai", IS_ZHUYIN|IS_PINYIN, 62},
-{"damg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 64},
-{"dan", IS_ZHUYIN|IS_PINYIN, 63},
-{"dang", IS_ZHUYIN|IS_PINYIN, 64},
-{"dao", IS_ZHUYIN|IS_PINYIN, 65},
-{"de", IS_ZHUYIN|IS_PINYIN, 66},
-{"degn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 69},
-{"dei", IS_ZHUYIN|IS_PINYIN, 67},
-{"demg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 69},
-{"deng", IS_ZHUYIN|IS_PINYIN, 69},
-{"di", IS_ZHUYIN|IS_PINYIN, 70},
-{"dia", IS_ZHUYIN|IS_PINYIN, 71},
-{"dian", IS_ZHUYIN|IS_PINYIN, 72},
-{"diao", IS_ZHUYIN|IS_PINYIN, 73},
-{"die", IS_ZHUYIN|IS_PINYIN, 74},
-{"dign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 76},
-{"dimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 76},
-{"ding", IS_ZHUYIN|IS_PINYIN, 76},
-{"diou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 77},
-{"diu", IS_ZHUYIN|IS_PINYIN, 77},
-{"dogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 78},
-{"domg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 78},
-{"don", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 78},
-{"dong", IS_ZHUYIN|IS_PINYIN, 78},
-{"dou", IS_ZHUYIN|IS_PINYIN, 79},
-{"du", IS_ZHUYIN|IS_PINYIN, 80},
-{"duan", IS_ZHUYIN|IS_PINYIN, 81},
-{"duei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 82},
-{"duen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 83},
-{"dui", IS_ZHUYIN|IS_PINYIN, 82},
-{"dun", IS_ZHUYIN|IS_PINYIN, 83},
-{"duo", IS_ZHUYIN|IS_PINYIN, 84},
-{"e", IS_ZHUYIN|IS_PINYIN, 85},
-{"egn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 88},
-{"ei", IS_ZHUYIN|IS_PINYIN, 86},
-{"emg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 88},
-{"en", IS_ZHUYIN|IS_PINYIN, 87},
-{"er", IS_ZHUYIN|IS_PINYIN, 89},
-{"f", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 90},
-{"fa", IS_ZHUYIN|IS_PINYIN, 91},
-{"fagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 93},
-{"famg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 93},
-{"fan", IS_ZHUYIN|IS_PINYIN, 92},
-{"fang", IS_ZHUYIN|IS_PINYIN, 93},
-{"fegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 97},
-{"fei", IS_ZHUYIN|IS_PINYIN, 95},
-{"femg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 97},
-{"fen", IS_ZHUYIN|IS_PINYIN, 96},
-{"feng", IS_ZHUYIN|IS_PINYIN, 97},
-{"fo", IS_ZHUYIN|IS_PINYIN, 98},
-{"fou", IS_ZHUYIN|IS_PINYIN, 99},
-{"fu", IS_ZHUYIN|IS_PINYIN, 100},
-{"g", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 101},
-{"ga", IS_ZHUYIN|IS_PINYIN, 102},
-{"gagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 105},
-{"gai", IS_ZHUYIN|IS_PINYIN, 103},
-{"gamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 105},
-{"gan", IS_ZHUYIN|IS_PINYIN, 104},
-{"gang", IS_ZHUYIN|IS_PINYIN, 105},
-{"gao", IS_ZHUYIN|IS_PINYIN, 106},
-{"ge", IS_ZHUYIN|IS_PINYIN, 107},
-{"gegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 110},
-{"gei", IS_ZHUYIN|IS_PINYIN, 108},
-{"gemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 110},
-{"gen", IS_ZHUYIN|IS_PINYIN, 109},
-{"geng", IS_ZHUYIN|IS_PINYIN, 110},
-{"gogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 111},
-{"gomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 111},
-{"gon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 111},
-{"gong", IS_ZHUYIN|IS_PINYIN, 111},
-{"gou", IS_ZHUYIN|IS_PINYIN, 112},
-{"gu", IS_ZHUYIN|IS_PINYIN, 113},
-{"gua", IS_ZHUYIN|IS_PINYIN, 114},
-{"guagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 117},
-{"guai", IS_ZHUYIN|IS_PINYIN, 115},
-{"guamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 117},
-{"guan", IS_ZHUYIN|IS_PINYIN, 116},
-{"guang", IS_ZHUYIN|IS_PINYIN, 117},
-{"guei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 118},
-{"guen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 119},
-{"gui", IS_ZHUYIN|IS_PINYIN, 118},
-{"gun", IS_ZHUYIN|IS_PINYIN, 119},
-{"guo", IS_ZHUYIN|IS_PINYIN, 120},
-{"h", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 121},
-{"ha", IS_ZHUYIN|IS_PINYIN, 122},
-{"hagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 125},
-{"hai", IS_ZHUYIN|IS_PINYIN, 123},
-{"hamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 125},
-{"han", IS_ZHUYIN|IS_PINYIN, 124},
-{"hang", IS_ZHUYIN|IS_PINYIN, 125},
-{"hao", IS_ZHUYIN|IS_PINYIN, 126},
-{"he", IS_ZHUYIN|IS_PINYIN, 127},
-{"hegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 130},
-{"hei", IS_ZHUYIN|IS_PINYIN, 128},
-{"hemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 130},
-{"hen", IS_ZHUYIN|IS_PINYIN, 129},
-{"heng", IS_ZHUYIN|IS_PINYIN, 130},
-{"hogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 131},
-{"homg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 131},
-{"hon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 131},
-{"hong", IS_ZHUYIN|IS_PINYIN, 131},
-{"hou", IS_ZHUYIN|IS_PINYIN, 132},
-{"hu", IS_ZHUYIN|IS_PINYIN, 133},
-{"hua", IS_ZHUYIN|IS_PINYIN, 134},
-{"huagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 137},
-{"huai", IS_ZHUYIN|IS_PINYIN, 135},
-{"huamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 137},
-{"huan", IS_ZHUYIN|IS_PINYIN, 136},
-{"huang", IS_ZHUYIN|IS_PINYIN, 137},
-{"huei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 138},
-{"huen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 139},
-{"hui", IS_ZHUYIN|IS_PINYIN, 138},
-{"hun", IS_ZHUYIN|IS_PINYIN, 139},
-{"huo", IS_ZHUYIN|IS_PINYIN, 140},
-{"j", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 141},
-{"ji", IS_ZHUYIN|IS_PINYIN, 142},
-{"jia", IS_ZHUYIN|IS_PINYIN, 143},
-{"jiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 145},
-{"jiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 145},
-{"jian", IS_ZHUYIN|IS_PINYIN, 144},
-{"jiang", IS_ZHUYIN|IS_PINYIN, 145},
-{"jiao", IS_ZHUYIN|IS_PINYIN, 146},
-{"jie", IS_ZHUYIN|IS_PINYIN, 147},
-{"jign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 149},
-{"jimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 149},
-{"jin", IS_ZHUYIN|IS_PINYIN, 148},
-{"jing", IS_ZHUYIN|IS_PINYIN, 149},
-{"jiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 150},
-{"jiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 150},
-{"jion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 150},
-{"jiong", IS_ZHUYIN|IS_PINYIN, 150},
-{"jiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 151},
-{"jiu", IS_ZHUYIN|IS_PINYIN, 151},
-{"ju", IS_ZHUYIN|IS_PINYIN, 152},
-{"juan", IS_ZHUYIN|IS_PINYIN, 153},
-{"jue", IS_ZHUYIN|IS_PINYIN, 154},
-{"juen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 155},
-{"jun", IS_ZHUYIN|IS_PINYIN, 155},
-{"jv", IS_PINYIN|PINYIN_CORRECT_V_U, 152},
-{"jvan", IS_PINYIN|PINYIN_CORRECT_V_U, 153},
-{"jve", IS_PINYIN|PINYIN_CORRECT_V_U, 154},
-{"jvn", IS_PINYIN|PINYIN_CORRECT_V_U, 155},
-{"k", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 156},
-{"ka", IS_ZHUYIN|IS_PINYIN, 157},
-{"kagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 160},
-{"kai", IS_ZHUYIN|IS_PINYIN, 158},
-{"kamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 160},
-{"kan", IS_ZHUYIN|IS_PINYIN, 159},
-{"kang", IS_ZHUYIN|IS_PINYIN, 160},
-{"kao", IS_ZHUYIN|IS_PINYIN, 161},
-{"ke", IS_ZHUYIN|IS_PINYIN, 162},
-{"kegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 165},
-{"kemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 165},
-{"ken", IS_ZHUYIN|IS_PINYIN, 164},
-{"keng", IS_ZHUYIN|IS_PINYIN, 165},
-{"kogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 166},
-{"komg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 166},
-{"kon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 166},
-{"kong", IS_ZHUYIN|IS_PINYIN, 166},
-{"kou", IS_ZHUYIN|IS_PINYIN, 167},
-{"ku", IS_ZHUYIN|IS_PINYIN, 168},
-{"kua", IS_ZHUYIN|IS_PINYIN, 169},
-{"kuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 172},
-{"kuai", IS_ZHUYIN|IS_PINYIN, 170},
-{"kuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 172},
-{"kuan", IS_ZHUYIN|IS_PINYIN, 171},
-{"kuang", IS_ZHUYIN|IS_PINYIN, 172},
-{"kuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 173},
-{"kuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 174},
-{"kui", IS_ZHUYIN|IS_PINYIN, 173},
-{"kun", IS_ZHUYIN|IS_PINYIN, 174},
-{"kuo", IS_ZHUYIN|IS_PINYIN, 175},
-{"l", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 176},
-{"la", IS_ZHUYIN|IS_PINYIN, 177},
-{"lagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 180},
-{"lai", IS_ZHUYIN|IS_PINYIN, 178},
-{"lamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 180},
-{"lan", IS_ZHUYIN|IS_PINYIN, 179},
-{"lang", IS_ZHUYIN|IS_PINYIN, 180},
-{"lao", IS_ZHUYIN|IS_PINYIN, 181},
-{"le", IS_ZHUYIN|IS_PINYIN, 182},
-{"legn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 185},
-{"lei", IS_ZHUYIN|IS_PINYIN, 183},
-{"lemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 185},
-{"leng", IS_ZHUYIN|IS_PINYIN, 185},
-{"li", IS_ZHUYIN|IS_PINYIN, 186},
-{"lia", IS_ZHUYIN|IS_PINYIN, 187},
-{"liagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 189},
-{"liamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 189},
-{"lian", IS_ZHUYIN|IS_PINYIN, 188},
-{"liang", IS_ZHUYIN|IS_PINYIN, 189},
-{"liao", IS_ZHUYIN|IS_PINYIN, 190},
-{"lie", IS_ZHUYIN|IS_PINYIN, 191},
-{"lign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 193},
-{"limg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 193},
-{"lin", IS_ZHUYIN|IS_PINYIN, 192},
-{"ling", IS_ZHUYIN|IS_PINYIN, 193},
-{"liou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 194},
-{"liu", IS_ZHUYIN|IS_PINYIN, 194},
-{"lo", IS_ZHUYIN|IS_PINYIN, 195},
-{"logn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 196},
-{"lomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 196},
-{"lon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 196},
-{"long", IS_ZHUYIN|IS_PINYIN, 196},
-{"lou", IS_ZHUYIN|IS_PINYIN, 197},
-{"lu", IS_ZHUYIN|IS_PINYIN, 198},
-{"luan", IS_ZHUYIN|IS_PINYIN, 199},
-{"lue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 203},
-{"luen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 200},
-{"lun", IS_ZHUYIN|IS_PINYIN, 200},
-{"luo", IS_ZHUYIN|IS_PINYIN, 201},
-{"lv", IS_ZHUYIN|IS_PINYIN, 202},
-{"lve", IS_ZHUYIN|IS_PINYIN, 203},
-{"m", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 204},
-{"ma", IS_ZHUYIN|IS_PINYIN, 205},
-{"magn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 208},
-{"mai", IS_ZHUYIN|IS_PINYIN, 206},
-{"mamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 208},
-{"man", IS_ZHUYIN|IS_PINYIN, 207},
-{"mang", IS_ZHUYIN|IS_PINYIN, 208},
-{"mao", IS_ZHUYIN|IS_PINYIN, 209},
-{"me", IS_ZHUYIN|IS_PINYIN, 210},
-{"megn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 213},
-{"mei", IS_ZHUYIN|IS_PINYIN, 211},
-{"memg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 213},
-{"men", IS_ZHUYIN|IS_PINYIN, 212},
-{"meng", IS_ZHUYIN|IS_PINYIN, 213},
-{"mi", IS_ZHUYIN|IS_PINYIN, 214},
-{"mian", IS_ZHUYIN|IS_PINYIN, 215},
-{"miao", IS_ZHUYIN|IS_PINYIN, 216},
-{"mie", IS_ZHUYIN|IS_PINYIN, 217},
-{"mign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 219},
-{"mimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 219},
-{"min", IS_ZHUYIN|IS_PINYIN, 218},
-{"ming", IS_ZHUYIN|IS_PINYIN, 219},
-{"miou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 220},
-{"miu", IS_ZHUYIN|IS_PINYIN, 220},
-{"mo", IS_ZHUYIN|IS_PINYIN, 221},
-{"mou", IS_ZHUYIN|IS_PINYIN, 222},
-{"mu", IS_ZHUYIN|IS_PINYIN, 223},
-{"n", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 224},
-{"na", IS_ZHUYIN|IS_PINYIN, 225},
-{"nagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 228},
-{"nai", IS_ZHUYIN|IS_PINYIN, 226},
-{"namg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 228},
-{"nan", IS_ZHUYIN|IS_PINYIN, 227},
-{"nang", IS_ZHUYIN|IS_PINYIN, 228},
-{"nao", IS_ZHUYIN|IS_PINYIN, 229},
-{"ne", IS_ZHUYIN|IS_PINYIN, 230},
-{"negn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 233},
-{"nei", IS_ZHUYIN|IS_PINYIN, 231},
-{"nemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 233},
-{"nen", IS_ZHUYIN|IS_PINYIN, 232},
-{"neng", IS_ZHUYIN|IS_PINYIN, 233},
-{"ng", IS_ZHUYIN|IS_PINYIN, 234},
-{"ni", IS_ZHUYIN|IS_PINYIN, 235},
-{"niagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 238},
-{"niamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 238},
-{"nian", IS_ZHUYIN|IS_PINYIN, 237},
-{"niang", IS_ZHUYIN|IS_PINYIN, 238},
-{"niao", IS_ZHUYIN|IS_PINYIN, 239},
-{"nie", IS_ZHUYIN|IS_PINYIN, 240},
-{"nign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 242},
-{"nimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 242},
-{"nin", IS_ZHUYIN|IS_PINYIN, 241},
-{"ning", IS_ZHUYIN|IS_PINYIN, 242},
-{"niou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 243},
-{"niu", IS_ZHUYIN|IS_PINYIN, 243},
-{"nogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 244},
-{"nomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 244},
-{"non", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 244},
-{"nong", IS_ZHUYIN|IS_PINYIN, 244},
-{"nou", IS_ZHUYIN|IS_PINYIN, 245},
-{"nu", IS_ZHUYIN|IS_PINYIN, 246},
-{"nuan", IS_ZHUYIN|IS_PINYIN, 247},
-{"nue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 251},
-{"nuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 248},
-{"nuo", IS_ZHUYIN|IS_PINYIN, 249},
-{"nv", IS_ZHUYIN|IS_PINYIN, 250},
-{"nve", IS_ZHUYIN|IS_PINYIN, 251},
-{"o", IS_ZHUYIN|IS_PINYIN, 252},
-{"ou", IS_ZHUYIN|IS_PINYIN, 253},
-{"p", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 254},
-{"pa", IS_ZHUYIN|IS_PINYIN, 255},
-{"pagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 258},
-{"pai", IS_ZHUYIN|IS_PINYIN, 256},
-{"pamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 258},
-{"pan", IS_ZHUYIN|IS_PINYIN, 257},
-{"pang", IS_ZHUYIN|IS_PINYIN, 258},
-{"pao", IS_ZHUYIN|IS_PINYIN, 259},
-{"pegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 262},
-{"pei", IS_ZHUYIN|IS_PINYIN, 260},
-{"pemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 262},
-{"pen", IS_ZHUYIN|IS_PINYIN, 261},
-{"peng", IS_ZHUYIN|IS_PINYIN, 262},
-{"pi", IS_ZHUYIN|IS_PINYIN, 263},
-{"pian", IS_ZHUYIN|IS_PINYIN, 264},
-{"piao", IS_ZHUYIN|IS_PINYIN, 265},
-{"pie", IS_ZHUYIN|IS_PINYIN, 266},
-{"pign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 268},
-{"pimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 268},
-{"pin", IS_ZHUYIN|IS_PINYIN, 267},
-{"ping", IS_ZHUYIN|IS_PINYIN, 268},
-{"po", IS_ZHUYIN|IS_PINYIN, 269},
-{"pou", IS_ZHUYIN|IS_PINYIN, 270},
-{"pu", IS_ZHUYIN|IS_PINYIN, 271},
-{"q", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 272},
-{"qi", IS_ZHUYIN|IS_PINYIN, 273},
-{"qia", IS_ZHUYIN|IS_PINYIN, 274},
-{"qiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 276},
-{"qiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 276},
-{"qian", IS_ZHUYIN|IS_PINYIN, 275},
-{"qiang", IS_ZHUYIN|IS_PINYIN, 276},
-{"qiao", IS_ZHUYIN|IS_PINYIN, 277},
-{"qie", IS_ZHUYIN|IS_PINYIN, 278},
-{"qign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 280},
-{"qimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 280},
-{"qin", IS_ZHUYIN|IS_PINYIN, 279},
-{"qing", IS_ZHUYIN|IS_PINYIN, 280},
-{"qiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 281},
-{"qiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 281},
-{"qion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 281},
-{"qiong", IS_ZHUYIN|IS_PINYIN, 281},
-{"qiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 282},
-{"qiu", IS_ZHUYIN|IS_PINYIN, 282},
-{"qu", IS_ZHUYIN|IS_PINYIN, 283},
-{"quan", IS_ZHUYIN|IS_PINYIN, 284},
-{"que", IS_ZHUYIN|IS_PINYIN, 285},
-{"quen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 286},
-{"qun", IS_ZHUYIN|IS_PINYIN, 286},
-{"qv", IS_PINYIN|PINYIN_CORRECT_V_U, 283},
-{"qvan", IS_PINYIN|PINYIN_CORRECT_V_U, 284},
-{"qve", IS_PINYIN|PINYIN_CORRECT_V_U, 285},
-{"qvn", IS_PINYIN|PINYIN_CORRECT_V_U, 286},
-{"r", IS_PINYIN|PINYIN_INCOMPLETE, 287},
-{"ragn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 289},
-{"ramg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 289},
-{"ran", IS_ZHUYIN|IS_PINYIN, 288},
-{"rang", IS_ZHUYIN|IS_PINYIN, 289},
-{"rao", IS_ZHUYIN|IS_PINYIN, 290},
-{"re", IS_ZHUYIN|IS_PINYIN, 291},
-{"regn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 293},
-{"remg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 293},
-{"ren", IS_ZHUYIN|IS_PINYIN, 292},
-{"reng", IS_ZHUYIN|IS_PINYIN, 293},
-{"ri", IS_ZHUYIN|IS_PINYIN, 294},
-{"rogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 295},
-{"romg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 295},
-{"ron", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 295},
-{"rong", IS_ZHUYIN|IS_PINYIN, 295},
-{"rou", IS_ZHUYIN|IS_PINYIN, 296},
-{"ru", IS_ZHUYIN|IS_PINYIN, 297},
-{"ruan", IS_ZHUYIN|IS_PINYIN, 299},
-{"ruei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 300},
-{"ruen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 301},
-{"rui", IS_ZHUYIN|IS_PINYIN, 300},
-{"run", IS_ZHUYIN|IS_PINYIN, 301},
-{"ruo", IS_ZHUYIN|IS_PINYIN, 302},
-{"s", IS_PINYIN|PINYIN_INCOMPLETE, 303},
-{"sa", IS_ZHUYIN|IS_PINYIN, 304},
-{"sagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 307},
-{"sai", IS_ZHUYIN|IS_PINYIN, 305},
-{"samg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 307},
-{"san", IS_ZHUYIN|IS_PINYIN, 306},
-{"sang", IS_ZHUYIN|IS_PINYIN, 307},
-{"sao", IS_ZHUYIN|IS_PINYIN, 308},
-{"se", IS_ZHUYIN|IS_PINYIN, 309},
-{"segn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 311},
-{"semg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 311},
-{"sen", IS_ZHUYIN|IS_PINYIN, 310},
-{"seng", IS_ZHUYIN|IS_PINYIN, 311},
-{"sh", IS_PINYIN|PINYIN_INCOMPLETE, 312},
-{"sha", IS_ZHUYIN|IS_PINYIN, 313},
-{"shagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 316},
-{"shai", IS_ZHUYIN|IS_PINYIN, 314},
-{"shamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 316},
-{"shan", IS_ZHUYIN|IS_PINYIN, 315},
-{"shang", IS_ZHUYIN|IS_PINYIN, 316},
-{"shao", IS_ZHUYIN|IS_PINYIN, 317},
-{"she", IS_ZHUYIN|IS_PINYIN, 318},
-{"shegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 321},
-{"shei", IS_ZHUYIN|IS_PINYIN, 319},
-{"shemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 321},
-{"shen", IS_ZHUYIN|IS_PINYIN, 320},
-{"sheng", IS_ZHUYIN|IS_PINYIN, 321},
-{"shi", IS_ZHUYIN|IS_PINYIN, 322},
-{"shou", IS_ZHUYIN|IS_PINYIN, 323},
-{"shu", IS_ZHUYIN|IS_PINYIN, 324},
-{"shua", IS_ZHUYIN|IS_PINYIN, 325},
-{"shuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 328},
-{"shuai", IS_ZHUYIN|IS_PINYIN, 326},
-{"shuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 328},
-{"shuan", IS_ZHUYIN|IS_PINYIN, 327},
-{"shuang", IS_ZHUYIN|IS_PINYIN, 328},
-{"shuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 329},
-{"shuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 330},
-{"shui", IS_ZHUYIN|IS_PINYIN, 329},
-{"shun", IS_ZHUYIN|IS_PINYIN, 330},
-{"shuo", IS_ZHUYIN|IS_PINYIN, 331},
-{"si", IS_ZHUYIN|IS_PINYIN, 332},
-{"sogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 333},
-{"somg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 333},
-{"son", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 333},
-{"song", IS_ZHUYIN|IS_PINYIN, 333},
-{"sou", IS_ZHUYIN|IS_PINYIN, 334},
-{"su", IS_ZHUYIN|IS_PINYIN, 335},
-{"suan", IS_ZHUYIN|IS_PINYIN, 336},
-{"suei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 337},
-{"suen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 338},
-{"sui", IS_ZHUYIN|IS_PINYIN, 337},
-{"sun", IS_ZHUYIN|IS_PINYIN, 338},
-{"suo", IS_ZHUYIN|IS_PINYIN, 339},
-{"t", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 340},
-{"ta", IS_ZHUYIN|IS_PINYIN, 341},
-{"tagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 344},
-{"tai", IS_ZHUYIN|IS_PINYIN, 342},
-{"tamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 344},
-{"tan", IS_ZHUYIN|IS_PINYIN, 343},
-{"tang", IS_ZHUYIN|IS_PINYIN, 344},
-{"tao", IS_ZHUYIN|IS_PINYIN, 345},
-{"te", IS_ZHUYIN|IS_PINYIN, 346},
-{"tegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 347},
-{"temg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 347},
-{"teng", IS_ZHUYIN|IS_PINYIN, 347},
-{"ti", IS_ZHUYIN|IS_PINYIN, 348},
-{"tian", IS_ZHUYIN|IS_PINYIN, 349},
-{"tiao", IS_ZHUYIN|IS_PINYIN, 350},
-{"tie", IS_ZHUYIN|IS_PINYIN, 351},
-{"tign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 352},
-{"timg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 352},
-{"ting", IS_ZHUYIN|IS_PINYIN, 352},
-{"togn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 353},
-{"tomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 353},
-{"ton", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 353},
-{"tong", IS_ZHUYIN|IS_PINYIN, 353},
-{"tou", IS_ZHUYIN|IS_PINYIN, 354},
-{"tu", IS_ZHUYIN|IS_PINYIN, 355},
-{"tuan", IS_ZHUYIN|IS_PINYIN, 356},
-{"tuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 357},
-{"tuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 358},
-{"tui", IS_ZHUYIN|IS_PINYIN, 357},
-{"tun", IS_ZHUYIN|IS_PINYIN, 358},
-{"tuo", IS_ZHUYIN|IS_PINYIN, 359},
-{"w", IS_PINYIN|PINYIN_INCOMPLETE, 360},
-{"wa", IS_ZHUYIN|IS_PINYIN, 361},
-{"wagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 364},
-{"wai", IS_ZHUYIN|IS_PINYIN, 362},
-{"wamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 364},
-{"wan", IS_ZHUYIN|IS_PINYIN, 363},
-{"wang", IS_ZHUYIN|IS_PINYIN, 364},
-{"wegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 367},
-{"wei", IS_ZHUYIN|IS_PINYIN, 365},
-{"wemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 367},
-{"wen", IS_ZHUYIN|IS_PINYIN, 366},
-{"weng", IS_ZHUYIN|IS_PINYIN, 367},
-{"wo", IS_ZHUYIN|IS_PINYIN, 368},
-{"wu", IS_ZHUYIN|IS_PINYIN, 369},
-{"x", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 370},
-{"xi", IS_ZHUYIN|IS_PINYIN, 371},
-{"xia", IS_ZHUYIN|IS_PINYIN, 372},
-{"xiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 374},
-{"xiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 374},
-{"xian", IS_ZHUYIN|IS_PINYIN, 373},
-{"xiang", IS_ZHUYIN|IS_PINYIN, 374},
-{"xiao", IS_ZHUYIN|IS_PINYIN, 375},
-{"xie", IS_ZHUYIN|IS_PINYIN, 376},
-{"xign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 378},
-{"ximg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 378},
-{"xin", IS_ZHUYIN|IS_PINYIN, 377},
-{"xing", IS_ZHUYIN|IS_PINYIN, 378},
-{"xiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 379},
-{"xiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 379},
-{"xion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 379},
-{"xiong", IS_ZHUYIN|IS_PINYIN, 379},
-{"xiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 380},
-{"xiu", IS_ZHUYIN|IS_PINYIN, 380},
-{"xu", IS_ZHUYIN|IS_PINYIN, 381},
-{"xuan", IS_ZHUYIN|IS_PINYIN, 382},
-{"xue", IS_ZHUYIN|IS_PINYIN, 383},
-{"xuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 384},
-{"xun", IS_ZHUYIN|IS_PINYIN, 384},
-{"xv", IS_PINYIN|PINYIN_CORRECT_V_U, 381},
-{"xvan", IS_PINYIN|PINYIN_CORRECT_V_U, 382},
-{"xve", IS_PINYIN|PINYIN_CORRECT_V_U, 383},
-{"xvn", IS_PINYIN|PINYIN_CORRECT_V_U, 384},
-{"y", IS_PINYIN|PINYIN_INCOMPLETE, 385},
-{"ya", IS_ZHUYIN|IS_PINYIN, 386},
-{"yagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 389},
-{"yamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 389},
-{"yan", IS_ZHUYIN|IS_PINYIN, 388},
-{"yang", IS_ZHUYIN|IS_PINYIN, 389},
-{"yao", IS_ZHUYIN|IS_PINYIN, 390},
-{"ye", IS_ZHUYIN|IS_PINYIN, 391},
-{"yi", IS_ZHUYIN|IS_PINYIN, 392},
-{"yign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 394},
-{"yimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 394},
-{"yin", IS_ZHUYIN|IS_PINYIN, 393},
-{"ying", IS_ZHUYIN|IS_PINYIN, 394},
-{"yo", IS_ZHUYIN|IS_PINYIN, 395},
-{"yogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 396},
-{"yomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 396},
-{"yon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 396},
-{"yong", IS_ZHUYIN|IS_PINYIN, 396},
-{"you", IS_ZHUYIN|IS_PINYIN, 397},
-{"yu", IS_ZHUYIN|IS_PINYIN, 398},
-{"yuan", IS_ZHUYIN|IS_PINYIN, 399},
-{"yue", IS_ZHUYIN|IS_PINYIN, 400},
-{"yuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 401},
-{"yun", IS_ZHUYIN|IS_PINYIN, 401},
-{"yv", IS_PINYIN|PINYIN_CORRECT_V_U, 398},
-{"yvan", IS_PINYIN|PINYIN_CORRECT_V_U, 399},
-{"yve", IS_PINYIN|PINYIN_CORRECT_V_U, 400},
-{"yvn", IS_PINYIN|PINYIN_CORRECT_V_U, 401},
-{"z", IS_PINYIN|PINYIN_INCOMPLETE, 402},
-{"za", IS_ZHUYIN|IS_PINYIN, 403},
-{"zagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 406},
-{"zai", IS_ZHUYIN|IS_PINYIN, 404},
-{"zamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 406},
-{"zan", IS_ZHUYIN|IS_PINYIN, 405},
-{"zang", IS_ZHUYIN|IS_PINYIN, 406},
-{"zao", IS_ZHUYIN|IS_PINYIN, 407},
-{"ze", IS_ZHUYIN|IS_PINYIN, 408},
-{"zegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 411},
-{"zei", IS_ZHUYIN|IS_PINYIN, 409},
-{"zemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 411},
-{"zen", IS_ZHUYIN|IS_PINYIN, 410},
-{"zeng", IS_ZHUYIN|IS_PINYIN, 411},
-{"zh", IS_PINYIN|PINYIN_INCOMPLETE, 412},
-{"zha", IS_ZHUYIN|IS_PINYIN, 413},
-{"zhagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 416},
-{"zhai", IS_ZHUYIN|IS_PINYIN, 414},
-{"zhamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 416},
-{"zhan", IS_ZHUYIN|IS_PINYIN, 415},
-{"zhang", IS_ZHUYIN|IS_PINYIN, 416},
-{"zhao", IS_ZHUYIN|IS_PINYIN, 417},
-{"zhe", IS_ZHUYIN|IS_PINYIN, 418},
-{"zhegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 421},
-{"zhemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 421},
-{"zhen", IS_ZHUYIN|IS_PINYIN, 420},
-{"zheng", IS_ZHUYIN|IS_PINYIN, 421},
-{"zhi", IS_ZHUYIN|IS_PINYIN, 422},
-{"zhogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 423},
-{"zhomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 423},
-{"zhon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 423},
-{"zhong", IS_ZHUYIN|IS_PINYIN, 423},
-{"zhou", IS_ZHUYIN|IS_PINYIN, 424},
-{"zhu", IS_ZHUYIN|IS_PINYIN, 425},
-{"zhua", IS_ZHUYIN|IS_PINYIN, 426},
-{"zhuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 429},
-{"zhuai", IS_ZHUYIN|IS_PINYIN, 427},
-{"zhuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 429},
-{"zhuan", IS_ZHUYIN|IS_PINYIN, 428},
-{"zhuang", IS_ZHUYIN|IS_PINYIN, 429},
-{"zhuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 430},
-{"zhuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 431},
-{"zhui", IS_ZHUYIN|IS_PINYIN, 430},
-{"zhun", IS_ZHUYIN|IS_PINYIN, 431},
-{"zhuo", IS_ZHUYIN|IS_PINYIN, 432},
-{"zi", IS_ZHUYIN|IS_PINYIN, 433},
-{"zogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 434},
-{"zomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 434},
-{"zon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 434},
-{"zong", IS_ZHUYIN|IS_PINYIN, 434},
-{"zou", IS_ZHUYIN|IS_PINYIN, 435},
-{"zu", IS_ZHUYIN|IS_PINYIN, 436},
-{"zuan", IS_ZHUYIN|IS_PINYIN, 437},
-{"zuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 438},
-{"zuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 439},
-{"zui", IS_ZHUYIN|IS_PINYIN, 438},
-{"zun", IS_ZHUYIN|IS_PINYIN, 439},
-{"zuo", IS_ZHUYIN|IS_PINYIN, 440}
+{"a", IS_ZHUYIN|IS_PINYIN, 1, 0},
+{"agn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 4, 1},
+{"ai", IS_ZHUYIN|IS_PINYIN, 2, 0},
+{"amg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 4, 1},
+{"an", IS_ZHUYIN|IS_PINYIN, 3, 0},
+{"ang", IS_ZHUYIN|IS_PINYIN, 4, 0},
+{"ao", IS_ZHUYIN|IS_PINYIN, 5, 0},
+{"b", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 6, 0},
+{"ba", IS_ZHUYIN|IS_PINYIN, 7, 0},
+{"bagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 10, 1},
+{"bai", IS_ZHUYIN|IS_PINYIN, 8, 0},
+{"bamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 10, 1},
+{"ban", IS_ZHUYIN|IS_PINYIN, 9, 0},
+{"bang", IS_ZHUYIN|IS_PINYIN, 10, 0},
+{"bao", IS_ZHUYIN|IS_PINYIN, 11, 0},
+{"begn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 14, 1},
+{"bei", IS_ZHUYIN|IS_PINYIN, 12, 0},
+{"bemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 14, 1},
+{"ben", IS_ZHUYIN|IS_PINYIN, 13, 0},
+{"beng", IS_ZHUYIN|IS_PINYIN, 14, 0},
+{"bi", IS_ZHUYIN|IS_PINYIN, 15, 0},
+{"bian", IS_ZHUYIN|IS_PINYIN, 16, 0},
+{"biao", IS_ZHUYIN|IS_PINYIN, 17, 0},
+{"bie", IS_ZHUYIN|IS_PINYIN, 18, 0},
+{"bign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 20, 1},
+{"bimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 20, 1},
+{"bin", IS_ZHUYIN|IS_PINYIN, 19, 0},
+{"bing", IS_ZHUYIN|IS_PINYIN, 20, 0},
+{"bo", IS_ZHUYIN|IS_PINYIN, 21, 0},
+{"bu", IS_ZHUYIN|IS_PINYIN, 22, 0},
+{"c", IS_PINYIN|PINYIN_INCOMPLETE, 23, 0},
+{"ca", IS_ZHUYIN|IS_PINYIN, 24, 0},
+{"cagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 27, 1},
+{"cai", IS_ZHUYIN|IS_PINYIN, 25, 0},
+{"camg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 27, 1},
+{"can", IS_ZHUYIN|IS_PINYIN, 26, 0},
+{"cang", IS_ZHUYIN|IS_PINYIN, 27, 0},
+{"cao", IS_ZHUYIN|IS_PINYIN, 28, 0},
+{"ce", IS_ZHUYIN|IS_PINYIN, 29, 0},
+{"cegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 31, 1},
+{"cemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 31, 1},
+{"cen", IS_ZHUYIN|IS_PINYIN, 30, 0},
+{"ceng", IS_ZHUYIN|IS_PINYIN, 31, 0},
+{"ch", IS_PINYIN|PINYIN_INCOMPLETE, 32, 0},
+{"cha", IS_ZHUYIN|IS_PINYIN, 33, 0},
+{"chagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 36, 1},
+{"chai", IS_ZHUYIN|IS_PINYIN, 34, 0},
+{"chamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 36, 1},
+{"chan", IS_ZHUYIN|IS_PINYIN, 35, 0},
+{"chang", IS_ZHUYIN|IS_PINYIN, 36, 0},
+{"chao", IS_ZHUYIN|IS_PINYIN, 37, 0},
+{"che", IS_ZHUYIN|IS_PINYIN, 38, 0},
+{"chegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 40, 1},
+{"chemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 40, 1},
+{"chen", IS_ZHUYIN|IS_PINYIN, 39, 0},
+{"cheng", IS_ZHUYIN|IS_PINYIN, 40, 0},
+{"chi", IS_ZHUYIN|IS_PINYIN, 41, 0},
+{"chogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 42, 1},
+{"chomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 42, 1},
+{"chon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 42, 1},
+{"chong", IS_ZHUYIN|IS_PINYIN, 42, 0},
+{"chou", IS_ZHUYIN|IS_PINYIN, 43, 0},
+{"chu", IS_ZHUYIN|IS_PINYIN, 44, 0},
+{"chuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 48, 1},
+{"chuai", IS_ZHUYIN|IS_PINYIN, 46, 0},
+{"chuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 48, 1},
+{"chuan", IS_ZHUYIN|IS_PINYIN, 47, 0},
+{"chuang", IS_ZHUYIN|IS_PINYIN, 48, 0},
+{"chuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 49, 1},
+{"chuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 50, 1},
+{"chui", IS_ZHUYIN|IS_PINYIN, 49, 0},
+{"chun", IS_ZHUYIN|IS_PINYIN, 50, 0},
+{"chuo", IS_ZHUYIN|IS_PINYIN, 51, 0},
+{"ci", IS_ZHUYIN|IS_PINYIN, 52, 0},
+{"cogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 53, 1},
+{"comg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 53, 1},
+{"con", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 53, 1},
+{"cong", IS_ZHUYIN|IS_PINYIN, 53, 0},
+{"cou", IS_ZHUYIN|IS_PINYIN, 54, 0},
+{"cu", IS_ZHUYIN|IS_PINYIN, 55, 0},
+{"cuan", IS_ZHUYIN|IS_PINYIN, 56, 0},
+{"cuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 57, 1},
+{"cuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 58, 1},
+{"cui", IS_ZHUYIN|IS_PINYIN, 57, 0},
+{"cun", IS_ZHUYIN|IS_PINYIN, 58, 0},
+{"cuo", IS_ZHUYIN|IS_PINYIN, 59, 0},
+{"d", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 60, 0},
+{"da", IS_ZHUYIN|IS_PINYIN, 61, 0},
+{"dagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 64, 1},
+{"dai", IS_ZHUYIN|IS_PINYIN, 62, 0},
+{"damg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 64, 1},
+{"dan", IS_ZHUYIN|IS_PINYIN, 63, 0},
+{"dang", IS_ZHUYIN|IS_PINYIN, 64, 0},
+{"dao", IS_ZHUYIN|IS_PINYIN, 65, 0},
+{"de", IS_ZHUYIN|IS_PINYIN, 66, 0},
+{"degn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 69, 1},
+{"dei", IS_ZHUYIN|IS_PINYIN, 67, 0},
+{"demg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 69, 1},
+{"deng", IS_ZHUYIN|IS_PINYIN, 69, 0},
+{"di", IS_ZHUYIN|IS_PINYIN, 70, 0},
+{"dia", IS_ZHUYIN|IS_PINYIN, 71, 0},
+{"dian", IS_ZHUYIN|IS_PINYIN, 72, 0},
+{"diao", IS_ZHUYIN|IS_PINYIN, 73, 0},
+{"die", IS_ZHUYIN|IS_PINYIN, 74, 0},
+{"dign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 76, 1},
+{"dimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 76, 1},
+{"ding", IS_ZHUYIN|IS_PINYIN, 76, 0},
+{"diou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 77, 1},
+{"diu", IS_ZHUYIN|IS_PINYIN, 77, 0},
+{"dogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 78, 1},
+{"domg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 78, 1},
+{"don", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 78, 1},
+{"dong", IS_ZHUYIN|IS_PINYIN, 78, 0},
+{"dou", IS_ZHUYIN|IS_PINYIN, 79, 0},
+{"du", IS_ZHUYIN|IS_PINYIN, 80, 0},
+{"duan", IS_ZHUYIN|IS_PINYIN, 81, 0},
+{"duei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 82, 1},
+{"duen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 83, 1},
+{"dui", IS_ZHUYIN|IS_PINYIN, 82, 0},
+{"dun", IS_ZHUYIN|IS_PINYIN, 83, 0},
+{"duo", IS_ZHUYIN|IS_PINYIN, 84, 0},
+{"e", IS_ZHUYIN|IS_PINYIN, 85, 0},
+{"egn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 88, 1},
+{"ei", IS_ZHUYIN|IS_PINYIN, 86, 0},
+{"emg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 88, 1},
+{"en", IS_ZHUYIN|IS_PINYIN, 87, 0},
+{"er", IS_ZHUYIN|IS_PINYIN, 89, 0},
+{"f", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 90, 0},
+{"fa", IS_ZHUYIN|IS_PINYIN, 91, 0},
+{"fagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 93, 1},
+{"famg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 93, 1},
+{"fan", IS_ZHUYIN|IS_PINYIN, 92, 0},
+{"fang", IS_ZHUYIN|IS_PINYIN, 93, 0},
+{"fegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 97, 1},
+{"fei", IS_ZHUYIN|IS_PINYIN, 95, 0},
+{"femg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 97, 1},
+{"fen", IS_ZHUYIN|IS_PINYIN, 96, 0},
+{"feng", IS_ZHUYIN|IS_PINYIN, 97, 0},
+{"fo", IS_ZHUYIN|IS_PINYIN, 98, 0},
+{"fou", IS_ZHUYIN|IS_PINYIN, 99, 0},
+{"fu", IS_ZHUYIN|IS_PINYIN, 100, 0},
+{"g", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 101, 0},
+{"ga", IS_ZHUYIN|IS_PINYIN, 102, 0},
+{"gagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 105, 1},
+{"gai", IS_ZHUYIN|IS_PINYIN, 103, 0},
+{"gamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 105, 1},
+{"gan", IS_ZHUYIN|IS_PINYIN, 104, 0},
+{"gang", IS_ZHUYIN|IS_PINYIN, 105, 0},
+{"gao", IS_ZHUYIN|IS_PINYIN, 106, 0},
+{"ge", IS_ZHUYIN|IS_PINYIN, 107, 0},
+{"gegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 110, 1},
+{"gei", IS_ZHUYIN|IS_PINYIN, 108, 0},
+{"gemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 110, 1},
+{"gen", IS_ZHUYIN|IS_PINYIN, 109, 0},
+{"geng", IS_ZHUYIN|IS_PINYIN, 110, 0},
+{"gogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 111, 1},
+{"gomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 111, 1},
+{"gon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 111, 1},
+{"gong", IS_ZHUYIN|IS_PINYIN, 111, 0},
+{"gou", IS_ZHUYIN|IS_PINYIN, 112, 0},
+{"gu", IS_ZHUYIN|IS_PINYIN, 113, 0},
+{"gua", IS_ZHUYIN|IS_PINYIN, 114, 0},
+{"guagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 117, 1},
+{"guai", IS_ZHUYIN|IS_PINYIN, 115, 0},
+{"guamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 117, 1},
+{"guan", IS_ZHUYIN|IS_PINYIN, 116, 0},
+{"guang", IS_ZHUYIN|IS_PINYIN, 117, 0},
+{"guei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 118, 1},
+{"guen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 119, 1},
+{"gui", IS_ZHUYIN|IS_PINYIN, 118, 0},
+{"gun", IS_ZHUYIN|IS_PINYIN, 119, 0},
+{"guo", IS_ZHUYIN|IS_PINYIN, 120, 0},
+{"h", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 121, 0},
+{"ha", IS_ZHUYIN|IS_PINYIN, 122, 0},
+{"hagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 125, 1},
+{"hai", IS_ZHUYIN|IS_PINYIN, 123, 0},
+{"hamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 125, 1},
+{"han", IS_ZHUYIN|IS_PINYIN, 124, 0},
+{"hang", IS_ZHUYIN|IS_PINYIN, 125, 0},
+{"hao", IS_ZHUYIN|IS_PINYIN, 126, 0},
+{"he", IS_ZHUYIN|IS_PINYIN, 127, 0},
+{"hegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 130, 1},
+{"hei", IS_ZHUYIN|IS_PINYIN, 128, 0},
+{"hemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 130, 1},
+{"hen", IS_ZHUYIN|IS_PINYIN, 129, 0},
+{"heng", IS_ZHUYIN|IS_PINYIN, 130, 0},
+{"hogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 131, 1},
+{"homg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 131, 1},
+{"hon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 131, 1},
+{"hong", IS_ZHUYIN|IS_PINYIN, 131, 0},
+{"hou", IS_ZHUYIN|IS_PINYIN, 132, 0},
+{"hu", IS_ZHUYIN|IS_PINYIN, 133, 0},
+{"hua", IS_ZHUYIN|IS_PINYIN, 134, 0},
+{"huagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 137, 1},
+{"huai", IS_ZHUYIN|IS_PINYIN, 135, 0},
+{"huamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 137, 1},
+{"huan", IS_ZHUYIN|IS_PINYIN, 136, 0},
+{"huang", IS_ZHUYIN|IS_PINYIN, 137, 0},
+{"huei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 138, 1},
+{"huen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 139, 1},
+{"hui", IS_ZHUYIN|IS_PINYIN, 138, 0},
+{"hun", IS_ZHUYIN|IS_PINYIN, 139, 0},
+{"huo", IS_ZHUYIN|IS_PINYIN, 140, 0},
+{"j", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 141, 0},
+{"ji", IS_ZHUYIN|IS_PINYIN, 142, 0},
+{"jia", IS_ZHUYIN|IS_PINYIN, 143, 0},
+{"jiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 145, 1},
+{"jiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 145, 1},
+{"jian", IS_ZHUYIN|IS_PINYIN, 144, 0},
+{"jiang", IS_ZHUYIN|IS_PINYIN, 145, 0},
+{"jiao", IS_ZHUYIN|IS_PINYIN, 146, 0},
+{"jie", IS_ZHUYIN|IS_PINYIN, 147, 0},
+{"jign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 149, 1},
+{"jimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 149, 1},
+{"jin", IS_ZHUYIN|IS_PINYIN, 148, 0},
+{"jing", IS_ZHUYIN|IS_PINYIN, 149, 0},
+{"jiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 150, 1},
+{"jiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 150, 1},
+{"jion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 150, 1},
+{"jiong", IS_ZHUYIN|IS_PINYIN, 150, 0},
+{"jiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 151, 1},
+{"jiu", IS_ZHUYIN|IS_PINYIN, 151, 0},
+{"ju", IS_ZHUYIN|IS_PINYIN, 152, 0},
+{"juan", IS_ZHUYIN|IS_PINYIN, 153, 0},
+{"jue", IS_ZHUYIN|IS_PINYIN, 154, 0},
+{"juen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 155, 1},
+{"jun", IS_ZHUYIN|IS_PINYIN, 155, 0},
+{"jv", IS_PINYIN|PINYIN_CORRECT_V_U, 152, 1},
+{"jvan", IS_PINYIN|PINYIN_CORRECT_V_U, 153, 1},
+{"jve", IS_PINYIN|PINYIN_CORRECT_V_U, 154, 1},
+{"jvn", IS_PINYIN|PINYIN_CORRECT_V_U, 155, 1},
+{"k", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 156, 0},
+{"ka", IS_ZHUYIN|IS_PINYIN, 157, 0},
+{"kagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 160, 1},
+{"kai", IS_ZHUYIN|IS_PINYIN, 158, 0},
+{"kamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 160, 1},
+{"kan", IS_ZHUYIN|IS_PINYIN, 159, 0},
+{"kang", IS_ZHUYIN|IS_PINYIN, 160, 0},
+{"kao", IS_ZHUYIN|IS_PINYIN, 161, 0},
+{"ke", IS_ZHUYIN|IS_PINYIN, 162, 0},
+{"kegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 165, 1},
+{"kemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 165, 1},
+{"ken", IS_ZHUYIN|IS_PINYIN, 164, 0},
+{"keng", IS_ZHUYIN|IS_PINYIN, 165, 0},
+{"kogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 166, 1},
+{"komg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 166, 1},
+{"kon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 166, 1},
+{"kong", IS_ZHUYIN|IS_PINYIN, 166, 0},
+{"kou", IS_ZHUYIN|IS_PINYIN, 167, 0},
+{"ku", IS_ZHUYIN|IS_PINYIN, 168, 0},
+{"kua", IS_ZHUYIN|IS_PINYIN, 169, 0},
+{"kuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 172, 1},
+{"kuai", IS_ZHUYIN|IS_PINYIN, 170, 0},
+{"kuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 172, 1},
+{"kuan", IS_ZHUYIN|IS_PINYIN, 171, 0},
+{"kuang", IS_ZHUYIN|IS_PINYIN, 172, 0},
+{"kuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 173, 1},
+{"kuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 174, 1},
+{"kui", IS_ZHUYIN|IS_PINYIN, 173, 0},
+{"kun", IS_ZHUYIN|IS_PINYIN, 174, 0},
+{"kuo", IS_ZHUYIN|IS_PINYIN, 175, 0},
+{"l", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 176, 0},
+{"la", IS_ZHUYIN|IS_PINYIN, 177, 0},
+{"lagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 180, 1},
+{"lai", IS_ZHUYIN|IS_PINYIN, 178, 0},
+{"lamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 180, 1},
+{"lan", IS_ZHUYIN|IS_PINYIN, 179, 0},
+{"lang", IS_ZHUYIN|IS_PINYIN, 180, 0},
+{"lao", IS_ZHUYIN|IS_PINYIN, 181, 0},
+{"le", IS_ZHUYIN|IS_PINYIN, 182, 0},
+{"legn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 185, 1},
+{"lei", IS_ZHUYIN|IS_PINYIN, 183, 0},
+{"lemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 185, 1},
+{"leng", IS_ZHUYIN|IS_PINYIN, 185, 0},
+{"li", IS_ZHUYIN|IS_PINYIN, 186, 0},
+{"lia", IS_ZHUYIN|IS_PINYIN, 187, 0},
+{"liagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 189, 1},
+{"liamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 189, 1},
+{"lian", IS_ZHUYIN|IS_PINYIN, 188, 0},
+{"liang", IS_ZHUYIN|IS_PINYIN, 189, 0},
+{"liao", IS_ZHUYIN|IS_PINYIN, 190, 0},
+{"lie", IS_ZHUYIN|IS_PINYIN, 191, 0},
+{"lign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 193, 1},
+{"limg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 193, 1},
+{"lin", IS_ZHUYIN|IS_PINYIN, 192, 0},
+{"ling", IS_ZHUYIN|IS_PINYIN, 193, 0},
+{"liou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 194, 1},
+{"liu", IS_ZHUYIN|IS_PINYIN, 194, 0},
+{"lo", IS_ZHUYIN|IS_PINYIN, 195, 0},
+{"logn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 196, 1},
+{"lomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 196, 1},
+{"lon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 196, 1},
+{"long", IS_ZHUYIN|IS_PINYIN, 196, 0},
+{"lou", IS_ZHUYIN|IS_PINYIN, 197, 0},
+{"lu", IS_ZHUYIN|IS_PINYIN, 198, 0},
+{"luan", IS_ZHUYIN|IS_PINYIN, 199, 0},
+{"lue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 203, 1},
+{"luen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 200, 1},
+{"lun", IS_ZHUYIN|IS_PINYIN, 200, 0},
+{"luo", IS_ZHUYIN|IS_PINYIN, 201, 0},
+{"lv", IS_ZHUYIN|IS_PINYIN, 202, 0},
+{"lve", IS_ZHUYIN|IS_PINYIN, 203, 0},
+{"m", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 204, 0},
+{"ma", IS_ZHUYIN|IS_PINYIN, 205, 0},
+{"magn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 208, 1},
+{"mai", IS_ZHUYIN|IS_PINYIN, 206, 0},
+{"mamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 208, 1},
+{"man", IS_ZHUYIN|IS_PINYIN, 207, 0},
+{"mang", IS_ZHUYIN|IS_PINYIN, 208, 0},
+{"mao", IS_ZHUYIN|IS_PINYIN, 209, 0},
+{"me", IS_ZHUYIN|IS_PINYIN, 210, 0},
+{"megn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 213, 1},
+{"mei", IS_ZHUYIN|IS_PINYIN, 211, 0},
+{"memg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 213, 1},
+{"men", IS_ZHUYIN|IS_PINYIN, 212, 0},
+{"meng", IS_ZHUYIN|IS_PINYIN, 213, 0},
+{"mi", IS_ZHUYIN|IS_PINYIN, 214, 0},
+{"mian", IS_ZHUYIN|IS_PINYIN, 215, 0},
+{"miao", IS_ZHUYIN|IS_PINYIN, 216, 0},
+{"mie", IS_ZHUYIN|IS_PINYIN, 217, 0},
+{"mign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 219, 1},
+{"mimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 219, 1},
+{"min", IS_ZHUYIN|IS_PINYIN, 218, 0},
+{"ming", IS_ZHUYIN|IS_PINYIN, 219, 0},
+{"miou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 220, 1},
+{"miu", IS_ZHUYIN|IS_PINYIN, 220, 0},
+{"mo", IS_ZHUYIN|IS_PINYIN, 221, 0},
+{"mou", IS_ZHUYIN|IS_PINYIN, 222, 0},
+{"mu", IS_ZHUYIN|IS_PINYIN, 223, 0},
+{"n", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 224, 0},
+{"na", IS_ZHUYIN|IS_PINYIN, 225, 0},
+{"nagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 228, 1},
+{"nai", IS_ZHUYIN|IS_PINYIN, 226, 0},
+{"namg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 228, 1},
+{"nan", IS_ZHUYIN|IS_PINYIN, 227, 0},
+{"nang", IS_ZHUYIN|IS_PINYIN, 228, 0},
+{"nao", IS_ZHUYIN|IS_PINYIN, 229, 0},
+{"ne", IS_ZHUYIN|IS_PINYIN, 230, 0},
+{"negn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 233, 1},
+{"nei", IS_ZHUYIN|IS_PINYIN, 231, 0},
+{"nemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 233, 1},
+{"nen", IS_ZHUYIN|IS_PINYIN, 232, 0},
+{"neng", IS_ZHUYIN|IS_PINYIN, 233, 0},
+{"ng", IS_ZHUYIN|IS_PINYIN, 234, 0},
+{"ni", IS_ZHUYIN|IS_PINYIN, 235, 0},
+{"niagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 238, 1},
+{"niamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 238, 1},
+{"nian", IS_ZHUYIN|IS_PINYIN, 237, 0},
+{"niang", IS_ZHUYIN|IS_PINYIN, 238, 0},
+{"niao", IS_ZHUYIN|IS_PINYIN, 239, 0},
+{"nie", IS_ZHUYIN|IS_PINYIN, 240, 0},
+{"nign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 242, 1},
+{"nimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 242, 1},
+{"nin", IS_ZHUYIN|IS_PINYIN, 241, 0},
+{"ning", IS_ZHUYIN|IS_PINYIN, 242, 0},
+{"niou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 243, 1},
+{"niu", IS_ZHUYIN|IS_PINYIN, 243, 0},
+{"nogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 244, 1},
+{"nomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 244, 1},
+{"non", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 244, 1},
+{"nong", IS_ZHUYIN|IS_PINYIN, 244, 0},
+{"nou", IS_ZHUYIN|IS_PINYIN, 245, 0},
+{"nu", IS_ZHUYIN|IS_PINYIN, 246, 0},
+{"nuan", IS_ZHUYIN|IS_PINYIN, 247, 0},
+{"nue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 251, 1},
+{"nuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 248, 1},
+{"nuo", IS_ZHUYIN|IS_PINYIN, 249, 0},
+{"nv", IS_ZHUYIN|IS_PINYIN, 250, 0},
+{"nve", IS_ZHUYIN|IS_PINYIN, 251, 0},
+{"o", IS_ZHUYIN|IS_PINYIN, 252, 0},
+{"ou", IS_ZHUYIN|IS_PINYIN, 253, 0},
+{"p", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 254, 0},
+{"pa", IS_ZHUYIN|IS_PINYIN, 255, 0},
+{"pagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 258, 1},
+{"pai", IS_ZHUYIN|IS_PINYIN, 256, 0},
+{"pamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 258, 1},
+{"pan", IS_ZHUYIN|IS_PINYIN, 257, 0},
+{"pang", IS_ZHUYIN|IS_PINYIN, 258, 0},
+{"pao", IS_ZHUYIN|IS_PINYIN, 259, 0},
+{"pegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 262, 1},
+{"pei", IS_ZHUYIN|IS_PINYIN, 260, 0},
+{"pemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 262, 1},
+{"pen", IS_ZHUYIN|IS_PINYIN, 261, 0},
+{"peng", IS_ZHUYIN|IS_PINYIN, 262, 0},
+{"pi", IS_ZHUYIN|IS_PINYIN, 263, 0},
+{"pian", IS_ZHUYIN|IS_PINYIN, 264, 0},
+{"piao", IS_ZHUYIN|IS_PINYIN, 265, 0},
+{"pie", IS_ZHUYIN|IS_PINYIN, 266, 0},
+{"pign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 268, 1},
+{"pimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 268, 1},
+{"pin", IS_ZHUYIN|IS_PINYIN, 267, 0},
+{"ping", IS_ZHUYIN|IS_PINYIN, 268, 0},
+{"po", IS_ZHUYIN|IS_PINYIN, 269, 0},
+{"pou", IS_ZHUYIN|IS_PINYIN, 270, 0},
+{"pu", IS_ZHUYIN|IS_PINYIN, 271, 0},
+{"q", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 272, 0},
+{"qi", IS_ZHUYIN|IS_PINYIN, 273, 0},
+{"qia", IS_ZHUYIN|IS_PINYIN, 274, 0},
+{"qiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 276, 1},
+{"qiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 276, 1},
+{"qian", IS_ZHUYIN|IS_PINYIN, 275, 0},
+{"qiang", IS_ZHUYIN|IS_PINYIN, 276, 0},
+{"qiao", IS_ZHUYIN|IS_PINYIN, 277, 0},
+{"qie", IS_ZHUYIN|IS_PINYIN, 278, 0},
+{"qign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 280, 1},
+{"qimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 280, 1},
+{"qin", IS_ZHUYIN|IS_PINYIN, 279, 0},
+{"qing", IS_ZHUYIN|IS_PINYIN, 280, 0},
+{"qiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 281, 1},
+{"qiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 281, 1},
+{"qion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 281, 1},
+{"qiong", IS_ZHUYIN|IS_PINYIN, 281, 0},
+{"qiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 282, 1},
+{"qiu", IS_ZHUYIN|IS_PINYIN, 282, 0},
+{"qu", IS_ZHUYIN|IS_PINYIN, 283, 0},
+{"quan", IS_ZHUYIN|IS_PINYIN, 284, 0},
+{"que", IS_ZHUYIN|IS_PINYIN, 285, 0},
+{"quen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 286, 1},
+{"qun", IS_ZHUYIN|IS_PINYIN, 286, 0},
+{"qv", IS_PINYIN|PINYIN_CORRECT_V_U, 283, 1},
+{"qvan", IS_PINYIN|PINYIN_CORRECT_V_U, 284, 1},
+{"qve", IS_PINYIN|PINYIN_CORRECT_V_U, 285, 1},
+{"qvn", IS_PINYIN|PINYIN_CORRECT_V_U, 286, 1},
+{"r", IS_PINYIN|PINYIN_INCOMPLETE, 287, 0},
+{"ragn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 289, 1},
+{"ramg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 289, 1},
+{"ran", IS_ZHUYIN|IS_PINYIN, 288, 0},
+{"rang", IS_ZHUYIN|IS_PINYIN, 289, 0},
+{"rao", IS_ZHUYIN|IS_PINYIN, 290, 0},
+{"re", IS_ZHUYIN|IS_PINYIN, 291, 0},
+{"regn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 293, 1},
+{"remg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 293, 1},
+{"ren", IS_ZHUYIN|IS_PINYIN, 292, 0},
+{"reng", IS_ZHUYIN|IS_PINYIN, 293, 0},
+{"ri", IS_ZHUYIN|IS_PINYIN, 294, 0},
+{"rogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 295, 1},
+{"romg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 295, 1},
+{"ron", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 295, 1},
+{"rong", IS_ZHUYIN|IS_PINYIN, 295, 0},
+{"rou", IS_ZHUYIN|IS_PINYIN, 296, 0},
+{"ru", IS_ZHUYIN|IS_PINYIN, 297, 0},
+{"ruan", IS_ZHUYIN|IS_PINYIN, 299, 0},
+{"ruei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 300, 1},
+{"ruen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 301, 1},
+{"rui", IS_ZHUYIN|IS_PINYIN, 300, 0},
+{"run", IS_ZHUYIN|IS_PINYIN, 301, 0},
+{"ruo", IS_ZHUYIN|IS_PINYIN, 302, 0},
+{"s", IS_PINYIN|PINYIN_INCOMPLETE, 303, 0},
+{"sa", IS_ZHUYIN|IS_PINYIN, 304, 0},
+{"sagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 307, 1},
+{"sai", IS_ZHUYIN|IS_PINYIN, 305, 0},
+{"samg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 307, 1},
+{"san", IS_ZHUYIN|IS_PINYIN, 306, 0},
+{"sang", IS_ZHUYIN|IS_PINYIN, 307, 0},
+{"sao", IS_ZHUYIN|IS_PINYIN, 308, 0},
+{"se", IS_ZHUYIN|IS_PINYIN, 309, 0},
+{"segn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 311, 1},
+{"semg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 311, 1},
+{"sen", IS_ZHUYIN|IS_PINYIN, 310, 0},
+{"seng", IS_ZHUYIN|IS_PINYIN, 311, 0},
+{"sh", IS_PINYIN|PINYIN_INCOMPLETE, 312, 0},
+{"sha", IS_ZHUYIN|IS_PINYIN, 313, 0},
+{"shagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 316, 1},
+{"shai", IS_ZHUYIN|IS_PINYIN, 314, 0},
+{"shamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 316, 1},
+{"shan", IS_ZHUYIN|IS_PINYIN, 315, 0},
+{"shang", IS_ZHUYIN|IS_PINYIN, 316, 0},
+{"shao", IS_ZHUYIN|IS_PINYIN, 317, 0},
+{"she", IS_ZHUYIN|IS_PINYIN, 318, 0},
+{"shegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 321, 1},
+{"shei", IS_ZHUYIN|IS_PINYIN, 319, 0},
+{"shemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 321, 1},
+{"shen", IS_ZHUYIN|IS_PINYIN, 320, 0},
+{"sheng", IS_ZHUYIN|IS_PINYIN, 321, 0},
+{"shi", IS_ZHUYIN|IS_PINYIN, 322, 0},
+{"shou", IS_ZHUYIN|IS_PINYIN, 323, 0},
+{"shu", IS_ZHUYIN|IS_PINYIN, 324, 0},
+{"shua", IS_ZHUYIN|IS_PINYIN, 325, 0},
+{"shuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 328, 1},
+{"shuai", IS_ZHUYIN|IS_PINYIN, 326, 0},
+{"shuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 328, 1},
+{"shuan", IS_ZHUYIN|IS_PINYIN, 327, 0},
+{"shuang", IS_ZHUYIN|IS_PINYIN, 328, 0},
+{"shuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 329, 1},
+{"shuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 330, 1},
+{"shui", IS_ZHUYIN|IS_PINYIN, 329, 0},
+{"shun", IS_ZHUYIN|IS_PINYIN, 330, 0},
+{"shuo", IS_ZHUYIN|IS_PINYIN, 331, 0},
+{"si", IS_ZHUYIN|IS_PINYIN, 332, 0},
+{"sogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 333, 1},
+{"somg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 333, 1},
+{"son", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 333, 1},
+{"song", IS_ZHUYIN|IS_PINYIN, 333, 0},
+{"sou", IS_ZHUYIN|IS_PINYIN, 334, 0},
+{"su", IS_ZHUYIN|IS_PINYIN, 335, 0},
+{"suan", IS_ZHUYIN|IS_PINYIN, 336, 0},
+{"suei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 337, 1},
+{"suen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 338, 1},
+{"sui", IS_ZHUYIN|IS_PINYIN, 337, 0},
+{"sun", IS_ZHUYIN|IS_PINYIN, 338, 0},
+{"suo", IS_ZHUYIN|IS_PINYIN, 339, 0},
+{"t", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 340, 0},
+{"ta", IS_ZHUYIN|IS_PINYIN, 341, 0},
+{"tagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 344, 1},
+{"tai", IS_ZHUYIN|IS_PINYIN, 342, 0},
+{"tamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 344, 1},
+{"tan", IS_ZHUYIN|IS_PINYIN, 343, 0},
+{"tang", IS_ZHUYIN|IS_PINYIN, 344, 0},
+{"tao", IS_ZHUYIN|IS_PINYIN, 345, 0},
+{"te", IS_ZHUYIN|IS_PINYIN, 346, 0},
+{"tegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 347, 1},
+{"temg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 347, 1},
+{"teng", IS_ZHUYIN|IS_PINYIN, 347, 0},
+{"ti", IS_ZHUYIN|IS_PINYIN, 348, 0},
+{"tian", IS_ZHUYIN|IS_PINYIN, 349, 0},
+{"tiao", IS_ZHUYIN|IS_PINYIN, 350, 0},
+{"tie", IS_ZHUYIN|IS_PINYIN, 351, 0},
+{"tign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 352, 1},
+{"timg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 352, 1},
+{"ting", IS_ZHUYIN|IS_PINYIN, 352, 0},
+{"togn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 353, 1},
+{"tomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 353, 1},
+{"ton", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 353, 1},
+{"tong", IS_ZHUYIN|IS_PINYIN, 353, 0},
+{"tou", IS_ZHUYIN|IS_PINYIN, 354, 0},
+{"tu", IS_ZHUYIN|IS_PINYIN, 355, 0},
+{"tuan", IS_ZHUYIN|IS_PINYIN, 356, 0},
+{"tuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 357, 1},
+{"tuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 358, 1},
+{"tui", IS_ZHUYIN|IS_PINYIN, 357, 0},
+{"tun", IS_ZHUYIN|IS_PINYIN, 358, 0},
+{"tuo", IS_ZHUYIN|IS_PINYIN, 359, 0},
+{"w", IS_PINYIN|PINYIN_INCOMPLETE, 360, 0},
+{"wa", IS_ZHUYIN|IS_PINYIN, 361, 0},
+{"wagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 364, 1},
+{"wai", IS_ZHUYIN|IS_PINYIN, 362, 0},
+{"wamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 364, 1},
+{"wan", IS_ZHUYIN|IS_PINYIN, 363, 0},
+{"wang", IS_ZHUYIN|IS_PINYIN, 364, 0},
+{"wegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 367, 1},
+{"wei", IS_ZHUYIN|IS_PINYIN, 365, 0},
+{"wemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 367, 1},
+{"wen", IS_ZHUYIN|IS_PINYIN, 366, 0},
+{"weng", IS_ZHUYIN|IS_PINYIN, 367, 0},
+{"wo", IS_ZHUYIN|IS_PINYIN, 368, 0},
+{"wu", IS_ZHUYIN|IS_PINYIN, 369, 0},
+{"x", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 370, 0},
+{"xi", IS_ZHUYIN|IS_PINYIN, 371, 0},
+{"xia", IS_ZHUYIN|IS_PINYIN, 372, 0},
+{"xiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 374, 1},
+{"xiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 374, 1},
+{"xian", IS_ZHUYIN|IS_PINYIN, 373, 0},
+{"xiang", IS_ZHUYIN|IS_PINYIN, 374, 0},
+{"xiao", IS_ZHUYIN|IS_PINYIN, 375, 0},
+{"xie", IS_ZHUYIN|IS_PINYIN, 376, 0},
+{"xign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 378, 1},
+{"ximg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 378, 1},
+{"xin", IS_ZHUYIN|IS_PINYIN, 377, 0},
+{"xing", IS_ZHUYIN|IS_PINYIN, 378, 0},
+{"xiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 379, 1},
+{"xiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 379, 1},
+{"xion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 379, 1},
+{"xiong", IS_ZHUYIN|IS_PINYIN, 379, 0},
+{"xiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 380, 1},
+{"xiu", IS_ZHUYIN|IS_PINYIN, 380, 0},
+{"xu", IS_ZHUYIN|IS_PINYIN, 381, 0},
+{"xuan", IS_ZHUYIN|IS_PINYIN, 382, 0},
+{"xue", IS_ZHUYIN|IS_PINYIN, 383, 0},
+{"xuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 384, 1},
+{"xun", IS_ZHUYIN|IS_PINYIN, 384, 0},
+{"xv", IS_PINYIN|PINYIN_CORRECT_V_U, 381, 1},
+{"xvan", IS_PINYIN|PINYIN_CORRECT_V_U, 382, 1},
+{"xve", IS_PINYIN|PINYIN_CORRECT_V_U, 383, 1},
+{"xvn", IS_PINYIN|PINYIN_CORRECT_V_U, 384, 1},
+{"y", IS_PINYIN|PINYIN_INCOMPLETE, 385, 0},
+{"ya", IS_ZHUYIN|IS_PINYIN, 386, 0},
+{"yagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 389, 1},
+{"yamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 389, 1},
+{"yan", IS_ZHUYIN|IS_PINYIN, 388, 0},
+{"yang", IS_ZHUYIN|IS_PINYIN, 389, 0},
+{"yao", IS_ZHUYIN|IS_PINYIN, 390, 0},
+{"ye", IS_ZHUYIN|IS_PINYIN, 391, 0},
+{"yi", IS_ZHUYIN|IS_PINYIN, 392, 0},
+{"yign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 394, 1},
+{"yimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 394, 1},
+{"yin", IS_ZHUYIN|IS_PINYIN, 393, 0},
+{"ying", IS_ZHUYIN|IS_PINYIN, 394, 0},
+{"yo", IS_ZHUYIN|IS_PINYIN, 395, 0},
+{"yogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 396, 1},
+{"yomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 396, 1},
+{"yon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 396, 1},
+{"yong", IS_ZHUYIN|IS_PINYIN, 396, 0},
+{"you", IS_ZHUYIN|IS_PINYIN, 397, 0},
+{"yu", IS_ZHUYIN|IS_PINYIN, 398, 0},
+{"yuan", IS_ZHUYIN|IS_PINYIN, 399, 0},
+{"yue", IS_ZHUYIN|IS_PINYIN, 400, 0},
+{"yuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 401, 1},
+{"yun", IS_ZHUYIN|IS_PINYIN, 401, 0},
+{"yv", IS_PINYIN|PINYIN_CORRECT_V_U, 398, 1},
+{"yvan", IS_PINYIN|PINYIN_CORRECT_V_U, 399, 1},
+{"yve", IS_PINYIN|PINYIN_CORRECT_V_U, 400, 1},
+{"yvn", IS_PINYIN|PINYIN_CORRECT_V_U, 401, 1},
+{"z", IS_PINYIN|PINYIN_INCOMPLETE, 402, 0},
+{"za", IS_ZHUYIN|IS_PINYIN, 403, 0},
+{"zagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 406, 1},
+{"zai", IS_ZHUYIN|IS_PINYIN, 404, 0},
+{"zamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 406, 1},
+{"zan", IS_ZHUYIN|IS_PINYIN, 405, 0},
+{"zang", IS_ZHUYIN|IS_PINYIN, 406, 0},
+{"zao", IS_ZHUYIN|IS_PINYIN, 407, 0},
+{"ze", IS_ZHUYIN|IS_PINYIN, 408, 0},
+{"zegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 411, 1},
+{"zei", IS_ZHUYIN|IS_PINYIN, 409, 0},
+{"zemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 411, 1},
+{"zen", IS_ZHUYIN|IS_PINYIN, 410, 0},
+{"zeng", IS_ZHUYIN|IS_PINYIN, 411, 0},
+{"zh", IS_PINYIN|PINYIN_INCOMPLETE, 412, 0},
+{"zha", IS_ZHUYIN|IS_PINYIN, 413, 0},
+{"zhagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 416, 1},
+{"zhai", IS_ZHUYIN|IS_PINYIN, 414, 0},
+{"zhamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 416, 1},
+{"zhan", IS_ZHUYIN|IS_PINYIN, 415, 0},
+{"zhang", IS_ZHUYIN|IS_PINYIN, 416, 0},
+{"zhao", IS_ZHUYIN|IS_PINYIN, 417, 0},
+{"zhe", IS_ZHUYIN|IS_PINYIN, 418, 0},
+{"zhegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 421, 1},
+{"zhemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 421, 1},
+{"zhen", IS_ZHUYIN|IS_PINYIN, 420, 0},
+{"zheng", IS_ZHUYIN|IS_PINYIN, 421, 0},
+{"zhi", IS_ZHUYIN|IS_PINYIN, 422, 0},
+{"zhogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 423, 1},
+{"zhomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 423, 1},
+{"zhon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 423, 1},
+{"zhong", IS_ZHUYIN|IS_PINYIN, 423, 0},
+{"zhou", IS_ZHUYIN|IS_PINYIN, 424, 0},
+{"zhu", IS_ZHUYIN|IS_PINYIN, 425, 0},
+{"zhua", IS_ZHUYIN|IS_PINYIN, 426, 0},
+{"zhuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 429, 1},
+{"zhuai", IS_ZHUYIN|IS_PINYIN, 427, 0},
+{"zhuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 429, 1},
+{"zhuan", IS_ZHUYIN|IS_PINYIN, 428, 0},
+{"zhuang", IS_ZHUYIN|IS_PINYIN, 429, 0},
+{"zhuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 430, 1},
+{"zhuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 431, 1},
+{"zhui", IS_ZHUYIN|IS_PINYIN, 430, 0},
+{"zhun", IS_ZHUYIN|IS_PINYIN, 431, 0},
+{"zhuo", IS_ZHUYIN|IS_PINYIN, 432, 0},
+{"zi", IS_ZHUYIN|IS_PINYIN, 433, 0},
+{"zogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 434, 1},
+{"zomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 434, 1},
+{"zon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 434, 1},
+{"zong", IS_ZHUYIN|IS_PINYIN, 434, 0},
+{"zou", IS_ZHUYIN|IS_PINYIN, 435, 0},
+{"zu", IS_ZHUYIN|IS_PINYIN, 436, 0},
+{"zuan", IS_ZHUYIN|IS_PINYIN, 437, 0},
+{"zuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 438, 1},
+{"zuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 439, 1},
+{"zui", IS_ZHUYIN|IS_PINYIN, 438, 0},
+{"zun", IS_ZHUYIN|IS_PINYIN, 439, 0},
+{"zuo", IS_ZHUYIN|IS_PINYIN, 440, 0}
};
const pinyin_index_item_t luoma_pinyin_index[] = {
diff --git a/src/storage/pinyin_phrase3.h b/src/storage/pinyin_phrase3.h
index d2e75ad..d23c657 100644
--- a/src/storage/pinyin_phrase3.h
+++ b/src/storage/pinyin_phrase3.h
@@ -223,8 +223,8 @@ inline int phrase_compare_with_tones(const PinyinIndexItem2<phrase_length> &lhs,
}
template<size_t phrase_length>
-inline int phrase_less_than_with_tones(const PinyinIndexItem2<phrase_length> &lhs,
- const PinyinIndexItem2<phrase_length> &rhs)
+inline bool phrase_less_than_with_tones(const PinyinIndexItem2<phrase_length> &lhs,
+ const PinyinIndexItem2<phrase_length> &rhs)
{
return 0 > phrase_compare_with_tones<phrase_length>(lhs, rhs);
}
diff --git a/src/storage/special_table.h b/src/storage/special_table.h
index 7916a65..8263236 100644
--- a/src/storage/special_table.h
+++ b/src/storage/special_table.h
@@ -41,6 +41,7 @@ const resplit_table_item_t resplit_table[] = {
{{"chu", "nan"}, {ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ZERO_FINAL), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"chun", "an"}, {ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100},
{{"dan", "gan"}, {ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 0, {"dang", "an"}, {ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100},
{{"e", "nai"}, {ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"en", "ai"}, {ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100},
+{{"e", "nen"}, {ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, 100, {"en", "en"}, {ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, 300},
{{"fa", "nan"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"fan", "an"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100},
{{"fan", "gai"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"fang", "ai"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100},
{{"fan", "gan"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"fang", "an"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100},
@@ -55,6 +56,7 @@ const resplit_table_item_t resplit_table[] = {
{{"ji", "nou"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ZERO_FINAL), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 0, {"jin", "ou"}, {ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_IN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 100},
{{"jia", "nai"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"jian", "ai"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100},
{{"jia", "nan"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"jian", "an"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100},
+{{"jia", "nao"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, 100, {"jian", "ao"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, 300},
{{"jia", "ne"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_E)}, 0, {"jian", "e"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E)}, 100},
{{"jia", "nou"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 0, {"jian", "ou"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 100},
{{"jian", "gan"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"jiang", "an"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100},
@@ -84,6 +86,7 @@ const resplit_table_item_t resplit_table[] = {
{{"qia", "ne"}, {ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_E)}, 0, {"qian", "e"}, {ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E)}, 100},
{{"qin", "gai"}, {ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_IN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"qing", "ai"}, {ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ING), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100},
{{"qin", "gan"}, {ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_IN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 0, {"qing", "an"}, {ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ING), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100},
+{{"qu", "na"}, {ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ZERO_FINAL), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_A)}, 300, {"qun", "a"}, {ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_A)}, 100},
{{"re", "nai"}, {ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_E), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"ren", "ai"}, {ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100},
{{"re", "nan"}, {ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_E), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 0, {"ren", "an"}, {ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100},
{{"san", "gou"}, {ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 0, {"sang", "ou"}, {ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 100},
diff --git a/src/storage/table_info.cpp b/src/storage/table_info.cpp
index 9fe3865..41c7da0 100644
--- a/src/storage/table_info.cpp
+++ b/src/storage/table_info.cpp
@@ -113,7 +113,17 @@ static TABLE_PHONETIC_TYPE to_table_phonetic_type(const char * str) {
if (0 == strcmp("zhuyin", str))
return ZHUYIN_TABLE;
- assert(FALSE);
+ abort();
+}
+
+static TABLE_DATABASE_FORMAT_TYPE to_table_database_format_type(const char * str) {
+ if (0 == strcmp("BerkeleyDB", str))
+ return BERKELEY_DB_FORMAT;
+
+ if (0 == strcmp("KyotoCabinet", str))
+ return KYOTO_CABINET_FORMAT;
+
+ abort();
}
static TABLE_TARGET to_table_target(const char * str) {
@@ -123,7 +133,7 @@ static TABLE_TARGET to_table_target(const char * str) {
if (0 == strcmp("addon", str))
return ADDON_TABLE;
- assert(FALSE);
+ abort();
}
static guint8 to_index_of_default_tables(const char * str) {
@@ -137,7 +147,7 @@ static guint8 to_index_of_default_tables(const char * str) {
HANDLE(NETWORK_DICTIONARY);
HANDLE(USER_DICTIONARY);
- assert(FALSE);
+ abort();
}
static guint8 to_index_of_addon_tables(const char * str) {
@@ -156,11 +166,21 @@ static PHRASE_FILE_TYPE to_file_type(const char * str) {
HANDLE(DICTIONARY);
HANDLE(USER_FILE);
- assert(FALSE);
+ abort();
}
#undef HANDLE
+static const char * from_table_database_format_type(const TABLE_DATABASE_FORMAT_TYPE format) {
+ if (format == BERKELEY_DB_FORMAT)
+ return "BerkeleyDB";
+
+ if (format == KYOTO_CABINET_FORMAT)
+ return "KyotoCabinet";
+
+ abort();
+}
+
bool SystemTableInfo2::load(const char * filename) {
reset();
@@ -196,9 +216,13 @@ bool SystemTableInfo2::load(const char * filename) {
TABLE_PHONETIC_TYPE type = PINYIN_TABLE;
char str[256];
- num = fscanf(input, "source table format:%255s", str);
+ num = fscanf(input, "source table format:%255s\n", str);
type = to_table_phonetic_type(str);
+ TABLE_DATABASE_FORMAT_TYPE format = UNKNOWN_FORMAT;
+ num = fscanf(input, "database format:%255s\n", str);
+ format = to_table_database_format_type (str);
+
#if 0
printf("binver:%d modelver:%d lambda:%f\n", binver, modelver, lambda);
printf("type:%d\n", type);
@@ -211,6 +235,8 @@ bool SystemTableInfo2::load(const char * filename) {
/* Note: support pinyin and zhuyin table now. */
assert(PINYIN_TABLE == type || ZHUYIN_TABLE == type);
m_table_phonetic_type = type;
+ assert(BERKELEY_DB_FORMAT == format || KYOTO_CABINET_FORMAT == format);
+ m_table_database_format_type = format;
int index = 0;
char tableinfo[256], dictstr[256];
@@ -309,12 +335,19 @@ bool UserTableInfo::load(const char * filename) {
return false;
}
+ TABLE_DATABASE_FORMAT_TYPE format = UNKNOWN_FORMAT;
+ char str[256];
+ num = fscanf(input, "database format:%255s\n", str);
+ if (EOF != num)
+ format = to_table_database_format_type (str);
+
#if 0
printf("binver:%d modelver:%d\n", binver, modelver);
#endif
m_binary_format_version = binver;
m_model_data_version = modelver;
+ m_table_database_format_type = format;
fclose(input);
@@ -334,6 +367,8 @@ bool UserTableInfo::save(const char * filename) {
fprintf(output, "binary format version:%d\n", m_binary_format_version);
fprintf(output, "model data version:%d\n", m_model_data_version);
+ fprintf(output, "database format:%s\n",
+ from_table_database_format_type (m_table_database_format_type));
fclose(output);
@@ -349,11 +384,15 @@ bool UserTableInfo::is_conform(const SystemTableInfo2 * sysinfo) {
if (sysinfo->m_model_data_version != m_model_data_version)
return false;
+ if (sysinfo->m_table_database_format_type != m_table_database_format_type)
+ return false;
+
return true;
}
bool UserTableInfo::make_conform(const SystemTableInfo2 * sysinfo) {
m_binary_format_version = sysinfo->m_binary_format_version;
m_model_data_version = sysinfo->m_model_data_version;
+ m_table_database_format_type = sysinfo->m_table_database_format_type;
return true;
}
diff --git a/src/storage/table_info.h b/src/storage/table_info.h
index bc3837f..4b8f252 100644
--- a/src/storage/table_info.h
+++ b/src/storage/table_info.h
@@ -32,6 +32,12 @@ typedef enum {
} TABLE_PHONETIC_TYPE;
typedef enum {
+ UNKNOWN_FORMAT,
+ BERKELEY_DB_FORMAT,
+ KYOTO_CABINET_FORMAT,
+} TABLE_DATABASE_FORMAT_TYPE;
+
+typedef enum {
DEFAULT_TABLE,
ADDON_TABLE,
} TABLE_TARGET;
@@ -62,6 +68,7 @@ private:
gfloat m_lambda;
TABLE_PHONETIC_TYPE m_table_phonetic_type;
+ TABLE_DATABASE_FORMAT_TYPE m_table_database_format_type;
pinyin_table_info_t m_default_tables[PHRASE_INDEX_LIBRARY_COUNT];
@@ -90,6 +97,7 @@ class UserTableInfo{
private:
int m_binary_format_version;
int m_model_data_version;
+ TABLE_DATABASE_FORMAT_TYPE m_table_database_format_type;
private:
void reset();
diff --git a/src/storage/zhuyin_parser2.cpp b/src/storage/zhuyin_parser2.cpp
index 99ae4d2..3f14eab 100644
--- a/src/storage/zhuyin_parser2.cpp
+++ b/src/storage/zhuyin_parser2.cpp
@@ -160,6 +160,7 @@ static int search_chewing_symbols2(const zhuyin_symbol_item_t * symbol_table,
bool ZhuyinSimpleParser2::parse_one_key(pinyin_option_t options,
ChewingKey & key,
+ gint16 & distance,
const char * str, int len) const {
options &= ~PINYIN_AMB_ALL;
unsigned char tone = CHEWING_ZERO_TONE;
@@ -240,9 +241,10 @@ int ZhuyinSimpleParser2::parse(pinyin_option_t options,
i = std_lite::min(maximum_len - parsed_len,
(int)max_chewing_length);
+ gint16 distance = 0;
ChewingKey key; ChewingKeyRest key_rest;
for (; i > 0; --i) {
- bool success = parse_one_key(options, key, cur_str, i);
+ bool success = parse_one_key(options, key, distance, cur_str, i);
if (success)
break;
}
@@ -289,7 +291,7 @@ bool ZhuyinSimpleParser2::set_scheme(ZhuyinScheme scheme) {
m_symbol_table = chewing_standard_dvorak_symbols;
m_tone_table = chewing_standard_dvorak_tones;
default:
- assert(FALSE);
+ abort();
}
return false;
@@ -331,6 +333,7 @@ bool ZhuyinSimpleParser2::in_chewing_scheme(pinyin_option_t options,
bool ZhuyinDiscreteParser2::parse_one_key(pinyin_option_t options,
ChewingKey & key,
+ gint16 & distance,
const char * str, int len) const {
if (0 == len)
return false;
@@ -430,9 +433,10 @@ int ZhuyinDiscreteParser2::parse(pinyin_option_t options,
i = std_lite::min(maximum_len - parsed_len,
(int)max_chewing_length);
+ gint16 distance = 0;
ChewingKey key; ChewingKeyRest key_rest;
for (; i > 0; --i) {
- bool success = parse_one_key(options, key, cur_str, i);
+ bool success = parse_one_key(options, key, distance, cur_str, i);
if (success)
break;
}
@@ -480,7 +484,7 @@ bool ZhuyinDiscreteParser2::set_scheme(ZhuyinScheme scheme) {
INIT_PARSER(hsu_zhuyin_index, hsu_dvorak);
break;
default:
- assert(FALSE);
+ abort();
}
#undef INIT_PARSER
@@ -567,6 +571,7 @@ static int count_same_chars(const char * str, int len) {
bool ZhuyinDaChenCP26Parser2::parse_one_key(pinyin_option_t options,
ChewingKey & key,
+ gint16 & distance,
const char *str, int len) const {
if (0 == len)
return false;
@@ -731,15 +736,16 @@ int ZhuyinDaChenCP26Parser2::parse(pinyin_option_t options,
/* maximum forward match for chewing. */
int parsed_len = 0;
const char * cur_str = NULL;
- ChewingKey key; ChewingKeyRest key_rest;
while (parsed_len < maximum_len) {
cur_str = str + parsed_len;
i = std_lite::min(maximum_len - parsed_len,
(int)max_chewing_dachen26_length);
+ gint16 distance = 0;
+ ChewingKey key; ChewingKeyRest key_rest;
for (; i > 0; --i) {
- bool success = parse_one_key(options, key, cur_str, i);
+ bool success = parse_one_key(options, key, distance, cur_str, i);
if (success)
break;
}
@@ -851,6 +857,7 @@ ZhuyinDirectParser2::ZhuyinDirectParser2 (){
bool ZhuyinDirectParser2::parse_one_key(pinyin_option_t options,
ChewingKey & key,
+ gint16 & distance,
const char *str, int len) const {
options &= ~PINYIN_AMB_ALL;
/* by default, chewing will use the first tone. */
@@ -910,8 +917,6 @@ int ZhuyinDirectParser2::parse(pinyin_option_t options,
g_array_set_size(keys, 0);
g_array_set_size(key_rests, 0);
- ChewingKey key; ChewingKeyRest key_rest;
-
int parsed_len = 0;
int i = 0, cur = 0, next = 0;
while (cur < len) {
@@ -922,7 +927,9 @@ int ZhuyinDirectParser2::parse(pinyin_option_t options,
}
next = i;
- if (parse_one_key(options, key, str + cur, next - cur)) {
+ gint16 distance = 0;
+ ChewingKey key; ChewingKeyRest key_rest;
+ if (parse_one_key(options, key, distance, str + cur, next - cur)) {
#if 0
/* as direct parser handles data source,
assume the data is correct when loading. */
diff --git a/src/storage/zhuyin_parser2.h b/src/storage/zhuyin_parser2.h
index 45af804..8a9c550 100644
--- a/src/storage/zhuyin_parser2.h
+++ b/src/storage/zhuyin_parser2.h
@@ -98,7 +98,7 @@ public:
virtual ~ZhuyinSimpleParser2() {}
- virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const;
virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const;
@@ -142,7 +142,7 @@ public:
virtual ~ZhuyinDiscreteParser2() {}
- virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const;
virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const;
@@ -167,7 +167,7 @@ public:
virtual ~ZhuyinDaChenCP26Parser2() {}
- virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const;
virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const;
@@ -187,7 +187,7 @@ public:
virtual ~ZhuyinDirectParser2() {}
- virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const;
virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const;
};
diff --git a/src/zhuyin.cpp b/src/zhuyin.cpp
index cb3ce70..1aeecf5 100644
--- a/src/zhuyin.cpp
+++ b/src/zhuyin.cpp
@@ -733,7 +733,7 @@ bool zhuyin_set_chewing_scheme(zhuyin_context_t * context,
context->m_chewing_parser = new ZhuyinDaChenCP26Parser2();
break;
default:
- assert(FALSE);
+ abort();
}
return true;
}
@@ -989,7 +989,7 @@ bool zhuyin_get_sentence(zhuyin_instance_t * instance,
return false;
MatchResult result = NULL;
- assert(results.get_result(0, result));
+ check_result(results.get_result(0, result));
bool retval = pinyin::convert_to_utf8
(context->m_phrase_index, result,
@@ -1007,9 +1007,10 @@ bool zhuyin_parse_full_pinyin(zhuyin_instance_t * instance,
/* disable the pinyin correction options. */
options &= ~PINYIN_CORRECT_ALL;
+ gint16 distance = 0;
int pinyin_len = strlen(onepinyin);
bool retval = context->m_full_pinyin_parser->parse_one_key
- (options, *onekey, onepinyin, pinyin_len);
+ (options, *onekey, distance, onepinyin, pinyin_len);
return retval;
}
@@ -1047,9 +1048,10 @@ bool zhuyin_parse_chewing(zhuyin_instance_t * instance,
zhuyin_context_t * & context = instance->m_context;
zhuyin_option_t options = context->m_options;
+ gint16 distance = 0;
int chewing_len = strlen(onechewing);
bool retval = context->m_chewing_parser->parse_one_key
- (options, *onekey, onechewing, chewing_len );
+ (options, *onekey, distance, onechewing, chewing_len);
return retval;
}
@@ -1181,7 +1183,7 @@ static phrase_token_t _get_previous_token(zhuyin_instance_t * instance,
/* use the first candidate. */
MatchResult result = NULL;
- assert(results.get_result(0, result));
+ check_result(results.get_result(0, result));
phrase_token_t cur_token = g_array_index
(result, phrase_token_t, offset);
@@ -1298,7 +1300,7 @@ static bool _compute_phrase_length(zhuyin_context_t * context,
switch(candidate->m_candidate_type) {
case BEST_MATCH_CANDIDATE:
- assert(FALSE);
+ abort();
case NORMAL_CANDIDATE_AFTER_CURSOR:
case NORMAL_CANDIDATE_BEFORE_CURSOR: {
phrase_index->get_phrase_item(candidate->m_token, item);
@@ -1306,7 +1308,7 @@ static bool _compute_phrase_length(zhuyin_context_t * context,
break;
}
case ZOMBIE_CANDIDATE:
- assert(FALSE);
+ abort();
}
}
@@ -1336,7 +1338,7 @@ static bool _compute_phrase_strings_of_items(zhuyin_instance_t * instance,
&(candidate->m_phrase_string));
break;
case ZOMBIE_CANDIDATE:
- assert(FALSE);
+ abort();
}
}
@@ -1705,7 +1707,7 @@ bool zhuyin_train(zhuyin_instance_t * instance){
context->m_modified = true;
MatchResult result = NULL;
- assert(results.get_result(0, result));
+ check_result(results.get_result(0, result));
bool retval = context->m_pinyin_lookup->train_result3
(&matrix, instance->m_constraints, result);
@@ -1944,6 +1946,28 @@ bool zhuyin_get_zhuyin_key_rest_length(zhuyin_instance_t * instance,
return true;
}
+/* find the first zero ChewingKey "'". */
+static size_t _compute_zero_start(PhoneticKeyMatrix & matrix, size_t offset) {
+ ChewingKey key; ChewingKeyRest key_rest;
+ const ChewingKey zero_key;
+
+ ssize_t index = offset - 1;
+ for (; index > 0; --index) {
+ const size_t size = matrix.get_column_size(index);
+
+ if (1 != size)
+ break;
+
+ matrix.get_item(index, 0, key, key_rest);
+ if (zero_key == key)
+ offset = index;
+ else
+ break;
+ }
+
+ return offset;
+}
+
/* when lookup offset:
get the previous non-zero ChewingKey. */
bool zhuyin_get_zhuyin_offset(zhuyin_instance_t * instance,
@@ -1960,6 +1984,7 @@ bool zhuyin_get_zhuyin_offset(zhuyin_instance_t * instance,
break;
}
+ offset = _compute_zero_start(matrix, offset);
_check_offset(matrix, offset);
*poffset = offset;
@@ -1991,6 +2016,7 @@ bool zhuyin_get_left_zhuyin_offset(zhuyin_instance_t * instance,
break;
}
+ offset = _compute_zero_start(matrix, offset);
_check_offset(matrix, left);
*pleft = left;
@@ -2007,6 +2033,7 @@ bool zhuyin_get_right_zhuyin_offset(zhuyin_instance_t * instance,
size_t right = offset;
ChewingKey key; ChewingKeyRest key_rest;
+ const ChewingKey zero_key;
for (size_t index = right; index < matrix.size() - 1; ++index) {
const size_t size = matrix.get_column_size(index);
@@ -2014,7 +2041,10 @@ bool zhuyin_get_right_zhuyin_offset(zhuyin_instance_t * instance,
break;
matrix.get_item(index, 0, key, key_rest);
- break;
+ if (zero_key == key)
+ right = index + 1;
+ else
+ break;
}
if (0 == matrix.get_column_size(right))
diff --git a/src/zhuyin.h b/src/zhuyin.h
index e627001..22f7eee 100644
--- a/src/zhuyin.h
+++ b/src/zhuyin.h
@@ -594,7 +594,7 @@ bool zhuyin_get_n_zhuyin(zhuyin_instance_t * instance,
*
*/
bool zhuyin_get_zhuyin_key(zhuyin_instance_t * instance,
- guint index,
+ size_t index,
ChewingKey ** key);
/**
@@ -608,7 +608,7 @@ bool zhuyin_get_zhuyin_key(zhuyin_instance_t * instance,
*
*/
bool zhuyin_get_zhuyin_key_rest(zhuyin_instance_t * instance,
- guint index,
+ size_t index,
ChewingKeyRest ** key_rest);
/**
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 34da2d0..5cf6d32 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -23,7 +23,7 @@ CLEANFILES = *.bak
ACLOCAL = aclocal -I $(ac_aux_dir)
-INCLUDES = -I$(top_srcdir)/src \
+AM_CPPFLAGS = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
-I$(top_srcdir)/src/lookup \
@@ -40,20 +40,20 @@ noinst_PROGRAMS = test_pinyin \
test_pinyin_SOURCES = test_pinyin.cpp
-test_pinyin_LDADD = ../src/libpinyin.la
+test_pinyin_LDADD = ../src/libpinyin.la @GLIB2_LIBS@
test_phrase_SOURCES = test_phrase.cpp
-test_phrase_LDADD = ../src/libpinyin.la
+test_phrase_LDADD = ../src/libpinyin.la @GLIB2_LIBS@
test_chewing_SOURCES = test_chewing.cpp
-test_chewing_LDADD = ../src/libpinyin.la
+test_chewing_LDADD = ../src/libpinyin.la @GLIB2_LIBS@
if ENABLE_LIBZHUYIN
noinst_PROGRAMS += test_zhuyin
test_zhuyin_SOURCES = test_zhuyin.cpp
-test_zhuyin_LDADD = ../src/libzhuyin.la
+test_zhuyin_LDADD = ../src/libzhuyin.la @GLIB2_LIBS@
endif
diff --git a/tests/include/CMakeLists.txt b/tests/include/CMakeLists.txt
index 3ad956c..43c029f 100644
--- a/tests/include/CMakeLists.txt
+++ b/tests/include/CMakeLists.txt
@@ -7,3 +7,5 @@ target_link_libraries(
test_memory_chunk
pinyin
)
+
+add_test(NAME memory_chunk COMMAND test_memory_chunk)
diff --git a/tests/include/Makefile.am b/tests/include/Makefile.am
index 4a22aea..a140ff0 100644
--- a/tests/include/Makefile.am
+++ b/tests/include/Makefile.am
@@ -14,7 +14,7 @@
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
-INCLUDES = -I$(top_srcdir)/src \
+AM_CPPFLAGS = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
-I$(top_srcdir)/src/lookup \
diff --git a/tests/include/test_memory_chunk.cpp b/tests/include/test_memory_chunk.cpp
index 5042882..f496e4d 100644
--- a/tests/include/test_memory_chunk.cpp
+++ b/tests/include/test_memory_chunk.cpp
@@ -55,13 +55,13 @@ int main(int argc, char * argv[]){
printf("%d\t%d\n", *p3, *(p3+1));
int tmp;
- assert(chunk->get_content(sizeof(int), &tmp, sizeof(int)));
+ check_result(chunk->get_content(sizeof(int), &tmp, sizeof(int)));
printf("%d\n", tmp);
- assert(chunk->save("/tmp/test.bin"));
- assert(chunk->load("/tmp/test.bin"));
+ check_result(chunk->save("/tmp/test.bin"));
+ check_result(chunk->load("/tmp/test.bin"));
#ifdef LIBPINYIN_USE_MMAP
- assert(chunk->mmap("/tmp/test.bin"));
+ check_result(chunk->mmap("/tmp/test.bin"));
#endif
delete chunk;
diff --git a/tests/lookup/Makefile.am b/tests/lookup/Makefile.am
index 10c5301..027298b 100644
--- a/tests/lookup/Makefile.am
+++ b/tests/lookup/Makefile.am
@@ -14,14 +14,19 @@
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
-INCLUDES = -I$(top_srcdir)/src \
+AM_CPPFLAGS = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
-I$(top_srcdir)/src/lookup \
-I$(top_srcdir)/tests \
@GLIB2_CFLAGS@
-LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+LDADD = \
+ ../../src/libpinyin_internal.a \
+ ../../src/storage/libstorage.a \
+ ../../src/lookup/liblookup.a \
+ @GLIB2_LIBS@ \
+ $(NULL)
noinst_PROGRAMS = test_pinyin_lookup \
test_phrase_lookup
diff --git a/tests/lookup/test_pinyin_lookup.cpp b/tests/lookup/test_pinyin_lookup.cpp
index c1552b4..9cc9f86 100644
--- a/tests/lookup/test_pinyin_lookup.cpp
+++ b/tests/lookup/test_pinyin_lookup.cpp
@@ -115,7 +115,7 @@ int main( int argc, char * argv[]){
for (size_t i = 0; i < results.size(); ++i) {
MatchResult result = NULL;
- assert(results.get_result(i, result));
+ check_result(results.get_result(i, result));
for (size_t j = 0; j < result->len; ++j){
phrase_token_t * token = &g_array_index(result, phrase_token_t, j);
diff --git a/tests/storage/CMakeLists.txt b/tests/storage/CMakeLists.txt
index 378e134..4b3a60f 100644
--- a/tests/storage/CMakeLists.txt
+++ b/tests/storage/CMakeLists.txt
@@ -40,6 +40,8 @@ target_link_libraries(
pinyin
)
+add_test(NAME phrase_index_logger COMMAND test_phrase_index_logger)
+
add_executable(
test_phrase_table
test_phrase_table.cpp
@@ -60,6 +62,8 @@ target_link_libraries(
pinyin
)
+add_test(NAME ngram COMMAND test_ngram)
+
add_executable(
test_flexible_ngram
test_flexible_ngram.cpp
@@ -69,3 +73,5 @@ target_link_libraries(
test_flexible_ngram
pinyin
)
+
+add_test(NAME flexible_ngram COMMAND test_flexible_ngram)
diff --git a/tests/storage/Makefile.am b/tests/storage/Makefile.am
index 6f75534..6b05918 100644
--- a/tests/storage/Makefile.am
+++ b/tests/storage/Makefile.am
@@ -14,14 +14,19 @@
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
-INCLUDES = -I$(top_srcdir)/src \
+AM_CPPFLAGS = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
-I$(top_srcdir)/src/lookup \
-I$(top_srcdir)/tests \
@GLIB2_CFLAGS@
-LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+LDADD = \
+ ../../src/libpinyin_internal.a \
+ ../../src/storage/libstorage.a \
+ ../../src/lookup/liblookup.a \
+ @GLIB2_LIBS@ \
+ $(NULL)
TESTS = test_phrase_index_logger \
test_ngram \
diff --git a/tests/storage/test_flexible_ngram.cpp b/tests/storage/test_flexible_ngram.cpp
index 0aaf15c..28b1d70 100644
--- a/tests/storage/test_flexible_ngram.cpp
+++ b/tests/storage/test_flexible_ngram.cpp
@@ -26,7 +26,7 @@ int main(int argc, char * argv[]) {
typedef FlexibleSingleGram<guint32, guint32>::ArrayItemWithToken array_item_t;
const guint32 total_freq = 16;
- assert(single_gram.set_array_header(total_freq));
+ check_result(single_gram.set_array_header(total_freq));
phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3 };
guint32 freqs[6] = { 1, 2, 4, 8, 16, 32};
@@ -35,9 +35,9 @@ int main(int argc, char * argv[]) {
for ( size_t i = 0; i < G_N_ELEMENTS(tokens); ++i ){
if ( single_gram.get_array_item(tokens[i], freq) )
- assert(single_gram.set_array_item(tokens[i], freqs[i]));
+ check_result(single_gram.set_array_item(tokens[i], freqs[i]));
else
- assert(single_gram.insert_array_item(tokens[i], freqs[i]));
+ check_result(single_gram.insert_array_item(tokens[i], freqs[i]));
}
single_gram.get_array_item(3, freq);
@@ -53,16 +53,16 @@ int main(int argc, char * argv[]) {
printf("item:%d:%d\n", item->m_token, item->m_item);
}
- assert(single_gram.get_array_header(freq));
+ check_result(single_gram.get_array_header(freq));
assert(freq == total_freq);
FlexibleBigram<guint32, guint32, guint32> bigram("TEST");
- assert(bigram.attach("/tmp/training.db", ATTACH_READWRITE|ATTACH_CREATE));
+ check_result(bigram.attach("/tmp/training.db", ATTACH_READWRITE|ATTACH_CREATE));
bigram.store(1, &single_gram);
- assert(single_gram.insert_array_item(5, 8));
- assert(single_gram.remove_array_item(1, freq));
- assert(single_gram.set_array_header(32));
- assert(single_gram.get_array_header(freq));
+ check_result(single_gram.insert_array_item(5, 8));
+ check_result(single_gram.remove_array_item(1, freq));
+ check_result(single_gram.set_array_header(32));
+ check_result(single_gram.get_array_header(freq));
printf("new array header:%d\n", freq);
bigram.store(2, &single_gram);
@@ -122,7 +122,7 @@ int main(int argc, char * argv[]) {
delete train_gram;
}
- assert(bigram.remove(1));
+ check_result(bigram.remove(1));
bigram.get_all_items(items);
printf("-----------------------items----------------------------\n");
diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp
index 0313f1c..3070ee9 100644
--- a/tests/storage/test_ngram.cpp
+++ b/tests/storage/test_ngram.cpp
@@ -6,7 +6,7 @@ int main(int argc, char * argv[]){
SingleGram single_gram;
const guint32 total_freq = 16;
- assert(single_gram.set_total_freq(total_freq));
+ check_result(single_gram.set_total_freq(total_freq));
phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3};
guint32 freqs[6] = { 1, 2, 4, 8, 16, 32};
@@ -15,9 +15,9 @@ int main(int argc, char * argv[]){
for(size_t i = 0; i < 6 ;++i){
if ( single_gram.get_freq(tokens[i], freq))
- assert(single_gram.set_freq(tokens[i], freqs[i]));
+ check_result(single_gram.set_freq(tokens[i], freqs[i]));
else
- assert(single_gram.insert_freq(tokens[i], freqs[i]));
+ check_result(single_gram.insert_freq(tokens[i], freqs[i]));
}
single_gram.get_freq(3, freq);
@@ -33,14 +33,14 @@ int main(int argc, char * argv[]){
printf("item:%d:%f\n", item->m_token, item->m_freq);
}
- assert(single_gram.get_total_freq(freq));
+ check_result(single_gram.get_total_freq(freq));
assert(freq == total_freq);
Bigram bigram;
- assert(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE));
+ check_result(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE));
bigram.store(1, &single_gram);
- assert(single_gram.insert_freq(5, 8));
- assert(single_gram.remove_freq(1, freq));
+ check_result(single_gram.insert_freq(5, 8));
+ check_result(single_gram.remove_freq(1, freq));
single_gram.set_total_freq(32);
bigram.store(2, &single_gram);
@@ -61,7 +61,7 @@ int main(int argc, char * argv[]){
}
printf("--------------------------------------------------------\n");
- assert(single_gram.get_total_freq(freq));
+ check_result(single_gram.get_total_freq(freq));
printf("total_freq:%d\n", freq);
g_array_free(array, TRUE);
@@ -75,8 +75,8 @@ int main(int argc, char * argv[]){
printf("item:%d\n", *token);
}
- assert(bigram.load_db("/tmp/test.db"));
- assert(bigram.save_db("/tmp/test.db"));
+ check_result(bigram.save_db("/tmp/snapshot.db"));
+ check_result(bigram.load_db("/tmp/snapshot.db"));
g_array_free(items, TRUE);
diff --git a/tests/storage/test_phrase_index.cpp b/tests/storage/test_phrase_index.cpp
index fa0721a..0fa6a29 100644
--- a/tests/storage/test_phrase_index.cpp
+++ b/tests/storage/test_phrase_index.cpp
@@ -38,11 +38,11 @@ int main(int argc, char * argv[]){
assert(string1 == string2);
FacadePhraseIndex phrase_index_test;
- assert(!phrase_index_test.add_phrase_item(1, &phrase_item));
+ check_result(!phrase_index_test.add_phrase_item(1, &phrase_item));
MemoryChunk* chunk = new MemoryChunk;
- assert(phrase_index_test.store(0, chunk));
- assert(phrase_index_test.load(0, chunk));
+ check_result(phrase_index_test.store(0, chunk));
+ check_result(phrase_index_test.load(0, chunk));
PhraseItem item2;
guint32 time = record_time();
diff --git a/tests/storage/test_phrase_index_logger.cpp b/tests/storage/test_phrase_index_logger.cpp
index 4f77964..452b162 100644
--- a/tests/storage/test_phrase_index_logger.cpp
+++ b/tests/storage/test_phrase_index_logger.cpp
@@ -30,7 +30,7 @@ int main(int argc, char * argv[]){
phrase_index.load(1, chunk);
PhraseIndexRange range;
- assert(ERROR_OK == phrase_index.get_range(1, range));
+ check_result(ERROR_OK == phrase_index.get_range(1, range));
for (size_t i = range.m_range_begin; i < range.m_range_end; ++i ) {
phrase_index.add_unigram_frequency(i, 1);
}
@@ -45,7 +45,7 @@ int main(int argc, char * argv[]){
chunk = new MemoryChunk;
chunk->load("../../data/gb_char.bin");
new_chunk = new MemoryChunk;
- assert(phrase_index.diff(1, chunk, new_chunk));
+ check_result(phrase_index.diff(1, chunk, new_chunk));
new_chunk->save("/tmp/gb_char.dbin");
delete new_chunk;
@@ -54,7 +54,7 @@ int main(int argc, char * argv[]){
phrase_index.load(1, chunk);
new_chunk = new MemoryChunk;
new_chunk->load("/tmp/gb_char.dbin");
- assert(phrase_index.merge(1, new_chunk));
+ check_result(phrase_index.merge(1, new_chunk));
chunk = new MemoryChunk;
phrase_index.store(1, chunk);
chunk->save("/tmp/gb_char2.bin");
diff --git a/tests/storage/test_table_info.cpp b/tests/storage/test_table_info.cpp
index e2e4893..27933bc 100644
--- a/tests/storage/test_table_info.cpp
+++ b/tests/storage/test_table_info.cpp
@@ -44,7 +44,7 @@ void dump_table_info(const pinyin_table_info_t * table_info) {
break;
default:
- assert(false);
+ abort();
}
}
@@ -90,8 +90,8 @@ int main(int argc, char * argv[]) {
retval = user_table_info.is_conform(&system_table_info);
assert(retval);
- assert(user_table_info.save("/tmp/user.conf"));
- assert(user_table_info.load("/tmp/user.conf"));
+ check_result(user_table_info.save("/tmp/user.conf"));
+ check_result(user_table_info.load("/tmp/user.conf"));
retval = user_table_info.is_conform(&system_table_info);
assert(retval);
diff --git a/tests/test_pinyin.cpp b/tests/test_pinyin.cpp
index 8eadf89..316cf1e 100644
--- a/tests/test_pinyin.cpp
+++ b/tests/test_pinyin.cpp
@@ -69,8 +69,8 @@ int main(int argc, char * argv[]){
size_t len = pinyin_parse_more_full_pinyins(instance, linebuf);
pinyin_guess_sentence_with_prefix(instance, prefixbuf);
- pinyin_guess_candidates(instance, 0,
- SORT_BY_PHRASE_LENGTH_AND_FREQUENCY);
+ guint sort_option = SORT_BY_PHRASE_LENGTH | SORT_BY_FREQUENCY;
+ pinyin_guess_candidates(instance, 0, sort_option);
size_t i = 0;
for (i = 0; i <= len; ++i) {
diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am
index 4354107..72e6743 100644
--- a/utils/segment/Makefile.am
+++ b/utils/segment/Makefile.am
@@ -16,14 +16,17 @@
MAINTAINERCLEANFILES = Makefile.in
-INCLUDES = -I$(top_srcdir)/src \
- -I$(top_srcdir)/src/include \
- -I$(top_srcdir)/src/storage \
- -I$(top_srcdir)/src/lookup \
- -I$(top_srcdir)/utils \
- @GLIB2_CFLAGS@
+AM_CPPFLAGS = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
+ @GLIB2_CFLAGS@
-LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+LDADD = ../../src/libpinyin_internal.a \
+ ../../src/storage/libstorage.a \
+ ../../src/lookup/liblookup.a \
+ @GLIB2_LIBS@
noinst_PROGRAMS = spseg ngseg mergeseq
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
index f5326a1..256386b 100644
--- a/utils/storage/Makefile.am
+++ b/utils/storage/Makefile.am
@@ -14,14 +14,19 @@
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
-INCLUDES = -I$(top_srcdir)/src \
+AM_CPPFLAGS = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
-I$(top_srcdir)/src/lookup \
-I$(top_srcdir)/utils \
@GLIB2_CFLAGS@
-LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+LDADD = \
+ ../../src/libpinyin_internal.a \
+ ../../src/storage/libstorage.a \
+ ../../src/lookup/liblookup.a \
+ @GLIB2_LIBS@ \
+ $(NULL)
bin_PROGRAMS = gen_binary_files \
import_interpolation
diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp
index fe73d75..c4620c4 100644
--- a/utils/storage/gen_binary_files.cpp
+++ b/utils/storage/gen_binary_files.cpp
@@ -95,16 +95,16 @@ int main(int argc, char * argv[]){
g_print("option parsing failed:%s\n", error->message);
exit(EINVAL);
}
+ g_option_context_free(context);
SystemTableInfo2 system_table_info;
- gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ const gchar * filename = SYSTEM_TABLE_INFO;
bool retval = system_table_info.load(filename);
if (!retval) {
fprintf(stderr, "load table.conf failed.\n");
exit(ENOENT);
}
- g_free(filename);
const pinyin_table_info_t * phrase_files =
system_table_info.get_default_tables();
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
index 041c85c..a07ed47 100644
--- a/utils/storage/import_interpolation.cpp
+++ b/utils/storage/import_interpolation.cpp
@@ -76,7 +76,7 @@ static ssize_t my_getline(FILE * input){
bool parse_headline(){
/* enter "\data" line */
- assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
+ check_result(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
/* read "\data" line */
if ( !taglib_read(linebuf, line_type, values, required) ) {
@@ -99,13 +99,13 @@ bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table,
Bigram * bigram){
taglib_push_state();
- assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
- assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
- assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+ check_result(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ check_result(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ check_result(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
do {
retry:
- assert(taglib_read(linebuf, line_type, values, required));
+ check_result(taglib_read(linebuf, line_type, values, required));
switch(line_type) {
case END_LINE:
goto end;
@@ -118,7 +118,7 @@ bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table,
parse_bigram(input, phrase_table, phrase_index, bigram);
goto retry;
default:
- assert(false);
+ abort();
}
} while (my_getline(input) != -1) ;
@@ -131,17 +131,17 @@ bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table,
FacadePhraseIndex * phrase_index){
taglib_push_state();
- assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", ""));
+ check_result(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", ""));
do {
- assert(taglib_read(linebuf, line_type, values, required));
+ check_result(taglib_read(linebuf, line_type, values, required));
switch (line_type) {
case GRAM_1_ITEM_LINE:{
/* handle \item in \1-gram */
TAGLIB_GET_TOKEN(token, 0);
TAGLIB_GET_PHRASE_STRING(word, 1);
- assert(taglib_validate_token_with_string
- (phrase_index, token, word));
+ check_result(taglib_validate_token_with_string
+ (phrase_index, token, word));
TAGLIB_GET_TAGVALUE(glong, count, atol);
phrase_index->add_unigram_frequency(token, count);
@@ -152,7 +152,7 @@ bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table,
case GRAM_2_LINE:
goto end;
default:
- assert(false);
+ abort();
}
} while (my_getline(input) != -1);
@@ -166,24 +166,24 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table,
Bigram * bigram){
taglib_push_state();
- assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", ""));
+ check_result(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", ""));
phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL;
do {
- assert(taglib_read(linebuf, line_type, values, required));
+ check_result(taglib_read(linebuf, line_type, values, required));
switch (line_type) {
case GRAM_2_ITEM_LINE:{
/* handle \item in \2-gram */
/* two tokens */
TAGLIB_GET_TOKEN(token1, 0);
TAGLIB_GET_PHRASE_STRING(word1, 1);
- assert(taglib_validate_token_with_string
- (phrase_index, token1, word1));
+ check_result(taglib_validate_token_with_string
+ (phrase_index, token1, word1));
TAGLIB_GET_TOKEN(token2, 2);
TAGLIB_GET_PHRASE_STRING(word2, 3);
- assert(taglib_validate_token_with_string
- (phrase_index, token2, word2));
+ check_result(taglib_validate_token_with_string
+ (phrase_index, token2, word2));
TAGLIB_GET_TAGVALUE(glong, count, atol);
@@ -209,10 +209,10 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table,
/* save the freq */
assert(NULL != last_single_gram);
guint32 total_freq = 0;
- assert(last_single_gram->get_total_freq(total_freq));
- assert(last_single_gram->insert_freq(token2, count));
+ check_result(last_single_gram->get_total_freq(total_freq));
+ check_result(last_single_gram->insert_freq(token2, count));
total_freq += count;
- assert(last_single_gram->set_total_freq(total_freq));
+ check_result(last_single_gram->set_total_freq(total_freq));
break;
}
case END_LINE:
@@ -220,7 +220,7 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table,
case GRAM_2_LINE:
goto end;
default:
- assert(false);
+ abort();
}
} while (my_getline(input) != -1);
@@ -255,13 +255,12 @@ int main(int argc, char * argv[]){
SystemTableInfo2 system_table_info;
- gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ const gchar * filename = SYSTEM_TABLE_INFO;
bool retval = system_table_info.load(filename);
if (!retval) {
fprintf(stderr, "load table.conf failed.\n");
exit(ENOENT);
}
- g_free(filename);
PhraseLargeTable3 phrase_table;
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am
index a70945a..d82cacf 100644
--- a/utils/training/Makefile.am
+++ b/utils/training/Makefile.am
@@ -16,14 +16,17 @@
MAINTAINERCLEANFILES = Makefile.in
-INCLUDES = -I$(top_srcdir)/src \
- -I$(top_srcdir)/src/include \
- -I$(top_srcdir)/src/storage \
- -I$(top_srcdir)/src/lookup \
- -I$(top_srcdir)/utils \
- @GLIB2_CFLAGS@
-
-LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+AM_CPPFLAGS = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
+ @GLIB2_CFLAGS@
+
+LDADD = ../../src/libpinyin_internal.a \
+ ../../src/storage/libstorage.a \
+ ../../src/lookup/liblookup.a \
+ @GLIB2_LIBS@
noinst_HEADERS = k_mixture_model.h
diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp
index a0e1dc6..089eadc 100644
--- a/utils/training/estimate_interpolation.cpp
+++ b/utils/training/estimate_interpolation.cpp
@@ -55,7 +55,7 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram,
parameter_t elem_poss = 0;
if (bigram && bigram->get_freq(token, freq)){
guint32 total_freq;
- assert(bigram->get_total_freq(total_freq));
+ check_result(bigram->get_total_freq(total_freq));
assert(0 != total_freq);
elem_poss = freq / (parameter_t) total_freq;
}
@@ -78,7 +78,7 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram,
next_lambda += deleted_count * (numerator / (numerator + part_of_denominator));
}
- assert(deleted_bigram->get_total_freq(table_num));
+ check_result(deleted_bigram->get_total_freq(table_num));
next_lambda /= table_num;
g_array_free(array, TRUE);
diff --git a/utils/training/estimate_k_mixture_model.cpp b/utils/training/estimate_k_mixture_model.cpp
index 779f67d..f5c5777 100644
--- a/utils/training/estimate_k_mixture_model.cpp
+++ b/utils/training/estimate_k_mixture_model.cpp
@@ -41,7 +41,7 @@ parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram,
parameter_t epsilon = 0.001;
KMixtureModelMagicHeader magic_header;
- assert(unigram->get_magic_header(magic_header));
+ check_result(unigram->get_magic_header(magic_header));
assert(0 != magic_header.m_total_freq);
while (fabs(lambda - next_lambda) > epsilon){
@@ -64,7 +64,7 @@ parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram,
KMixtureModelArrayHeader array_header;
KMixtureModelArrayItem array_item;
if ( bigram && bigram->get_array_item(token, array_item) ){
- assert(bigram->get_array_header(array_header));
+ check_result(bigram->get_array_header(array_header));
assert(0 != array_header.m_WC);
elem_poss = array_item.m_WC / (parameter_t) array_header.m_WC;
}
@@ -85,7 +85,7 @@ parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram,
next_lambda += deleted_count * (numerator / (numerator + part_of_denominator));
}
KMixtureModelArrayHeader header;
- assert(deleted_bigram->get_array_header(header));
+ check_result(deleted_bigram->get_array_header(header));
assert(0 != header.m_WC);
next_lambda /= header.m_WC;
@@ -131,9 +131,9 @@ int main(int argc, char * argv[]){
KMixtureModelArrayHeader array_header;
if (single_gram)
- assert(single_gram->get_array_header(array_header));
+ check_result(single_gram->get_array_header(array_header));
KMixtureModelArrayHeader deleted_array_header;
- assert(deleted_single_gram->get_array_header(deleted_array_header));
+ check_result(deleted_single_gram->get_array_header(deleted_array_header));
if ( 0 != deleted_array_header.m_WC ) {
parameter_t lambda = compute_interpolation(deleted_single_gram, &bigram, single_gram);
diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp
index aa33e6b..d795c0d 100644
--- a/utils/training/eval_correction_rate.cpp
+++ b/utils/training/eval_correction_rate.cpp
@@ -45,14 +45,14 @@ bool get_possible_pinyin(FacadePhraseIndex * phrase_index,
key_index = 0; max_freq = 0;
for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) {
freq = 0;
- assert(item.get_nth_pronunciation(m, buffer, freq));
+ check_result(item.get_nth_pronunciation(m, buffer, freq));
if ( freq > max_freq ) {
key_index = m;
max_freq = freq;
}
}
- assert(item.get_nth_pronunciation(key_index, buffer, freq));
+ check_result(item.get_nth_pronunciation(key_index, buffer, freq));
assert(max_freq == freq);
guint8 len = item.get_phrase_length();
g_array_append_vals(keys, buffer, len);
@@ -105,7 +105,7 @@ bool do_one_test(PhoneticLookup<1, 1> * pinyin_lookup,
get_best_match(phrase_index, pinyin_lookup, &matrix, &results);
assert(1 == results.size());
- assert(results.get_result(0, guessed_tokens));
+ check_result(results.get_result(0, guessed_tokens));
/* compare the results */
char * sentence = NULL; char * guessed_sentence = NULL;
diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp
index fe11cb6..8466d3b 100644
--- a/utils/training/export_k_mixture_model.cpp
+++ b/utils/training/export_k_mixture_model.cpp
@@ -55,7 +55,7 @@ bool print_k_mixture_model_array_headers(FILE * output,
for (size_t i = 0; i < items->len; ++i) {
phrase_token_t token = g_array_index(items, phrase_token_t, i);
KMixtureModelArrayHeader array_header;
- assert(bigram->get_array_header(token, array_header));
+ check_result(bigram->get_array_header(token, array_header));
char * phrase = taglib_token_to_string(phrase_index, token);
if ( phrase )
fprintf(output, "\\item %d %s count %d freq %d\n",
@@ -76,7 +76,7 @@ bool print_k_mixture_model_array_items(FILE * output,
for (size_t i = 0; i < items->len; ++i) {
phrase_token_t token = g_array_index(items, phrase_token_t, i);
KMixtureModelSingleGram * single_gram = NULL;
- assert(bigram->load(token, single_gram));
+ check_result(bigram->load(token, single_gram));
FlexibleBigramPhraseArray array = g_array_new
(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
single_gram->retrieve_all(array);
diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp
index 9f61bd7..074c198 100644
--- a/utils/training/gen_deleted_ngram.cpp
+++ b/utils/training/gen_deleted_ngram.cpp
@@ -109,9 +109,9 @@ int main(int argc, char * argv[]){
guint32 freq, total_freq;
//increase freq
if (single_gram->get_freq(cur_token, freq))
- assert(single_gram->set_freq(cur_token, freq + 1));
+ check_result(single_gram->set_freq(cur_token, freq + 1));
else
- assert(single_gram->insert_freq(cur_token, 1));
+ check_result(single_gram->insert_freq(cur_token, 1));
//increase total freq
single_gram->get_total_freq(total_freq);
single_gram->set_total_freq(total_freq + 1);
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp
index b4066d0..94969a4 100644
--- a/utils/training/gen_k_mixture_model.cpp
+++ b/utils/training/gen_k_mixture_model.cpp
@@ -156,19 +156,19 @@ static void train_word_pair(HashofUnigram hash_of_unigram,
*/
if ( count > maximum_occurs_allowed ){
gpointer value = NULL;
- assert( g_hash_table_lookup_extended
- (hash_of_unigram, GUINT_TO_POINTER(token2),
- NULL, &value) );
+ check_result(g_hash_table_lookup_extended
+ (hash_of_unigram, GUINT_TO_POINTER(token2),
+ NULL, &value));
guint32 freq = GPOINTER_TO_UINT(value);
freq -= count;
if ( freq > 0 ) {
g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2),
GUINT_TO_POINTER(freq));
} else if ( freq == 0 ) {
- assert(g_hash_table_steal(hash_of_unigram,
- GUINT_TO_POINTER(token2)));
+ check_result(g_hash_table_steal(hash_of_unigram,
+ GUINT_TO_POINTER(token2)));
} else {
- assert(false);
+ abort();
}
return;
}
@@ -178,24 +178,24 @@ static void train_word_pair(HashofUnigram hash_of_unigram,
if ( 1 == count )
array_item.m_n_1 ++;
array_item.m_Mr = std_lite::max(array_item.m_Mr, count);
- assert(single_gram->set_array_item(token2, array_item));
+ check_result(single_gram->set_array_item(token2, array_item));
} else { /* item doesn't exist. */
/* the same as above. */
if ( count > g_maximum_occurs ){
gpointer value = NULL;
- assert( g_hash_table_lookup_extended
- (hash_of_unigram, GUINT_TO_POINTER(token2),
- NULL, &value) );
+ check_result(g_hash_table_lookup_extended
+ (hash_of_unigram, GUINT_TO_POINTER(token2),
+ NULL, &value));
guint32 freq = GPOINTER_TO_UINT(value);
freq -= count;
if ( freq > 0 ) {
g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2),
GUINT_TO_POINTER(freq));
} else if ( freq == 0 ) {
- assert(g_hash_table_steal(hash_of_unigram,
- GUINT_TO_POINTER(token2)));
+ check_result(g_hash_table_steal(hash_of_unigram,
+ GUINT_TO_POINTER(token2)));
} else {
- assert(false);
+ abort();
}
return;
}
@@ -206,7 +206,7 @@ static void train_word_pair(HashofUnigram hash_of_unigram,
if ( 1 == count )
array_item.m_n_1 = 1;
array_item.m_Mr = count;
- assert(single_gram->insert_array_item(token2, array_item));
+ check_result(single_gram->insert_array_item(token2, array_item));
}
/* save delta in the array header. */
@@ -224,14 +224,14 @@ bool train_single_gram(HashofUnigram hash_of_unigram,
assert(NULL != single_gram);
delta = 0; /* delta in WC of single_gram. */
KMixtureModelArrayHeader array_header;
- assert(single_gram->get_array_header(array_header));
+ check_result(single_gram->get_array_header(array_header));
guint32 saved_array_header_WC = array_header.m_WC;
HashofSecondWord hash_of_second_word = NULL;
gpointer key, value = NULL;
- assert(g_hash_table_lookup_extended
- (hash_of_document, GUINT_TO_POINTER(token1),
- NULL, &value));
+ check_result(g_hash_table_lookup_extended
+ (hash_of_document, GUINT_TO_POINTER(token1),
+ NULL, &value));
hash_of_second_word = (HashofSecondWord) value;
assert(NULL != hash_of_second_word);
@@ -244,7 +244,7 @@ bool train_single_gram(HashofUnigram hash_of_unigram,
train_word_pair(hash_of_unigram, single_gram, token2, count);
}
- assert(single_gram->get_array_header(array_header));
+ check_result(single_gram->get_array_header(array_header));
delta = array_header.m_WC - saved_array_header_WC;
return true;
}
@@ -268,7 +268,7 @@ static bool train_second_word(HashofUnigram hash_of_unigram,
}
/* save the single gram. */
- assert(bigram->store(token1, single_gram));
+ check_result(bigram->store(token1, single_gram));
delete single_gram;
KMixtureModelMagicHeader magic_header;
@@ -282,7 +282,7 @@ static bool train_second_word(HashofUnigram hash_of_unigram,
return false;
}
magic_header.m_WC += delta;
- assert(bigram->set_magic_header(magic_header));
+ check_result(bigram->set_magic_header(magic_header));
return true;
}
@@ -306,13 +306,13 @@ static bool post_processing_unigram(KMixtureModelBigram * bigram,
}
KMixtureModelMagicHeader magic_header;
- assert(bigram->get_magic_header(magic_header));
+ check_result(bigram->get_magic_header(magic_header));
if ( magic_header.m_total_freq + total_freq < magic_header.m_total_freq ){
fprintf(stderr, "the m_total_freq in magic header overflows.\n");
return false;
}
magic_header.m_total_freq += total_freq;
- assert(bigram->set_magic_header(magic_header));
+ check_result(bigram->set_magic_header(magic_header));
return true;
}
@@ -369,8 +369,8 @@ int main(int argc, char * argv[]){
HashofUnigram hash_of_unigram = g_hash_table_new
(g_direct_hash, g_direct_equal);
- assert(read_document(&phrase_table, &phrase_index, document,
- hash_of_document, hash_of_unigram));
+ check_result(read_document(&phrase_table, &phrase_index, document,
+ hash_of_document, hash_of_unigram));
fclose(document);
document = NULL;
@@ -386,9 +386,9 @@ int main(int argc, char * argv[]){
}
KMixtureModelMagicHeader magic_header;
- assert(bigram.get_magic_header(magic_header));
+ check_result(bigram.get_magic_header(magic_header));
magic_header.m_N ++;
- assert(bigram.set_magic_header(magic_header));
+ check_result(bigram.set_magic_header(magic_header));
post_processing_unigram(&bigram, hash_of_unigram);
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
index 9d9c643..3e73cbe 100644
--- a/utils/training/gen_ngram.cpp
+++ b/utils/training/gen_ngram.cpp
@@ -113,9 +113,9 @@ int main(int argc, char * argv[]){
guint32 freq, total_freq;
/* increase freq */
if (single_gram->get_freq(cur_token, freq))
- assert(single_gram->set_freq(cur_token, freq + 1));
+ check_result(single_gram->set_freq(cur_token, freq + 1));
else
- assert(single_gram->insert_freq(cur_token, 1));
+ check_result(single_gram->insert_freq(cur_token, 1));
/* increase total freq */
single_gram->get_total_freq(total_freq);
single_gram->set_total_freq(total_freq + 1);
diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp
index f399408..b50db3c 100644
--- a/utils/training/gen_unigram.cpp
+++ b/utils/training/gen_unigram.cpp
@@ -93,13 +93,12 @@ int main(int argc, char * argv[]){
SystemTableInfo2 system_table_info;
- gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ const gchar * filename = SYSTEM_TABLE_INFO;
bool retval = system_table_info.load(filename);
if (!retval) {
fprintf(stderr, "load table.conf failed.\n");
exit(ENOENT);
}
- g_free(filename);
const pinyin_table_info_t * phrase_files =
system_table_info.get_default_tables();
diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp
index 1a34871..45f8d5e 100644
--- a/utils/training/import_k_mixture_model.cpp
+++ b/utils/training/import_k_mixture_model.cpp
@@ -77,7 +77,7 @@ static ssize_t my_getline(FILE * input){
bool parse_headline(KMixtureModelBigram * bigram){
/* enter "\data" line */
- assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", ""));
+ check_result(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", ""));
/* read "\data" line */
if ( !taglib_read(linebuf, line_type, values, required) ) {
@@ -111,13 +111,13 @@ bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table,
KMixtureModelBigram * bigram){
taglib_push_state();
- assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
- assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
- assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+ check_result(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ check_result(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ check_result(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
do {
retry:
- assert(taglib_read(linebuf, line_type, values, required));
+ check_result(taglib_read(linebuf, line_type, values, required));
switch(line_type) {
case END_LINE:
goto end;
@@ -130,7 +130,7 @@ bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table,
parse_bigram(input, phrase_table, phrase_index, bigram);
goto retry;
default:
- assert(false);
+ abort();
}
} while (my_getline(input) != -1) ;
@@ -144,17 +144,17 @@ bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table,
KMixtureModelBigram * bigram){
taglib_push_state();
- assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", ""));
+ check_result(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", ""));
do {
- assert(taglib_read(linebuf, line_type, values, required));
+ check_result(taglib_read(linebuf, line_type, values, required));
switch (line_type) {
case GRAM_1_ITEM_LINE:{
/* handle \item in \1-gram */
TAGLIB_GET_TOKEN(token, 0);
TAGLIB_GET_PHRASE_STRING(word, 1);
- assert(taglib_validate_token_with_string
- (phrase_index, token, word));
+ check_result(taglib_validate_token_with_string
+ (phrase_index, token, word));
TAGLIB_GET_TAGVALUE(glong, count, atol);
TAGLIB_GET_TAGVALUE(glong, freq, atol);
@@ -170,7 +170,7 @@ bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table,
case GRAM_2_LINE:
goto end;
default:
- assert(false);
+ abort();
}
} while (my_getline(input) != -1);
@@ -184,26 +184,26 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table,
KMixtureModelBigram * bigram){
taglib_push_state();
- assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
- "count:T:N_n_0:n_1:Mr", ""));
+ check_result(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
+ "count:T:N_n_0:n_1:Mr", ""));
phrase_token_t last_token = null_token;
KMixtureModelSingleGram * last_single_gram = NULL;
do {
- assert(taglib_read(linebuf, line_type, values, required));
+ check_result(taglib_read(linebuf, line_type, values, required));
switch (line_type) {
case GRAM_2_ITEM_LINE:{
/* handle \item in \2-gram */
/* two tokens */
TAGLIB_GET_TOKEN(token1, 0);
TAGLIB_GET_PHRASE_STRING(word1, 1);
- assert(taglib_validate_token_with_string
- (phrase_index, token1, word1));
+ check_result(taglib_validate_token_with_string
+ (phrase_index, token1, word1));
TAGLIB_GET_TOKEN(token2, 2);
TAGLIB_GET_PHRASE_STRING(word2, 3);
- assert(taglib_validate_token_with_string
- (phrase_index, token2, word2));
+ check_result(taglib_validate_token_with_string
+ (phrase_index, token2, word2));
TAGLIB_GET_TAGVALUE(glong, count, atol);
TAGLIB_GET_TAGVALUE(glong, T, atol);
@@ -236,7 +236,7 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table,
}
assert(NULL != last_single_gram);
- assert(last_single_gram->insert_array_item(token2, array_item));
+ check_result(last_single_gram->insert_array_item(token2, array_item));
break;
}
case END_LINE:
@@ -244,7 +244,7 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table,
case GRAM_2_LINE:
goto end;
default:
- assert(false);
+ abort();
}
} while (my_getline(input) != -1);
diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h
index 5b7bfde..3fa1dc9 100644
--- a/utils/training/k_mixture_model.h
+++ b/utils/training/k_mixture_model.h
@@ -79,7 +79,7 @@ static inline parameter_t compute_Pr_G_3(corpus_count_t k,
return (alpha * gamma / (B - 1)) * pow((1 - 1 / (B - 1)) , k - 2);
}
- assert(false);
+ abort();
}
static inline parameter_t compute_Pr_G_3_with_count(corpus_count_t k,
diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp
index 4879ac7..3a549a6 100644
--- a/utils/training/k_mixture_model_to_interpolation.cpp
+++ b/utils/training/k_mixture_model_to_interpolation.cpp
@@ -58,8 +58,8 @@ static ssize_t my_getline(FILE * input){
bool parse_headline(FILE * input, FILE * output) {
/* enter "\data" line */
- assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model",
- "count:N:total_freq"));
+ check_result(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model",
+ "count:N:total_freq"));
/* read "\data" line */
if ( !taglib_read(linebuf, line_type, values, required) ) {
@@ -83,13 +83,13 @@ bool parse_headline(FILE * input, FILE * output) {
bool parse_body(FILE * input, FILE * output){
taglib_push_state();
- assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
- assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
- assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+ check_result(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ check_result(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ check_result(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
do {
retry:
- assert(taglib_read(linebuf, line_type, values, required));
+ check_result(taglib_read(linebuf, line_type, values, required));
switch(line_type) {
case END_LINE:
fprintf(output, "\\end\n");
@@ -105,7 +105,7 @@ bool parse_body(FILE * input, FILE * output){
parse_bigram(input, output);
goto retry;
default:
- assert(false);
+ abort();
}
} while (my_getline(input) != -1);
@@ -117,10 +117,10 @@ bool parse_body(FILE * input, FILE * output){
bool parse_unigram(FILE * input, FILE * output){
taglib_push_state();
- assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count"));
+ check_result(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count"));
do {
- assert(taglib_read(linebuf, line_type, values, required));
+ check_result(taglib_read(linebuf, line_type, values, required));
switch(line_type) {
case GRAM_1_ITEM_LINE: {
/* handle \item in \1-gram */
@@ -143,7 +143,7 @@ bool parse_unigram(FILE * input, FILE * output){
case GRAM_2_LINE:
goto end;
default:
- assert(false);
+ abort();
}
} while (my_getline(input) != -1);
@@ -155,11 +155,11 @@ bool parse_unigram(FILE * input, FILE * output){
bool parse_bigram(FILE * input, FILE * output){
taglib_push_state();
- assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
- "count", "T:N_n_0:n_1:Mr"));
+ check_result(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
+ "count", "T:N_n_0:n_1:Mr"));
do {
- assert(taglib_read(linebuf, line_type, values, required));
+ check_result(taglib_read(linebuf, line_type, values, required));
switch (line_type) {
case GRAM_2_ITEM_LINE:{
/* handle \item in \2-gram */
@@ -180,7 +180,7 @@ bool parse_bigram(FILE * input, FILE * output){
case GRAM_2_LINE:
goto end;
default:
- assert(false);
+ abort();
}
} while (my_getline(input) != -1);
diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp
index 9554505..b27b4fd 100644
--- a/utils/training/merge_k_mixture_model.cpp
+++ b/utils/training/merge_k_mixture_model.cpp
@@ -104,7 +104,7 @@ static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target,
if (!target->get_magic_header(target_magic_header)) {
memset(&target_magic_header, 0, sizeof(KMixtureModelMagicHeader));
}
- assert(new_one->get_magic_header(new_magic_header));
+ check_result(new_one->get_magic_header(new_magic_header));
if ( target_magic_header.m_WC + new_magic_header.m_WC <
std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){
fprintf(stderr, "the m_WC integer in magic header overflows.\n");
@@ -124,7 +124,7 @@ static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target,
merged_magic_header.m_total_freq = target_magic_header.m_total_freq +
new_magic_header.m_total_freq;
- assert(target->set_magic_header(merged_magic_header));
+ check_result(target->set_magic_header(merged_magic_header));
return true;
}
@@ -139,7 +139,7 @@ static bool merge_array_items( /* in & out */ KMixtureModelBigram * target,
KMixtureModelSingleGram * target_single_gram = NULL;
KMixtureModelSingleGram * new_single_gram = NULL;
- assert(new_one->load(*token, new_single_gram));
+ check_result(new_one->load(*token, new_single_gram));
bool exists_in_target = target->load(*token, target_single_gram);
if ( !exists_in_target ){
target->store(*token, new_single_gram);
@@ -152,8 +152,8 @@ static bool merge_array_items( /* in & out */ KMixtureModelBigram * target,
KMixtureModelArrayHeader new_array_header;
KMixtureModelArrayHeader merged_array_header;
- assert(new_one->get_array_header(*token, new_array_header));
- assert(target->get_array_header(*token, target_array_header));
+ check_result(new_one->get_array_header(*token, new_array_header));
+ check_result(target->get_array_header(*token, target_array_header));
memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader));
merged_array_header.m_WC = target_array_header.m_WC +
@@ -176,7 +176,7 @@ static bool merge_array_items( /* in & out */ KMixtureModelBigram * target,
FlexibleBigramPhraseArray merged_array =
g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
- assert(merge_two_phrase_array(target_array, new_array, merged_array));
+ check_result(merge_two_phrase_array(target_array, new_array, merged_array));
g_array_free(target_array, TRUE);
g_array_free(new_array, TRUE);
@@ -189,8 +189,8 @@ static bool merge_array_items( /* in & out */ KMixtureModelBigram * target,
merged_single_gram->insert_array_item(item->m_token, item->m_item);
}
- assert(merged_single_gram->set_array_header(merged_array_header));
- assert(target->store(*token, merged_single_gram));
+ check_result(merged_single_gram->set_array_header(merged_array_header));
+ check_result(target->store(*token, merged_single_gram));
delete merged_single_gram;
g_array_free(merged_array, TRUE);
}
diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp
index 988bf1a..5434908 100644
--- a/utils/training/prune_k_mixture_model.cpp
+++ b/utils/training/prune_k_mixture_model.cpp
@@ -162,10 +162,10 @@ int main(int argc, char * argv[]){
&g_array_index(removed_array,
KMixtureModelArrayItemWithToken, m);
KMixtureModelArrayHeader array_header;
- assert(bigram.get_array_header(item->m_token, array_header));
+ check_result(bigram.get_array_header(item->m_token, array_header));
array_header.m_freq -= item->m_item.m_WC;
assert(array_header.m_freq >= 0);
- assert(bigram.set_array_header(item->m_token, array_header));
+ check_result(bigram.set_array_header(item->m_token, array_header));
}
g_array_free(removed_array, TRUE);
@@ -180,9 +180,9 @@ int main(int argc, char * argv[]){
KMixtureModelArrayHeader array_header;
for ( size_t i = 0; i < items->len; ++i ){
phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
- assert(bigram.get_array_header(*token, array_header));
+ check_result(bigram.get_array_header(*token, array_header));
if ( 0 == array_header.m_WC && 0 == array_header.m_freq )
- assert(bigram.remove(*token));
+ check_result(bigram.remove(*token));
}
g_array_free(items, TRUE);
diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp
index 91a4b2c..be4352b 100644
--- a/utils/training/validate_k_mixture_model.cpp
+++ b/utils/training/validate_k_mixture_model.cpp
@@ -55,7 +55,7 @@ bool validate_unigram(KMixtureModelBigram * bigram){
for (size_t i = 0; i < items->len; ++i) {
phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
KMixtureModelArrayHeader array_header;
- assert(bigram->get_array_header(*token, array_header));
+ check_result(bigram->get_array_header(*token, array_header));
word_count += array_header.m_WC;
total_freq += array_header.m_freq;
}
@@ -88,14 +88,14 @@ bool validate_bigram(KMixtureModelBigram * bigram){
for (size_t i = 0; i < items->len; ++i) {
phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
KMixtureModelSingleGram * single_gram = NULL;
- assert(bigram->load(*token, single_gram));
+ check_result(bigram->load(*token, single_gram));
FlexibleBigramPhraseArray array = g_array_new
(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
single_gram->retrieve_all(array);
KMixtureModelArrayHeader array_header;
- assert(single_gram->get_array_header(array_header));
+ check_result(single_gram->get_array_header(array_header));
guint32 expected_sum = array_header.m_WC;
guint32 freq = array_header.m_freq;
diff --git a/utils/utils_helper.h b/utils/utils_helper.h
index d44d129..385415f 100644
--- a/utils/utils_helper.h
+++ b/utils/utils_helper.h
@@ -22,6 +22,7 @@
#ifndef UTILS_HELPER_H
#define UTILS_HELPER_H
+#include "pinyin_utils.h"
#define TAGLIB_GET_TOKEN(var, index) \
phrase_token_t var = null_token; \
@@ -42,8 +43,8 @@
type var; \
{ \
gpointer value = NULL; \
- assert(g_hash_table_lookup_extended \
- (required, #var, NULL, &value)); \
+ check_result(g_hash_table_lookup_extended \
+ (required, #var, NULL, &value)); \
var = conv((const char *)value); \
}
@@ -55,13 +56,13 @@
\
gchar ** strs = g_strsplit_set(line, " \t", 2); \
if (2 != g_strv_length(strs)) \
- assert(false); \
+ abort(); \
\
phrase_token_t _token = atoi(strs[0]); \
const char * phrase = strs[1]; \
if (null_token != _token) \
- assert(taglib_validate_token_with_string \
- (phrase_index, _token, phrase)); \
+ check_result(taglib_validate_token_with_string \
+ (phrase_index, _token, phrase)); \
\
var = _token; \
\