diff options
108 files changed, 3138 insertions, 1595 deletions
diff --git a/.github/workflows/make-check.yml b/.github/workflows/make-check.yml new file mode 100644 index 0000000..47a83b6 --- /dev/null +++ b/.github/workflows/make-check.yml @@ -0,0 +1,30 @@ +name: C/C++ CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + + runs-on: ubuntu-latest + + container: + image: fedora:rawhide + + steps: + - uses: actions/checkout@v2 + - name: install build dependency + run: sudo dnf install -y gcc-c++ libdb-devel glib2-devel make gnome-common wget + - name: autoconf + run: ./autogen.sh + - name: configure + run: ./configure --with-dbm=BerkeleyDB + - name: make + run: make V=1 VERBOSE=1 + - name: make check + run: make check + - name: make distcheck + run: make distcheck diff --git a/CMakeLists.txt b/CMakeLists.txt index 78761aa..4f62a9f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,7 @@ if (DB_FOUND) include_directories ( ${DB_INCLUDE_DIR} ) SET (LIBS ${LIBS} ${DB_LIBRARIES}) SET (HAVE_BERKELEY_DB 1) + SET (DATABASE_FORMAT "BerkeleyDB") SET (CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} ${LIBS}") endif (DB_FOUND) @@ -82,6 +83,7 @@ if (NOT HAVE_BERKELEY_DB) include_directories ( ${KyotoCabinet_INCLUDE_PATH} ) SET (LIBS ${LIBS} ${KyotoCabinet_LIBRARY}) SET (HAVE_KYOTO_CABINET 1) + SET (DATABASE_FORMAT "KyotoCabinet") SET (CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} ${LIBS}") endif (KyotoCabinet_FOUND) @@ -123,7 +125,7 @@ if (DEFINED SYSCONF_INSTALL_DIR) endif (DEFINED SYSCONF_INSTALL_DIR) set (DIR_SHARE_LIBPINYIN ${DIR_SHARE}/libpinyin) -set (DIR_INCLUDE_LIBPINYIN ${DIR_INCLUDE}/libpinyin-${LIBPINYIN_BINARY_VERSION}) +set (DIR_INCLUDE_LIBPINYIN ${DIR_INCLUDE}/libpinyin-${VERSION}) ######## Configuration @@ -162,6 +164,7 @@ if (CMAKE_BUILD_TYPE MATCHES Debug) endif (CMAKE_BUILD_TYPE MATCHES Debug) include_directories( + ${CMAKE_BINARY_DIR} ${GLIB2_INCLUDE_DIR} ${PROJECT_SOURCE_DIR}/ ${PROJECT_SOURCE_DIR}/src @@ -1,144 +0,0 @@ -version 2.2.2 -* minor fixes - -version 2.2.1 -* fixes predicted candidates - -version 2.2.0 -* bug fixes - -version 2.1.91 -* fixes zhuyin parsers; - -version 2.1.0 -* support sort option in pinyin_guess_candidates function; - -version 2.0.92 -* reduce memory consumption after imported user dictionary; - -version 2.0.91 -* merge libzhuyin code; - -version 2.0.0 -* the first official release of 2.0.x; -* fixes autoconf; - -version 1.9.92 -* fixes crash in double pinyin; - -version 1.9.91 -* multiple sentence candidates; - -version 1.7.0 -* fixes build on FreeBSD; -* update cmake files; - -version 1.6.91 -* change license to GPLv3+; -* import open-gram dictionary and remove pinyin tones; -* add some checks when load data from file; - -version 1.6.0 -* bug fixes. - -version 1.5.91 -* change pinyin/phrase tables to use dbm. -* enhance pinyin key representation and pinyin parsers. - -version 1.2.0 -* bug fixes. - -version 1.1.91 -* support Kyoto Cabinet as alternative to Berkeley DB. -* improve multiple dictionaries support feature. - -version 1.1.0 -* support to export user phrases. - -version 1.0.0 -* the first official release of 1.0.x. - -version 0.9.94 -* bug fixes. - -version 0.9.93 -* fixes libpinyin issues from coverity scan report. - -version 0.9.92 -* bug fixes. - -version 0.9.91 -* code re-factor. - -version 0.9.0 -* the first official release of 0.9.x. -* fixes import dictionary. - -version 0.8.93 -* add back pinyin_clear_constraint. - -version 0.8.92 -* fixes model data. - -version 0.8.91 -* multiple dictioniares and user dictionary support. - -version 0.8.1 -* bug fixes. - -version 0.8.0 -* the first official release of 0.8.x. - -version 0.7.92 -* re-factor PhraseLookup class. -* all tests passed simple valgrind memory check. - -version 0.7.91 -* simplify PinyinLookup class. - -version 0.7.1 -* add API to lookup pinyin for characters. - -version 0.7.0 -* the first official release of 0.7.x. - -version 0.6.92 -* draft support for multiple professional phrase libraries. - -version 0.6.91 -* support ucs4 characters. -* support guess sentence with prefix. -* initially support fuzzy pinyin segment. - -version 0.6.0 -* the first official release of 0.6.x. - -version 0.5.92 -* fixes new parsers and chewing large table. -* improves pinyin_save. - -version 0.5.91 -* some code re-factor and simplify. -* fixes the self-learning work around. - -version 0.5.0 -* the first official release of 0.5.x. - -version 0.4.93 -* fixes some bugs in new parsers. - -version 0.4.92 -* enable parallel make. - -version 0.4.91 -* New parsers for full pinyin/double pinyin/chewing. - * libpinyin now fully supports all pinyin auto corrections of -ibus-pinyin. - * libpinyin now better supports an/ang, en/eng, in/ing fuzzy -pinyin match. - -version 0.3.0 -* the first official release of 0.3.x. - -version 0.2.99 -* import from pinyin. diff --git a/Makefile.am b/Makefile.am index 6266b2d..fbaefed 100644 --- a/Makefile.am +++ b/Makefile.am @@ -31,3 +31,8 @@ pkgconfig_DATA = libpinyin.pc if ENABLE_LIBZHUYIN pkgconfig_DATA += libzhuyin.pc endif + +dist-hook: + if test -d .git ; then \ + git log --name-status --date=iso > $(distdir)/ChangeLog ; \ + fi @@ -0,0 +1,177 @@ +version 2.8.1 +* bug fixes + +version 2.8.0 +* bug fixes for ARMv7 + +version 2.7.92 +* bug fixes + +version 2.7.91 +* improve suggestion candidates +* support longer candidates + +version 2.6.2 +* bug fixes + +version 2.6.1 +* bug fixes + +version 2.6.0 +* bug fixes + +version 2.4.92 +* update pinyin data +* bug fixes + +version 2.4.91 +* improve full pinyin auto correction +* bug fixes + +version 2.3.0 +* update pinyin data + +version 2.2.2 +* minor fixes + +version 2.2.1 +* fixes predicted candidates + +version 2.2.0 +* bug fixes + +version 2.1.91 +* fixes zhuyin parsers; + +version 2.1.0 +* support sort option in pinyin_guess_candidates function; + +version 2.0.92 +* reduce memory consumption after imported user dictionary; + +version 2.0.91 +* merge libzhuyin code; + +version 2.0.0 +* the first official release of 2.0.x; +* fixes autoconf; + +version 1.9.92 +* fixes crash in double pinyin; + +version 1.9.91 +* multiple sentence candidates; + +version 1.7.0 +* fixes build on FreeBSD; +* update cmake files; + +version 1.6.91 +* change license to GPLv3+; +* import open-gram dictionary and remove pinyin tones; +* add some checks when load data from file; + +version 1.6.0 +* bug fixes. + +version 1.5.91 +* change pinyin/phrase tables to use dbm. +* enhance pinyin key representation and pinyin parsers. + +version 1.2.0 +* bug fixes. + +version 1.1.91 +* support Kyoto Cabinet as alternative to Berkeley DB. +* improve multiple dictionaries support feature. + +version 1.1.0 +* support to export user phrases. + +version 1.0.0 +* the first official release of 1.0.x. + +version 0.9.94 +* bug fixes. + +version 0.9.93 +* fixes libpinyin issues from coverity scan report. + +version 0.9.92 +* bug fixes. + +version 0.9.91 +* code re-factor. + +version 0.9.0 +* the first official release of 0.9.x. +* fixes import dictionary. + +version 0.8.93 +* add back pinyin_clear_constraint. + +version 0.8.92 +* fixes model data. + +version 0.8.91 +* multiple dictioniares and user dictionary support. + +version 0.8.1 +* bug fixes. + +version 0.8.0 +* the first official release of 0.8.x. + +version 0.7.92 +* re-factor PhraseLookup class. +* all tests passed simple valgrind memory check. + +version 0.7.91 +* simplify PinyinLookup class. + +version 0.7.1 +* add API to lookup pinyin for characters. + +version 0.7.0 +* the first official release of 0.7.x. + +version 0.6.92 +* draft support for multiple professional phrase libraries. + +version 0.6.91 +* support ucs4 characters. +* support guess sentence with prefix. +* initially support fuzzy pinyin segment. + +version 0.6.0 +* the first official release of 0.6.x. + +version 0.5.92 +* fixes new parsers and chewing large table. +* improves pinyin_save. + +version 0.5.91 +* some code re-factor and simplify. +* fixes the self-learning work around. + +version 0.5.0 +* the first official release of 0.5.x. + +version 0.4.93 +* fixes some bugs in new parsers. + +version 0.4.92 +* enable parallel make. + +version 0.4.91 +* New parsers for full pinyin/double pinyin/chewing. + * libpinyin now fully supports all pinyin auto corrections of +ibus-pinyin. + * libpinyin now better supports an/ang, en/eng, in/ing fuzzy +pinyin match. + +version 0.3.0 +* the first official release of 0.3.x. + +version 0.2.99 +* import from pinyin. diff --git a/configure.ac b/configure.ac index cb39888..80c1040 100644 --- a/configure.ac +++ b/configure.ac @@ -5,12 +5,12 @@ # if not 1, append datestamp to the version number. m4_define([libpinyin_released], [1]) m4_define([libpinyin_major_version], [2]) -m4_define([libpinyin_minor_version], [3]) -m4_define([libpinyin_micro_version], [0]) +m4_define([libpinyin_minor_version], [8]) +m4_define([libpinyin_micro_version], [1]) m4_define(libpinyin_maybe_datestamp, m4_esyscmd([if test x]libpinyin_released[ != x1; then date +.%Y%m%d | tr -d '\n\r'; fi])) -m4_define([libpinyin_abi_current], [13]) +m4_define([libpinyin_abi_current], [15]) m4_define([libpinyin_abi_revision], [0]) m4_define([libpinyin_version], @@ -44,11 +44,19 @@ AC_PROG_CPP AC_PROG_INSTALL AC_PROG_LN_S AC_PROG_MAKE_SET +AC_PROG_LD AC_GNU_SOURCE AX_CXX_COMPILE_STDCXX([11]) +# Detect whether LLVM ld is being used +using_lld=no +if `$LD -v 2>&1 | grep 'LLVM' >/dev/null 2>&1` ; then + using_lld=yes +fi +AM_CONDITIONAL([LLVMLD], [test "$using_lld" = "yes"]) + # Init libtool AC_PROG_LIBTOOL AC_SUBST(LIBTOOL_DEPS) @@ -92,9 +100,9 @@ AC_ARG_WITH(dbm, if test x"$DBM" = x"BerkeleyDB"; then # Check Berkeley DB - AC_CHECK_HEADER([db.h], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4])) + AC_CHECK_HEADER([db.h], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 5])) - AC_SEARCH_LIBS([db_create], [db], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4])) + AC_SEARCH_LIBS([db_create], [db], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 5])) AC_DEFINE([HAVE_BERKELEY_DB], [], [Have Berkeley DB.]) fi @@ -112,6 +120,9 @@ fi AM_CONDITIONAL([KYOTOCABINET], [test x"$DBM" = x"KyotoCabinet"]) +DATABASE_FORMAT="$DBM" +AC_SUBST(DATABASE_FORMAT) + # --enable-libzhuyin AC_ARG_ENABLE(libzhuyin, AC_HELP_STRING([--enable-libzhuyin], @@ -128,6 +139,7 @@ AC_CONFIG_FILES([libpinyin.pc Makefile doc/Makefile data/Makefile + data/table.conf src/Makefile src/include/Makefile src/storage/Makefile diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index 2c7d09b..d3df1d0 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -45,12 +45,13 @@ add_custom_command( ${CMAKE_SOURCE_DIR}/data/gb_char.table ${CMAKE_SOURCE_DIR}/data/gbk_char.table ${CMAKE_SOURCE_DIR}/data/interpolation2.text + ${CMAKE_SOURCE_DIR}/data/table.conf COMMENT "Downloading textual model data..." COMMAND - wget http://downloads.sourceforge.net/libpinyin/models/model17.text.tar.gz + wget http://downloads.sourceforge.net/libpinyin/models/model19.text.tar.gz COMMAND - tar xvf model17.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data + tar xvf model19.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data ) add_custom_command( @@ -67,9 +68,9 @@ add_custom_command( ${gen_binary_files_BIN} --table-dir ${CMAKE_SOURCE_DIR}/data DEPENDS gen_binary_files - ${CMAKE_SOURCE_DIR}/data/gb_char.table - ${CMAKE_SOURCE_DIR}/data/gbk_char.table - ${CMAKE_SOURCE_DIR}/data/table.conf + ${CMAKE_SOURCE_DIR}/data/gb_char.table + ${CMAKE_SOURCE_DIR}/data/gbk_char.table + ${CMAKE_SOURCE_DIR}/data/table.conf ) add_custom_command( @@ -83,7 +84,11 @@ add_custom_command( ${gen_unigram_BIN} --table-dir ${CMAKE_SOURCE_DIR}/data DEPENDS import_interpolation - ${CMAKE_SOURCE_DIR}/data/interpolation2.text + ${CMAKE_SOURCE_DIR}/data/interpolation2.text + ${CMAKE_BINARY_DIR}/data/gb_char.bin + ${CMAKE_BINARY_DIR}/data/gbk_char.bin + ${CMAKE_BINARY_DIR}/data/phrase_index.bin + ${CMAKE_BINARY_DIR}/data/pinyin_index.bin ) install( @@ -98,3 +103,9 @@ set_directory_properties( ADDITIONAL_MAKE_CLEAN_FILES ${BINARY_MODEL_DATA_FILES} ) + +configure_file( + table.conf.in + table.conf + @ONLY +) diff --git a/data/Makefile.am b/data/Makefile.am index 2e1d40d..b8c6716 100644 --- a/data/Makefile.am +++ b/data/Makefile.am @@ -38,7 +38,7 @@ binary_model_data = phrase_index.bin pinyin_index.bin \ MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = $(textual_model_data) \ - table.conf + table.conf.in libpinyin_db_DATA = $(binary_model_data) \ table.conf @@ -48,8 +48,8 @@ libpinyin_dbdir = $(libdir)/libpinyin/data CLEANFILES = $(binary_model_data) interpolation2.text: - wget http://downloads.sourceforge.net/libpinyin/models/model17.text.tar.gz - tar xvf model17.text.tar.gz -C $(top_srcdir)/data + wget http://downloads.sourceforge.net/libpinyin/models/model19.text.tar.gz + tar xvf model19.text.tar.gz -C $(top_srcdir)/data $(tablefiles) table.conf: interpolation2.text diff --git a/data/table.conf.in b/data/table.conf.in new file mode 100644 index 0000000..26a12b0 --- /dev/null +++ b/data/table.conf.in @@ -0,0 +1,29 @@ +binary format version:7 +model data version:14 +lambda parameter:0.312699 + +source table format:pinyin +database format:@DATABASE_FORMAT@ + +default RESERVED NULL NULL NULL NOT_USED +default GB_DICTIONARY gb_char.table gb_char.bin gb_char.dbin SYSTEM_FILE +default GBK_DICTIONARY gbk_char.table gbk_char.bin gbk_char.dbin SYSTEM_FILE +default OPENGRAM_DICTIONARY opengram.table opengram.bin opengram.dbin SYSTEM_FILE +default MERGED_DICTIONARY merged.table merged.bin merged.dbin SYSTEM_FILE +default ADDON_DICTIONARY NULL NULL addon.bin USER_FILE +default NETWORK_DICTIONARY NULL NULL network.bin USER_FILE +default USER_DICTIONARY NULL NULL user.bin USER_FILE + +addon 4 art.table art.bin NULL DICTIONARY +addon 5 culture.table culture.bin NULL DICTIONARY +addon 6 economy.table economy.bin NULL DICTIONARY +addon 7 geology.table geology.bin NULL DICTIONARY +addon 8 history.table history.bin NULL DICTIONARY + +addon 9 life.table life.bin NULL DICTIONARY +addon 10 nature.table nature.bin NULL DICTIONARY +addon 11 people.table people.bin NULL DICTIONARY +addon 12 science.table science.bin NULL DICTIONARY +addon 13 society.table society.bin NULL DICTIONARY +addon 14 sport.table sport.bin NULL DICTIONARY +addon 15 technology.table technology.bin NULL DICTIONARY diff --git a/libpinyin.pc.in b/libpinyin.pc.in index ea08282..fef958d 100644 --- a/libpinyin.pc.in +++ b/libpinyin.pc.in @@ -3,6 +3,7 @@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ pkgdatadir=@libdir@/libpinyin +database_format=@DATABASE_FORMAT@ libpinyinincludedir=${includedir}/libpinyin-@VERSION@ libpinyin_binary_version=@LIBPINYIN_BINARY_VERSION@ diff --git a/libzhuyin.pc.in b/libzhuyin.pc.in index 66ae943..f248d1c 100644 --- a/libzhuyin.pc.in +++ b/libzhuyin.pc.in @@ -2,6 +2,7 @@ prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ +database_format=@DATABASE_FORMAT@ libzhuyinincludedir=${includedir}/libpinyin-@VERSION@ libzhuyin_binary_version=@LIBPINYIN_BINARY_VERSION@ diff --git a/scripts2/fullpinyintable.py b/scripts2/fullpinyintable.py index 6f39acd..4af94a9 100644 --- a/scripts2/fullpinyintable.py +++ b/scripts2/fullpinyintable.py @@ -100,6 +100,7 @@ def gen_pinyin_list(): def gen_pinyins(): #generate all pinyins + distance = 0 for pinyin in pinyin_list: flags = [] if pinyin in PINYIN_ZHUYIN_MAP.keys(): @@ -113,7 +114,7 @@ def gen_pinyins(): if zhuyin in chewing.CHEWING_ASCII_INITIAL_MAP and \ pinyin not in ZHUYIN_SPECIAL_INITIAL_SET_IN_PINYIN_FORM: flags.append("ZHUYIN_INCOMPLETE") - yield pinyin, pinyin, zhuyin, flags, get_chewing(pinyin) + yield pinyin, pinyin, zhuyin, flags, get_chewing(pinyin), distance def get_shengmu_chewing(shengmu): @@ -127,6 +128,7 @@ def get_shengmu_chewing(shengmu): def gen_shengmu(): #generate all shengmu + distance = 0 for shengmu in shengmu_list: if shengmu in pinyin_list: continue @@ -135,12 +137,12 @@ def gen_shengmu(): chewing_initial = chewing_key[0] if chewing_initial in chewing.ASCII_CHEWING_INITIAL_MAP: chewing_initial = chewing.ASCII_CHEWING_INITIAL_MAP[chewing_initial] - yield shengmu, shengmu, chewing_initial, flags, chewing_key + yield shengmu, shengmu, chewing_initial, flags, chewing_key, distance def gen_corrects(): #generate corrections - for correct, wrong in auto_correct: + for correct, wrong, distance in auto_correct: flags = ['IS_PINYIN', 'PINYIN_CORRECT_{0}_{1}'.format(wrong.upper(), correct.upper())] for pinyin in pinyin_list: @@ -149,17 +151,17 @@ def gen_corrects(): zhuyin = PINYIN_ZHUYIN_MAP[pinyin] wrong_pinyin = pinyin.replace(correct, wrong) yield pinyin, wrong_pinyin, zhuyin,\ - flags, get_chewing(pinyin) + flags, get_chewing(pinyin), distance def gen_u_to_v(): #generate U to V - for correct, wrong, flags in auto_correct_ext: + for correct, wrong, flags, distance in auto_correct_ext: #over-ride flags flags = ['IS_PINYIN', 'PINYIN_CORRECT_V_U'] pinyin = correct zhuyin = PINYIN_ZHUYIN_MAP[pinyin] - yield correct, wrong, zhuyin, flags, get_chewing(pinyin) + yield correct, wrong, zhuyin, flags, get_chewing(pinyin), distance #pinyin table @@ -174,7 +176,8 @@ eten26_zhuyin_index = [] def filter_pinyin_list(): - for (correct, wrong, zhuyin, flags, chewing_key) in gen_pinyin_list(): + for (correct, wrong, zhuyin, flags, chewing_key, distance) in \ + gen_pinyin_list(): (luoma, secondary) = (None, None) if zhuyin in ZHUYIN_LUOMA_PINYIN_MAP: @@ -190,7 +193,7 @@ def filter_pinyin_list(): content_table.append((correct, zhuyin, luoma, secondary, chewing_key)) if "IS_PINYIN" in flags: - pinyin_index.append((wrong, flags, correct)) + pinyin_index.append((wrong, flags, correct, distance)) #skip pinyin correct options if correct != wrong: continue @@ -292,9 +295,9 @@ def gen_content_table(): def gen_pinyin_index(): entries = [] - for (wrong, flags, correct) in pinyin_index: + for (wrong, flags, correct, distance) in pinyin_index: index = [x[0] for x in content_table].index(correct) - entry = '{{"{0}", {1}, {2}}}'.format(wrong, flags, index) + entry = '{{"{0}", {1}, {2}, {3}}}'.format(wrong, flags, index, distance) entries.append(entry) return ',\n'.join(entries) diff --git a/scripts2/options.py b/scripts2/options.py index fcfb9fd..e4bd01f 100644 --- a/scripts2/options.py +++ b/scripts2/options.py @@ -22,47 +22,47 @@ auto_correct = [ # "correct", "wrong" - ("ng", "gn"), - ("ng", "mg"), - ("iu", "iou"), - ("ui", "uei"), - ("un", "uen"), + ("ng", "gn", 1), + ("ng", "mg", 1), + ("iu", "iou", 1), + ("ui", "uei", 1), + ("un", "uen", 1), # ("ue", "ve"), - ("ve", "ue"), - ("ong", "on"), + ("ve", "ue", 1), + ("ong", "on", 1), ] auto_correct_ext = [ # "correct", "wrong", flag - ("ju", "jv", "PINYIN_CORRECT_V_U"), - ("qu", "qv", "PINYIN_CORRECT_V_U"), - ("xu", "xv", "PINYIN_CORRECT_V_U"), - ("yu", "yv", "PINYIN_CORRECT_V_U"), - - ("jue", "jve", "PINYIN_CORRECT_V_U"), - ("que", "qve", "PINYIN_CORRECT_V_U"), - ("xue", "xve", "PINYIN_CORRECT_V_U"), - ("yue", "yve", "PINYIN_CORRECT_V_U"), - - ("juan", "jvan", "PINYIN_CORRECT_V_U"), - ("quan", "qvan", "PINYIN_CORRECT_V_U"), - ("xuan", "xvan", "PINYIN_CORRECT_V_U"), - ("yuan", "yvan", "PINYIN_CORRECT_V_U"), - - ("jun", "jvn", "PINYIN_CORRECT_V_U"), - ("qun", "qvn", "PINYIN_CORRECT_V_U"), - ("xun", "xvn", "PINYIN_CORRECT_V_U"), - ("yun", "yvn", "PINYIN_CORRECT_V_U"), - -# ("juang", "jvang", "PINYIN_CORRECT_V_U"), -# ("quang", "qvang", "PINYIN_CORRECT_V_U"), -# ("xuang", "xvang", "PINYIN_CORRECT_V_U"), -# ("yuang", "yvang", "PINYIN_CORRECT_V_U"), - -# ("jun", "jven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), -# ("qun", "qven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), -# ("xun", "xven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), -# ("yun", "yven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), + ("ju", "jv", "PINYIN_CORRECT_V_U", 1), + ("qu", "qv", "PINYIN_CORRECT_V_U", 1), + ("xu", "xv", "PINYIN_CORRECT_V_U", 1), + ("yu", "yv", "PINYIN_CORRECT_V_U", 1), + + ("jue", "jve", "PINYIN_CORRECT_V_U", 1), + ("que", "qve", "PINYIN_CORRECT_V_U", 1), + ("xue", "xve", "PINYIN_CORRECT_V_U", 1), + ("yue", "yve", "PINYIN_CORRECT_V_U", 1), + + ("juan", "jvan", "PINYIN_CORRECT_V_U", 1), + ("quan", "qvan", "PINYIN_CORRECT_V_U", 1), + ("xuan", "xvan", "PINYIN_CORRECT_V_U", 1), + ("yuan", "yvan", "PINYIN_CORRECT_V_U", 1), + + ("jun", "jvn", "PINYIN_CORRECT_V_U", 1), + ("qun", "qvn", "PINYIN_CORRECT_V_U", 1), + ("xun", "xvn", "PINYIN_CORRECT_V_U", 1), + ("yun", "yvn", "PINYIN_CORRECT_V_U", 1), + +# ("juang", "jvang", "PINYIN_CORRECT_V_U", 1), +# ("quang", "qvang", "PINYIN_CORRECT_V_U", 1), +# ("xuang", "xvang", "PINYIN_CORRECT_V_U", 1), +# ("yuang", "yvang", "PINYIN_CORRECT_V_U", 1), + +# ("jun", "jven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U", 1), +# ("qun", "qven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U", 1), +# ("xun", "xven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U", 1), +# ("yun", "yven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U", 1), ] diff --git a/scripts2/specials.txt b/scripts2/specials.txt new file mode 100644 index 0000000..387a384 --- /dev/null +++ b/scripts2/specials.txt @@ -0,0 +1,6 @@ +e'nen 100 +en'en 300 +qun'a 100 +qu'na 300 +jia'nao 100 +jian'ao 300 diff --git a/scripts2/specialtable.py b/scripts2/specialtable.py index 17bd673..e0182b0 100644 --- a/scripts2/specialtable.py +++ b/scripts2/specialtable.py @@ -170,7 +170,7 @@ def gen_resplit_table(): #init code load_phrase("pinyins.txt") -#load_phrase("specials.txt") +load_phrase("specials.txt") divided_list = filter_divided() resplit_list = filter_resplit() sort_all() diff --git a/src/Makefile.am b/src/Makefile.am index c821d04..70419b7 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -14,19 +14,18 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. -AUTOMAKE_OPTIONS = gnu -SUBDIRS = include storage lookup +AUTOMAKE_OPTIONS = gnu subdir-objects +SUBDIRS = include storage lookup -EXTRA_DIST = libpinyin.ver \ - libzhuyin.ver +EXTRA_DIST = libpinyin.ver libzhuyin.ver -MAINTAINERCLEANFILES = Makefile.in +MAINTAINERCLEANFILES = Makefile.in -CLEANFILES = *.bak +CLEANFILES = *.bak -ACLOCAL = aclocal -I $(ac_aux_dir) +ACLOCAL = aclocal -I $(ac_aux_dir) -INCLUDES = -I$(top_srcdir)/src \ +AM_CPPFLAGS = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ -I$(top_srcdir)/src/lookup \ @@ -40,33 +39,79 @@ if ENABLE_LIBZHUYIN libpinyininclude_HEADERS += zhuyin.h endif +pinyin_SOURCES = \ + storage/phrase_index.cpp \ + storage/phrase_large_table2.cpp \ + storage/phrase_large_table3.cpp \ + storage/ngram.cpp \ + storage/tag_utility.cpp \ + storage/chewing_key.cpp \ + storage/pinyin_parser2.cpp \ + storage/zhuyin_parser2.cpp \ + storage/phonetic_key_matrix.cpp \ + storage/chewing_large_table.cpp \ + storage/chewing_large_table2.cpp \ + storage/table_info.cpp \ + lookup/pinyin_lookup2.cpp \ + lookup/phrase_lookup.cpp \ + lookup/lookup.cpp \ + lookup/phonetic_lookup.cpp \ + $(NULL) + +if BERKELEYDB +pinyin_SOURCES += storage/ngram_bdb.cpp \ + storage/phrase_large_table3_bdb.cpp \ + storage/chewing_large_table2_bdb.cpp +endif + +if KYOTOCABINET +pinyin_SOURCES += storage/ngram_kyotodb.cpp \ + storage/phrase_large_table3_kyotodb.cpp \ + storage/chewing_large_table2_kyotodb.cpp +endif + + noinst_HEADERS = pinyin_internal.h -lib_LTLIBRARIES = libpinyin.la +lib_LTLIBRARIES = libpinyin.la -noinst_LTLIBRARIES = libpinyin_internal.la +noinst_LIBRARIES = libpinyin_internal.a -libpinyin_la_SOURCES = pinyin.cpp +libpinyin_la_SOURCES = $(pinyin_SOURCES) pinyin.cpp -libpinyin_la_LIBADD = storage/libstorage.la lookup/liblookup.la @GLIB2_LIBS@ +libpinyin_la_LIBADD = @GLIB2_LIBS@ -libpinyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libpinyin.ver \ +if LLVMLD +## LLVM linker does not support --version-script, +## use -exported_symbols_list instead +libpinyin_la_LDFLAGS = -Wl,-exported_symbols_list,$(srcdir)/libpinyin.exp \ + -version-info @LT_VERSION_INFO@ +else +libpinyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libpinyin.ver \ -version-info @LT_VERSION_INFO@ +endif if ENABLE_LIBZHUYIN lib_LTLIBRARIES += libzhuyin.la -libzhuyin_la_SOURCES = zhuyin.cpp +libzhuyin_la_SOURCES = $(pinyin_SOURCES) zhuyin.cpp -libzhuyin_la_LIBADD = storage/libstorage.la lookup/liblookup.la @GLIB2_LIBS@ +libzhuyin_la_LIBADD = @GLIB2_LIBS@ -libzhuyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libzhuyin.ver \ +if LLVMLD +## LLVM linker does not support --version-script, +## use -exported_symbols_list instead +libzhuyin_la_LDFLAGS = -Wl,-exported_symbols_list,$(srcdir)/libzhuyin.exp \ + -version-info @LT_VERSION_INFO@ +else +libzhuyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libzhuyin.ver \ -version-info @LT_VERSION_INFO@ endif +endif -libpinyin_internal_la_SOURCES = pinyin_internal.cpp +libpinyin_internal_a_SOURCES = pinyin_internal.cpp -libpinyin_internal_la_LIBADD = storage/libstorage.la lookup/liblookup.la +libpinyin_internal_a_LIBADD = storage/libstorage.a lookup/liblookup.a ## Note: ## As libpinyin internal interface will change, only provides static library diff --git a/src/include/Makefile.am b/src/include/Makefile.am index 8f9b417..e81e5d7 100644 --- a/src/include/Makefile.am +++ b/src/include/Makefile.am @@ -14,11 +14,13 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. -MAINTAINERCLEANFILES = Makefile.in +MAINTAINERCLEANFILES = Makefile.in libpinyinincludedir = $(includedir)/libpinyin-@VERSION@ libpinyininclude_HEADERS= novel_types.h -noinst_HEADERS = memory_chunk.h \ - stl_lite.h +noinst_HEADERS = memory_chunk.h \ + pinyin_utils.h \ + stl_lite.h \ + unaligned_memory.h diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h index 044fa0d..713fb5c 100644 --- a/src/include/memory_chunk.h +++ b/src/include/memory_chunk.h @@ -33,6 +33,7 @@ #define LIBPINYIN_USE_MMAP #endif #include "stl_lite.h" +#include "pinyin_utils.h" namespace pinyin{ @@ -72,7 +73,7 @@ private: munmap(m_data_begin - header, header + capacity()); #endif else - assert(FALSE); + abort(); } @@ -134,7 +135,7 @@ private: /* checksum for aligned parts. */ guint32 index = 0; for (; index < aligns; index += sizeof(guint32)) { - const char * p = data + index; + const unsigned char * p = (const unsigned char *)data + index; /* use little endian here. */ guint32 item = *p | *(p + 1) << 8 | @@ -146,7 +147,7 @@ private: /* checksum for remained parts. */ guint32 shift = 0; for (; index < length; index++) { - const char * p = data + index; + const unsigned char * p = (const unsigned char *)data + index; guint32 item = *p << shift; shift += 8; @@ -290,6 +291,20 @@ public: } /** + * MemoryChunk::set_content: + * @offset: the offset in this MemoryChunk. + * @data: the data to be copied. + * @returns: whether the data is copied successfully. + * + * Data are written directly to the memory area in this MemoryChunk. + * + */ + template <typename T> + bool set_content(size_t offset, T data){ + return set_content(offset, &data, sizeof(T)); + } + + /** * MemoryChunk::append_content: * @data: the begin of the data to be copied. * @len: the length of the data to be copied. @@ -349,7 +364,7 @@ public: * Get the content in this MemoryChunk. * */ - bool get_content(size_t offset, void * buffer, size_t length){ + bool get_content(size_t offset, void * buffer, size_t length) const { if ( size() < offset + length ) return false; memcpy( buffer, m_data_begin + offset, length); @@ -357,6 +372,21 @@ public: } /** + * MemoryChunk::get_content: + * @offset: the offset in this MemoryChunk. + * @returns: the content + * + * Get the content in this MemoryChunk. + * + */ + template <typename T> + T get_content(size_t offset) const { + T value; + check_result(get_content(offset, &value, sizeof(T))); + return value; + } + + /** * MemoryChunk::compact_memory: * * Compact memory, reduce the size. diff --git a/src/include/pinyin_utils.h b/src/include/pinyin_utils.h new file mode 100644 index 0000000..9be3bd1 --- /dev/null +++ b/src/include/pinyin_utils.h @@ -0,0 +1,32 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2022 Peng Wu + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef PINYIN_UTILS_H +#define PINYIN_UTILS_H + +#include <assert.h> + +#if defined(NDEBUG) || defined(G_DISABLE_ASSERT) +#define check_result(expr) expr +#else +#define check_result(expr) assert(expr) +#endif + +#endif diff --git a/src/include/unaligned_memory.h b/src/include/unaligned_memory.h new file mode 100644 index 0000000..d748c35 --- /dev/null +++ b/src/include/unaligned_memory.h @@ -0,0 +1,61 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2022 Matias Larsson + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef UNALIGNED_MEMORY_H +#define UNALIGNED_MEMORY_H + +#include <cstring> + +/** + * UnalignedMemory: Safe unaligned memory access. + * + * Some instruction sets, or some instructions in some instruction sets + * require that memory access is aligned to a specific boundary. These + * instructions may trap on unaligned access. + * + * This class provides methods to load and store values at unaligned + * addresses. It ensures that the compiler doesn't generate instructions + * that could trap on the unaligned memory access. + */ + +namespace pinyin{ + template <typename T> + class UnalignedMemory{ + public: + /** + * Read a value from a possibly unaligned memory address. + */ + static T load(const void * src) { + T value; + memcpy(&value, src, sizeof(T)); + return value; + } + + /** + * Store a value into a possibly unaligned memory address. + */ + static void store(T value, void * dest) { + memcpy(dest, &value, sizeof(T)); + } + }; +}; + + +#endif diff --git a/src/libpinyin.exp b/src/libpinyin.exp new file mode 100644 index 0000000..570cf58 --- /dev/null +++ b/src/libpinyin.exp @@ -0,0 +1,74 @@ +_pinyin_init +_pinyin_save +_pinyin_set_full_pinyin_scheme +_pinyin_set_double_pinyin_scheme +_pinyin_set_zhuyin_scheme +_pinyin_load_phrase_library +_pinyin_unload_phrase_library +_pinyin_load_addon_phrase_library +_pinyin_unload_addon_phrase_library +_pinyin_begin_add_phrases +_pinyin_iterator_add_phrase +_pinyin_end_add_phrases +_pinyin_begin_get_phrases +_pinyin_iterator_has_next_phrase +_pinyin_iterator_get_next_phrase +_pinyin_end_get_phrases +_pinyin_fini +_pinyin_mask_out +_pinyin_set_options +_pinyin_alloc_instance +_pinyin_free_instance +_pinyin_get_context +_pinyin_guess_sentence +_pinyin_guess_sentence_with_prefix +_pinyin_guess_predicted_candidates +_pinyin_phrase_segment +_pinyin_get_sentence +_pinyin_parse_full_pinyin +_pinyin_parse_more_full_pinyins +_pinyin_parse_double_pinyin +_pinyin_parse_more_double_pinyins +_pinyin_parse_chewing +_pinyin_parse_more_chewings +_pinyin_get_parsed_input_length +_pinyin_in_chewing_keyboard +_pinyin_guess_candidates +_pinyin_choose_candidate +_pinyin_choose_predicted_candidate +_pinyin_clear_constraint +_pinyin_lookup_tokens +_pinyin_train +_pinyin_reset +_pinyin_get_zhuyin_string +_pinyin_get_pinyin_string +_pinyin_get_luoma_pinyin_string +_pinyin_get_secondary_zhuyin_string +_pinyin_get_pinyin_strings +_pinyin_get_pinyin_is_incomplete +_pinyin_token_get_phrase +_pinyin_token_get_n_pronunciation +_pinyin_token_get_nth_pronunciation +_pinyin_token_get_unigram_frequency +_pinyin_token_add_unigram_frequency +_pinyin_get_n_candidate +_pinyin_get_candidate +_pinyin_get_candidate_type +_pinyin_get_candidate_string +_pinyin_get_candidate_nbest_index +_pinyin_get_pinyin_key +_pinyin_get_pinyin_key_rest +_pinyin_get_pinyin_key_rest_positions +_pinyin_get_pinyin_key_rest_length +_pinyin_get_pinyin_offset +_pinyin_get_left_pinyin_offset +_pinyin_get_right_pinyin_offset +_pinyin_get_character_offset +_pinyin_get_n_phrase +_pinyin_get_phrase_token +_pinyin_get_full_pinyin_auxiliary_text +_pinyin_get_double_pinyin_auxiliary_text +_pinyin_get_chewing_auxiliary_text +_pinyin_remember_user_input +_pinyin_is_user_candidate +_pinyin_remove_user_candidate diff --git a/src/libzhuyin.exp b/src/libzhuyin.exp new file mode 100644 index 0000000..fb30130 --- /dev/null +++ b/src/libzhuyin.exp @@ -0,0 +1,52 @@ +_zhuyin_init +_zhuyin_save +_zhuyin_set_chewing_scheme +_zhuyin_set_full_pinyin_scheme +_zhuyin_load_phrase_library +_zhuyin_unload_phrase_library +_zhuyin_begin_add_phrases +_zhuyin_iterator_add_phrase +_zhuyin_end_add_phrases +_zhuyin_fini +_zhuyin_mask_out +_zhuyin_set_options +_zhuyin_alloc_instance +_zhuyin_free_instance +_zhuyin_guess_sentence +_zhuyin_guess_sentence_with_prefix +_zhuyin_phrase_segment +_zhuyin_get_sentence +_zhuyin_parse_full_pinyin +_zhuyin_parse_more_full_pinyins +_zhuyin_parse_chewing +_zhuyin_parse_more_chewings +_zhuyin_get_parsed_input_length +_zhuyin_in_chewing_keyboard +_zhuyin_guess_candidates_after_cursor +_zhuyin_guess_candidates_before_cursor +_zhuyin_choose_candidate +_zhuyin_clear_constraint +_zhuyin_lookup_tokens +_zhuyin_train +_zhuyin_reset +_zhuyin_get__zhuyin_string +_zhuyin_get_pinyin_string +_zhuyin_token_get_phrase +_zhuyin_token_get_n_pronunciation +_zhuyin_token_get_nth_pronunciation +_zhuyin_token_get_unigram_frequency +_zhuyin_token_add_unigram_frequency +_zhuyin_get_n_candidate +_zhuyin_get_candidate +_zhuyin_get_candidate_type +_zhuyin_get_candidate_string +_zhuyin_get_zhuyin_key +_zhuyin_get_zhuyin_key_rest +_zhuyin_get_zhuyin_key_rest_positions +_zhuyin_get_zhuyin_key_rest_length +_zhuyin_get_zhuyin_offset +_zhuyin_get_left_zhuyin_offset +_zhuyin_get_right_zhuyin_offset +_zhuyin_get_character_offset +_zhuyin_get_n_phrase +_zhuyin_get_phrase_token diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am index 55c1881..75a7ae5 100644 --- a/src/lookup/Makefile.am +++ b/src/lookup/Makefile.am @@ -14,27 +14,23 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. -MAINTAINERCLEANFILES = Makefile.in +MAINTAINERCLEANFILES = Makefile.in -INCLUDES = -I$(top_srcdir)/src/include \ - -I$(top_srcdir)/src/storage \ - @GLIB2_CFLAGS@ +AM_CPPFLAGS = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CFLAGS@ -noinst_HEADERS = lookup.h \ - pinyin_lookup2.h \ - phrase_lookup.h \ - phonetic_lookup.h \ - phonetic_lookup_linear.h \ - phonetic_lookup_heap.h +noinst_HEADERS = lookup.h \ + pinyin_lookup2.h \ + phrase_lookup.h \ + phonetic_lookup.h \ + phonetic_lookup_linear.h \ + phonetic_lookup_heap.h -noinst_LTLIBRARIES = liblookup.la +noinst_LIBRARIES = liblookup.a -liblookup_la_CXXFLAGS = "-fPIC" - -liblookup_la_LDFLAGS = -static - -liblookup_la_SOURCES = pinyin_lookup2.cpp \ - phrase_lookup.cpp \ - lookup.cpp \ - phonetic_lookup.cpp +liblookup_a_SOURCES = pinyin_lookup2.cpp \ + phrase_lookup.cpp \ + lookup.cpp \ + phonetic_lookup.cpp diff --git a/src/lookup/phonetic_lookup.cpp b/src/lookup/phonetic_lookup.cpp index e658911..4205630 100644 --- a/src/lookup/phonetic_lookup.cpp +++ b/src/lookup/phonetic_lookup.cpp @@ -200,7 +200,7 @@ bool ForwardPhoneticConstraints::diff_result(MatchResult best, } } - assert(add_constraint(pos, next_pos, other_token)); + check_result(add_constraint(pos, next_pos, other_token)); } return changed; diff --git a/src/lookup/phonetic_lookup.h b/src/lookup/phonetic_lookup.h index 78fac1b..72e70a5 100644 --- a/src/lookup/phonetic_lookup.h +++ b/src/lookup/phonetic_lookup.h @@ -22,9 +22,10 @@ #define PHONETIC_LOOKUP_H -#include "novel_types.h" #include <limits.h> #include <math.h> +#include "novel_types.h" +#include "pinyin_utils.h" #include "phonetic_key_matrix.h" #include "ngram.h" #include "lookup.h" @@ -250,7 +251,7 @@ public: initial_value.m_handles[1] = token; trellis_node<nstore> initial_node; - assert(initial_node.eval_item(&initial_value)); + check_result(initial_node.eval_item(&initial_value)); LookupStepContent initial_step_content = (LookupStepContent) g_ptr_array_index(m_steps_content, 0); @@ -308,7 +309,7 @@ public: if (!lookup_result) { trellis_node<nstore> node; - assert(node.eval_item(candidate)); + check_result(node.eval_item(candidate)); g_array_append_val(step_content, node); g_hash_table_insert(step_index, GUINT_TO_POINTER(token), GUINT_TO_POINTER(step_content->len - 1)); @@ -321,7 +322,7 @@ public: return node->eval_item(candidate); } - assert(FALSE); + abort(); } /* get tails */ @@ -388,7 +389,7 @@ bool extract_result(const ForwardPhoneticTrellis<nstore, nbest> * trellis, phrase_token_t last_token = tail->m_handles[0]; int sub_index = tail->m_sub_index; - assert(trellis->get_candidate(index, last_token, sub_index, tail)); + check_result(trellis->get_candidate(index, last_token, sub_index, tail)); } /* no need to reverse the result */ @@ -406,7 +407,7 @@ public: /* set tail node */ bool set_tail(const matrix_step<nstore> * tail); /* back trace */ - /* always assume/assert matrix_step.eval_item(...) return true? */ + /* always assume/check_result matrix_step.eval_item(...) return true? */ bool back_trace(const ForwardPhoneticTrellis * trellis); /* extract results */ int extract(/* out */ GPtrArray * arrays); @@ -546,7 +547,7 @@ protected: g_ptr_array_index(topresults, 0); const trellis_constraint_t * constraint = NULL; - assert(m_constraints->get_constraint(start, constraint)); + check_result(m_constraints->get_constraint(start, constraint)); if (CONSTRAINT_ONESTEP == constraint->m_type) { return unigram_gen_next_step(start, constraint->m_constraint_step, @@ -578,7 +579,7 @@ protected: int start, int end, PhraseIndexRanges ranges) { const trellis_constraint_t * constraint = NULL; - assert(m_constraints->get_constraint(start, constraint)); + check_result(m_constraints->get_constraint(start, constraint)); bool found = false; BigramPhraseArray bigram_phrase_items = g_array_new @@ -761,7 +762,7 @@ public: /* begin the viterbi beam search. */ for ( int i = 0; i < nstep - 1; ++i ){ const trellis_constraint_t * cur_constraint = NULL; - assert(m_constraints->get_constraint(i, cur_constraint)); + check_result(m_constraints->get_constraint(i, cur_constraint)); if (CONSTRAINT_NOSEARCH == cur_constraint->m_type) continue; @@ -792,7 +793,7 @@ public: for ( int m = i + 1; m < nstep; ++m ){ const trellis_constraint_t * next_constraint = NULL; - assert(m_constraints->get_constraint(m, next_constraint)); + check_result(m_constraints->get_constraint(m, next_constraint)); if (CONSTRAINT_NOSEARCH == next_constraint->m_type) break; @@ -830,7 +831,7 @@ public: const trellis_value_t * tail = (const trellis_value_t *) g_ptr_array_index(tails, i); - assert(extract_result<nstore>(&m_trellis, tail, result)); + check_result(extract_result<nstore>(&m_trellis, tail, result)); results->add_result(result); } @@ -860,7 +861,7 @@ public: continue; const trellis_constraint_t * constraint = NULL; - assert(constraints->get_constraint(i, constraint)); + check_result(constraints->get_constraint(i, constraint)); if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) { if (CONSTRAINT_ONESTEP == constraint->m_type) { @@ -880,12 +881,12 @@ public: if (!user) { user = new SingleGram; } - assert(user->get_total_freq(total_freq)); + check_result(user->get_total_freq(total_freq)); guint32 freq = 0; /* compute train factor */ if (!user->get_freq(token, freq)) { - assert(user->insert_freq(token, 0)); + check_result(user->insert_freq(token, 0)); seed = initial_seed; } else { seed = std_lite::max(freq, initial_seed); @@ -897,10 +898,10 @@ public: if (seed > 0 && total_freq > total_freq + seed) goto next; - assert(user->set_total_freq(total_freq + seed)); + check_result(user->set_total_freq(total_freq + seed)); /* if total_freq is not overflow, then freq won't overflow. */ - assert(user->set_freq(token, freq + seed)); - assert(m_user_bigram->store(last_token, user)); + check_result(user->set_freq(token, freq + seed)); + check_result(m_user_bigram->store(last_token, user)); next: assert(NULL != user); if (user) diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp index bb3d053..8946de3 100644 --- a/src/lookup/pinyin_lookup2.cpp +++ b/src/lookup/pinyin_lookup2.cpp @@ -22,6 +22,7 @@ #include "facade_chewing_table2.h" #include "pinyin_lookup2.h" #include "stl_lite.h" +#include "pinyin_utils.h" using namespace pinyin; @@ -602,12 +603,12 @@ bool PinyinLookup2::train_result2(PhoneticKeyMatrix * matrix, if (!user) { user = new SingleGram; } - assert(user->get_total_freq(total_freq)); + check_result(user->get_total_freq(total_freq)); guint32 freq = 0; /* compute train factor */ if (!user->get_freq(token, freq)) { - assert(user->insert_freq(token, 0)); + check_result(user->insert_freq(token, 0)); seed = initial_seed; } else { seed = std_lite::max(freq, initial_seed); @@ -619,10 +620,10 @@ bool PinyinLookup2::train_result2(PhoneticKeyMatrix * matrix, if (seed > 0 && total_freq > total_freq + seed) goto next; - assert(user->set_total_freq(total_freq + seed)); + check_result(user->set_total_freq(total_freq + seed)); /* if total_freq is not overflow, then freq won't overflow. */ - assert(user->set_freq(token, freq + seed)); - assert(m_user_bigram->store(last_token, user)); + check_result(user->set_freq(token, freq + seed)); + check_result(m_user_bigram->store(last_token, user)); next: assert(NULL != user); if (user) diff --git a/src/pinyin.cpp b/src/pinyin.cpp index 75eb41b..9dd784b 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -71,18 +71,24 @@ struct _pinyin_instance_t{ /* pointer of pinyin_context_t. */ pinyin_context_t * m_context; + ucs4_t * m_prefix_ucs4; + glong m_prefix_len; /* the tokens of phrases before the user input. */ TokenVector m_prefixes; /* cached parsed pinyin keys. */ PhoneticKeyMatrix m_matrix; size_t m_parsed_len; + size_t m_parsed_key_len; /* cached pinyin lookup variables. */ ForwardPhoneticConstraints * m_constraints; NBestMatchResults m_nbest_results; TokenVector m_phrase_result; CandidateVector m_candidates; + + /* cache the sort option here. */ + guint m_sort_option; }; struct _lookup_candidate_t{ @@ -671,7 +677,7 @@ bool pinyin_iterator_get_next_phrase(export_iterator_t * iter, /* fill phrase and pronunciation pair. */ ucs4_t phrase_ucs4[MAX_PHRASE_LENGTH]; guint8 len = item.get_phrase_length(); - assert(item.get_phrase_string(phrase_ucs4)); + check_result(item.get_phrase_string(phrase_ucs4)); gchar * phrase_utf8 = g_ucs4_to_utf8 (phrase_ucs4, len, NULL, NULL, NULL); @@ -681,7 +687,7 @@ bool pinyin_iterator_get_next_phrase(export_iterator_t * iter, assert(nth_pronun < n_pronuns); ChewingKey keys[MAX_PHRASE_LENGTH]; guint32 freq = 0; - assert(item.get_nth_pronunciation(nth_pronun, keys, freq)); + check_result(item.get_nth_pronunciation(nth_pronun, keys, freq)); GPtrArray * array = g_ptr_array_new(); for(size_t i = 0; i < len; ++i) { @@ -732,15 +738,7 @@ void pinyin_end_get_phrases(export_iterator_t * iter){ delete iter; } -bool pinyin_save(pinyin_context_t * context){ - if (!context->m_user_dir) - return false; - - if (!context->m_modified) - return false; - - context->m_phrase_index->compact(); - +static bool _write_files(pinyin_context_t * context){ const pinyin_table_info_t * phrase_files = context->m_system_table_info.get_default_tables(); @@ -788,11 +786,94 @@ bool pinyin_save(pinyin_context_t * context){ gchar * tmppathname = g_build_filename(context->m_user_dir, tmpfilename, NULL); + + log->save(tmppathname); + + g_free(tmpfilename); + g_free(tmppathname); + delete log; + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + MemoryChunk * chunk = new MemoryChunk; + context->m_phrase_index->store(i, chunk); + + const char * userfilename = table_info->m_user_filename; + gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); + gchar * tmppathname = g_build_filename(context->m_user_dir, + tmpfilename, NULL); + + chunk->save(tmppathname); + + g_free(tmpfilename); + g_free(tmppathname); + delete chunk; + } + } + + /* save user pinyin table */ + gchar * tmpfilename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL); + unlink(tmpfilename); + + context->m_pinyin_table->store(tmpfilename); + + g_free(tmpfilename); + + /* save user phrase table */ + tmpfilename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL); + unlink(tmpfilename); + + context->m_phrase_table->store(tmpfilename); + + g_free(tmpfilename); + + /* save user bi-gram */ + tmpfilename = g_build_filename + (context->m_user_dir, USER_BIGRAM ".tmp", NULL); + unlink(tmpfilename); + context->m_user_bigram->save_db(tmpfilename); + + g_free(tmpfilename); + + return true; +} + +static bool _rename_files(pinyin_context_t * context){ + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_default_tables(); + + /* skip the reserved zero phrase library. */ + for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(i, range); + + if (ERROR_NO_SUB_PHRASE_INDEX == retval) + continue; + + const pinyin_table_info_t * table_info = phrase_files + i; + + if (NOT_USED == table_info->m_file_type) + continue; + + const char * userfilename = table_info->m_user_filename; + + if (NULL == userfilename) + continue; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + const char * userfilename = table_info->m_user_filename; + gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); + + gchar * tmppathname = g_build_filename(context->m_user_dir, + tmpfilename, NULL); g_free(tmpfilename); gchar * chunkpathname = g_build_filename(context->m_user_dir, userfilename, NULL); - log->save(tmppathname); int result = rename(tmppathname, chunkpathname); if (0 != result) @@ -801,14 +882,9 @@ bool pinyin_save(pinyin_context_t * context){ g_free(chunkpathname); g_free(tmppathname); - delete log; } if (USER_FILE == table_info->m_file_type) { - /* user phrase library */ - MemoryChunk * chunk = new MemoryChunk; - context->m_phrase_index->store(i, chunk); - const char * userfilename = table_info->m_user_filename; gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); gchar * tmppathname = g_build_filename(context->m_user_dir, @@ -818,8 +894,6 @@ bool pinyin_save(pinyin_context_t * context){ gchar * chunkpathname = g_build_filename(context->m_user_dir, userfilename, NULL); - chunk->save(tmppathname); - int result = rename(tmppathname, chunkpathname); if (0 != result) fprintf(stderr, "rename %s to %s failed.\n", @@ -827,19 +901,15 @@ bool pinyin_save(pinyin_context_t * context){ g_free(chunkpathname); g_free(tmppathname); - delete chunk; } } /* save user pinyin table */ gchar * tmpfilename = g_build_filename (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL); - unlink(tmpfilename); gchar * filename = g_build_filename (context->m_user_dir, USER_PINYIN_INDEX, NULL); - context->m_pinyin_table->store(tmpfilename); - int result = rename(tmpfilename, filename); if (0 != result) fprintf(stderr, "rename %s to %s failed.\n", @@ -851,12 +921,9 @@ bool pinyin_save(pinyin_context_t * context){ /* save user phrase table */ tmpfilename = g_build_filename (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL); - unlink(tmpfilename); filename = g_build_filename (context->m_user_dir, USER_PHRASE_INDEX, NULL); - context->m_phrase_table->store(tmpfilename); - result = rename(tmpfilename, filename); if (0 != result) fprintf(stderr, "rename %s to %s failed.\n", @@ -868,9 +935,7 @@ bool pinyin_save(pinyin_context_t * context){ /* save user bi-gram */ tmpfilename = g_build_filename (context->m_user_dir, USER_BIGRAM ".tmp", NULL); - unlink(tmpfilename); filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); - context->m_user_bigram->save_db(tmpfilename); result = rename(tmpfilename, filename); if (0 != result) @@ -880,10 +945,24 @@ bool pinyin_save(pinyin_context_t * context){ g_free(tmpfilename); g_free(filename); + return true; +} + +bool pinyin_save(pinyin_context_t * context){ + if (!context->m_user_dir) + return false; + + if (!context->m_modified) + return false; + + context->m_phrase_index->compact(); + + bool retval = _write_files(context) && _rename_files(context); + mark_version(context); context->m_modified = false; - return true; + return retval; } bool pinyin_set_full_pinyin_scheme(pinyin_context_t * context, @@ -926,7 +1005,7 @@ bool pinyin_set_zhuyin_scheme(pinyin_context_t * context, context->m_chewing_parser = new ZhuyinDaChenCP26Parser2(); break; default: - assert(FALSE); + abort(); } return true; } @@ -1043,9 +1122,12 @@ pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){ pinyin_instance_t * instance = new pinyin_instance_t; instance->m_context = context; + instance->m_prefix_ucs4 = NULL; + instance->m_prefix_len = 0; instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); instance->m_parsed_len = 0; + instance->m_parsed_key_len = 0; instance->m_constraints = new ForwardPhoneticConstraints (context->m_phrase_index); @@ -1055,6 +1137,9 @@ pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){ instance->m_candidates = g_array_new(TRUE, TRUE, sizeof(lookup_candidate_t)); + instance->m_sort_option = + SORT_BY_PHRASE_LENGTH | SORT_BY_PINYIN_LENGTH | SORT_BY_FREQUENCY; + return instance; } @@ -1071,6 +1156,7 @@ static bool _free_candidates(CandidateVector candidates) { } void pinyin_free_instance(pinyin_instance_t * instance){ + g_free(instance->m_prefix_ucs4); g_array_free(instance->m_prefixes, TRUE); delete instance->m_constraints; g_array_free(instance->m_phrase_result, TRUE); @@ -1116,17 +1202,22 @@ static void _compute_prefixes(pinyin_instance_t * instance, pinyin_context_t * & context = instance->m_context; FacadePhraseIndex * & phrase_index = context->m_phrase_index; - glong len_str = 0; - ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL); GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + g_free (instance->m_prefix_ucs4); + instance->m_prefix_ucs4 = g_utf8_to_ucs4(prefix, -1, NULL, + &(instance->m_prefix_len), NULL); + + const ucs4_t * ucs4_str = instance->m_prefix_ucs4; + const glong len_str = instance->m_prefix_len; + if (ucs4_str && len_str) { /* add prefixes. */ for (ssize_t i = 1; i <= len_str; ++i) { if (i > MAX_PHRASE_LENGTH) break; - ucs4_t * start = ucs4_str + len_str - i; + const ucs4_t * start = ucs4_str + len_str - i; PhraseTokens tokens; memset(tokens, 0, sizeof(tokens)); @@ -1141,7 +1232,6 @@ static void _compute_prefixes(pinyin_instance_t * instance, } } g_array_free(tokenarray, TRUE); - g_free(ucs4_str); } bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, @@ -1193,7 +1283,7 @@ bool pinyin_get_sentence(pinyin_instance_t * instance, MatchResult result = NULL; assert(index < results.size()); - assert(results.get_result(index, result)); + check_result(results.get_result(index, result)); bool retval = pinyin::convert_to_utf8 (context->m_phrase_index, result, @@ -1208,9 +1298,10 @@ bool pinyin_parse_full_pinyin(pinyin_instance_t * instance, pinyin_context_t * & context = instance->m_context; pinyin_option_t options = context->m_options; + gint16 distance = 0; int pinyin_len = strlen(onepinyin); bool retval = context->m_full_pinyin_parser->parse_one_key - (options, *onekey, onepinyin, pinyin_len); + (options, *onekey, distance, onepinyin, pinyin_len); return retval; } @@ -1229,6 +1320,7 @@ size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance, key_rests, pinyins, strlen(pinyins)); instance->m_parsed_len = parsed_len; + instance->m_parsed_key_len = keys->len; fill_matrix(&matrix, keys, key_rests, parsed_len); @@ -1249,9 +1341,10 @@ bool pinyin_parse_double_pinyin(pinyin_instance_t * instance, pinyin_context_t * & context = instance->m_context; pinyin_option_t options = context->m_options; + gint16 distance = 0; int pinyin_len = strlen(onepinyin); bool retval = context->m_double_pinyin_parser->parse_one_key - (options, *onekey, onepinyin, pinyin_len); + (options, *onekey, distance, onepinyin, pinyin_len); return retval; } @@ -1270,6 +1363,7 @@ size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance, key_rests, pinyins, strlen(pinyins)); instance->m_parsed_len = parsed_len; + instance->m_parsed_key_len = keys->len; fill_matrix(&matrix, keys, key_rests, parsed_len); @@ -1289,9 +1383,10 @@ bool pinyin_parse_chewing(pinyin_instance_t * instance, /* disable the zhuyin correction options. */ options &= ~ZHUYIN_CORRECT_ALL; + gint16 distance = 0; int chewing_len = strlen(onechewing); bool retval = context->m_chewing_parser->parse_one_key - (options, *onekey, onechewing, chewing_len ); + (options, *onekey, distance, onechewing, chewing_len ); return retval; } @@ -1313,6 +1408,7 @@ size_t pinyin_parse_more_chewings(pinyin_instance_t * instance, key_rests, chewings, strlen(chewings)); instance->m_parsed_len = parsed_len; + instance->m_parsed_key_len = keys->len; fill_matrix(&matrix, keys, key_rests, parsed_len); @@ -1341,6 +1437,7 @@ bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, static bool _token_get_phrase(FacadePhraseIndex * phrase_index, phrase_token_t token, + guint begin, guint * len, gchar ** utf8_str) { PhraseItem item; @@ -1353,9 +1450,9 @@ static bool _token_get_phrase(FacadePhraseIndex * phrase_index, item.get_phrase_string(buffer); guint length = item.get_phrase_length(); if (len) - *len = length; + *len = length - begin; if (utf8_str) - *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); + *utf8_str = g_ucs4_to_utf8(buffer + begin, length - begin, NULL, NULL, NULL); return true; } @@ -1370,7 +1467,6 @@ static gint compare_item_with_token(gconstpointer lhs, return (token_lhs - token_rhs); } -#endif static gint compare_item_with_phrase_length_and_frequency(gconstpointer lhs, gconstpointer rhs) { @@ -1388,28 +1484,39 @@ static gint compare_item_with_phrase_length_and_frequency(gconstpointer lhs, return -(freq_lhs - freq_rhs); /* in descendant order */ } +#endif -static gint compare_item_with_phrase_length_and_pinyin_length_and_frequency -(gconstpointer lhs, gconstpointer rhs) { +static gint compare_item_with_sort_option +(gconstpointer lhs, gconstpointer rhs, gpointer user_data) { lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; + guint sort_option = GPOINTER_TO_UINT(user_data); - guint8 len_lhs = item_lhs->m_phrase_length; - guint8 len_rhs = item_rhs->m_phrase_length; + if (sort_option & SORT_BY_PHRASE_LENGTH) { + guint8 len_lhs = item_lhs->m_phrase_length; + guint8 len_rhs = item_rhs->m_phrase_length; - if (len_lhs != len_rhs) - return -(len_lhs - len_rhs); /* in descendant order */ + if (len_lhs != len_rhs) + return -(len_lhs - len_rhs); /* in descendant order */ + } - len_lhs = item_lhs->m_end - item_lhs->m_begin; - len_rhs = item_rhs->m_end - item_rhs->m_begin; + if (sort_option & SORT_BY_PINYIN_LENGTH) { + guint8 len_lhs = item_lhs->m_end - item_lhs->m_begin; + guint8 len_rhs = item_rhs->m_end - item_rhs->m_begin; - if (len_lhs != len_rhs) - return -(len_lhs - len_rhs); /* in descendant order */ + if (len_lhs != len_rhs) + return -(len_lhs - len_rhs); /* in descendant order */ + } - guint32 freq_lhs = item_lhs->m_freq; - guint32 freq_rhs = item_rhs->m_freq; + if (sort_option & SORT_BY_FREQUENCY) { + guint32 freq_lhs = item_lhs->m_freq; + guint32 freq_rhs = item_rhs->m_freq; - return -(freq_lhs - freq_rhs); /* in descendant order */ + if (freq_lhs != freq_rhs) + return -(freq_lhs - freq_rhs); /* in descendant order */ + } + + return 0; } static phrase_token_t _get_previous_token(pinyin_instance_t * instance, @@ -1452,7 +1559,7 @@ static phrase_token_t _get_previous_token(pinyin_instance_t * instance, /* use the first candidate. */ MatchResult result = NULL; - assert(results.get_result(0, result)); + check_result(results.get_result(0, result)); phrase_token_t cur_token = g_array_index (result, phrase_token_t, offset); @@ -1513,7 +1620,23 @@ static void _compute_frequency_of_items(pinyin_context_t * context, gfloat lambda = context->m_system_table_info.get_lambda(); - /* handle addon candidates first. */ + /* handle prefix candidates. */ + if (PREDICTED_PREFIX_CANDIDATE == item->m_candidate_type) { + total_freq = context->m_phrase_index-> + get_phrase_index_total_freq(); + + context->m_phrase_index->get_phrase_item + (token, cached_item); + + /* Note: possibility value <= 1.0. */ + guint32 freq = ((1 - lambda) * + cached_item.get_unigram_frequency() / + (gfloat) total_freq) * 256 * 256 * 256; + item->m_freq = freq; + continue; + } + + /* handle addon candidates. */ if (ADDON_CANDIDATE == item->m_candidate_type) { total_freq = context->m_phrase_index-> get_phrase_index_total_freq(); @@ -1555,6 +1678,70 @@ static void _compute_frequency_of_items(pinyin_context_t * context, } } +static bool _prepend_longer_candidates(pinyin_instance_t * instance, + CandidateVector candidates) { + + pinyin_context_t * & context = instance->m_context; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + PhoneticKeyMatrix & matrix = instance->m_matrix; + size_t prefix_len = instance->m_parsed_key_len; + + GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(tokens)); + phrase_index->prepare_tokens(tokens); + int result = search_suggestion_with_matrix + (context->m_pinyin_table, &matrix, prefix_len, tokens); + int num = reduce_tokens(tokens, tokenarray, false); + phrase_index->destroy_tokens(tokens); + + phrase_token_t longer_token = null_token; + PhraseItem longer_item, item; + for (int i = 0; i < tokenarray->len; ++i) { + phrase_token_t token = g_array_index(tokenarray, phrase_token_t, i); + + if (ERROR_OK != phrase_index->get_phrase_item(token, item)) + continue; + + /* skip the phrase longer than prefix_len * 2 + 1 */ + if (item.get_phrase_length() > (prefix_len * 2 + 1)) + continue; + + if (longer_token == null_token) { + longer_token = token; + phrase_index->get_phrase_item(longer_token, longer_item); + continue; + } + + if (item.get_unigram_frequency() > + longer_item.get_unigram_frequency()) { + longer_token = token; + phrase_index->get_phrase_item(longer_token, longer_item); + } + } + + if (longer_token == null_token) + return false; + + /* compute the unigram frequency. */ + gfloat lambda = context->m_system_table_info.get_lambda(); + guint32 total_freq = phrase_index->get_phrase_index_total_freq(); + guint32 freq = ((1 - lambda) * + longer_item.get_unigram_frequency() / + (gfloat) total_freq) * 256 * 256 * 256; + + /* prepend longer candidate to candidates. */ + lookup_candidate_t candidate; + candidate.m_candidate_type = LONGER_CANDIDATE; + candidate.m_token = longer_token; + candidate.m_freq = freq; + g_array_prepend_val(candidates, candidate); + + g_array_free(tokenarray, TRUE); + return true; +} + static bool _prepend_sentence_candidates(pinyin_instance_t * instance, CandidateVector candidates) { const size_t size = instance->m_nbest_results.size(); @@ -1588,20 +1775,26 @@ static bool _compute_phrase_length(pinyin_context_t * context, switch(candidate->m_candidate_type) { case NBEST_MATCH_CANDIDATE: - assert(FALSE); + abort(); case NORMAL_CANDIDATE: - case PREDICTED_CANDIDATE: { + case PREDICTED_BIGRAM_CANDIDATE: { phrase_index->get_phrase_item(candidate->m_token, item); candidate->m_phrase_length = item.get_phrase_length(); break; } + case PREDICTED_PREFIX_CANDIDATE: { + phrase_index->get_phrase_item(candidate->m_token, item); + candidate->m_phrase_length = + item.get_phrase_length() - candidate->m_begin; + break; + } case ADDON_CANDIDATE: { addon_phrase_index->get_phrase_item(candidate->m_token, item); candidate->m_phrase_length = item.get_phrase_length(); break; } case ZOMBIE_CANDIDATE: - assert(FALSE); + abort(); } } @@ -1624,20 +1817,27 @@ static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance, break; } case NORMAL_CANDIDATE: - case PREDICTED_CANDIDATE: + case LONGER_CANDIDATE: + case PREDICTED_BIGRAM_CANDIDATE: + _token_get_phrase + (instance->m_context->m_phrase_index, + candidate->m_token, 0, NULL, + &(candidate->m_phrase_string)); + break; + case PREDICTED_PREFIX_CANDIDATE: _token_get_phrase (instance->m_context->m_phrase_index, - candidate->m_token, NULL, + candidate->m_token, candidate->m_begin, NULL, &(candidate->m_phrase_string)); break; case ADDON_CANDIDATE: _token_get_phrase (instance->m_context->m_addon_phrase_index, - candidate->m_token, NULL, + candidate->m_token, 0, NULL, &(candidate->m_phrase_string)); break; case ZOMBIE_CANDIDATE: - assert(FALSE); + abort(); } } @@ -1662,8 +1862,7 @@ static gint compare_indexed_item_with_phrase_string(gconstpointer lhs, static bool _remove_duplicated_items_by_phrase_string -(pinyin_instance_t * instance, - CandidateVector candidates) { +(pinyin_instance_t * instance, CandidateVector candidates) { size_t i; /* create the GArray of indexed item */ GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t)); @@ -1690,6 +1889,22 @@ static bool _remove_duplicated_items_by_phrase_string cur_item->m_phrase_string)) { /* found duplicated candidates */ + /* as the longer candidates is longer than the pinyin input, + then only longer candidates can be equal. */ + + if (LONGER_CANDIDATE == saved_item->m_candidate_type && + LONGER_CANDIDATE == cur_item->m_candidate_type) { + /* keep the high possiblity one */ + if (saved_item->m_freq < cur_item->m_freq) { + cur_item->m_candidate_type = ZOMBIE_CANDIDATE; + } else { + saved_item->m_candidate_type = ZOMBIE_CANDIDATE; + saved_item = cur_item; + } + + continue; + } + /* both are nbest match candidate */ if (NBEST_MATCH_CANDIDATE == saved_item->m_candidate_type && NBEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { @@ -1772,7 +1987,7 @@ static bool _check_offset(PhoneticKeyMatrix & matrix, size_t offset) { bool pinyin_guess_candidates(pinyin_instance_t * instance, size_t offset, - sort_option_t sort_option) { + guint sort_option) { pinyin_context_t * & context = instance->m_context; pinyin_option_t & options = context->m_options; @@ -1784,6 +1999,8 @@ bool pinyin_guess_candidates(pinyin_instance_t * instance, if (0 == matrix.size()) return false; + instance->m_sort_option = sort_option; + /* lookup the previous token here. */ phrase_token_t prev_token = null_token; @@ -1870,20 +2087,17 @@ bool pinyin_guess_candidates(pinyin_instance_t * instance, _compute_frequency_of_items(context, prev_token, &merged_gram, candidates); /* sort the candidates. */ - switch (sort_option) { - case SORT_BY_PHRASE_LENGTH_AND_FREQUENCY: - g_array_sort(candidates, - compare_item_with_phrase_length_and_frequency); - break; - case SORT_BY_PHRASE_LENGTH_AND_PINYIN_LENGTH_AND_FREQUENCY: - g_array_sort(candidates, - compare_item_with_phrase_length_and_pinyin_length_and_frequency); - break; - } + g_array_sort_with_data + (candidates, compare_item_with_sort_option, + GUINT_TO_POINTER(sort_option)); /* post process to remove duplicated candidates */ - _prepend_sentence_candidates(instance, instance->m_candidates); + if (!(sort_option & SORT_WITHOUT_LONGER_CANDIDATE)) + _prepend_longer_candidates(instance, instance->m_candidates); + + if (!(sort_option & SORT_WITHOUT_SENTENCE_CANDIDATE)) + _prepend_sentence_candidates(instance, instance->m_candidates); _compute_phrase_strings_of_items(instance, instance->m_candidates); @@ -1910,6 +2124,7 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance, _free_candidates(candidates); + /* search bigram candidate. */ g_array_set_size(instance->m_prefixes, 0); _compute_prefixes(instance, prefix); @@ -1925,45 +2140,79 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance, context->m_user_bigram->load(prev_token, user_gram); merge_single_gram(&merged_gram, NULL, user_gram); + if (user_gram) + delete user_gram; + if (merged_gram.get_length()) break; } - if (0 == merged_gram.get_length()) - return false; + if (0 != merged_gram.get_length()) { - /* retrieve all items. */ - BigramPhraseWithCountArray tokens = g_array_new - (FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); - merged_gram.retrieve_all(tokens); + /* retrieve all items. */ + BigramPhraseWithCountArray tokens = g_array_new + (FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); + merged_gram.retrieve_all(tokens); - /* sort the longer word first. */ - PhraseItem cached_item; - for (ssize_t len = length; len > 0; --len) { - /* append items. */ - for (size_t k = 0; k < tokens->len; ++k){ - BigramPhraseItemWithCount * phrase_item = &g_array_index - (tokens, BigramPhraseItemWithCount, k); + /* sort the longer word first. */ + PhraseItem cached_item; + for (ssize_t len = length; len > 0; --len) { + /* append items. */ + for (size_t k = 0; k < tokens->len; ++k){ + BigramPhraseItemWithCount * phrase_item = &g_array_index + (tokens, BigramPhraseItemWithCount, k); - if (phrase_item->m_count < filter) - continue; + if (phrase_item->m_count < filter) + continue; - int result = phrase_index->get_phrase_item - (phrase_item->m_token, cached_item); - if (ERROR_NO_SUB_PHRASE_INDEX == result) - continue; + int result = phrase_index->get_phrase_item + (phrase_item->m_token, cached_item); + if (ERROR_NO_SUB_PHRASE_INDEX == result) + continue; - if (len != cached_item.get_phrase_length()) - continue; + if (len != cached_item.get_phrase_length()) + continue; - lookup_candidate_t item; - item.m_candidate_type = PREDICTED_CANDIDATE; - item.m_token = phrase_item->m_token; - g_array_append_val(candidates, item); + lookup_candidate_t item; + item.m_candidate_type = PREDICTED_BIGRAM_CANDIDATE; + item.m_token = phrase_item->m_token; + g_array_append_val(candidates, item); + } } + } + + /* search prefix candidate. */ + GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + PhraseTokens phrase_tokens; + memset(phrase_tokens, 0, sizeof(phrase_tokens)); + phrase_index->prepare_tokens(phrase_tokens); + int result = context->m_phrase_table->search_suggestion + (instance->m_prefix_len, instance->m_prefix_ucs4, phrase_tokens); + int num = reduce_tokens(phrase_tokens, tokenarray, false); + phrase_index->destroy_tokens(phrase_tokens); + + PhraseItem item; + for (size_t i = 0; i < tokenarray->len; ++i) { + phrase_token_t token = g_array_index(tokenarray, phrase_token_t, i); + + phrase_index->get_phrase_item(token, item); + /* skip the phrase longer than prefix_len * 2 + 1 */ + if (item.get_phrase_length() > (instance->m_prefix_len * 2 + 1)) + continue; + + lookup_candidate_t template_item; + template_item.m_candidate_type = PREDICTED_PREFIX_CANDIDATE; + template_item.m_token = token; + template_item.m_begin = instance->m_prefix_len; + /* The prefix candidate only uses the m_begin variable. */ + template_item.m_end = 0; + g_array_append_val(candidates, template_item); } + g_array_free(tokenarray, TRUE); + /* post process to sort the candidates */ _compute_phrase_length(context, candidates); @@ -1971,7 +2220,11 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance, _compute_frequency_of_items(context, prev_token, &merged_gram, candidates); /* sort the candidates by phrase length and frequency. */ - g_array_sort(candidates, compare_item_with_phrase_length_and_frequency); + guint sort_option = SORT_BY_PHRASE_LENGTH | SORT_BY_FREQUENCY; + + g_array_sort_with_data + (candidates, compare_item_with_sort_option, + GUINT_TO_POINTER(sort_option)); /* post process to remove duplicated candidates */ @@ -1979,16 +2232,17 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance, _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); - if (user_gram) - delete user_gram; - return true; } int pinyin_choose_candidate(pinyin_instance_t * instance, size_t offset, lookup_candidate_t * candidate){ - assert(PREDICTED_CANDIDATE != candidate->m_candidate_type); + const guint32 initial_seed = 23 * 3; + const guint32 unigram_factor = 7; + + assert(PREDICTED_BIGRAM_CANDIDATE != candidate->m_candidate_type && + PREDICTED_PREFIX_CANDIDATE != candidate->m_candidate_type); pinyin_context_t * context = instance->m_context; PhoneticKeyMatrix & matrix = instance->m_matrix; @@ -1997,12 +2251,23 @@ int pinyin_choose_candidate(pinyin_instance_t * instance, if (NBEST_MATCH_CANDIDATE == candidate->m_candidate_type) { MatchResult best = NULL, other = NULL; - assert(results.get_result(0, best)); - assert(results.get_result(candidate->m_nbest_index, other)); + check_result(results.get_result(0, best)); + check_result(results.get_result(candidate->m_nbest_index, other)); constraints->diff_result(best, other); return matrix.size() - 1; } + if (LONGER_CANDIDATE == candidate->m_candidate_type) { + /* only train uni-gram for longer candidate. */ + phrase_token_t token = candidate->m_token; + int error = context->m_phrase_index->add_unigram_frequency + (token, initial_seed * unigram_factor); + if (ERROR_INTEGER_OVERFLOW == error) + return false; + + return true; + } + if (ADDON_CANDIDATE == candidate->m_candidate_type) { PhraseItem item; context->m_addon_phrase_index->get_phrase_item @@ -2034,6 +2299,19 @@ int pinyin_choose_candidate(pinyin_instance_t * instance, candidate->m_token = token; } + if (instance->m_sort_option & SORT_WITHOUT_SENTENCE_CANDIDATE) { + assert(0 == offset); + + /* only train uni-gram. */ + phrase_token_t token = candidate->m_token; + int error = context->m_phrase_index->add_unigram_frequency + (token, initial_seed * unigram_factor); + if (ERROR_INTEGER_OVERFLOW == error) + return false; + + return true; + } + /* sync m_constraints to the length of m_pinyin_keys. */ bool retval = constraints->validate_constraint(&matrix); @@ -2049,7 +2327,8 @@ int pinyin_choose_candidate(pinyin_instance_t * instance, bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance, lookup_candidate_t * candidate){ - assert(PREDICTED_CANDIDATE == candidate->m_candidate_type); + assert(PREDICTED_BIGRAM_CANDIDATE == candidate->m_candidate_type || + PREDICTED_PREFIX_CANDIDATE == candidate->m_candidate_type); const guint32 initial_seed = 23 * 3; const guint32 unigram_factor = 7; @@ -2064,6 +2343,10 @@ bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance, if (ERROR_INTEGER_OVERFLOW == error) return false; + /* The prefix candidate only trains uni-gram frequency. */ + if (PREDICTED_PREFIX_CANDIDATE == candidate->m_candidate_type) + return true; + phrase_token_t prev_token = _get_previous_token(instance, 0); if (null_token == prev_token) return false; @@ -2076,14 +2359,14 @@ bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance, /* train bi-gram */ guint32 total_freq = 0; - assert(user_gram->get_total_freq(total_freq)); + check_result(user_gram->get_total_freq(total_freq)); guint32 freq = 0; if (!user_gram->get_freq(token, freq)) { - assert(user_gram->insert_freq(token, initial_seed)); + check_result(user_gram->insert_freq(token, initial_seed)); } else { - assert(user_gram->set_freq(token, freq + initial_seed)); + check_result(user_gram->set_freq(token, freq + initial_seed)); } - assert(user_gram->set_total_freq(total_freq + initial_seed)); + check_result(user_gram->set_total_freq(total_freq + initial_seed)); context->m_user_bigram->store(prev_token, user_gram); delete user_gram; return true; @@ -2131,7 +2414,7 @@ bool pinyin_train(pinyin_instance_t * instance, guint8 index){ MatchResult result = NULL; assert(index < results.size()); - assert(results.get_result(index, result)); + check_result(results.get_result(index, result)); bool retval = context->m_pinyin_lookup->train_result3 (&matrix, instance->m_constraints, result); @@ -2229,7 +2512,7 @@ bool pinyin_token_get_phrase(pinyin_instance_t * instance, pinyin_context_t * & context = instance->m_context; return _token_get_phrase(context->m_phrase_index, - token, len, utf8_str); + token, 0, len, utf8_str); } bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance, @@ -2769,7 +3052,7 @@ static gchar * _get_aux_text_prefix(pinyin_instance_t * instance, else if (IS_ZHUYIN == options) str = key.get_zhuyin_string(); else - assert(FALSE); + abort(); gchar * newprefix = g_strconcat(prefix, str, " ", NULL); @@ -2813,7 +3096,7 @@ static gchar * _get_aux_text_postfix(pinyin_instance_t * instance, else if (IS_ZHUYIN == options) str = key.get_zhuyin_string(); else - assert(FALSE); + abort(); gchar * newpostfix = g_strconcat(postfix, str, " ", NULL); @@ -2934,7 +3217,7 @@ bool pinyin_get_double_pinyin_auxiliary_text(pinyin_instance_t * instance, middle = g_strconcat(shengmu, yunmu, "|", NULL); break; default: - assert(FALSE); + abort(); } g_free(shengmu); @@ -3046,8 +3329,10 @@ static bool _remember_phrase_recur(pinyin_instance_t * instance, return false; /* as cached_keys and phrase has the same length. */ - assert(cached_keys->len > 0); - assert(cached_keys->len <= MAX_PHRASE_LENGTH); + if (cached_keys->len <= 0) + return false; + if (cached_keys->len > MAX_PHRASE_LENGTH) + return false; return _add_phrase(context, index, cached_keys, phrase, phrase_length, count); @@ -3055,7 +3340,8 @@ static bool _remember_phrase_recur(pinyin_instance_t * instance, const size_t size = matrix.get_column_size(start); /* assume pinyin parsers will filter invalid keys. */ - assert(size > 0); + if (size <= 0) + return false; bool result = false; @@ -3069,18 +3355,22 @@ static bool _remember_phrase_recur(pinyin_instance_t * instance, const ChewingKey zero_key; if (zero_key == key) { /* assume only one key here for "'" or the last key. */ - assert(1 == size); + if (1 != size) + return false; + return _remember_phrase_recur (instance, cached_keys, cached_tokens, newstart, phrase, count); } +#if 0 /* meet in-complete pinyin */ if (CHEWING_ZERO_MIDDLE == key.m_middle && CHEWING_ZERO_FINAL == key.m_final) { assert(CHEWING_ZERO_TONE == key.m_tone); return false; } +#endif /* check pronunciation */ if (cached_keys->len >= phrase_length) @@ -3136,7 +3426,8 @@ bool pinyin_remember_user_input(pinyin_instance_t * instance, return false; } - assert(cached_tokens->len == phrase_length); + if (cached_tokens->len != phrase_length) + return false; ChewingKeyVector cached_keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey)); @@ -3152,7 +3443,8 @@ bool pinyin_remember_user_input(pinyin_instance_t * instance, bool pinyin_is_user_candidate(pinyin_instance_t * instance, lookup_candidate_t * candidate) { - if (NORMAL_CANDIDATE != candidate->m_candidate_type) + if (NORMAL_CANDIDATE != candidate->m_candidate_type && + LONGER_CANDIDATE != candidate->m_candidate_type) return false; phrase_token_t token = candidate->m_token; diff --git a/src/pinyin.h b/src/pinyin.h index 6328e1d..bf3ac38 100644 --- a/src/pinyin.h +++ b/src/pinyin.h @@ -43,13 +43,24 @@ typedef enum _lookup_candidate_type_t{ NBEST_MATCH_CANDIDATE = 1, NORMAL_CANDIDATE, ZOMBIE_CANDIDATE, - PREDICTED_CANDIDATE, + PREDICTED_BIGRAM_CANDIDATE, + PREDICTED_PREFIX_CANDIDATE, ADDON_CANDIDATE, + LONGER_CANDIDATE, } lookup_candidate_type_t; typedef enum _sort_option_t{ - SORT_BY_PHRASE_LENGTH_AND_FREQUENCY = 1, - SORT_BY_PHRASE_LENGTH_AND_PINYIN_LENGTH_AND_FREQUENCY, + /* The sort order is phrase length, pinyin length, frequency. */ + SORT_WITHOUT_SENTENCE_CANDIDATE = 0x1, + SORT_WITHOUT_LONGER_CANDIDATE = 0x2, + SORT_BY_PHRASE_LENGTH = 0x4, + SORT_BY_PINYIN_LENGTH = 0x8, + SORT_BY_FREQUENCY = 0x10, + /* For compatibility. */ + SORT_BY_PHRASE_LENGTH_AND_FREQUENCY = + SORT_WITHOUT_LONGER_CANDIDATE | SORT_BY_PHRASE_LENGTH | SORT_BY_FREQUENCY, + SORT_BY_PHRASE_LENGTH_AND_PINYIN_LENGTH_AND_FREQUENCY = + SORT_WITHOUT_LONGER_CANDIDATE | SORT_BY_PHRASE_LENGTH | SORT_BY_PINYIN_LENGTH | SORT_BY_FREQUENCY, } sort_option_t; /** @@ -483,7 +494,7 @@ bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, */ bool pinyin_guess_candidates(pinyin_instance_t * instance, size_t offset, - sort_option_t sort_option); + guint sort_option); /** * pinyin_choose_candidate: @@ -600,20 +611,6 @@ bool pinyin_get_luoma_pinyin_string(pinyin_instance_t * instance, gchar ** utf8_str); /** - * pinyin_get_luoma_pinyin_string: - * @instance: the pinyin instance. - * @key: the pinyin key. - * @utf8_str: the luoma pinyin string. - * @returns: whether the get operation is successful. - * - * Get the luoma pinyin string of the key. - * - */ -bool pinyin_get_luoma_pinyin_string(pinyin_instance_t * instance, - ChewingKey * key, - gchar ** utf8_str); - -/** * pinyin_get_secondary_zhuyin_string: * @instance: the pinyin instance. * @key: the pinyin key. diff --git a/src/pinyin_internal.h b/src/pinyin_internal.h index 082360b..44b455f 100644 --- a/src/pinyin_internal.h +++ b/src/pinyin_internal.h @@ -27,6 +27,7 @@ #include "memory_chunk.h" #include "pinyin_custom2.h" #include "chewing_key.h" +#include "pinyin_utils.h" #include "pinyin_parser2.h" #include "zhuyin_parser2.h" #include "phonetic_key_matrix.h" diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am index ba9d4f4..b64b9ea 100644 --- a/src/storage/Makefile.am +++ b/src/storage/Makefile.am @@ -14,7 +14,7 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. -INCLUDES = -I$(top_srcdir)/src/include \ +AM_CPPFLAGS = -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ @GLIB2_CFLAGS@ @@ -26,7 +26,7 @@ if ENABLE_LIBZHUYIN libpinyininclude_HEADERS += zhuyin_custom2.h endif -noinst_HEADERS = chewing_enum.h \ +noinst_HEADERS = chewing_enum.h \ chewing_key.h \ pinyin_parser2.h \ zhuyin_parser2.h \ @@ -64,13 +64,9 @@ noinst_HEADERS = chewing_enum.h \ kyotodb_utils.h -noinst_LTLIBRARIES = libstorage.la +noinst_LIBRARIES = libstorage.a -libstorage_la_CXXFLAGS = "-fPIC" - -libstorage_la_LDFLAGS = -static - -libstorage_la_SOURCES = phrase_index.cpp \ +libstorage_a_SOURCES = phrase_index.cpp \ phrase_large_table2.cpp \ phrase_large_table3.cpp \ ngram.cpp \ @@ -84,13 +80,13 @@ libstorage_la_SOURCES = phrase_index.cpp \ table_info.cpp if BERKELEYDB -libstorage_la_SOURCES += ngram_bdb.cpp \ +libstorage_a_SOURCES += ngram_bdb.cpp \ phrase_large_table3_bdb.cpp \ chewing_large_table2_bdb.cpp endif if KYOTOCABINET -libstorage_la_SOURCES += ngram_kyotodb.cpp \ +libstorage_a_SOURCES += ngram_kyotodb.cpp \ phrase_large_table3_kyotodb.cpp \ chewing_large_table2_kyotodb.cpp endif diff --git a/src/storage/bdb_utils.h b/src/storage/bdb_utils.h index b1c5832..7a83793 100644 --- a/src/storage/bdb_utils.h +++ b/src/storage/bdb_utils.h @@ -59,6 +59,10 @@ inline bool copy_bdb(DB * srcdb, DB * destdb) { while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { ret = destdb->put(destdb, NULL, &key, &data, 0); assert(0 == ret); + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); } assert(DB_NOTFOUND == ret); diff --git a/src/storage/chewing_large_table.cpp b/src/storage/chewing_large_table.cpp index 5716c5c..bd76e9b 100644 --- a/src/storage/chewing_large_table.cpp +++ b/src/storage/chewing_large_table.cpp @@ -344,7 +344,7 @@ ChewingLengthIndexLevel::~ChewingLengthIndexLevel() { CASE(14); CASE(15); default: - assert(false); + abort(); } } #undef CASE @@ -389,7 +389,7 @@ int ChewingLengthIndexLevel::search(pinyin_option_t options, int phrase_length, CASE(14); CASE(15); default: - assert(false); + abort(); } #undef CASE @@ -545,7 +545,7 @@ int ChewingLengthIndexLevel::add_index(int phrase_length, CASE(14); CASE(15); default: - assert(false); + abort(); } #undef CASE @@ -599,7 +599,7 @@ int ChewingLengthIndexLevel::remove_index(int phrase_length, CASE(14); CASE(15); default: - assert(false); + abort(); } #undef CASE @@ -669,8 +669,13 @@ bool ChewingLargeTable::load_text(FILE * infile, TABLE_PHONETIC_TYPE type) { size_t freq; while (!feof(infile)) { +#ifdef __APPLE__ + int num = fscanf(infile, "%255s %255[^ \t] %u %ld", + pinyin, phrase, &token, &freq); +#else int num = fscanf(infile, "%255s %255s %u %ld", pinyin, phrase, &token, &freq); +#endif if (4 != num) continue; @@ -799,7 +804,7 @@ bool ChewingBitmapIndexLevel::store(MemoryChunk * new_chunk, bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end) { char * begin = (char *) chunk->begin(); - guint32 nindex = *((guint32 *)(begin + offset)); /* number of index */ + guint32 nindex = chunk->get_content<guint32>(offset); /* number of index */ table_offset_t * index = (table_offset_t *) (begin + offset + sizeof(guint32)); @@ -845,7 +850,7 @@ bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, CASE(14); CASE(15); default: - assert(false); + abort(); } #undef CASE @@ -905,7 +910,7 @@ bool ChewingLengthIndexLevel::store(MemoryChunk * new_chunk, CASE(14); CASE(15); default: - assert(false); + abort(); } #undef CASE @@ -1029,7 +1034,7 @@ bool ChewingLengthIndexLevel::mask_out(phrase_token_t mask, CASE(14); CASE(15); default: - assert(false); + abort(); } } #undef CASE diff --git a/src/storage/chewing_large_table.h b/src/storage/chewing_large_table.h index d836769..21158e1 100644 --- a/src/storage/chewing_large_table.h +++ b/src/storage/chewing_large_table.h @@ -26,6 +26,7 @@ #include "novel_types.h" #include "memory_chunk.h" #include "chewing_key.h" +#include "pinyin_utils.h" #include "table_info.h" namespace pinyin{ diff --git a/src/storage/chewing_large_table2.cpp b/src/storage/chewing_large_table2.cpp index c8f9b06..7ac1398 100644 --- a/src/storage/chewing_large_table2.cpp +++ b/src/storage/chewing_large_table2.cpp @@ -56,7 +56,7 @@ void ChewingLargeTable2::init_entries() { CASE(15); CASE(16); default: - assert(false); + abort(); } } @@ -97,7 +97,7 @@ void ChewingLargeTable2::fini_entries() { CASE(15); CASE(16); default: - assert(false); + abort(); } } @@ -115,8 +115,13 @@ bool ChewingLargeTable2::load_text(FILE * infile, TABLE_PHONETIC_TYPE type) { size_t freq; while (!feof(infile)) { +#ifdef __APPLE__ + int num = fscanf(infile, "%255s %255[^ \t] %u %ld", + pinyin, phrase, &token, &freq); +#else int num = fscanf(infile, "%255s %255s %u %ld", pinyin, phrase, &token, &freq); +#endif if (4 != num) continue; @@ -149,7 +154,7 @@ bool ChewingLargeTable2::load_text(FILE * infile, TABLE_PHONETIC_TYPE type) { }; if (len != keys->len) { - fprintf(stderr, "ChewingLargeTable::load_text:%s\t%s\t%u\t%ld\n", + fprintf(stderr, "ChewingLargeTable2::load_text:%s\t%s\t%u\t%ld\n", pinyin, phrase, token, freq); continue; } diff --git a/src/storage/chewing_large_table2.h b/src/storage/chewing_large_table2.h index ebf6114..d37ed7c 100644 --- a/src/storage/chewing_large_table2.h +++ b/src/storage/chewing_large_table2.h @@ -38,6 +38,32 @@ namespace pinyin{ class MaskOutVisitor2; +template<int phrase_length> +class PrefixLessThanWithTones{ +protected: + int m_prefix_len; + +public: + PrefixLessThanWithTones(int prefix_len) : + m_prefix_len(prefix_len) {} + + ~PrefixLessThanWithTones() { + m_prefix_len = 0; + } + + int prefix_compare_with_tones(const PinyinIndexItem2<phrase_length> &lhs, + const PinyinIndexItem2<phrase_length> &rhs) { + ChewingKey * keys_lhs = (ChewingKey *) lhs.m_keys; + ChewingKey * keys_rhs = (ChewingKey *) rhs.m_keys; + return pinyin_compare_with_tones(keys_lhs, keys_rhs, m_prefix_len); + } + + bool operator () (const PinyinIndexItem2<phrase_length> &lhs, + const PinyinIndexItem2<phrase_length> &rhs) { + return 0 > prefix_compare_with_tones(lhs, rhs); + } +}; + /* As this is a template class, the code will be in the header file. */ template<int phrase_length> class ChewingTableEntry{ @@ -57,8 +83,8 @@ public: /* convert method. */ /* compress consecutive tokens */ int convert(const ChewingKey keys[], - const IndexItem * begin, const IndexItem * end, - PhraseIndexRanges ranges) const { + const IndexItem * begin, const IndexItem * end, + PhraseIndexRanges ranges) const { const IndexItem * iter = NULL; PhraseIndexRange cursor; GArray * head, * cursor_head = NULL; @@ -120,6 +146,58 @@ public: return convert(keys, range.first, range.second, ranges); } + int convert_suggestion(int prefix_len, + const ChewingKey prefix_keys[], + const IndexItem * begin, const IndexItem * end, + PhraseTokens tokens) const { + assert(prefix_len < phrase_length); + const IndexItem * iter = NULL; + GArray * array = NULL; + + int result = SEARCH_NONE; + for (iter = begin; iter != end; ++iter) { + if (0 != pinyin_compare_with_tones + (prefix_keys, iter->m_keys, prefix_len)) + continue; + + phrase_token_t token = iter->m_token; + array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)]; + if (NULL == array) + continue; + + result |= SEARCH_OK; + g_array_append_val(array, token); + } + + return result; + } + + /* search_suggestion method */ + int search_suggestion(int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const { + /* Usually suggestion candidates will have at least two characters, + use PhraseTokens instead of PhraseIndexRanges. */ + assert(prefix_len < phrase_length); + + IndexItem item; + if (contains_incomplete_pinyin(prefix_keys, prefix_len)) { + compute_incomplete_chewing_index + (prefix_keys, item.m_keys, prefix_len); + } else { + compute_chewing_index(prefix_keys, item.m_keys, prefix_len); + } + + const IndexItem * begin = (IndexItem *) m_chunk.begin(); + const IndexItem * end = (IndexItem *) m_chunk.end(); + + PrefixLessThanWithTones<phrase_length> less_than(prefix_len); + std_lite::pair<const IndexItem *, const IndexItem *> range = + std_lite::equal_range(begin, end, item, less_than); + + return convert_suggestion(prefix_len, prefix_keys, range.first, range.second, tokens); + } + /* add/remove index method */ int add_index(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) { diff --git a/src/storage/chewing_large_table2_bdb.cpp b/src/storage/chewing_large_table2_bdb.cpp index 6741ffd..a90685e 100644 --- a/src/storage/chewing_large_table2_bdb.cpp +++ b/src/storage/chewing_large_table2_bdb.cpp @@ -24,6 +24,31 @@ namespace pinyin{ +/* keep dbm key compare function inside the corresponding dbm file + to get more flexibility. */ + +static bool bdb_chewing_continue_search(const DBT *dbt1, + const DBT *dbt2) { + ChewingKey * lhs_chewing = (ChewingKey *) dbt1->data; + int lhs_chewing_length = dbt1->size / sizeof(ChewingKey); + ChewingKey * rhs_chewing = (ChewingKey *) dbt2->data; + int rhs_chewing_length = dbt2->size / sizeof(ChewingKey); + + /* The key in dbm is longer than the key in application. */ + if (lhs_chewing_length >= rhs_chewing_length) + return false; + + int min_chewing_length = lhs_chewing_length; + + int result = pinyin_exact_compare2 + (lhs_chewing, rhs_chewing, min_chewing_length); + if (0 != result) + return false; + + /* continue the longer chewing search. */ + return true; +} + ChewingLargeTable2::ChewingLargeTable2() { /* create in-memory db. */ m_db = NULL; @@ -195,7 +220,7 @@ int ChewingLargeTable2::search_internal(int phrase_length, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE @@ -203,6 +228,64 @@ int ChewingLargeTable2::search_internal(int phrase_length, return SEARCH_NONE; } +template<int phrase_length> +int ChewingLargeTable2::search_suggestion_internal +(/* in */ const DBT & db_data, + int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + + ChewingTableEntry<phrase_length> * entry = + (ChewingTableEntry<phrase_length> *) + g_ptr_array_index(m_entries, phrase_length); + assert(NULL != entry); + + entry->m_chunk.set_chunk(db_data.data, db_data.size, NULL); + + result = entry->search_suggestion(prefix_len, prefix_keys, tokens) | result; + + return result; +} + +int ChewingLargeTable2::search_suggestion_internal +(int phrase_length, + /* in */ const DBT & db_data, + int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const { + assert(prefix_len < phrase_length); + +#define CASE(len) case len: \ + { \ + return search_suggestion_internal<len> \ + (db_data, prefix_len, prefix_keys, tokens); \ + } + switch(phrase_length) { + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + abort(); + } + +#undef CASE + + return SEARCH_NONE; +} template<int phrase_length> int ChewingLargeTable2::add_index_internal(/* in */ const ChewingKey index[], @@ -302,7 +385,7 @@ int ChewingLargeTable2::add_index_internal(int phrase_length, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE @@ -376,7 +459,7 @@ int ChewingLargeTable2::remove_index_internal(int phrase_length, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE @@ -443,11 +526,14 @@ bool ChewingLargeTable2::mask_out(phrase_token_t mask, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE + /* Initialize our DBTs. */ + memset(&db_key, 0, sizeof(DBT)); + memset(&db_data, 0, sizeof(DBT)); } assert(ret == DB_NOTFOUND); @@ -460,4 +546,68 @@ bool ChewingLargeTable2::mask_out(phrase_token_t mask, return true; } +/* search_suggesion method */ +int ChewingLargeTable2::search_suggestion +(int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const { + ChewingKey index[MAX_PHRASE_LENGTH]; + int result = SEARCH_NONE; + + if (NULL == m_db) + return result; + + if (contains_incomplete_pinyin(prefix_keys, prefix_len)) + compute_incomplete_chewing_index(prefix_keys, index, prefix_len); + else + compute_chewing_index(prefix_keys, index, prefix_len); + + DBC * cursorp = NULL; + /* Get a cursor */ + int ret = m_db->cursor(m_db, NULL, &cursorp, 0); + if (ret != 0) + return result; + + DBT db_key1; + memset(&db_key1, 0, sizeof(DBT)); + db_key1.data = (void *) index; + db_key1.size = prefix_len * sizeof(ChewingKey); + + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + /* Get the prefix entry */ + ret = cursorp->c_get(cursorp, &db_key1, &db_data, DB_SET); + if (ret != 0) { + cursorp->c_close(cursorp); + return result; + } + + /* Get the next entry */ + DBT db_key2; + memset(&db_key2, 0, sizeof(DBT)); + memset(&db_data, 0, sizeof(DBT)); + ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT); + if (ret != 0) { + cursorp->c_close(cursorp); + return result; + } + + while(bdb_chewing_continue_search(&db_key1, &db_key2)) { + int phrase_length = db_key2.size / sizeof(ChewingKey); + result = search_suggestion_internal + (phrase_length, db_data, prefix_len, prefix_keys, tokens) | result; + + memset(&db_key2, 0, sizeof(DBT)); + memset(&db_data, 0, sizeof(DBT)); + ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT); + if (ret != 0) { + cursorp->c_close(cursorp); + return result; + } + } + + cursorp->c_close(cursorp); + return result; +} + }; diff --git a/src/storage/chewing_large_table2_bdb.h b/src/storage/chewing_large_table2_bdb.h index e12855e..a0da787 100644 --- a/src/storage/chewing_large_table2_bdb.h +++ b/src/storage/chewing_large_table2_bdb.h @@ -59,6 +59,18 @@ protected: /* out */ PhraseIndexRanges ranges) const; template<int phrase_length> + int search_suggestion_internal(/* in */ const DBT & db_data, + int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const; + + int search_suggestion_internal(int phrase_length, + /* in */ const DBT & db_data, + int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const; + + template<int phrase_length> int add_index_internal(/* in */ const ChewingKey index[], /* in */ const ChewingKey keys[], /* in */ phrase_token_t token); @@ -101,6 +113,11 @@ public: int search(int phrase_length, /* in */ const ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const; + /* search_suggesion method */ + int search_suggestion(int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const; + /* add/remove index method */ int add_index(int phrase_length, /* in */ const ChewingKey keys[], /* in */ phrase_token_t token); diff --git a/src/storage/chewing_large_table2_kyotodb.cpp b/src/storage/chewing_large_table2_kyotodb.cpp index b2bc9fc..d5aeeb9 100644 --- a/src/storage/chewing_large_table2_kyotodb.cpp +++ b/src/storage/chewing_large_table2_kyotodb.cpp @@ -20,17 +20,43 @@ #include "chewing_large_table2.h" #include <kchashdb.h> -#include <kcprotodb.h> +#include <kccachedb.h> +#include "pinyin_utils.h" #include "kyotodb_utils.h" using namespace kyotocabinet; namespace pinyin{ +/* keep dbm key compare function inside the corresponding dbm file + to get more flexibility. */ + +bool kyotodb_chewing_continue_search(const char* akbuf, size_t aksiz, + const char* bkbuf, size_t bksiz) { + ChewingKey * lhs_chewing = (ChewingKey *) akbuf; + int lhs_chewing_length = aksiz / sizeof(ChewingKey); + ChewingKey * rhs_chewing = (ChewingKey *) bkbuf; + int rhs_chewing_length = bksiz / sizeof(ChewingKey); + + /* The key in dbm is longer than the key in application. */ + if (lhs_chewing_length >= rhs_chewing_length) + return false; + + int min_chewing_length = lhs_chewing_length; + + int result = pinyin_exact_compare2 + (lhs_chewing, rhs_chewing, min_chewing_length); + if (0 != result) + return false; + + /* continue the longer chewing search. */ + return true; +} + ChewingLargeTable2::ChewingLargeTable2() { /* create in-memory db. */ m_db = new ProtoTreeDB; - assert(m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE)); + check_result(m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE)); m_entries = NULL; init_entries(); @@ -76,6 +102,10 @@ bool ChewingLargeTable2::load_db(const char * filename) { if (!m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE)) return false; + if (!m_db->load_snapshot(filename, NULL)) + return false; + +#if 0 /* load db into memory. */ BasicDB * tmp_db = new TreeDB; if (!tmp_db->open(filename, BasicDB::OREADER)) @@ -86,6 +116,7 @@ bool ChewingLargeTable2::load_db(const char * filename) { tmp_db->close(); delete tmp_db; +#endif return true; } @@ -95,6 +126,10 @@ bool ChewingLargeTable2::store_db(const char * new_filename) { if ( ret != 0 && errno != ENOENT) return false; + if (!m_db->dump_snapshot(new_filename, NULL)) + return false; + +#if 0 BasicDB * tmp_db = new TreeDB; if (!tmp_db->open(new_filename, BasicDB::OWRITER|BasicDB::OCREATE)) return false; @@ -105,6 +140,7 @@ bool ChewingLargeTable2::store_db(const char * new_filename) { tmp_db->synchronize(); tmp_db->close(); delete tmp_db; +#endif return true; } @@ -134,8 +170,8 @@ int ChewingLargeTable2::search_internal(/* in */ const ChewingKey index[], entry->m_chunk.set_size(vsiz); /* m_chunk may re-allocate here. */ char * vbuf = (char *) entry->m_chunk.begin(); - assert(vsiz == m_db->get(kbuf, phrase_length * sizeof(ChewingKey), - vbuf, vsiz)); + check_result(vsiz == m_db->get(kbuf, phrase_length * sizeof(ChewingKey), + vbuf, vsiz)); result = entry->search(keys, ranges) | result; @@ -169,7 +205,68 @@ int ChewingLargeTable2::search_internal(int phrase_length, CASE(15); CASE(16); default: - assert(false); + abort(); + } + +#undef CASE + + return SEARCH_NONE; +} + +template<int phrase_length> +int ChewingLargeTable2::search_suggestion_internal +(/* in */ const MemoryChunk & chunk, + int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + + ChewingTableEntry<phrase_length> * entry = + (ChewingTableEntry<phrase_length> *) + g_ptr_array_index(m_entries, phrase_length); + assert(NULL != entry); + + entry->m_chunk.set_chunk(chunk.begin(), chunk.size(), NULL); + + result = entry->search_suggestion(prefix_len, prefix_keys, tokens) | result; + + entry->m_chunk.set_size(0); + + return result; +} + +int ChewingLargeTable2::search_suggestion_internal +(int phrase_length, + /* in */ const MemoryChunk & chunk, + int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const { + +#define CASE(len) case len: \ + { \ + return search_suggestion_internal<len> \ + (chunk, prefix_len, prefix_keys, tokens); \ + } + + switch(phrase_length) { + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + abort(); } #undef CASE @@ -226,7 +323,7 @@ int ChewingLargeTable2::add_index_internal(/* in */ const ChewingKey index[], entry->m_chunk.set_size(vsiz); /* m_chunk may re-allocate here. */ vbuf = (char *) entry->m_chunk.begin(); - assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); + check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); int result = entry->add_index(keys, token); @@ -267,7 +364,7 @@ int ChewingLargeTable2::add_index_internal(int phrase_length, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE @@ -295,7 +392,7 @@ int ChewingLargeTable2::remove_index_internal(/* in */ const ChewingKey index[], entry->m_chunk.set_size(vsiz); /* m_chunk may re-allocate here. */ vbuf = (char *) entry->m_chunk.begin(); - assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); + check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); int result = entry->remove_index(keys, token); if (ERROR_OK != result) @@ -338,7 +435,7 @@ int ChewingLargeTable2::remove_index_internal(int phrase_length, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE @@ -399,12 +496,12 @@ public: CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE - assert(false); + abort(); return NOP; } @@ -423,4 +520,68 @@ bool ChewingLargeTable2::mask_out(phrase_token_t mask, return true; } +/* search_suggesion method */ +int ChewingLargeTable2::search_suggestion +(int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const { + ChewingKey index[MAX_PHRASE_LENGTH]; + int result = SEARCH_NONE; + + if (NULL == m_db) + return result; + + if (contains_incomplete_pinyin(prefix_keys, prefix_len)) + compute_incomplete_chewing_index(prefix_keys, index, prefix_len); + else + compute_chewing_index(prefix_keys, index, prefix_len); + + const char * akbuf = (char *) index; + const size_t aksiz = prefix_len * sizeof(ChewingKey); + const int32_t vsiz = m_db->check(akbuf, aksiz); + /* -1 on failure. */ + if (-1 == vsiz) + return result; + + BasicDB::Cursor * cursor = m_db->cursor(); + bool retval = cursor->jump(akbuf, aksiz); + if (!retval) { + delete cursor; + return result; + } + + /* Get the next entry */ + retval = cursor->step(); + if (!retval) { + delete cursor; + return result; + } + + size_t bksiz = 0; + const char * bkbuf = cursor->get_key(&bksiz); + MemoryChunk chunk; + while(kyotodb_chewing_continue_search(akbuf, aksiz, bkbuf, bksiz)) { + int phrase_length = bksiz / sizeof(ChewingKey); + size_t bvsiz = 0; + char * bvbuf = cursor->get_value(&bvsiz); + chunk.set_chunk(bvbuf, bvsiz, NULL); + result = search_suggestion_internal + (phrase_length, chunk, prefix_len, prefix_keys, tokens) | result; + chunk.set_size(0); + delete [] bvbuf; + + retval = cursor->step(); + if (!retval) { + delete cursor; + return result; + } + + bksiz = 0; + bkbuf = cursor->get_key(&bksiz); + } + + delete cursor; + return result; +} + }; diff --git a/src/storage/chewing_large_table2_kyotodb.h b/src/storage/chewing_large_table2_kyotodb.h index 92f317b..fcfee83 100644 --- a/src/storage/chewing_large_table2_kyotodb.h +++ b/src/storage/chewing_large_table2_kyotodb.h @@ -59,6 +59,18 @@ protected: /* out */ PhraseIndexRanges ranges) const; template<int phrase_length> + int search_suggestion_internal(/* in */ const MemoryChunk & chunk, + int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const; + + int search_suggestion_internal(int phrase_length, + /* in */ const MemoryChunk & chunk, + int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const; + + template<int phrase_length> int add_index_internal(/* in */ const ChewingKey index[], /* in */ const ChewingKey keys[], /* in */ phrase_token_t token); @@ -100,6 +112,11 @@ public: int search(int phrase_length, /* in */ const ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const; + /* search_suggesion method */ + int search_suggestion(int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const; + /* add/remove index method */ int add_index(int phrase_length, /* in */ const ChewingKey keys[], /* in */ phrase_token_t token); diff --git a/src/storage/facade_chewing_table2.h b/src/storage/facade_chewing_table2.h index 1cf2a1f..0d22b5d 100644 --- a/src/storage/facade_chewing_table2.h +++ b/src/storage/facade_chewing_table2.h @@ -129,6 +129,32 @@ public: } /** + * FacadeChewingTable2::search_suggestion: + * @prefix_len: the length of the prefix to be searched. + * @prefix_keys: the pinyin key of the prefix to be searched. + * @tokens: the array of GArrays to store the matched prefix token. + * @returns: the search result of enum SearchResult. + * + * Search the phrase tokens according to the prefix pinyin keys. + * + */ + int search_suggestion(int prefix_len, + /* in */ const ChewingKey prefix_keys[], + /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + + if (NULL != m_system_chewing_table) + result |= m_system_chewing_table->search_suggestion + (prefix_len, prefix_keys, tokens); + + if (NULL != m_user_chewing_table) + result |= m_user_chewing_table->search_suggestion + (prefix_len, prefix_keys, tokens); + + return result; + } + + /** * FacadeChewingTable2::add_index: * @phrase_length: the length of the phrase to be added. * @keys: the pinyin keys of the phrase to be added. diff --git a/src/storage/facade_phrase_table3.h b/src/storage/facade_phrase_table3.h index 3f71421..9ad9e85 100644 --- a/src/storage/facade_phrase_table3.h +++ b/src/storage/facade_phrase_table3.h @@ -129,6 +129,31 @@ public: } /** + * FacadePhraseTable3::search_suggestion: + * @phrase_length: the length of the prefix to be searched. + * @phrase: the ucs4 characters of the prefix to be searched. + * @tokens: the GArray of tokens to store the matched phrases. + * @returns: the search result of enum SearchResult. + * + * Search the phrase tokens according to the ucs4 prefix characters. + * + */ + int search_suggestion(int phrase_length, /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + + if (NULL != m_system_phrase_table) + result |= m_system_phrase_table->search_suggestion + (phrase_length, phrase, tokens); + + if (NULL != m_user_phrase_table) + result |= m_user_phrase_table->search_suggestion + (phrase_length, phrase, tokens); + + return result; + } + + /** * FacadePhraseTable3::add_index: * @phrase_length: the length of the phrase to be added. * @phrase: the ucs4 characters of the phrase to be added. diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h index f6b0161..a80baa3 100644 --- a/src/storage/flexible_ngram.h +++ b/src/storage/flexible_ngram.h @@ -28,6 +28,7 @@ * struct MagicHeader, ArrayHeader, ArrayItem. */ +#include "pinyin_utils.h" #include "flexible_single_gram.h" #ifdef HAVE_BERKELEY_DB diff --git a/src/storage/flexible_ngram_bdb.h b/src/storage/flexible_ngram_bdb.h index 786cb76..3453752 100644 --- a/src/storage/flexible_ngram_bdb.h +++ b/src/storage/flexible_ngram_bdb.h @@ -105,7 +105,7 @@ public: int ret = db_create(&m_db, NULL, 0); if ( ret != 0 ) - assert(false); + abort(); ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644); if ( ret != 0 && (flags & ATTACH_CREATE) ) { @@ -270,6 +270,10 @@ public: } phrase_token_t * token = (phrase_token_t *) key.data; g_array_append_val(items, *token); + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); } if ( ret != DB_NOTFOUND ){ diff --git a/src/storage/flexible_ngram_kyotodb.h b/src/storage/flexible_ngram_kyotodb.h index ad84b78..b12fa42 100644 --- a/src/storage/flexible_ngram_kyotodb.h +++ b/src/storage/flexible_ngram_kyotodb.h @@ -171,7 +171,7 @@ public: m_chunk.set_size(vsiz); char * vbuf = (char *) m_chunk.begin(); - assert (vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); + check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); if ( memcmp(vbuf, m_magic_number, sizeof(m_magic_number)) == 0 ) @@ -206,8 +206,8 @@ public: m_chunk.set_size(vsiz); char * vbuf = (char *) m_chunk.begin(); - assert (vsiz == m_db->get(kbuf, sizeof(phrase_token_t), - vbuf, vsiz)); + check_result(vsiz == m_db->get(kbuf, sizeof(phrase_token_t), + vbuf, vsiz)); single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem> (m_chunk.begin(), vsiz, copy); @@ -397,7 +397,7 @@ public: } else { /* found */ m_chunk.set_size(vsiz); char * vbuf = (char *) m_chunk.begin(); - assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); + check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); } m_chunk.set_content(0, &header, sizeof(ArrayHeader)); diff --git a/src/storage/kyotodb_utils.h b/src/storage/kyotodb_utils.h index 5e39212..4562007 100644 --- a/src/storage/kyotodb_utils.h +++ b/src/storage/kyotodb_utils.h @@ -49,6 +49,7 @@ inline uint32_t attach_options(guint32 flags) { /* Kyoto Cabinet requires non-NULL pointer for zero length value. */ static const char * empty_vbuf = (char *)UINTPTR_MAX; +#if 0 class CopyVisitor : public DB::Visitor { private: BasicDB * m_db; @@ -68,6 +69,7 @@ public: return NOP; } }; +#endif }; diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp index b8347d9..e4bfe8f 100644 --- a/src/storage/ngram.cpp +++ b/src/storage/ngram.cpp @@ -46,14 +46,12 @@ SingleGram::SingleGram(void * buffer, size_t length, bool copy){ } bool SingleGram::get_total_freq(guint32 & total) const{ - char * buf_begin = (char *)m_chunk.begin(); - total = *((guint32 *)buf_begin); + total = m_chunk.get_content<guint32>(0); return true; } bool SingleGram::set_total_freq(guint32 total){ - char * buf_begin = (char *)m_chunk.begin(); - *((guint32 *)buf_begin) = total; + m_chunk.set_content<guint32>(0, total); return true; } @@ -68,7 +66,7 @@ guint32 SingleGram::get_length(){ if (0 == length) { /* no items here, total freq should be zero. */ guint32 total_freq = 0; - assert(get_total_freq(total_freq)); + check_result(get_total_freq(total_freq)); assert(0 == total_freq); } @@ -79,7 +77,7 @@ guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){ guint32 removed_items = 0; guint32 total_freq = 0; - assert(get_total_freq(total_freq)); + check_result(get_total_freq(total_freq)); const SingleGramItem * begin = (const SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); @@ -100,12 +98,12 @@ guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){ --cur; } - assert(set_total_freq(total_freq)); + check_result(set_total_freq(total_freq)); return removed_items; } bool SingleGram::prune(){ - assert(false); + abort(); #if 0 SingleGramItem * begin = (SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); @@ -122,8 +120,8 @@ bool SingleGram::prune(){ } } guint32 total_freq; - assert(get_total_freq(total_freq)); - assert(set_total_freq(total_freq - nitem)); + check_result(get_total_freq(total_freq)); + check_result(set_total_freq(total_freq - nitem)); #endif return true; } @@ -140,7 +138,7 @@ bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array) guint32 total_freq; BigramPhraseItemWithCount bigram_item_with_count; - assert(get_total_freq(total_freq)); + check_result(get_total_freq(total_freq)); for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){ bigram_item_with_count.m_token = cur_item->m_token; @@ -164,14 +162,14 @@ bool SingleGram::search(/* in */ PhraseIndexRange * range, guint32 total_freq; BigramPhraseItem bigram_item; - assert(get_total_freq(total_freq)); + check_result(get_total_freq(total_freq)); for ( ; cur_item != end; ++cur_item){ - if ( cur_item->m_token >= range->m_range_end ) - break; - bigram_item.m_token = cur_item->m_token; - bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq; - g_array_append_val(array, bigram_item); + if ( cur_item->m_token >= range->m_range_end ) + break; + bigram_item.m_token = cur_item->m_token; + bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq; + g_array_append_val(array, bigram_item); } return true; @@ -283,15 +281,17 @@ bool merge_single_gram(SingleGram * merged, const SingleGram * system, MemoryChunk & merged_chunk = merged->m_chunk; + merged_chunk.set_size(0); + if (NULL == system) { - merged_chunk.set_chunk(user->m_chunk.begin(), - user->m_chunk.size(), NULL); + merged_chunk.set_content(0, user->m_chunk.begin(), + user->m_chunk.size()); return true; } if (NULL == user) { - merged_chunk.set_chunk(system->m_chunk.begin(), - system->m_chunk.size(), NULL); + merged_chunk.set_content(0, system->m_chunk.begin(), + system->m_chunk.size()); return true; } @@ -300,8 +300,8 @@ bool merge_single_gram(SingleGram * merged, const SingleGram * system, /* merge the origin info and delta info */ guint32 system_total, user_total; - assert(system->get_total_freq(system_total)); - assert(user->get_total_freq(user_total)); + check_result(system->get_total_freq(system_total)); + check_result(user->get_total_freq(user_total)); const guint32 merged_total = system_total + user_total; merged_chunk.set_content(0, &merged_total, sizeof(guint32)); diff --git a/src/storage/ngram.h b/src/storage/ngram.h index 7f7a653..57979b4 100644 --- a/src/storage/ngram.h +++ b/src/storage/ngram.h @@ -24,6 +24,7 @@ #include "config.h" #include <glib.h> #include "novel_types.h" +#include "pinyin_utils.h" #ifdef HAVE_BERKELEY_DB #include "ngram_bdb.h" diff --git a/src/storage/ngram_bdb.cpp b/src/storage/ngram_bdb.cpp index a13b431..8910f9d 100644 --- a/src/storage/ngram_bdb.cpp +++ b/src/storage/ngram_bdb.cpp @@ -199,6 +199,10 @@ bool Bigram::get_all_items(GArray * items){ assert(key.size == sizeof(phrase_token_t)); phrase_token_t * token = (phrase_token_t *)key.data; g_array_append_val(items, *token); + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); } assert (ret == DB_NOTFOUND); @@ -223,12 +227,12 @@ bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){ phrase_token_t index = g_array_index(items, phrase_token_t, i); if ((index & mask) == value) { - assert(remove(index)); + check_result(remove(index)); continue; } SingleGram * gram = NULL; - assert(load(index, gram)); + check_result(load(index, gram)); int num = gram->mask_out(mask, value); if (0 == num) { @@ -237,9 +241,9 @@ bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){ } if (0 == gram->get_length()) { - assert(remove(index)); + check_result(remove(index)); } else { - assert(store(index, gram)); + check_result(store(index, gram)); } delete gram; diff --git a/src/storage/ngram_kyotodb.cpp b/src/storage/ngram_kyotodb.cpp index 98f2f59..560e196 100644 --- a/src/storage/ngram_kyotodb.cpp +++ b/src/storage/ngram_kyotodb.cpp @@ -22,7 +22,7 @@ #include <assert.h> #include <errno.h> #include <kchashdb.h> -#include <kcprotodb.h> +#include <kcstashdb.h> #include "kyotodb_utils.h" @@ -50,16 +50,20 @@ void Bigram::reset(){ } -/* Use ProtoHashDB for load_db/save_db methods. */ +/* Use StashDB for load_db/save_db methods. */ bool Bigram::load_db(const char * dbfile){ reset(); /* create in-memory db. */ - m_db = new ProtoHashDB; + m_db = new StashDB; if ( !m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE) ) return false; + if (!m_db->load_snapshot(dbfile, NULL)) + return false; + +#if 0 /* load db into memory. */ BasicDB * tmp_db = new HashDB; if (!tmp_db->open(dbfile, BasicDB::OREADER)) @@ -70,6 +74,7 @@ bool Bigram::load_db(const char * dbfile){ tmp_db->close(); delete tmp_db; +#endif return true; } @@ -80,6 +85,10 @@ bool Bigram::save_db(const char * dbfile){ if ( ret != 0 && errno != ENOENT) return false; + if (!m_db->dump_snapshot(dbfile, NULL)) + return false; + +#if 0 BasicDB * tmp_db = new HashDB; if ( !tmp_db->open(dbfile, BasicDB::OWRITER|BasicDB::OCREATE) ) @@ -91,6 +100,7 @@ bool Bigram::save_db(const char * dbfile){ tmp_db->synchronize(); tmp_db->close(); delete tmp_db; +#endif return true; } @@ -123,7 +133,7 @@ bool Bigram::load(phrase_token_t index, SingleGram * & single_gram, m_chunk.set_size(vsiz); char * vbuf = (char *) m_chunk.begin(); - assert (vsiz == m_db->get(kbuf, sizeof(phrase_token_t), + check_result (vsiz == m_db->get(kbuf, sizeof(phrase_token_t), vbuf, vsiz)); single_gram = new SingleGram(m_chunk.begin(), vsiz, copy); @@ -196,12 +206,12 @@ bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){ phrase_token_t index = g_array_index(items, phrase_token_t, i); if ((index & mask) == value) { - assert(remove(index)); + check_result(remove(index)); continue; } SingleGram * gram = NULL; - assert(load(index, gram)); + check_result(load(index, gram)); int num = gram->mask_out(mask, value); if (0 == num) { @@ -210,9 +220,9 @@ bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){ } if (0 == gram->get_length()) { - assert(remove(index)); + check_result(remove(index)); } else { - assert(store(index, gram)); + check_result(store(index, gram)); } delete gram; diff --git a/src/storage/phonetic_key_matrix.cpp b/src/storage/phonetic_key_matrix.cpp index 058c2e4..ab7e879 100644 --- a/src/storage/phonetic_key_matrix.cpp +++ b/src/storage/phonetic_key_matrix.cpp @@ -437,6 +437,100 @@ int search_matrix(const FacadeChewingTable2 * table, return result; } +int search_suggestion_with_matrix_recur(GArray * cached_keys, + const FacadeChewingTable2 * table, + const PhoneticKeyMatrix * matrix, + size_t prefix_len, + size_t start, size_t end, + PhraseTokens tokens) { + if (start > end) + return SEARCH_NONE; + + /* only do chewing table search with 'start' and 'end'. */ + if (start == end) { + /* exceed the maximum phrase length. */ + if (cached_keys->len > MAX_PHRASE_LENGTH) + return SEARCH_NONE; + + /* skip the phrase longer than prefix_len * 2 + 1, + use the m_parsed_key_len variable for the prefix_len. */ + if (cached_keys->len > prefix_len * 2) + return SEARCH_NONE; + + /* only "'" here. */ + if (0 == cached_keys->len) + return SEARCH_NONE; + +#if 0 + printf("search table for suggestion candidate:%d\n", cached_keys->len); +#endif + return table->search_suggestion + (cached_keys->len, (ChewingKey *)cached_keys->data, tokens); + } + + int result = SEARCH_NONE; + + const size_t size = matrix->get_column_size(start); + /* assume pinyin parsers will filter invalid keys. */ + assert(size > 0); + + for (size_t i = 0; i < size; ++i) { + ChewingKey key; ChewingKeyRest key_rest; + matrix->get_item(start, i, key, key_rest); + + const size_t newstart = key_rest.m_raw_end; + + const ChewingKey zero_key; + if (zero_key == key) { + /* assume only one key here for "'" or the last key. */ + assert(1 == size); + return search_suggestion_with_matrix_recur + (cached_keys, table, matrix, prefix_len, newstart, end, tokens); + } + + /* push value */ + g_array_append_val(cached_keys, key); + + result |= search_suggestion_with_matrix_recur + (cached_keys, table, matrix, prefix_len, newstart, end, tokens); + + /* pop value */ + g_array_set_size(cached_keys, cached_keys->len - 1); + } + + return result; +} + +int search_suggestion_with_matrix(const FacadeChewingTable2 * table, + const PhoneticKeyMatrix * matrix, + size_t prefix_len, + PhraseTokens tokens) { + int result = SEARCH_NONE; + + /* skip the prefix phrase is equal or longer than MAX_PHRASE_LENGTH, + as the prefix phrase candidate will always longer than prefix_len. */ + if (prefix_len >= MAX_PHRASE_LENGTH) + return result; + + size_t start = 0, end = matrix->size() - 1; + + const size_t start_len = matrix->get_column_size(start); + if (0 == start_len) + return result; + + const size_t end_len = matrix->get_column_size(end); + if (0 == end_len) + return result; + + GArray * cached_keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey)); + + result = search_suggestion_with_matrix_recur + (cached_keys, table, matrix, prefix_len, start, end, tokens); + + g_array_free(cached_keys, TRUE); + return result; +} + gfloat compute_pronunciation_possibility_recur(const PhoneticKeyMatrix * matrix, size_t start, size_t end, GArray * cached_keys, diff --git a/src/storage/phonetic_key_matrix.h b/src/storage/phonetic_key_matrix.h index ca7aa84..0b80a96 100644 --- a/src/storage/phonetic_key_matrix.h +++ b/src/storage/phonetic_key_matrix.h @@ -212,6 +212,11 @@ int search_matrix(const FacadeChewingTable2 * table, size_t start, size_t end, PhraseIndexRanges ranges); +int search_suggestion_with_matrix(const FacadeChewingTable2 * table, + const PhoneticKeyMatrix * matrix, + size_t prefix_len, + PhraseTokens tokens); + gfloat compute_pronunciation_possibility(const PhoneticKeyMatrix * matrix, size_t start, size_t end, GArray * cached_keys, diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp index 06b613f..bb98251 100644 --- a/src/storage/phrase_index.cpp +++ b/src/storage/phrase_index.cpp @@ -20,6 +20,7 @@ #include "phrase_index.h" #include "pinyin_custom2.h" +#include "unaligned_memory.h" namespace pinyin{ @@ -61,10 +62,12 @@ bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){ for (int i = 0; i < npron; ++i) { char * chewing_begin = buf_begin + offset + i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); - guint32 * freq = (guint32 *)(chewing_begin + - phrase_length * sizeof(ChewingKey)); - total_freq += *freq; + guint32 * pfreq = (guint32 *)(chewing_begin + + phrase_length * sizeof(ChewingKey)); + guint32 freq = UnalignedMemory<guint32>::load(pfreq); + + total_freq += freq; if (0 == pinyin_exact_compare2 (keys, (ChewingKey *)chewing_begin, phrase_length)) { @@ -74,8 +77,9 @@ bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){ if (delta > 0 && total_freq > total_freq + delta) return false; - *freq += delta; + freq += delta; total_freq += delta; + UnalignedMemory<guint32>::store(freq, pfreq); return true; } } @@ -117,9 +121,11 @@ void PhraseItem::increase_pronunciation_possibility(ChewingKey * keys, for (int i = 0; i < npron; ++i) { char * chewing_begin = buf_begin + offset + i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); - guint32 * freq = (guint32 *)(chewing_begin + - phrase_length * sizeof(ChewingKey)); - total_freq += *freq; + + guint32 * pfreq = (guint32 *)(chewing_begin + + phrase_length * sizeof(ChewingKey)); + guint32 freq = UnalignedMemory<guint32>::load(pfreq); + total_freq += freq; if (0 == pinyin_compare_with_tones(keys, (ChewingKey *)chewing_begin, phrase_length)) { @@ -128,8 +134,9 @@ void PhraseItem::increase_pronunciation_possibility(ChewingKey * keys, if (delta > 0 && total_freq > total_freq + delta) return; - *freq += delta; + freq += delta; total_freq += delta; + UnalignedMemory<guint32>::store(freq, pfreq); } } } @@ -505,7 +512,7 @@ bool SubPhraseIndex::merge(PhraseIndexLogger * logger){ break; } default: - assert(false); + abort(); } } return true; @@ -527,8 +534,13 @@ bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile, phrase_token_t cur_token = 0; while (!feof(infile)){ +#ifdef __APPLE__ + int num = fscanf(infile, "%255s %255[^ \t] %u %ld", + pinyin, phrase, &token, &freq); +#else int num = fscanf(infile, "%255s %255s %u %ld", pinyin, phrase, &token, &freq); +#endif if (4 != num) continue; @@ -796,7 +808,7 @@ bool _compute_new_header(PhraseIndexLogger * logger, break; } default: - assert(false); + abort(); } } diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h index 83dfb51..f97ac65 100644 --- a/src/storage/phrase_index.h +++ b/src/storage/phrase_index.h @@ -31,6 +31,7 @@ #include "memory_chunk.h" #include "phrase_index_logger.h" #include "table_info.h" +#include "unaligned_memory.h" /** * Phrase Index File Format @@ -121,8 +122,7 @@ public: * */ guint32 get_unigram_frequency(){ - char * buf_begin = (char *)m_chunk.begin(); - return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8))); + return m_chunk.get_content<guint32>(sizeof(guint8) + sizeof(guint8)); } /** @@ -142,12 +142,13 @@ public: for ( int i = 0 ; i < npron ; ++i){ char * chewing_begin = buf_begin + offset + i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); - guint32 * freq = (guint32 *)(chewing_begin + - phrase_length * sizeof(ChewingKey)); - total_freq += *freq; + + guint32 freq = UnalignedMemory<guint32>::load(chewing_begin + + phrase_length * sizeof(ChewingKey)); + total_freq += freq; if ( 0 == pinyin_compare_with_tones(keys, (ChewingKey *)chewing_begin, phrase_length) ){ - matched += *freq; + matched += freq; } } diff --git a/src/storage/phrase_index_logger.h b/src/storage/phrase_index_logger.h index cffd937..d1e42b6 100644 --- a/src/storage/phrase_index_logger.h +++ b/src/storage/phrase_index_logger.h @@ -294,7 +294,7 @@ public: break; } default: - assert(false); + abort(); } /* store log record. */ diff --git a/src/storage/phrase_large_table2.cpp b/src/storage/phrase_large_table2.cpp index 38dafb3..28eb313 100644 --- a/src/storage/phrase_large_table2.cpp +++ b/src/storage/phrase_large_table2.cpp @@ -184,7 +184,7 @@ PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){ CASE(15); CASE(16); default: - assert(false); + abort(); } } g_array_free(m_phrase_array_indexes, TRUE); @@ -228,7 +228,7 @@ int PhraseLengthIndexLevel2::search(int phrase_length, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE } @@ -348,7 +348,7 @@ int PhraseLengthIndexLevel2::add_index(int phrase_length, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE @@ -402,7 +402,7 @@ int PhraseLengthIndexLevel2::remove_index(int phrase_length, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE } @@ -472,8 +472,13 @@ bool PhraseLargeTable2::load_text(FILE * infile){ size_t freq; while (!feof(infile)) { +#ifdef __APPLE__ + int num = fscanf(infile, "%255s %255[^ \t] %u %ld", + pinyin, phrase, &token, &freq); +#else int num = fscanf(infile, "%255s %255s %u %ld", pinyin, phrase, &token, &freq); +#endif if (4 != num) continue; @@ -556,7 +561,8 @@ bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end) { char * buf_begin = (char *) chunk->begin(); - guint32 nindex = *((guint32 *)(buf_begin + offset)); + guint32 nindex = chunk->get_content<guint32>(offset); + table_offset_t * index = (table_offset_t *) (buf_begin + offset + sizeof(guint32)); @@ -600,7 +606,7 @@ bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk, CASE(15); CASE(16); default: - assert(false); + abort(); } #undef CASE } @@ -656,7 +662,7 @@ bool PhraseLengthIndexLevel2::store(MemoryChunk * new_chunk, CASE(15); CASE(16); default: - assert(false); + abort(); } //add '#' new_chunk->set_content(offset, &c_separate, sizeof(char)); @@ -776,7 +782,7 @@ bool PhraseLengthIndexLevel2::mask_out(phrase_token_t mask, CASE(15); CASE(16); default: - assert(false); + abort(); } } /* shrink self array. */ diff --git a/src/storage/phrase_large_table3.cpp b/src/storage/phrase_large_table3.cpp index 696c612..1e8fd76 100644 --- a/src/storage/phrase_large_table3.cpp +++ b/src/storage/phrase_large_table3.cpp @@ -128,8 +128,13 @@ bool PhraseLargeTable3::load_text(FILE * infile){ size_t freq; while (!feof(infile)) { +#ifdef __APPLE__ + int num = fscanf(infile, "%255s %255[^ \t] %u %ld", + pinyin, phrase, &token, &freq); +#else int num = fscanf(infile, "%255s %255s %u %ld", pinyin, phrase, &token, &freq); +#endif if (4 != num) continue; diff --git a/src/storage/phrase_large_table3.h b/src/storage/phrase_large_table3.h index 2774767..b4da01d 100644 --- a/src/storage/phrase_large_table3.h +++ b/src/storage/phrase_large_table3.h @@ -24,6 +24,7 @@ #include <stdio.h> #include "novel_types.h" #include "memory_chunk.h" +#include "pinyin_utils.h" #ifdef HAVE_BERKELEY_DB #include "phrase_large_table3_bdb.h" @@ -68,7 +69,8 @@ public: static inline int reduce_tokens(const PhraseTokens tokens, - TokenVector tokenarray) { + TokenVector tokenarray, + bool validate = true) { int num = 0; g_array_set_size(tokenarray, 0); @@ -82,8 +84,9 @@ static inline int reduce_tokens(const PhraseTokens tokens, g_array_append_vals(tokenarray, array->data, array->len); } - /* the following line will be removed in future after code are verified. */ - assert(0 <= num && num <= 4); + /* the following lines will be removed in future after code are verified. */ + if (validate) + assert(0 <= num && num <= 4); return num; } diff --git a/src/storage/phrase_large_table3_bdb.cpp b/src/storage/phrase_large_table3_bdb.cpp index 03632ae..9074170 100644 --- a/src/storage/phrase_large_table3_bdb.cpp +++ b/src/storage/phrase_large_table3_bdb.cpp @@ -24,6 +24,44 @@ namespace pinyin{ +/* keep the following function synced between dbm implementations + for consistent phrase key compare. */ + +inline int compare_phrase(ucs4_t * lhs, ucs4_t * rhs, int phrase_length) { + int result; + for (int i = 0; i < phrase_length; ++i) { + result = lhs[i] - rhs[i]; + if (0 != result) + return result; + } + + return 0; +} + +/* keep dbm key compare function inside the corresponding dbm file + to get more flexibility. */ + +static bool bdb_phrase_continue_search(const DBT *dbt1, + const DBT *dbt2) { + ucs4_t * lhs_phrase = (ucs4_t *) dbt1->data; + int lhs_phrase_length = dbt1->size / sizeof(ucs4_t); + ucs4_t * rhs_phrase = (ucs4_t *) dbt2->data; + int rhs_phrase_length = dbt2->size / sizeof(ucs4_t); + + /* The key in dbm is longer than the key in application. */ + if (lhs_phrase_length >= rhs_phrase_length) + return false; + + int min_phrase_length = lhs_phrase_length; + + int result = compare_phrase (lhs_phrase, rhs_phrase, min_phrase_length); + if (0 != result) + return false; + + /* continue the longer phrase search. */ + return true; +} + PhraseLargeTable3::PhraseLargeTable3() { /* create in-memory db. */ m_db = NULL; @@ -169,6 +207,64 @@ int PhraseLargeTable3::search(int phrase_length, return result; } +int PhraseLargeTable3::search_suggestion(int phrase_length, + /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + + if (NULL == m_db) + return result; + assert(NULL != m_entry); + + DBC * cursorp = NULL; + /* Get a cursor */ + int ret = m_db->cursor(m_db, NULL, &cursorp, 0); + if (ret != 0) + return result; + + DBT db_key1; + memset(&db_key1, 0, sizeof(DBT)); + db_key1.data = (void *) phrase; + db_key1.size = phrase_length * sizeof(ucs4_t); + + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + /* Get the prefix entry */ + ret = cursorp->c_get(cursorp, &db_key1, &db_data, DB_SET); + if (ret != 0) { + cursorp->c_close(cursorp); + return result; + } + + /* Get the next entry */ + DBT db_key2; + memset(&db_key2, 0, sizeof(DBT)); + memset(&db_data, 0, sizeof(DBT)); + ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT); + if (ret != 0) { + cursorp->c_close(cursorp); + return result; + } + + while(bdb_phrase_continue_search(&db_key1, &db_key2)) { + + m_entry->m_chunk.set_chunk(db_data.data, db_data.size, NULL); + result = m_entry->search(tokens) | result; + m_entry->m_chunk.set_size(0); + + memset(&db_key2, 0, sizeof(DBT)); + memset(&db_data, 0, sizeof(DBT)); + ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT); + if (ret != 0) { + cursorp->c_close(cursorp); + return result; + } + } + + cursorp->c_close(cursorp); + return result; +} + /* add_index/remove_index method */ int PhraseLargeTable3::add_index(int phrase_length, /* in */ const ucs4_t phrase[], @@ -302,6 +398,10 @@ bool PhraseLargeTable3::mask_out(phrase_token_t mask, db_data.size = entry.m_chunk.size(); int ret = cursorp->put(cursorp, &db_key, &db_data, DB_CURRENT); assert(ret == 0); + + /* Initialize our DBTs. */ + memset(&db_key, 0, sizeof(DBT)); + memset(&db_data, 0, sizeof(DBT)); } assert(ret == DB_NOTFOUND); diff --git a/src/storage/phrase_large_table3_bdb.h b/src/storage/phrase_large_table3_bdb.h index 73f7625..da8f199 100644 --- a/src/storage/phrase_large_table3_bdb.h +++ b/src/storage/phrase_large_table3_bdb.h @@ -58,6 +58,8 @@ public: /* search method */ int search(int phrase_length, /* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const; + int search_suggestion(int phrase_length, /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const; /* add_index/remove_index method */ int add_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token); diff --git a/src/storage/phrase_large_table3_kyotodb.cpp b/src/storage/phrase_large_table3_kyotodb.cpp index 529aa65..f4b6faa 100644 --- a/src/storage/phrase_large_table3_kyotodb.cpp +++ b/src/storage/phrase_large_table3_kyotodb.cpp @@ -20,7 +20,7 @@ #include "phrase_large_table3.h" #include <kchashdb.h> -#include <kcprotodb.h> +#include <kccachedb.h> #include "kyotodb_utils.h" @@ -28,10 +28,47 @@ using namespace kyotocabinet; namespace pinyin{ +/* keep the following function synced between dbm implementations + for consistent phrase key compare. */ +inline int compare_phrase(ucs4_t * lhs, ucs4_t * rhs, int phrase_length) { + int result; + for (int i = 0; i < phrase_length; ++i) { + result = lhs[i] - rhs[i]; + if (0 != result) + return result; + } + + return 0; +} + +/* keep dbm key compare function inside the corresponding dbm file + to get more flexibility. */ + +bool kyotodb_phrase_continue_search(const char* akbuf, size_t aksiz, + const char* bkbuf, size_t bksiz) { + ucs4_t * lhs_phrase = (ucs4_t *) akbuf; + int lhs_phrase_length = aksiz / sizeof(ucs4_t); + ucs4_t * rhs_phrase = (ucs4_t *) bkbuf; + int rhs_phrase_length = bksiz / sizeof(ucs4_t); + + /* The key in dbm is longer than the key in application. */ + if (lhs_phrase_length >= rhs_phrase_length) + return false; + + int min_phrase_length = lhs_phrase_length; + + int result = compare_phrase (lhs_phrase, rhs_phrase, min_phrase_length); + if (0 != result) + return false; + + /* continue the longer phrase search. */ + return true; +} + PhraseLargeTable3::PhraseLargeTable3() { /* create in-memory db. */ m_db = new ProtoTreeDB; - assert(m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE)); + check_result(m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE)); m_entry = new PhraseTableEntry; } @@ -80,6 +117,10 @@ bool PhraseLargeTable3::load_db(const char * filename) { if (!m_db->open("-", BasicDB::OREADER|BasicDB::OWRITER|BasicDB::OCREATE)) return false; + if (!m_db->load_snapshot(filename, NULL)) + return false; + +#if 0 /* load db into memory. */ BasicDB * tmp_db = new TreeDB; if (!tmp_db->open(filename, BasicDB::OREADER)) @@ -90,6 +131,7 @@ bool PhraseLargeTable3::load_db(const char * filename) { tmp_db->close(); delete tmp_db; +#endif return true; } @@ -99,6 +141,10 @@ bool PhraseLargeTable3::store_db(const char * new_filename){ if ( ret != 0 && errno != ENOENT) return false; + if (!m_db->dump_snapshot(new_filename, NULL)) + return false; + +#if 0 BasicDB * tmp_db = new TreeDB; if (!tmp_db->open(new_filename, BasicDB::OWRITER|BasicDB::OCREATE)) return false; @@ -109,6 +155,7 @@ bool PhraseLargeTable3::store_db(const char * new_filename){ tmp_db->synchronize(); tmp_db->close(); delete tmp_db; +#endif return true; } @@ -137,14 +184,68 @@ int PhraseLargeTable3::search(int phrase_length, m_entry->m_chunk.set_size(vsiz); /* m_chunk may re-allocate here. */ char * vbuf = (char *) m_entry->m_chunk.begin(); - assert (vsiz == m_db->get(kbuf, phrase_length * sizeof(ucs4_t), - vbuf, vsiz)); + check_result(vsiz == m_db->get(kbuf, phrase_length * sizeof(ucs4_t), + vbuf, vsiz)); result = m_entry->search(tokens) | result; return result; } +int PhraseLargeTable3::search_suggestion(int phrase_length, + /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + + if (NULL == m_db) + return result; + assert(NULL != m_entry); + + const char * akbuf = (char *) phrase; + const size_t aksiz = phrase_length * sizeof(ucs4_t); + const int32_t vsiz = m_db->check(akbuf, aksiz); + /* -1 on failure. */ + if (-1 == vsiz) + return result; + + BasicDB::Cursor * cursor = m_db->cursor(); + bool retval = cursor->jump(akbuf, aksiz); + if (!retval) { + delete cursor; + return result; + } + + /* Get the next entry */ + retval = cursor->step(); + if (!retval) { + delete cursor; + return result; + } + + size_t bksiz = 0; + const char * bkbuf = cursor->get_key(&bksiz); + while(kyotodb_phrase_continue_search(akbuf, aksiz, bkbuf, bksiz)) { + size_t bvsiz = 0; + char * bvbuf = cursor->get_value(&bvsiz); + m_entry->m_chunk.set_chunk(bvbuf, bvsiz, NULL); + result = m_entry->search(tokens) | result; + m_entry->m_chunk.set_size(0); + delete [] bvbuf; + + retval = cursor->step(); + if (!retval) { + delete cursor; + return result; + } + + bksiz = 0; + bkbuf = cursor->get_key(&bksiz); + } + + delete cursor; + return result; +} + /* add_index/remove_index method */ int PhraseLargeTable3::add_index(int phrase_length, /* in */ const ucs4_t phrase[], @@ -192,7 +293,7 @@ int PhraseLargeTable3::add_index(int phrase_length, m_entry->m_chunk.set_size(vsiz); /* m_chunk may re-allocate here. */ vbuf = (char *) m_entry->m_chunk.begin(); - assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); + check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); int result = m_entry->add_index(token); @@ -223,7 +324,7 @@ int PhraseLargeTable3::remove_index(int phrase_length, m_entry->m_chunk.set_size(vsiz); /* m_chunk may re-allocate here. */ vbuf = (char *) m_entry->m_chunk.begin(); - assert(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); + check_result(vsiz == m_db->get(kbuf, ksiz, vbuf, vsiz)); int result = m_entry->remove_index(token); if (ERROR_OK != result) diff --git a/src/storage/phrase_large_table3_kyotodb.h b/src/storage/phrase_large_table3_kyotodb.h index d122de0..c7f3b87 100644 --- a/src/storage/phrase_large_table3_kyotodb.h +++ b/src/storage/phrase_large_table3_kyotodb.h @@ -60,6 +60,8 @@ public: /* search method */ int search(int phrase_length, /* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const; + int search_suggestion(int phrase_length, /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const; /* add_index/remove_index method */ int add_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token); diff --git a/src/storage/pinyin_custom2.h b/src/storage/pinyin_custom2.h index fe2e3bd..320e846 100644 --- a/src/storage/pinyin_custom2.h +++ b/src/storage/pinyin_custom2.h @@ -28,7 +28,7 @@ G_BEGIN_DECLS /** * PinyinTableFlag: */ -enum PinyinTableFlag{ +typedef enum{ IS_PINYIN = 1U << 1, IS_ZHUYIN = 1U << 2, PINYIN_INCOMPLETE = 1U << 3, @@ -38,7 +38,7 @@ enum PinyinTableFlag{ USE_DIVIDED_TABLE = 1U << 7, USE_RESPLIT_TABLE = 1U << 8, DYNAMIC_ADJUST = 1U << 9 -}; +} PinyinTableFlag; /** * PinyinAmbiguity2: @@ -46,7 +46,7 @@ enum PinyinTableFlag{ * The enums of pinyin ambiguities. * */ -enum PinyinAmbiguity2{ +typedef enum{ PINYIN_AMB_C_CH = 1U << 10, PINYIN_AMB_S_SH = 1U << 11, PINYIN_AMB_Z_ZH = 1U << 12, @@ -58,7 +58,7 @@ enum PinyinAmbiguity2{ PINYIN_AMB_EN_ENG = 1U << 18, PINYIN_AMB_IN_ING = 1U << 19, PINYIN_AMB_ALL = 0x3FFU << 10 -}; +} PinyinAmbiguity2; /** * PinyinCorrection2: @@ -67,7 +67,7 @@ enum PinyinAmbiguity2{ * */ -enum PinyinCorrection2{ +typedef enum{ PINYIN_CORRECT_GN_NG = 1U << 21, PINYIN_CORRECT_MG_NG = 1U << 22, PINYIN_CORRECT_IOU_IU = 1U << 23, @@ -77,37 +77,35 @@ enum PinyinCorrection2{ PINYIN_CORRECT_V_U = 1U << 27, PINYIN_CORRECT_ON_ONG = 1U << 28, PINYIN_CORRECT_ALL = 0xFFU << 21 -}; +} PinyinCorrection2; /** - * PinyinCorrection2: + * ZhuyinCorrection2: * - * The enums of pinyin corrections. + * The enums of zhuyin corrections. * */ -enum ZhuyinCorrection2{ +typedef enum{ ZHUYIN_CORRECT_HSU = 1U << 29, ZHUYIN_CORRECT_ETEN26 = 1U << 30, ZHUYIN_CORRECT_SHUFFLE = 1U << 31, ZHUYIN_CORRECT_ALL = 0x7U << 29 -}; +} ZhuyinCorrection2; /** * @brief enums of Full Pinyin Schemes. */ -enum FullPinyinScheme -{ +typedef enum{ FULL_PINYIN_HANYU = 1, FULL_PINYIN_LUOMA = 2, FULL_PINYIN_SECONDARY_ZHUYIN = 3, FULL_PINYIN_DEFAULT = FULL_PINYIN_HANYU -}; +} FullPinyinScheme; /** * @brief enums of Double Pinyin Schemes. */ -enum DoublePinyinScheme -{ +typedef enum{ DOUBLE_PINYIN_ZRM = 1, DOUBLE_PINYIN_MS = 2, DOUBLE_PINYIN_ZIGUANG = 3, @@ -116,13 +114,12 @@ enum DoublePinyinScheme DOUBLE_PINYIN_XHE = 6, DOUBLE_PINYIN_CUSTOMIZED = 30, /* for user's keyboard */ DOUBLE_PINYIN_DEFAULT = DOUBLE_PINYIN_MS -}; +} DoublePinyinScheme; /** * @brief enums of Zhuyin Schemes. */ -enum ZhuyinScheme -{ +typedef enum{ ZHUYIN_STANDARD = 1, ZHUYIN_HSU = 2, ZHUYIN_IBM = 3, @@ -133,7 +130,7 @@ enum ZhuyinScheme ZHUYIN_HSU_DVORAK = 8, ZHUYIN_DACHEN_CP26 = 9, ZHUYIN_DEFAULT = ZHUYIN_STANDARD -}; +} ZhuyinScheme; G_END_DECLS diff --git a/src/storage/pinyin_custom3.h b/src/storage/pinyin_custom3.h deleted file mode 100644 index 37e89bb..0000000 --- a/src/storage/pinyin_custom3.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * libpinyin - * Library to deal with pinyin. - * - * Copyright (C) 2016 Peng Wu <alexepico@gmail.com> - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - - -#include <glib.h> - - -G_BEGIN_DECLS - - -/** - * PinyinStandardOption: type pinyin_option_t. - * - * The enums of pinyin standard option. - * - */ -enum PinyinStandardOption{ - IS_PINYIN = 1U << 1, - IS_ZHUYIN = 1U << 2, - PINYIN_INCOMPLETE = 1U << 3, - ZHUYIN_INCOMPLETE = 1U << 4, - USE_TONE = 1U << 5, - FORCE_TONE = 1U << 6, - USE_DIVIDED_TABLE = 1U << 7, - USE_RESPLIT_TABLE = 1U << 8, - DYNAMIC_ADJUST = 1U << 9, - FULL_PINYIN_SUPPORT_QUOTATION = 1U << 10, -}; - -/** - * PinyinFuzzyOption: type pinyin_option_t. - * - * The enums of pinyin fuzzy option. - * - */ -enum PinyinFuzzyOption{ - PINYIN_FUZZY_C_CH = 1U << 10, - PINYIN_FUZZY_S_SH = 1U << 11, - PINYIN_FUZZY_Z_ZH = 1U << 12, - PINYIN_FUZZY_F_H = 1U << 13, - PINYIN_FUZZY_G_K = 1U << 14, - PINYIN_FUZZY_L_N = 1U << 15, - PINYIN_FUZZY_L_R = 1U << 16, - PINYIN_FUZZY_AN_ANG = 1U << 17, - PINYIN_FUZZY_EN_ENG = 1U << 18, - PINYIN_FUZZY_IN_ING = 1U << 19, - PINYIN_FUZZY_ALL = 0x3FFU << 10 -}; - -/** - * PinyinCorrectOption: type pinyin_option_t. - * - * The enums of pinyin correct option. - * - */ -enum PinyinCorrectOption{ - PINYIN_CORRECT_GN_NG = 1U << 21, - PINYIN_CORRECT_MG_NG = 1U << 22, - PINYIN_CORRECT_IOU_IU = 1U << 23, - PINYIN_CORRECT_UEI_UI = 1U << 24, - PINYIN_CORRECT_UEN_UN = 1U << 25, - PINYIN_CORRECT_UE_VE = 1U << 26, - PINYIN_CORRECT_V_U = 1U << 27, - PINYIN_CORRECT_ON_ONG = 1U << 28, - PINYIN_CORRECT_ALL = 0xFFU << 21, -}; - -/** - * ZhuyinCorrectOption: type pinyin_option_t. - * - * The enums of zhuyin correct option. - * - */ -enum ZhuyinCorrectOption{ - ZHUYIN_CORRECT_HSU = 1U << 29, - ZHUYIN_CORRECT_ETEN26 = 1U << 30, - ZHUYIN_CORRECT_SHUFFLE = 1U << 31, - ZHUYIN_CORRECT_ALL = 0x7U << 29 -}; - -/** - * @brief enums of Full Pinyin Schemes. - */ -enum FullPinyinScheme -{ - FULL_PINYIN_HANYU = 1, - FULL_PINYIN_LUOMA = 2, - FULL_PINYIN_SECONDARY_BOPOMOFO = 3, - FULL_PINYIN_DEFAULT = FULL_PINYIN_HANYU -}; - -/** - * @brief enums of Double Pinyin Schemes. - */ -enum DoublePinyinScheme -{ - DOUBLE_PINYIN_ZRM = 1, - DOUBLE_PINYIN_MS = 2, - DOUBLE_PINYIN_ZIGUANG = 3, - DOUBLE_PINYIN_ABC = 4, - DOUBLE_PINYIN_PYJJ = 5, - DOUBLE_PINYIN_XHE = 6, - DOUBLE_PINYIN_CUSTOMIZED = 30, /* for user's keyboard */ - DOUBLE_PINYIN_DEFAULT = DOUBLE_PINYIN_MS -}; - -/** - * @brief enums of Zhuyin Schemes. - */ -enum ZhuyinScheme -{ - ZHUYIN_STANDARD = 1, - ZHUYIN_HSU = 2, - ZHUYIN_IBM = 3, - ZHUYIN_GINYIEH = 4, - ZHUYIN_ETEN = 5, - ZHUYIN_ETEN26 = 6, - ZHUYIN_STANDARD_DVORAK = 7, - ZHUYIN_HSU_DVORAK = 8, - ZHUYIN_DACHEN_CP26 = 9, - ZHUYIN_DEFAULT = ZHUYIN_STANDARD -}; - - -G_END_DECLS diff --git a/src/storage/pinyin_parser2.cpp b/src/storage/pinyin_parser2.cpp index da32455..ce640c3 100644 --- a/src/storage/pinyin_parser2.cpp +++ b/src/storage/pinyin_parser2.cpp @@ -66,6 +66,7 @@ struct parse_value_t{ ChewingKeyRest m_key_rest; gint16 m_num_keys; gint16 m_parsed_len; + gint16 m_distance; gint16 m_last_step; /* constructor */ @@ -73,6 +74,7 @@ public: parse_value_t(){ m_num_keys = 0; m_parsed_len = 0; + m_distance = 0; m_last_step = -1; } }; @@ -121,7 +123,8 @@ static inline bool search_pinyin_index2(pinyin_option_t options, const pinyin_index_item_t * index, size_t len, const char * pinyin, - ChewingKey & key){ + ChewingKey & key, + gint16 & distance){ pinyin_index_item_t item; memset(&item, 0, sizeof(item)); item.m_pinyin_input = pinyin; @@ -141,6 +144,7 @@ static inline bool search_pinyin_index2(pinyin_option_t options, return false; key = content_table[index->m_table_index].m_chewing_key; + distance = index->m_distance; assert(key.get_table_index() == index->m_table_index); return true; } @@ -159,6 +163,7 @@ FullPinyinParser2::FullPinyinParser2 (){ bool FullPinyinParser2::parse_one_key (pinyin_option_t options, ChewingKey & key, + gint16 & distance, const char * pinyin, int len) const { /* "'" are not accepted in parse_one_key. */ gchar * input = g_strndup(pinyin, len); @@ -189,7 +194,7 @@ bool FullPinyinParser2::parse_one_key (pinyin_option_t options, /* Note: optimize here? */ input[parsed_len] = '\0'; if (!search_pinyin_index2(options, m_pinyin_index, m_pinyin_index_len, - input, key)) { + input, key, distance)) { g_free(input); return false; } @@ -239,6 +244,7 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys, nextstep->m_key_rest = ChewingKeyRest(); nextstep->m_num_keys = curstep->m_num_keys; nextstep->m_parsed_len = curstep->m_parsed_len + 1; + nextstep->m_distance = curstep->m_distance; nextstep->m_last_step = i; next_sep = 0; continue; @@ -265,13 +271,14 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys, nextstep = &g_array_index(m_parse_steps, parse_value_t, n); /* gen next step */ + gint16 distance = 0; const char * onepinyin = input + m; gint16 onepinyinlen = n - m; value = parse_value_t(); ChewingKey key; ChewingKeyRest rest; bool parsed = parse_one_key - (options, key, onepinyin, onepinyinlen); + (options, key, distance, onepinyin, onepinyinlen); rest.m_raw_begin = m; rest.m_raw_end = n; if (!parsed) continue; @@ -281,6 +288,7 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys, value.m_key = key; value.m_key_rest = rest; value.m_num_keys = curstep->m_num_keys + 1; value.m_parsed_len = curstep->m_parsed_len + onepinyinlen; + value.m_distance = curstep->m_distance + distance; value.m_last_step = m; /* save next step */ @@ -297,21 +305,26 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys, /* handle with the same pinyin length and the number of keys */ if (value.m_parsed_len == nextstep->m_parsed_len && - value.m_num_keys == nextstep->m_num_keys) { + value.m_num_keys == nextstep->m_num_keys && + value.m_distance < nextstep->m_distance) + *nextstep = value; #if 0 - /* prefer the 'a' at the end of clause, - * ex: "zheyanga$" -> "zhe'yang'a$". - */ - if (value.m_parsed_len == len && - (nextstep->m_key.m_initial != CHEWING_ZERO_INITIAL && - nextstep->m_key.m_final == CHEWING_A) && - (value.m_key.m_initial == CHEWING_ZERO_INITIAL && - value.m_key.m_middle == CHEWING_ZERO_MIDDLE && - value.m_key.m_final == CHEWING_A)) - *nextstep = value; + /* prefer the 'a' at the end of clause, + * ex: "zheyanga$" -> "zhe'yang'a$". + */ + if (value.m_parsed_len == len && + (value.m_parsed_len == nextstep->m_parsed_len && + value.m_num_keys == nextstep->m_num_keys && + value.m_distance == nextstep->m_distance) && + (nextstep->m_key.m_initial != CHEWING_ZERO_INITIAL && + nextstep->m_key.m_middle == CHEWING_ZERO_MIDDLE && + nextstep->m_key.m_final == CHEWING_A) && + (value.m_key.m_initial == CHEWING_ZERO_INITIAL && + value.m_key.m_middle == CHEWING_ZERO_MIDDLE && + value.m_key.m_final == CHEWING_A)) + *nextstep = value; #endif - } } } } @@ -382,7 +395,7 @@ bool FullPinyinParser2::set_scheme(FullPinyinScheme scheme){ m_pinyin_index_len = G_N_ELEMENTS(secondary_zhuyin_index); break; default: - assert(false); + abort(); } return true; } @@ -391,6 +404,7 @@ bool FullPinyinParser2::set_scheme(FullPinyinScheme scheme){ bool DoublePinyinParser2::parse_one_key(pinyin_option_t options, ChewingKey & key, + gint16 & distance, const char *str, int len) const { options &= ~(PINYIN_CORRECT_ALL|PINYIN_AMB_ALL); @@ -537,9 +551,10 @@ int DoublePinyinParser2::parse(pinyin_option_t options, ChewingKeyVector & keys, i = std_lite::min(maximum_len - parsed_len, (int)max_double_pinyin_length); + gint16 distance = 0; ChewingKey key; ChewingKeyRest key_rest; for (; i > 0; --i) { - bool success = parse_one_key(options, key, cur_str, i); + bool success = parse_one_key(options, key, distance, cur_str, i); if (success) break; } @@ -593,7 +608,7 @@ bool DoublePinyinParser2::set_scheme(DoublePinyinScheme scheme) { m_fallback_table = double_pinyin_xhe_fallback; return true; case DOUBLE_PINYIN_CUSTOMIZED: - assert(FALSE); + abort(); }; return false; /* no such scheme. */ @@ -607,6 +622,7 @@ PinyinDirectParser2::PinyinDirectParser2 (){ bool PinyinDirectParser2::parse_one_key(pinyin_option_t options, ChewingKey & key, + gint16 & distance, const char *str, int len) const { /* "'" are not accepted in parse_one_key. */ gchar * input = g_strndup(str, len); @@ -637,7 +653,7 @@ bool PinyinDirectParser2::parse_one_key(pinyin_option_t options, /* Note: optimize here? */ input[parsed_len] = '\0'; if (!search_pinyin_index2(options, m_pinyin_index, m_pinyin_index_len, - input, key)) { + input, key, distance)) { g_free(input); return false; } @@ -663,8 +679,6 @@ int PinyinDirectParser2::parse(pinyin_option_t options, g_array_set_size(keys, 0); g_array_set_size(key_rests, 0); - ChewingKey key; ChewingKeyRest key_rest; - int parsed_len = 0; int i = 0, cur = 0, next = 0; while (cur < len) { @@ -675,7 +689,9 @@ int PinyinDirectParser2::parse(pinyin_option_t options, } next = i; - if (parse_one_key(options, key, str + cur, next - cur)) { + gint16 distance = 0; + ChewingKey key; ChewingKeyRest key_rest; + if (parse_one_key(options, key, distance, str + cur, next - cur)) { key_rest.m_raw_begin = cur; key_rest.m_raw_end = next; /* save the pinyin. */ diff --git a/src/storage/pinyin_parser2.h b/src/storage/pinyin_parser2.h index 4a2f0f2..790c0cb 100644 --- a/src/storage/pinyin_parser2.h +++ b/src/storage/pinyin_parser2.h @@ -42,6 +42,7 @@ typedef struct { const char * m_pinyin_input; guint32 m_flags; guint16 m_table_index; + guint16 m_distance; } pinyin_index_item_t; typedef struct { @@ -114,7 +115,7 @@ public: * Parse only one struct ChewingKey from a string. * */ - virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const = 0; + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const = 0; /** * PhoneticParser2::parse: @@ -156,7 +157,7 @@ public: g_array_free(m_parse_steps, TRUE); } - virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const; /* Note: * the parse method will use dynamic programming to drive parse_one_key. @@ -195,7 +196,7 @@ public: virtual ~DoublePinyinParser2() {} - virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const; virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const; @@ -215,7 +216,7 @@ public: virtual ~PinyinDirectParser2() {} - virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const; virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const; }; diff --git a/src/storage/pinyin_parser_table.h b/src/storage/pinyin_parser_table.h index ad8bf7c..031a629 100644 --- a/src/storage/pinyin_parser_table.h +++ b/src/storage/pinyin_parser_table.h @@ -7,666 +7,666 @@ namespace pinyin{ const pinyin_index_item_t pinyin_index[] = { -{"a", IS_ZHUYIN|IS_PINYIN, 1}, -{"agn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 4}, -{"ai", IS_ZHUYIN|IS_PINYIN, 2}, -{"amg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 4}, -{"an", IS_ZHUYIN|IS_PINYIN, 3}, -{"ang", IS_ZHUYIN|IS_PINYIN, 4}, -{"ao", IS_ZHUYIN|IS_PINYIN, 5}, -{"b", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 6}, -{"ba", IS_ZHUYIN|IS_PINYIN, 7}, -{"bagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 10}, -{"bai", IS_ZHUYIN|IS_PINYIN, 8}, -{"bamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 10}, -{"ban", IS_ZHUYIN|IS_PINYIN, 9}, -{"bang", IS_ZHUYIN|IS_PINYIN, 10}, -{"bao", IS_ZHUYIN|IS_PINYIN, 11}, -{"begn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 14}, -{"bei", IS_ZHUYIN|IS_PINYIN, 12}, -{"bemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 14}, -{"ben", IS_ZHUYIN|IS_PINYIN, 13}, -{"beng", IS_ZHUYIN|IS_PINYIN, 14}, -{"bi", IS_ZHUYIN|IS_PINYIN, 15}, -{"bian", IS_ZHUYIN|IS_PINYIN, 16}, -{"biao", IS_ZHUYIN|IS_PINYIN, 17}, -{"bie", IS_ZHUYIN|IS_PINYIN, 18}, -{"bign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 20}, -{"bimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 20}, -{"bin", IS_ZHUYIN|IS_PINYIN, 19}, -{"bing", IS_ZHUYIN|IS_PINYIN, 20}, -{"bo", IS_ZHUYIN|IS_PINYIN, 21}, -{"bu", IS_ZHUYIN|IS_PINYIN, 22}, -{"c", IS_PINYIN|PINYIN_INCOMPLETE, 23}, -{"ca", IS_ZHUYIN|IS_PINYIN, 24}, -{"cagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 27}, -{"cai", IS_ZHUYIN|IS_PINYIN, 25}, -{"camg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 27}, -{"can", IS_ZHUYIN|IS_PINYIN, 26}, -{"cang", IS_ZHUYIN|IS_PINYIN, 27}, -{"cao", IS_ZHUYIN|IS_PINYIN, 28}, -{"ce", IS_ZHUYIN|IS_PINYIN, 29}, -{"cegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 31}, -{"cemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 31}, -{"cen", IS_ZHUYIN|IS_PINYIN, 30}, -{"ceng", IS_ZHUYIN|IS_PINYIN, 31}, -{"ch", IS_PINYIN|PINYIN_INCOMPLETE, 32}, -{"cha", IS_ZHUYIN|IS_PINYIN, 33}, -{"chagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 36}, -{"chai", IS_ZHUYIN|IS_PINYIN, 34}, -{"chamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 36}, -{"chan", IS_ZHUYIN|IS_PINYIN, 35}, -{"chang", IS_ZHUYIN|IS_PINYIN, 36}, -{"chao", IS_ZHUYIN|IS_PINYIN, 37}, -{"che", IS_ZHUYIN|IS_PINYIN, 38}, -{"chegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 40}, -{"chemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 40}, -{"chen", IS_ZHUYIN|IS_PINYIN, 39}, -{"cheng", IS_ZHUYIN|IS_PINYIN, 40}, -{"chi", IS_ZHUYIN|IS_PINYIN, 41}, -{"chogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 42}, -{"chomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 42}, -{"chon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 42}, -{"chong", IS_ZHUYIN|IS_PINYIN, 42}, -{"chou", IS_ZHUYIN|IS_PINYIN, 43}, -{"chu", IS_ZHUYIN|IS_PINYIN, 44}, -{"chuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 48}, -{"chuai", IS_ZHUYIN|IS_PINYIN, 46}, -{"chuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 48}, -{"chuan", IS_ZHUYIN|IS_PINYIN, 47}, -{"chuang", IS_ZHUYIN|IS_PINYIN, 48}, -{"chuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 49}, -{"chuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 50}, -{"chui", IS_ZHUYIN|IS_PINYIN, 49}, -{"chun", IS_ZHUYIN|IS_PINYIN, 50}, -{"chuo", IS_ZHUYIN|IS_PINYIN, 51}, -{"ci", IS_ZHUYIN|IS_PINYIN, 52}, -{"cogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 53}, -{"comg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 53}, -{"con", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 53}, -{"cong", IS_ZHUYIN|IS_PINYIN, 53}, -{"cou", IS_ZHUYIN|IS_PINYIN, 54}, -{"cu", IS_ZHUYIN|IS_PINYIN, 55}, -{"cuan", IS_ZHUYIN|IS_PINYIN, 56}, -{"cuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 57}, -{"cuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 58}, -{"cui", IS_ZHUYIN|IS_PINYIN, 57}, -{"cun", IS_ZHUYIN|IS_PINYIN, 58}, -{"cuo", IS_ZHUYIN|IS_PINYIN, 59}, -{"d", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 60}, -{"da", IS_ZHUYIN|IS_PINYIN, 61}, -{"dagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 64}, -{"dai", IS_ZHUYIN|IS_PINYIN, 62}, -{"damg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 64}, -{"dan", IS_ZHUYIN|IS_PINYIN, 63}, -{"dang", IS_ZHUYIN|IS_PINYIN, 64}, -{"dao", IS_ZHUYIN|IS_PINYIN, 65}, -{"de", IS_ZHUYIN|IS_PINYIN, 66}, -{"degn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 69}, -{"dei", IS_ZHUYIN|IS_PINYIN, 67}, -{"demg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 69}, -{"deng", IS_ZHUYIN|IS_PINYIN, 69}, -{"di", IS_ZHUYIN|IS_PINYIN, 70}, -{"dia", IS_ZHUYIN|IS_PINYIN, 71}, -{"dian", IS_ZHUYIN|IS_PINYIN, 72}, -{"diao", IS_ZHUYIN|IS_PINYIN, 73}, -{"die", IS_ZHUYIN|IS_PINYIN, 74}, -{"dign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 76}, -{"dimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 76}, -{"ding", IS_ZHUYIN|IS_PINYIN, 76}, -{"diou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 77}, -{"diu", IS_ZHUYIN|IS_PINYIN, 77}, -{"dogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 78}, -{"domg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 78}, -{"don", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 78}, -{"dong", IS_ZHUYIN|IS_PINYIN, 78}, -{"dou", IS_ZHUYIN|IS_PINYIN, 79}, -{"du", IS_ZHUYIN|IS_PINYIN, 80}, -{"duan", IS_ZHUYIN|IS_PINYIN, 81}, -{"duei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 82}, -{"duen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 83}, -{"dui", IS_ZHUYIN|IS_PINYIN, 82}, -{"dun", IS_ZHUYIN|IS_PINYIN, 83}, -{"duo", IS_ZHUYIN|IS_PINYIN, 84}, -{"e", IS_ZHUYIN|IS_PINYIN, 85}, -{"egn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 88}, -{"ei", IS_ZHUYIN|IS_PINYIN, 86}, -{"emg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 88}, -{"en", IS_ZHUYIN|IS_PINYIN, 87}, -{"er", IS_ZHUYIN|IS_PINYIN, 89}, -{"f", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 90}, -{"fa", IS_ZHUYIN|IS_PINYIN, 91}, -{"fagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 93}, -{"famg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 93}, -{"fan", IS_ZHUYIN|IS_PINYIN, 92}, -{"fang", IS_ZHUYIN|IS_PINYIN, 93}, -{"fegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 97}, -{"fei", IS_ZHUYIN|IS_PINYIN, 95}, -{"femg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 97}, -{"fen", IS_ZHUYIN|IS_PINYIN, 96}, -{"feng", IS_ZHUYIN|IS_PINYIN, 97}, -{"fo", IS_ZHUYIN|IS_PINYIN, 98}, -{"fou", IS_ZHUYIN|IS_PINYIN, 99}, -{"fu", IS_ZHUYIN|IS_PINYIN, 100}, -{"g", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 101}, -{"ga", IS_ZHUYIN|IS_PINYIN, 102}, -{"gagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 105}, -{"gai", IS_ZHUYIN|IS_PINYIN, 103}, -{"gamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 105}, -{"gan", IS_ZHUYIN|IS_PINYIN, 104}, -{"gang", IS_ZHUYIN|IS_PINYIN, 105}, -{"gao", IS_ZHUYIN|IS_PINYIN, 106}, -{"ge", IS_ZHUYIN|IS_PINYIN, 107}, -{"gegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 110}, -{"gei", IS_ZHUYIN|IS_PINYIN, 108}, -{"gemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 110}, -{"gen", IS_ZHUYIN|IS_PINYIN, 109}, -{"geng", IS_ZHUYIN|IS_PINYIN, 110}, -{"gogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 111}, -{"gomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 111}, -{"gon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 111}, -{"gong", IS_ZHUYIN|IS_PINYIN, 111}, -{"gou", IS_ZHUYIN|IS_PINYIN, 112}, -{"gu", IS_ZHUYIN|IS_PINYIN, 113}, -{"gua", IS_ZHUYIN|IS_PINYIN, 114}, -{"guagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 117}, -{"guai", IS_ZHUYIN|IS_PINYIN, 115}, -{"guamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 117}, -{"guan", IS_ZHUYIN|IS_PINYIN, 116}, -{"guang", IS_ZHUYIN|IS_PINYIN, 117}, -{"guei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 118}, -{"guen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 119}, -{"gui", IS_ZHUYIN|IS_PINYIN, 118}, -{"gun", IS_ZHUYIN|IS_PINYIN, 119}, -{"guo", IS_ZHUYIN|IS_PINYIN, 120}, -{"h", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 121}, -{"ha", IS_ZHUYIN|IS_PINYIN, 122}, -{"hagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 125}, -{"hai", IS_ZHUYIN|IS_PINYIN, 123}, -{"hamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 125}, -{"han", IS_ZHUYIN|IS_PINYIN, 124}, -{"hang", IS_ZHUYIN|IS_PINYIN, 125}, -{"hao", IS_ZHUYIN|IS_PINYIN, 126}, -{"he", IS_ZHUYIN|IS_PINYIN, 127}, -{"hegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 130}, -{"hei", IS_ZHUYIN|IS_PINYIN, 128}, -{"hemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 130}, -{"hen", IS_ZHUYIN|IS_PINYIN, 129}, -{"heng", IS_ZHUYIN|IS_PINYIN, 130}, -{"hogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 131}, -{"homg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 131}, -{"hon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 131}, -{"hong", IS_ZHUYIN|IS_PINYIN, 131}, -{"hou", IS_ZHUYIN|IS_PINYIN, 132}, -{"hu", IS_ZHUYIN|IS_PINYIN, 133}, -{"hua", IS_ZHUYIN|IS_PINYIN, 134}, -{"huagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 137}, -{"huai", IS_ZHUYIN|IS_PINYIN, 135}, -{"huamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 137}, -{"huan", IS_ZHUYIN|IS_PINYIN, 136}, -{"huang", IS_ZHUYIN|IS_PINYIN, 137}, -{"huei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 138}, -{"huen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 139}, -{"hui", IS_ZHUYIN|IS_PINYIN, 138}, -{"hun", IS_ZHUYIN|IS_PINYIN, 139}, -{"huo", IS_ZHUYIN|IS_PINYIN, 140}, -{"j", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 141}, -{"ji", IS_ZHUYIN|IS_PINYIN, 142}, -{"jia", IS_ZHUYIN|IS_PINYIN, 143}, -{"jiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 145}, -{"jiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 145}, -{"jian", IS_ZHUYIN|IS_PINYIN, 144}, -{"jiang", IS_ZHUYIN|IS_PINYIN, 145}, -{"jiao", IS_ZHUYIN|IS_PINYIN, 146}, -{"jie", IS_ZHUYIN|IS_PINYIN, 147}, -{"jign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 149}, -{"jimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 149}, -{"jin", IS_ZHUYIN|IS_PINYIN, 148}, -{"jing", IS_ZHUYIN|IS_PINYIN, 149}, -{"jiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 150}, -{"jiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 150}, -{"jion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 150}, -{"jiong", IS_ZHUYIN|IS_PINYIN, 150}, -{"jiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 151}, -{"jiu", IS_ZHUYIN|IS_PINYIN, 151}, -{"ju", IS_ZHUYIN|IS_PINYIN, 152}, -{"juan", IS_ZHUYIN|IS_PINYIN, 153}, -{"jue", IS_ZHUYIN|IS_PINYIN, 154}, -{"juen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 155}, -{"jun", IS_ZHUYIN|IS_PINYIN, 155}, -{"jv", IS_PINYIN|PINYIN_CORRECT_V_U, 152}, -{"jvan", IS_PINYIN|PINYIN_CORRECT_V_U, 153}, -{"jve", IS_PINYIN|PINYIN_CORRECT_V_U, 154}, -{"jvn", IS_PINYIN|PINYIN_CORRECT_V_U, 155}, -{"k", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 156}, -{"ka", IS_ZHUYIN|IS_PINYIN, 157}, -{"kagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 160}, -{"kai", IS_ZHUYIN|IS_PINYIN, 158}, -{"kamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 160}, -{"kan", IS_ZHUYIN|IS_PINYIN, 159}, -{"kang", IS_ZHUYIN|IS_PINYIN, 160}, -{"kao", IS_ZHUYIN|IS_PINYIN, 161}, -{"ke", IS_ZHUYIN|IS_PINYIN, 162}, -{"kegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 165}, -{"kemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 165}, -{"ken", IS_ZHUYIN|IS_PINYIN, 164}, -{"keng", IS_ZHUYIN|IS_PINYIN, 165}, -{"kogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 166}, -{"komg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 166}, -{"kon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 166}, -{"kong", IS_ZHUYIN|IS_PINYIN, 166}, -{"kou", IS_ZHUYIN|IS_PINYIN, 167}, -{"ku", IS_ZHUYIN|IS_PINYIN, 168}, -{"kua", IS_ZHUYIN|IS_PINYIN, 169}, -{"kuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 172}, -{"kuai", IS_ZHUYIN|IS_PINYIN, 170}, -{"kuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 172}, -{"kuan", IS_ZHUYIN|IS_PINYIN, 171}, -{"kuang", IS_ZHUYIN|IS_PINYIN, 172}, -{"kuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 173}, -{"kuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 174}, -{"kui", IS_ZHUYIN|IS_PINYIN, 173}, -{"kun", IS_ZHUYIN|IS_PINYIN, 174}, -{"kuo", IS_ZHUYIN|IS_PINYIN, 175}, -{"l", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 176}, -{"la", IS_ZHUYIN|IS_PINYIN, 177}, -{"lagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 180}, -{"lai", IS_ZHUYIN|IS_PINYIN, 178}, -{"lamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 180}, -{"lan", IS_ZHUYIN|IS_PINYIN, 179}, -{"lang", IS_ZHUYIN|IS_PINYIN, 180}, -{"lao", IS_ZHUYIN|IS_PINYIN, 181}, -{"le", IS_ZHUYIN|IS_PINYIN, 182}, -{"legn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 185}, -{"lei", IS_ZHUYIN|IS_PINYIN, 183}, -{"lemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 185}, -{"leng", IS_ZHUYIN|IS_PINYIN, 185}, -{"li", IS_ZHUYIN|IS_PINYIN, 186}, -{"lia", IS_ZHUYIN|IS_PINYIN, 187}, -{"liagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 189}, -{"liamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 189}, -{"lian", IS_ZHUYIN|IS_PINYIN, 188}, -{"liang", IS_ZHUYIN|IS_PINYIN, 189}, -{"liao", IS_ZHUYIN|IS_PINYIN, 190}, -{"lie", IS_ZHUYIN|IS_PINYIN, 191}, -{"lign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 193}, -{"limg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 193}, -{"lin", IS_ZHUYIN|IS_PINYIN, 192}, -{"ling", IS_ZHUYIN|IS_PINYIN, 193}, -{"liou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 194}, -{"liu", IS_ZHUYIN|IS_PINYIN, 194}, -{"lo", IS_ZHUYIN|IS_PINYIN, 195}, -{"logn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 196}, -{"lomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 196}, -{"lon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 196}, -{"long", IS_ZHUYIN|IS_PINYIN, 196}, -{"lou", IS_ZHUYIN|IS_PINYIN, 197}, -{"lu", IS_ZHUYIN|IS_PINYIN, 198}, -{"luan", IS_ZHUYIN|IS_PINYIN, 199}, -{"lue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 203}, -{"luen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 200}, -{"lun", IS_ZHUYIN|IS_PINYIN, 200}, -{"luo", IS_ZHUYIN|IS_PINYIN, 201}, -{"lv", IS_ZHUYIN|IS_PINYIN, 202}, -{"lve", IS_ZHUYIN|IS_PINYIN, 203}, -{"m", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 204}, -{"ma", IS_ZHUYIN|IS_PINYIN, 205}, -{"magn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 208}, -{"mai", IS_ZHUYIN|IS_PINYIN, 206}, -{"mamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 208}, -{"man", IS_ZHUYIN|IS_PINYIN, 207}, -{"mang", IS_ZHUYIN|IS_PINYIN, 208}, -{"mao", IS_ZHUYIN|IS_PINYIN, 209}, -{"me", IS_ZHUYIN|IS_PINYIN, 210}, -{"megn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 213}, -{"mei", IS_ZHUYIN|IS_PINYIN, 211}, -{"memg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 213}, -{"men", IS_ZHUYIN|IS_PINYIN, 212}, -{"meng", IS_ZHUYIN|IS_PINYIN, 213}, -{"mi", IS_ZHUYIN|IS_PINYIN, 214}, -{"mian", IS_ZHUYIN|IS_PINYIN, 215}, -{"miao", IS_ZHUYIN|IS_PINYIN, 216}, -{"mie", IS_ZHUYIN|IS_PINYIN, 217}, -{"mign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 219}, -{"mimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 219}, -{"min", IS_ZHUYIN|IS_PINYIN, 218}, -{"ming", IS_ZHUYIN|IS_PINYIN, 219}, -{"miou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 220}, -{"miu", IS_ZHUYIN|IS_PINYIN, 220}, -{"mo", IS_ZHUYIN|IS_PINYIN, 221}, -{"mou", IS_ZHUYIN|IS_PINYIN, 222}, -{"mu", IS_ZHUYIN|IS_PINYIN, 223}, -{"n", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 224}, -{"na", IS_ZHUYIN|IS_PINYIN, 225}, -{"nagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 228}, -{"nai", IS_ZHUYIN|IS_PINYIN, 226}, -{"namg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 228}, -{"nan", IS_ZHUYIN|IS_PINYIN, 227}, -{"nang", IS_ZHUYIN|IS_PINYIN, 228}, -{"nao", IS_ZHUYIN|IS_PINYIN, 229}, -{"ne", IS_ZHUYIN|IS_PINYIN, 230}, -{"negn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 233}, -{"nei", IS_ZHUYIN|IS_PINYIN, 231}, -{"nemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 233}, -{"nen", IS_ZHUYIN|IS_PINYIN, 232}, -{"neng", IS_ZHUYIN|IS_PINYIN, 233}, -{"ng", IS_ZHUYIN|IS_PINYIN, 234}, -{"ni", IS_ZHUYIN|IS_PINYIN, 235}, -{"niagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 238}, -{"niamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 238}, -{"nian", IS_ZHUYIN|IS_PINYIN, 237}, -{"niang", IS_ZHUYIN|IS_PINYIN, 238}, -{"niao", IS_ZHUYIN|IS_PINYIN, 239}, -{"nie", IS_ZHUYIN|IS_PINYIN, 240}, -{"nign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 242}, -{"nimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 242}, -{"nin", IS_ZHUYIN|IS_PINYIN, 241}, -{"ning", IS_ZHUYIN|IS_PINYIN, 242}, -{"niou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 243}, -{"niu", IS_ZHUYIN|IS_PINYIN, 243}, -{"nogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 244}, -{"nomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 244}, -{"non", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 244}, -{"nong", IS_ZHUYIN|IS_PINYIN, 244}, -{"nou", IS_ZHUYIN|IS_PINYIN, 245}, -{"nu", IS_ZHUYIN|IS_PINYIN, 246}, -{"nuan", IS_ZHUYIN|IS_PINYIN, 247}, -{"nue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 251}, -{"nuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 248}, -{"nuo", IS_ZHUYIN|IS_PINYIN, 249}, -{"nv", IS_ZHUYIN|IS_PINYIN, 250}, -{"nve", IS_ZHUYIN|IS_PINYIN, 251}, -{"o", IS_ZHUYIN|IS_PINYIN, 252}, -{"ou", IS_ZHUYIN|IS_PINYIN, 253}, -{"p", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 254}, -{"pa", IS_ZHUYIN|IS_PINYIN, 255}, -{"pagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 258}, -{"pai", IS_ZHUYIN|IS_PINYIN, 256}, -{"pamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 258}, -{"pan", IS_ZHUYIN|IS_PINYIN, 257}, -{"pang", IS_ZHUYIN|IS_PINYIN, 258}, -{"pao", IS_ZHUYIN|IS_PINYIN, 259}, -{"pegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 262}, -{"pei", IS_ZHUYIN|IS_PINYIN, 260}, -{"pemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 262}, -{"pen", IS_ZHUYIN|IS_PINYIN, 261}, -{"peng", IS_ZHUYIN|IS_PINYIN, 262}, -{"pi", IS_ZHUYIN|IS_PINYIN, 263}, -{"pian", IS_ZHUYIN|IS_PINYIN, 264}, -{"piao", IS_ZHUYIN|IS_PINYIN, 265}, -{"pie", IS_ZHUYIN|IS_PINYIN, 266}, -{"pign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 268}, -{"pimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 268}, -{"pin", IS_ZHUYIN|IS_PINYIN, 267}, -{"ping", IS_ZHUYIN|IS_PINYIN, 268}, -{"po", IS_ZHUYIN|IS_PINYIN, 269}, -{"pou", IS_ZHUYIN|IS_PINYIN, 270}, -{"pu", IS_ZHUYIN|IS_PINYIN, 271}, -{"q", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 272}, -{"qi", IS_ZHUYIN|IS_PINYIN, 273}, -{"qia", IS_ZHUYIN|IS_PINYIN, 274}, -{"qiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 276}, -{"qiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 276}, -{"qian", IS_ZHUYIN|IS_PINYIN, 275}, -{"qiang", IS_ZHUYIN|IS_PINYIN, 276}, -{"qiao", IS_ZHUYIN|IS_PINYIN, 277}, -{"qie", IS_ZHUYIN|IS_PINYIN, 278}, -{"qign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 280}, -{"qimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 280}, -{"qin", IS_ZHUYIN|IS_PINYIN, 279}, -{"qing", IS_ZHUYIN|IS_PINYIN, 280}, -{"qiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 281}, -{"qiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 281}, -{"qion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 281}, -{"qiong", IS_ZHUYIN|IS_PINYIN, 281}, -{"qiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 282}, -{"qiu", IS_ZHUYIN|IS_PINYIN, 282}, -{"qu", IS_ZHUYIN|IS_PINYIN, 283}, -{"quan", IS_ZHUYIN|IS_PINYIN, 284}, -{"que", IS_ZHUYIN|IS_PINYIN, 285}, -{"quen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 286}, -{"qun", IS_ZHUYIN|IS_PINYIN, 286}, -{"qv", IS_PINYIN|PINYIN_CORRECT_V_U, 283}, -{"qvan", IS_PINYIN|PINYIN_CORRECT_V_U, 284}, -{"qve", IS_PINYIN|PINYIN_CORRECT_V_U, 285}, -{"qvn", IS_PINYIN|PINYIN_CORRECT_V_U, 286}, -{"r", IS_PINYIN|PINYIN_INCOMPLETE, 287}, -{"ragn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 289}, -{"ramg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 289}, -{"ran", IS_ZHUYIN|IS_PINYIN, 288}, -{"rang", IS_ZHUYIN|IS_PINYIN, 289}, -{"rao", IS_ZHUYIN|IS_PINYIN, 290}, -{"re", IS_ZHUYIN|IS_PINYIN, 291}, -{"regn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 293}, -{"remg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 293}, -{"ren", IS_ZHUYIN|IS_PINYIN, 292}, -{"reng", IS_ZHUYIN|IS_PINYIN, 293}, -{"ri", IS_ZHUYIN|IS_PINYIN, 294}, -{"rogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 295}, -{"romg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 295}, -{"ron", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 295}, -{"rong", IS_ZHUYIN|IS_PINYIN, 295}, -{"rou", IS_ZHUYIN|IS_PINYIN, 296}, -{"ru", IS_ZHUYIN|IS_PINYIN, 297}, -{"ruan", IS_ZHUYIN|IS_PINYIN, 299}, -{"ruei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 300}, -{"ruen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 301}, -{"rui", IS_ZHUYIN|IS_PINYIN, 300}, -{"run", IS_ZHUYIN|IS_PINYIN, 301}, -{"ruo", IS_ZHUYIN|IS_PINYIN, 302}, -{"s", IS_PINYIN|PINYIN_INCOMPLETE, 303}, -{"sa", IS_ZHUYIN|IS_PINYIN, 304}, -{"sagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 307}, -{"sai", IS_ZHUYIN|IS_PINYIN, 305}, -{"samg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 307}, -{"san", IS_ZHUYIN|IS_PINYIN, 306}, -{"sang", IS_ZHUYIN|IS_PINYIN, 307}, -{"sao", IS_ZHUYIN|IS_PINYIN, 308}, -{"se", IS_ZHUYIN|IS_PINYIN, 309}, -{"segn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 311}, -{"semg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 311}, -{"sen", IS_ZHUYIN|IS_PINYIN, 310}, -{"seng", IS_ZHUYIN|IS_PINYIN, 311}, -{"sh", IS_PINYIN|PINYIN_INCOMPLETE, 312}, -{"sha", IS_ZHUYIN|IS_PINYIN, 313}, -{"shagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 316}, -{"shai", IS_ZHUYIN|IS_PINYIN, 314}, -{"shamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 316}, -{"shan", IS_ZHUYIN|IS_PINYIN, 315}, -{"shang", IS_ZHUYIN|IS_PINYIN, 316}, -{"shao", IS_ZHUYIN|IS_PINYIN, 317}, -{"she", IS_ZHUYIN|IS_PINYIN, 318}, -{"shegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 321}, -{"shei", IS_ZHUYIN|IS_PINYIN, 319}, -{"shemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 321}, -{"shen", IS_ZHUYIN|IS_PINYIN, 320}, -{"sheng", IS_ZHUYIN|IS_PINYIN, 321}, -{"shi", IS_ZHUYIN|IS_PINYIN, 322}, -{"shou", IS_ZHUYIN|IS_PINYIN, 323}, -{"shu", IS_ZHUYIN|IS_PINYIN, 324}, -{"shua", IS_ZHUYIN|IS_PINYIN, 325}, -{"shuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 328}, -{"shuai", IS_ZHUYIN|IS_PINYIN, 326}, -{"shuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 328}, -{"shuan", IS_ZHUYIN|IS_PINYIN, 327}, -{"shuang", IS_ZHUYIN|IS_PINYIN, 328}, -{"shuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 329}, -{"shuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 330}, -{"shui", IS_ZHUYIN|IS_PINYIN, 329}, -{"shun", IS_ZHUYIN|IS_PINYIN, 330}, -{"shuo", IS_ZHUYIN|IS_PINYIN, 331}, -{"si", IS_ZHUYIN|IS_PINYIN, 332}, -{"sogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 333}, -{"somg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 333}, -{"son", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 333}, -{"song", IS_ZHUYIN|IS_PINYIN, 333}, -{"sou", IS_ZHUYIN|IS_PINYIN, 334}, -{"su", IS_ZHUYIN|IS_PINYIN, 335}, -{"suan", IS_ZHUYIN|IS_PINYIN, 336}, -{"suei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 337}, -{"suen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 338}, -{"sui", IS_ZHUYIN|IS_PINYIN, 337}, -{"sun", IS_ZHUYIN|IS_PINYIN, 338}, -{"suo", IS_ZHUYIN|IS_PINYIN, 339}, -{"t", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 340}, -{"ta", IS_ZHUYIN|IS_PINYIN, 341}, -{"tagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 344}, -{"tai", IS_ZHUYIN|IS_PINYIN, 342}, -{"tamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 344}, -{"tan", IS_ZHUYIN|IS_PINYIN, 343}, -{"tang", IS_ZHUYIN|IS_PINYIN, 344}, -{"tao", IS_ZHUYIN|IS_PINYIN, 345}, -{"te", IS_ZHUYIN|IS_PINYIN, 346}, -{"tegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 347}, -{"temg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 347}, -{"teng", IS_ZHUYIN|IS_PINYIN, 347}, -{"ti", IS_ZHUYIN|IS_PINYIN, 348}, -{"tian", IS_ZHUYIN|IS_PINYIN, 349}, -{"tiao", IS_ZHUYIN|IS_PINYIN, 350}, -{"tie", IS_ZHUYIN|IS_PINYIN, 351}, -{"tign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 352}, -{"timg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 352}, -{"ting", IS_ZHUYIN|IS_PINYIN, 352}, -{"togn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 353}, -{"tomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 353}, -{"ton", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 353}, -{"tong", IS_ZHUYIN|IS_PINYIN, 353}, -{"tou", IS_ZHUYIN|IS_PINYIN, 354}, -{"tu", IS_ZHUYIN|IS_PINYIN, 355}, -{"tuan", IS_ZHUYIN|IS_PINYIN, 356}, -{"tuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 357}, -{"tuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 358}, -{"tui", IS_ZHUYIN|IS_PINYIN, 357}, -{"tun", IS_ZHUYIN|IS_PINYIN, 358}, -{"tuo", IS_ZHUYIN|IS_PINYIN, 359}, -{"w", IS_PINYIN|PINYIN_INCOMPLETE, 360}, -{"wa", IS_ZHUYIN|IS_PINYIN, 361}, -{"wagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 364}, -{"wai", IS_ZHUYIN|IS_PINYIN, 362}, -{"wamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 364}, -{"wan", IS_ZHUYIN|IS_PINYIN, 363}, -{"wang", IS_ZHUYIN|IS_PINYIN, 364}, -{"wegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 367}, -{"wei", IS_ZHUYIN|IS_PINYIN, 365}, -{"wemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 367}, -{"wen", IS_ZHUYIN|IS_PINYIN, 366}, -{"weng", IS_ZHUYIN|IS_PINYIN, 367}, -{"wo", IS_ZHUYIN|IS_PINYIN, 368}, -{"wu", IS_ZHUYIN|IS_PINYIN, 369}, -{"x", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 370}, -{"xi", IS_ZHUYIN|IS_PINYIN, 371}, -{"xia", IS_ZHUYIN|IS_PINYIN, 372}, -{"xiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 374}, -{"xiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 374}, -{"xian", IS_ZHUYIN|IS_PINYIN, 373}, -{"xiang", IS_ZHUYIN|IS_PINYIN, 374}, -{"xiao", IS_ZHUYIN|IS_PINYIN, 375}, -{"xie", IS_ZHUYIN|IS_PINYIN, 376}, -{"xign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 378}, -{"ximg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 378}, -{"xin", IS_ZHUYIN|IS_PINYIN, 377}, -{"xing", IS_ZHUYIN|IS_PINYIN, 378}, -{"xiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 379}, -{"xiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 379}, -{"xion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 379}, -{"xiong", IS_ZHUYIN|IS_PINYIN, 379}, -{"xiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 380}, -{"xiu", IS_ZHUYIN|IS_PINYIN, 380}, -{"xu", IS_ZHUYIN|IS_PINYIN, 381}, -{"xuan", IS_ZHUYIN|IS_PINYIN, 382}, -{"xue", IS_ZHUYIN|IS_PINYIN, 383}, -{"xuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 384}, -{"xun", IS_ZHUYIN|IS_PINYIN, 384}, -{"xv", IS_PINYIN|PINYIN_CORRECT_V_U, 381}, -{"xvan", IS_PINYIN|PINYIN_CORRECT_V_U, 382}, -{"xve", IS_PINYIN|PINYIN_CORRECT_V_U, 383}, -{"xvn", IS_PINYIN|PINYIN_CORRECT_V_U, 384}, -{"y", IS_PINYIN|PINYIN_INCOMPLETE, 385}, -{"ya", IS_ZHUYIN|IS_PINYIN, 386}, -{"yagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 389}, -{"yamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 389}, -{"yan", IS_ZHUYIN|IS_PINYIN, 388}, -{"yang", IS_ZHUYIN|IS_PINYIN, 389}, -{"yao", IS_ZHUYIN|IS_PINYIN, 390}, -{"ye", IS_ZHUYIN|IS_PINYIN, 391}, -{"yi", IS_ZHUYIN|IS_PINYIN, 392}, -{"yign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 394}, -{"yimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 394}, -{"yin", IS_ZHUYIN|IS_PINYIN, 393}, -{"ying", IS_ZHUYIN|IS_PINYIN, 394}, -{"yo", IS_ZHUYIN|IS_PINYIN, 395}, -{"yogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 396}, -{"yomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 396}, -{"yon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 396}, -{"yong", IS_ZHUYIN|IS_PINYIN, 396}, -{"you", IS_ZHUYIN|IS_PINYIN, 397}, -{"yu", IS_ZHUYIN|IS_PINYIN, 398}, -{"yuan", IS_ZHUYIN|IS_PINYIN, 399}, -{"yue", IS_ZHUYIN|IS_PINYIN, 400}, -{"yuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 401}, -{"yun", IS_ZHUYIN|IS_PINYIN, 401}, -{"yv", IS_PINYIN|PINYIN_CORRECT_V_U, 398}, -{"yvan", IS_PINYIN|PINYIN_CORRECT_V_U, 399}, -{"yve", IS_PINYIN|PINYIN_CORRECT_V_U, 400}, -{"yvn", IS_PINYIN|PINYIN_CORRECT_V_U, 401}, -{"z", IS_PINYIN|PINYIN_INCOMPLETE, 402}, -{"za", IS_ZHUYIN|IS_PINYIN, 403}, -{"zagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 406}, -{"zai", IS_ZHUYIN|IS_PINYIN, 404}, -{"zamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 406}, -{"zan", IS_ZHUYIN|IS_PINYIN, 405}, -{"zang", IS_ZHUYIN|IS_PINYIN, 406}, -{"zao", IS_ZHUYIN|IS_PINYIN, 407}, -{"ze", IS_ZHUYIN|IS_PINYIN, 408}, -{"zegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 411}, -{"zei", IS_ZHUYIN|IS_PINYIN, 409}, -{"zemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 411}, -{"zen", IS_ZHUYIN|IS_PINYIN, 410}, -{"zeng", IS_ZHUYIN|IS_PINYIN, 411}, -{"zh", IS_PINYIN|PINYIN_INCOMPLETE, 412}, -{"zha", IS_ZHUYIN|IS_PINYIN, 413}, -{"zhagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 416}, -{"zhai", IS_ZHUYIN|IS_PINYIN, 414}, -{"zhamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 416}, -{"zhan", IS_ZHUYIN|IS_PINYIN, 415}, -{"zhang", IS_ZHUYIN|IS_PINYIN, 416}, -{"zhao", IS_ZHUYIN|IS_PINYIN, 417}, -{"zhe", IS_ZHUYIN|IS_PINYIN, 418}, -{"zhegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 421}, -{"zhemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 421}, -{"zhen", IS_ZHUYIN|IS_PINYIN, 420}, -{"zheng", IS_ZHUYIN|IS_PINYIN, 421}, -{"zhi", IS_ZHUYIN|IS_PINYIN, 422}, -{"zhogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 423}, -{"zhomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 423}, -{"zhon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 423}, -{"zhong", IS_ZHUYIN|IS_PINYIN, 423}, -{"zhou", IS_ZHUYIN|IS_PINYIN, 424}, -{"zhu", IS_ZHUYIN|IS_PINYIN, 425}, -{"zhua", IS_ZHUYIN|IS_PINYIN, 426}, -{"zhuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 429}, -{"zhuai", IS_ZHUYIN|IS_PINYIN, 427}, -{"zhuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 429}, -{"zhuan", IS_ZHUYIN|IS_PINYIN, 428}, -{"zhuang", IS_ZHUYIN|IS_PINYIN, 429}, -{"zhuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 430}, -{"zhuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 431}, -{"zhui", IS_ZHUYIN|IS_PINYIN, 430}, -{"zhun", IS_ZHUYIN|IS_PINYIN, 431}, -{"zhuo", IS_ZHUYIN|IS_PINYIN, 432}, -{"zi", IS_ZHUYIN|IS_PINYIN, 433}, -{"zogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 434}, -{"zomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 434}, -{"zon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 434}, -{"zong", IS_ZHUYIN|IS_PINYIN, 434}, -{"zou", IS_ZHUYIN|IS_PINYIN, 435}, -{"zu", IS_ZHUYIN|IS_PINYIN, 436}, -{"zuan", IS_ZHUYIN|IS_PINYIN, 437}, -{"zuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 438}, -{"zuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 439}, -{"zui", IS_ZHUYIN|IS_PINYIN, 438}, -{"zun", IS_ZHUYIN|IS_PINYIN, 439}, -{"zuo", IS_ZHUYIN|IS_PINYIN, 440} +{"a", IS_ZHUYIN|IS_PINYIN, 1, 0}, +{"agn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 4, 1}, +{"ai", IS_ZHUYIN|IS_PINYIN, 2, 0}, +{"amg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 4, 1}, +{"an", IS_ZHUYIN|IS_PINYIN, 3, 0}, +{"ang", IS_ZHUYIN|IS_PINYIN, 4, 0}, +{"ao", IS_ZHUYIN|IS_PINYIN, 5, 0}, +{"b", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 6, 0}, +{"ba", IS_ZHUYIN|IS_PINYIN, 7, 0}, +{"bagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 10, 1}, +{"bai", IS_ZHUYIN|IS_PINYIN, 8, 0}, +{"bamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 10, 1}, +{"ban", IS_ZHUYIN|IS_PINYIN, 9, 0}, +{"bang", IS_ZHUYIN|IS_PINYIN, 10, 0}, +{"bao", IS_ZHUYIN|IS_PINYIN, 11, 0}, +{"begn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 14, 1}, +{"bei", IS_ZHUYIN|IS_PINYIN, 12, 0}, +{"bemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 14, 1}, +{"ben", IS_ZHUYIN|IS_PINYIN, 13, 0}, +{"beng", IS_ZHUYIN|IS_PINYIN, 14, 0}, +{"bi", IS_ZHUYIN|IS_PINYIN, 15, 0}, +{"bian", IS_ZHUYIN|IS_PINYIN, 16, 0}, +{"biao", IS_ZHUYIN|IS_PINYIN, 17, 0}, +{"bie", IS_ZHUYIN|IS_PINYIN, 18, 0}, +{"bign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 20, 1}, +{"bimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 20, 1}, +{"bin", IS_ZHUYIN|IS_PINYIN, 19, 0}, +{"bing", IS_ZHUYIN|IS_PINYIN, 20, 0}, +{"bo", IS_ZHUYIN|IS_PINYIN, 21, 0}, +{"bu", IS_ZHUYIN|IS_PINYIN, 22, 0}, +{"c", IS_PINYIN|PINYIN_INCOMPLETE, 23, 0}, +{"ca", IS_ZHUYIN|IS_PINYIN, 24, 0}, +{"cagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 27, 1}, +{"cai", IS_ZHUYIN|IS_PINYIN, 25, 0}, +{"camg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 27, 1}, +{"can", IS_ZHUYIN|IS_PINYIN, 26, 0}, +{"cang", IS_ZHUYIN|IS_PINYIN, 27, 0}, +{"cao", IS_ZHUYIN|IS_PINYIN, 28, 0}, +{"ce", IS_ZHUYIN|IS_PINYIN, 29, 0}, +{"cegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 31, 1}, +{"cemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 31, 1}, +{"cen", IS_ZHUYIN|IS_PINYIN, 30, 0}, +{"ceng", IS_ZHUYIN|IS_PINYIN, 31, 0}, +{"ch", IS_PINYIN|PINYIN_INCOMPLETE, 32, 0}, +{"cha", IS_ZHUYIN|IS_PINYIN, 33, 0}, +{"chagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 36, 1}, +{"chai", IS_ZHUYIN|IS_PINYIN, 34, 0}, +{"chamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 36, 1}, +{"chan", IS_ZHUYIN|IS_PINYIN, 35, 0}, +{"chang", IS_ZHUYIN|IS_PINYIN, 36, 0}, +{"chao", IS_ZHUYIN|IS_PINYIN, 37, 0}, +{"che", IS_ZHUYIN|IS_PINYIN, 38, 0}, +{"chegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 40, 1}, +{"chemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 40, 1}, +{"chen", IS_ZHUYIN|IS_PINYIN, 39, 0}, +{"cheng", IS_ZHUYIN|IS_PINYIN, 40, 0}, +{"chi", IS_ZHUYIN|IS_PINYIN, 41, 0}, +{"chogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 42, 1}, +{"chomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 42, 1}, +{"chon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 42, 1}, +{"chong", IS_ZHUYIN|IS_PINYIN, 42, 0}, +{"chou", IS_ZHUYIN|IS_PINYIN, 43, 0}, +{"chu", IS_ZHUYIN|IS_PINYIN, 44, 0}, +{"chuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 48, 1}, +{"chuai", IS_ZHUYIN|IS_PINYIN, 46, 0}, +{"chuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 48, 1}, +{"chuan", IS_ZHUYIN|IS_PINYIN, 47, 0}, +{"chuang", IS_ZHUYIN|IS_PINYIN, 48, 0}, +{"chuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 49, 1}, +{"chuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 50, 1}, +{"chui", IS_ZHUYIN|IS_PINYIN, 49, 0}, +{"chun", IS_ZHUYIN|IS_PINYIN, 50, 0}, +{"chuo", IS_ZHUYIN|IS_PINYIN, 51, 0}, +{"ci", IS_ZHUYIN|IS_PINYIN, 52, 0}, +{"cogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 53, 1}, +{"comg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 53, 1}, +{"con", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 53, 1}, +{"cong", IS_ZHUYIN|IS_PINYIN, 53, 0}, +{"cou", IS_ZHUYIN|IS_PINYIN, 54, 0}, +{"cu", IS_ZHUYIN|IS_PINYIN, 55, 0}, +{"cuan", IS_ZHUYIN|IS_PINYIN, 56, 0}, +{"cuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 57, 1}, +{"cuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 58, 1}, +{"cui", IS_ZHUYIN|IS_PINYIN, 57, 0}, +{"cun", IS_ZHUYIN|IS_PINYIN, 58, 0}, +{"cuo", IS_ZHUYIN|IS_PINYIN, 59, 0}, +{"d", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 60, 0}, +{"da", IS_ZHUYIN|IS_PINYIN, 61, 0}, +{"dagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 64, 1}, +{"dai", IS_ZHUYIN|IS_PINYIN, 62, 0}, +{"damg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 64, 1}, +{"dan", IS_ZHUYIN|IS_PINYIN, 63, 0}, +{"dang", IS_ZHUYIN|IS_PINYIN, 64, 0}, +{"dao", IS_ZHUYIN|IS_PINYIN, 65, 0}, +{"de", IS_ZHUYIN|IS_PINYIN, 66, 0}, +{"degn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 69, 1}, +{"dei", IS_ZHUYIN|IS_PINYIN, 67, 0}, +{"demg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 69, 1}, +{"deng", IS_ZHUYIN|IS_PINYIN, 69, 0}, +{"di", IS_ZHUYIN|IS_PINYIN, 70, 0}, +{"dia", IS_ZHUYIN|IS_PINYIN, 71, 0}, +{"dian", IS_ZHUYIN|IS_PINYIN, 72, 0}, +{"diao", IS_ZHUYIN|IS_PINYIN, 73, 0}, +{"die", IS_ZHUYIN|IS_PINYIN, 74, 0}, +{"dign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 76, 1}, +{"dimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 76, 1}, +{"ding", IS_ZHUYIN|IS_PINYIN, 76, 0}, +{"diou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 77, 1}, +{"diu", IS_ZHUYIN|IS_PINYIN, 77, 0}, +{"dogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 78, 1}, +{"domg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 78, 1}, +{"don", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 78, 1}, +{"dong", IS_ZHUYIN|IS_PINYIN, 78, 0}, +{"dou", IS_ZHUYIN|IS_PINYIN, 79, 0}, +{"du", IS_ZHUYIN|IS_PINYIN, 80, 0}, +{"duan", IS_ZHUYIN|IS_PINYIN, 81, 0}, +{"duei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 82, 1}, +{"duen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 83, 1}, +{"dui", IS_ZHUYIN|IS_PINYIN, 82, 0}, +{"dun", IS_ZHUYIN|IS_PINYIN, 83, 0}, +{"duo", IS_ZHUYIN|IS_PINYIN, 84, 0}, +{"e", IS_ZHUYIN|IS_PINYIN, 85, 0}, +{"egn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 88, 1}, +{"ei", IS_ZHUYIN|IS_PINYIN, 86, 0}, +{"emg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 88, 1}, +{"en", IS_ZHUYIN|IS_PINYIN, 87, 0}, +{"er", IS_ZHUYIN|IS_PINYIN, 89, 0}, +{"f", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 90, 0}, +{"fa", IS_ZHUYIN|IS_PINYIN, 91, 0}, +{"fagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 93, 1}, +{"famg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 93, 1}, +{"fan", IS_ZHUYIN|IS_PINYIN, 92, 0}, +{"fang", IS_ZHUYIN|IS_PINYIN, 93, 0}, +{"fegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 97, 1}, +{"fei", IS_ZHUYIN|IS_PINYIN, 95, 0}, +{"femg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 97, 1}, +{"fen", IS_ZHUYIN|IS_PINYIN, 96, 0}, +{"feng", IS_ZHUYIN|IS_PINYIN, 97, 0}, +{"fo", IS_ZHUYIN|IS_PINYIN, 98, 0}, +{"fou", IS_ZHUYIN|IS_PINYIN, 99, 0}, +{"fu", IS_ZHUYIN|IS_PINYIN, 100, 0}, +{"g", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 101, 0}, +{"ga", IS_ZHUYIN|IS_PINYIN, 102, 0}, +{"gagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 105, 1}, +{"gai", IS_ZHUYIN|IS_PINYIN, 103, 0}, +{"gamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 105, 1}, +{"gan", IS_ZHUYIN|IS_PINYIN, 104, 0}, +{"gang", IS_ZHUYIN|IS_PINYIN, 105, 0}, +{"gao", IS_ZHUYIN|IS_PINYIN, 106, 0}, +{"ge", IS_ZHUYIN|IS_PINYIN, 107, 0}, +{"gegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 110, 1}, +{"gei", IS_ZHUYIN|IS_PINYIN, 108, 0}, +{"gemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 110, 1}, +{"gen", IS_ZHUYIN|IS_PINYIN, 109, 0}, +{"geng", IS_ZHUYIN|IS_PINYIN, 110, 0}, +{"gogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 111, 1}, +{"gomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 111, 1}, +{"gon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 111, 1}, +{"gong", IS_ZHUYIN|IS_PINYIN, 111, 0}, +{"gou", IS_ZHUYIN|IS_PINYIN, 112, 0}, +{"gu", IS_ZHUYIN|IS_PINYIN, 113, 0}, +{"gua", IS_ZHUYIN|IS_PINYIN, 114, 0}, +{"guagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 117, 1}, +{"guai", IS_ZHUYIN|IS_PINYIN, 115, 0}, +{"guamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 117, 1}, +{"guan", IS_ZHUYIN|IS_PINYIN, 116, 0}, +{"guang", IS_ZHUYIN|IS_PINYIN, 117, 0}, +{"guei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 118, 1}, +{"guen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 119, 1}, +{"gui", IS_ZHUYIN|IS_PINYIN, 118, 0}, +{"gun", IS_ZHUYIN|IS_PINYIN, 119, 0}, +{"guo", IS_ZHUYIN|IS_PINYIN, 120, 0}, +{"h", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 121, 0}, +{"ha", IS_ZHUYIN|IS_PINYIN, 122, 0}, +{"hagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 125, 1}, +{"hai", IS_ZHUYIN|IS_PINYIN, 123, 0}, +{"hamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 125, 1}, +{"han", IS_ZHUYIN|IS_PINYIN, 124, 0}, +{"hang", IS_ZHUYIN|IS_PINYIN, 125, 0}, +{"hao", IS_ZHUYIN|IS_PINYIN, 126, 0}, +{"he", IS_ZHUYIN|IS_PINYIN, 127, 0}, +{"hegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 130, 1}, +{"hei", IS_ZHUYIN|IS_PINYIN, 128, 0}, +{"hemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 130, 1}, +{"hen", IS_ZHUYIN|IS_PINYIN, 129, 0}, +{"heng", IS_ZHUYIN|IS_PINYIN, 130, 0}, +{"hogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 131, 1}, +{"homg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 131, 1}, +{"hon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 131, 1}, +{"hong", IS_ZHUYIN|IS_PINYIN, 131, 0}, +{"hou", IS_ZHUYIN|IS_PINYIN, 132, 0}, +{"hu", IS_ZHUYIN|IS_PINYIN, 133, 0}, +{"hua", IS_ZHUYIN|IS_PINYIN, 134, 0}, +{"huagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 137, 1}, +{"huai", IS_ZHUYIN|IS_PINYIN, 135, 0}, +{"huamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 137, 1}, +{"huan", IS_ZHUYIN|IS_PINYIN, 136, 0}, +{"huang", IS_ZHUYIN|IS_PINYIN, 137, 0}, +{"huei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 138, 1}, +{"huen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 139, 1}, +{"hui", IS_ZHUYIN|IS_PINYIN, 138, 0}, +{"hun", IS_ZHUYIN|IS_PINYIN, 139, 0}, +{"huo", IS_ZHUYIN|IS_PINYIN, 140, 0}, +{"j", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 141, 0}, +{"ji", IS_ZHUYIN|IS_PINYIN, 142, 0}, +{"jia", IS_ZHUYIN|IS_PINYIN, 143, 0}, +{"jiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 145, 1}, +{"jiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 145, 1}, +{"jian", IS_ZHUYIN|IS_PINYIN, 144, 0}, +{"jiang", IS_ZHUYIN|IS_PINYIN, 145, 0}, +{"jiao", IS_ZHUYIN|IS_PINYIN, 146, 0}, +{"jie", IS_ZHUYIN|IS_PINYIN, 147, 0}, +{"jign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 149, 1}, +{"jimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 149, 1}, +{"jin", IS_ZHUYIN|IS_PINYIN, 148, 0}, +{"jing", IS_ZHUYIN|IS_PINYIN, 149, 0}, +{"jiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 150, 1}, +{"jiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 150, 1}, +{"jion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 150, 1}, +{"jiong", IS_ZHUYIN|IS_PINYIN, 150, 0}, +{"jiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 151, 1}, +{"jiu", IS_ZHUYIN|IS_PINYIN, 151, 0}, +{"ju", IS_ZHUYIN|IS_PINYIN, 152, 0}, +{"juan", IS_ZHUYIN|IS_PINYIN, 153, 0}, +{"jue", IS_ZHUYIN|IS_PINYIN, 154, 0}, +{"juen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 155, 1}, +{"jun", IS_ZHUYIN|IS_PINYIN, 155, 0}, +{"jv", IS_PINYIN|PINYIN_CORRECT_V_U, 152, 1}, +{"jvan", IS_PINYIN|PINYIN_CORRECT_V_U, 153, 1}, +{"jve", IS_PINYIN|PINYIN_CORRECT_V_U, 154, 1}, +{"jvn", IS_PINYIN|PINYIN_CORRECT_V_U, 155, 1}, +{"k", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 156, 0}, +{"ka", IS_ZHUYIN|IS_PINYIN, 157, 0}, +{"kagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 160, 1}, +{"kai", IS_ZHUYIN|IS_PINYIN, 158, 0}, +{"kamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 160, 1}, +{"kan", IS_ZHUYIN|IS_PINYIN, 159, 0}, +{"kang", IS_ZHUYIN|IS_PINYIN, 160, 0}, +{"kao", IS_ZHUYIN|IS_PINYIN, 161, 0}, +{"ke", IS_ZHUYIN|IS_PINYIN, 162, 0}, +{"kegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 165, 1}, +{"kemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 165, 1}, +{"ken", IS_ZHUYIN|IS_PINYIN, 164, 0}, +{"keng", IS_ZHUYIN|IS_PINYIN, 165, 0}, +{"kogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 166, 1}, +{"komg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 166, 1}, +{"kon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 166, 1}, +{"kong", IS_ZHUYIN|IS_PINYIN, 166, 0}, +{"kou", IS_ZHUYIN|IS_PINYIN, 167, 0}, +{"ku", IS_ZHUYIN|IS_PINYIN, 168, 0}, +{"kua", IS_ZHUYIN|IS_PINYIN, 169, 0}, +{"kuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 172, 1}, +{"kuai", IS_ZHUYIN|IS_PINYIN, 170, 0}, +{"kuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 172, 1}, +{"kuan", IS_ZHUYIN|IS_PINYIN, 171, 0}, +{"kuang", IS_ZHUYIN|IS_PINYIN, 172, 0}, +{"kuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 173, 1}, +{"kuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 174, 1}, +{"kui", IS_ZHUYIN|IS_PINYIN, 173, 0}, +{"kun", IS_ZHUYIN|IS_PINYIN, 174, 0}, +{"kuo", IS_ZHUYIN|IS_PINYIN, 175, 0}, +{"l", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 176, 0}, +{"la", IS_ZHUYIN|IS_PINYIN, 177, 0}, +{"lagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 180, 1}, +{"lai", IS_ZHUYIN|IS_PINYIN, 178, 0}, +{"lamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 180, 1}, +{"lan", IS_ZHUYIN|IS_PINYIN, 179, 0}, +{"lang", IS_ZHUYIN|IS_PINYIN, 180, 0}, +{"lao", IS_ZHUYIN|IS_PINYIN, 181, 0}, +{"le", IS_ZHUYIN|IS_PINYIN, 182, 0}, +{"legn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 185, 1}, +{"lei", IS_ZHUYIN|IS_PINYIN, 183, 0}, +{"lemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 185, 1}, +{"leng", IS_ZHUYIN|IS_PINYIN, 185, 0}, +{"li", IS_ZHUYIN|IS_PINYIN, 186, 0}, +{"lia", IS_ZHUYIN|IS_PINYIN, 187, 0}, +{"liagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 189, 1}, +{"liamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 189, 1}, +{"lian", IS_ZHUYIN|IS_PINYIN, 188, 0}, +{"liang", IS_ZHUYIN|IS_PINYIN, 189, 0}, +{"liao", IS_ZHUYIN|IS_PINYIN, 190, 0}, +{"lie", IS_ZHUYIN|IS_PINYIN, 191, 0}, +{"lign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 193, 1}, +{"limg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 193, 1}, +{"lin", IS_ZHUYIN|IS_PINYIN, 192, 0}, +{"ling", IS_ZHUYIN|IS_PINYIN, 193, 0}, +{"liou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 194, 1}, +{"liu", IS_ZHUYIN|IS_PINYIN, 194, 0}, +{"lo", IS_ZHUYIN|IS_PINYIN, 195, 0}, +{"logn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 196, 1}, +{"lomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 196, 1}, +{"lon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 196, 1}, +{"long", IS_ZHUYIN|IS_PINYIN, 196, 0}, +{"lou", IS_ZHUYIN|IS_PINYIN, 197, 0}, +{"lu", IS_ZHUYIN|IS_PINYIN, 198, 0}, +{"luan", IS_ZHUYIN|IS_PINYIN, 199, 0}, +{"lue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 203, 1}, +{"luen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 200, 1}, +{"lun", IS_ZHUYIN|IS_PINYIN, 200, 0}, +{"luo", IS_ZHUYIN|IS_PINYIN, 201, 0}, +{"lv", IS_ZHUYIN|IS_PINYIN, 202, 0}, +{"lve", IS_ZHUYIN|IS_PINYIN, 203, 0}, +{"m", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 204, 0}, +{"ma", IS_ZHUYIN|IS_PINYIN, 205, 0}, +{"magn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 208, 1}, +{"mai", IS_ZHUYIN|IS_PINYIN, 206, 0}, +{"mamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 208, 1}, +{"man", IS_ZHUYIN|IS_PINYIN, 207, 0}, +{"mang", IS_ZHUYIN|IS_PINYIN, 208, 0}, +{"mao", IS_ZHUYIN|IS_PINYIN, 209, 0}, +{"me", IS_ZHUYIN|IS_PINYIN, 210, 0}, +{"megn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 213, 1}, +{"mei", IS_ZHUYIN|IS_PINYIN, 211, 0}, +{"memg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 213, 1}, +{"men", IS_ZHUYIN|IS_PINYIN, 212, 0}, +{"meng", IS_ZHUYIN|IS_PINYIN, 213, 0}, +{"mi", IS_ZHUYIN|IS_PINYIN, 214, 0}, +{"mian", IS_ZHUYIN|IS_PINYIN, 215, 0}, +{"miao", IS_ZHUYIN|IS_PINYIN, 216, 0}, +{"mie", IS_ZHUYIN|IS_PINYIN, 217, 0}, +{"mign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 219, 1}, +{"mimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 219, 1}, +{"min", IS_ZHUYIN|IS_PINYIN, 218, 0}, +{"ming", IS_ZHUYIN|IS_PINYIN, 219, 0}, +{"miou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 220, 1}, +{"miu", IS_ZHUYIN|IS_PINYIN, 220, 0}, +{"mo", IS_ZHUYIN|IS_PINYIN, 221, 0}, +{"mou", IS_ZHUYIN|IS_PINYIN, 222, 0}, +{"mu", IS_ZHUYIN|IS_PINYIN, 223, 0}, +{"n", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 224, 0}, +{"na", IS_ZHUYIN|IS_PINYIN, 225, 0}, +{"nagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 228, 1}, +{"nai", IS_ZHUYIN|IS_PINYIN, 226, 0}, +{"namg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 228, 1}, +{"nan", IS_ZHUYIN|IS_PINYIN, 227, 0}, +{"nang", IS_ZHUYIN|IS_PINYIN, 228, 0}, +{"nao", IS_ZHUYIN|IS_PINYIN, 229, 0}, +{"ne", IS_ZHUYIN|IS_PINYIN, 230, 0}, +{"negn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 233, 1}, +{"nei", IS_ZHUYIN|IS_PINYIN, 231, 0}, +{"nemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 233, 1}, +{"nen", IS_ZHUYIN|IS_PINYIN, 232, 0}, +{"neng", IS_ZHUYIN|IS_PINYIN, 233, 0}, +{"ng", IS_ZHUYIN|IS_PINYIN, 234, 0}, +{"ni", IS_ZHUYIN|IS_PINYIN, 235, 0}, +{"niagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 238, 1}, +{"niamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 238, 1}, +{"nian", IS_ZHUYIN|IS_PINYIN, 237, 0}, +{"niang", IS_ZHUYIN|IS_PINYIN, 238, 0}, +{"niao", IS_ZHUYIN|IS_PINYIN, 239, 0}, +{"nie", IS_ZHUYIN|IS_PINYIN, 240, 0}, +{"nign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 242, 1}, +{"nimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 242, 1}, +{"nin", IS_ZHUYIN|IS_PINYIN, 241, 0}, +{"ning", IS_ZHUYIN|IS_PINYIN, 242, 0}, +{"niou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 243, 1}, +{"niu", IS_ZHUYIN|IS_PINYIN, 243, 0}, +{"nogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 244, 1}, +{"nomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 244, 1}, +{"non", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 244, 1}, +{"nong", IS_ZHUYIN|IS_PINYIN, 244, 0}, +{"nou", IS_ZHUYIN|IS_PINYIN, 245, 0}, +{"nu", IS_ZHUYIN|IS_PINYIN, 246, 0}, +{"nuan", IS_ZHUYIN|IS_PINYIN, 247, 0}, +{"nue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 251, 1}, +{"nuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 248, 1}, +{"nuo", IS_ZHUYIN|IS_PINYIN, 249, 0}, +{"nv", IS_ZHUYIN|IS_PINYIN, 250, 0}, +{"nve", IS_ZHUYIN|IS_PINYIN, 251, 0}, +{"o", IS_ZHUYIN|IS_PINYIN, 252, 0}, +{"ou", IS_ZHUYIN|IS_PINYIN, 253, 0}, +{"p", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 254, 0}, +{"pa", IS_ZHUYIN|IS_PINYIN, 255, 0}, +{"pagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 258, 1}, +{"pai", IS_ZHUYIN|IS_PINYIN, 256, 0}, +{"pamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 258, 1}, +{"pan", IS_ZHUYIN|IS_PINYIN, 257, 0}, +{"pang", IS_ZHUYIN|IS_PINYIN, 258, 0}, +{"pao", IS_ZHUYIN|IS_PINYIN, 259, 0}, +{"pegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 262, 1}, +{"pei", IS_ZHUYIN|IS_PINYIN, 260, 0}, +{"pemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 262, 1}, +{"pen", IS_ZHUYIN|IS_PINYIN, 261, 0}, +{"peng", IS_ZHUYIN|IS_PINYIN, 262, 0}, +{"pi", IS_ZHUYIN|IS_PINYIN, 263, 0}, +{"pian", IS_ZHUYIN|IS_PINYIN, 264, 0}, +{"piao", IS_ZHUYIN|IS_PINYIN, 265, 0}, +{"pie", IS_ZHUYIN|IS_PINYIN, 266, 0}, +{"pign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 268, 1}, +{"pimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 268, 1}, +{"pin", IS_ZHUYIN|IS_PINYIN, 267, 0}, +{"ping", IS_ZHUYIN|IS_PINYIN, 268, 0}, +{"po", IS_ZHUYIN|IS_PINYIN, 269, 0}, +{"pou", IS_ZHUYIN|IS_PINYIN, 270, 0}, +{"pu", IS_ZHUYIN|IS_PINYIN, 271, 0}, +{"q", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 272, 0}, +{"qi", IS_ZHUYIN|IS_PINYIN, 273, 0}, +{"qia", IS_ZHUYIN|IS_PINYIN, 274, 0}, +{"qiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 276, 1}, +{"qiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 276, 1}, +{"qian", IS_ZHUYIN|IS_PINYIN, 275, 0}, +{"qiang", IS_ZHUYIN|IS_PINYIN, 276, 0}, +{"qiao", IS_ZHUYIN|IS_PINYIN, 277, 0}, +{"qie", IS_ZHUYIN|IS_PINYIN, 278, 0}, +{"qign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 280, 1}, +{"qimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 280, 1}, +{"qin", IS_ZHUYIN|IS_PINYIN, 279, 0}, +{"qing", IS_ZHUYIN|IS_PINYIN, 280, 0}, +{"qiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 281, 1}, +{"qiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 281, 1}, +{"qion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 281, 1}, +{"qiong", IS_ZHUYIN|IS_PINYIN, 281, 0}, +{"qiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 282, 1}, +{"qiu", IS_ZHUYIN|IS_PINYIN, 282, 0}, +{"qu", IS_ZHUYIN|IS_PINYIN, 283, 0}, +{"quan", IS_ZHUYIN|IS_PINYIN, 284, 0}, +{"que", IS_ZHUYIN|IS_PINYIN, 285, 0}, +{"quen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 286, 1}, +{"qun", IS_ZHUYIN|IS_PINYIN, 286, 0}, +{"qv", IS_PINYIN|PINYIN_CORRECT_V_U, 283, 1}, +{"qvan", IS_PINYIN|PINYIN_CORRECT_V_U, 284, 1}, +{"qve", IS_PINYIN|PINYIN_CORRECT_V_U, 285, 1}, +{"qvn", IS_PINYIN|PINYIN_CORRECT_V_U, 286, 1}, +{"r", IS_PINYIN|PINYIN_INCOMPLETE, 287, 0}, +{"ragn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 289, 1}, +{"ramg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 289, 1}, +{"ran", IS_ZHUYIN|IS_PINYIN, 288, 0}, +{"rang", IS_ZHUYIN|IS_PINYIN, 289, 0}, +{"rao", IS_ZHUYIN|IS_PINYIN, 290, 0}, +{"re", IS_ZHUYIN|IS_PINYIN, 291, 0}, +{"regn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 293, 1}, +{"remg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 293, 1}, +{"ren", IS_ZHUYIN|IS_PINYIN, 292, 0}, +{"reng", IS_ZHUYIN|IS_PINYIN, 293, 0}, +{"ri", IS_ZHUYIN|IS_PINYIN, 294, 0}, +{"rogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 295, 1}, +{"romg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 295, 1}, +{"ron", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 295, 1}, +{"rong", IS_ZHUYIN|IS_PINYIN, 295, 0}, +{"rou", IS_ZHUYIN|IS_PINYIN, 296, 0}, +{"ru", IS_ZHUYIN|IS_PINYIN, 297, 0}, +{"ruan", IS_ZHUYIN|IS_PINYIN, 299, 0}, +{"ruei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 300, 1}, +{"ruen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 301, 1}, +{"rui", IS_ZHUYIN|IS_PINYIN, 300, 0}, +{"run", IS_ZHUYIN|IS_PINYIN, 301, 0}, +{"ruo", IS_ZHUYIN|IS_PINYIN, 302, 0}, +{"s", IS_PINYIN|PINYIN_INCOMPLETE, 303, 0}, +{"sa", IS_ZHUYIN|IS_PINYIN, 304, 0}, +{"sagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 307, 1}, +{"sai", IS_ZHUYIN|IS_PINYIN, 305, 0}, +{"samg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 307, 1}, +{"san", IS_ZHUYIN|IS_PINYIN, 306, 0}, +{"sang", IS_ZHUYIN|IS_PINYIN, 307, 0}, +{"sao", IS_ZHUYIN|IS_PINYIN, 308, 0}, +{"se", IS_ZHUYIN|IS_PINYIN, 309, 0}, +{"segn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 311, 1}, +{"semg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 311, 1}, +{"sen", IS_ZHUYIN|IS_PINYIN, 310, 0}, +{"seng", IS_ZHUYIN|IS_PINYIN, 311, 0}, +{"sh", IS_PINYIN|PINYIN_INCOMPLETE, 312, 0}, +{"sha", IS_ZHUYIN|IS_PINYIN, 313, 0}, +{"shagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 316, 1}, +{"shai", IS_ZHUYIN|IS_PINYIN, 314, 0}, +{"shamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 316, 1}, +{"shan", IS_ZHUYIN|IS_PINYIN, 315, 0}, +{"shang", IS_ZHUYIN|IS_PINYIN, 316, 0}, +{"shao", IS_ZHUYIN|IS_PINYIN, 317, 0}, +{"she", IS_ZHUYIN|IS_PINYIN, 318, 0}, +{"shegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 321, 1}, +{"shei", IS_ZHUYIN|IS_PINYIN, 319, 0}, +{"shemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 321, 1}, +{"shen", IS_ZHUYIN|IS_PINYIN, 320, 0}, +{"sheng", IS_ZHUYIN|IS_PINYIN, 321, 0}, +{"shi", IS_ZHUYIN|IS_PINYIN, 322, 0}, +{"shou", IS_ZHUYIN|IS_PINYIN, 323, 0}, +{"shu", IS_ZHUYIN|IS_PINYIN, 324, 0}, +{"shua", IS_ZHUYIN|IS_PINYIN, 325, 0}, +{"shuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 328, 1}, +{"shuai", IS_ZHUYIN|IS_PINYIN, 326, 0}, +{"shuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 328, 1}, +{"shuan", IS_ZHUYIN|IS_PINYIN, 327, 0}, +{"shuang", IS_ZHUYIN|IS_PINYIN, 328, 0}, +{"shuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 329, 1}, +{"shuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 330, 1}, +{"shui", IS_ZHUYIN|IS_PINYIN, 329, 0}, +{"shun", IS_ZHUYIN|IS_PINYIN, 330, 0}, +{"shuo", IS_ZHUYIN|IS_PINYIN, 331, 0}, +{"si", IS_ZHUYIN|IS_PINYIN, 332, 0}, +{"sogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 333, 1}, +{"somg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 333, 1}, +{"son", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 333, 1}, +{"song", IS_ZHUYIN|IS_PINYIN, 333, 0}, +{"sou", IS_ZHUYIN|IS_PINYIN, 334, 0}, +{"su", IS_ZHUYIN|IS_PINYIN, 335, 0}, +{"suan", IS_ZHUYIN|IS_PINYIN, 336, 0}, +{"suei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 337, 1}, +{"suen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 338, 1}, +{"sui", IS_ZHUYIN|IS_PINYIN, 337, 0}, +{"sun", IS_ZHUYIN|IS_PINYIN, 338, 0}, +{"suo", IS_ZHUYIN|IS_PINYIN, 339, 0}, +{"t", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 340, 0}, +{"ta", IS_ZHUYIN|IS_PINYIN, 341, 0}, +{"tagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 344, 1}, +{"tai", IS_ZHUYIN|IS_PINYIN, 342, 0}, +{"tamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 344, 1}, +{"tan", IS_ZHUYIN|IS_PINYIN, 343, 0}, +{"tang", IS_ZHUYIN|IS_PINYIN, 344, 0}, +{"tao", IS_ZHUYIN|IS_PINYIN, 345, 0}, +{"te", IS_ZHUYIN|IS_PINYIN, 346, 0}, +{"tegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 347, 1}, +{"temg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 347, 1}, +{"teng", IS_ZHUYIN|IS_PINYIN, 347, 0}, +{"ti", IS_ZHUYIN|IS_PINYIN, 348, 0}, +{"tian", IS_ZHUYIN|IS_PINYIN, 349, 0}, +{"tiao", IS_ZHUYIN|IS_PINYIN, 350, 0}, +{"tie", IS_ZHUYIN|IS_PINYIN, 351, 0}, +{"tign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 352, 1}, +{"timg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 352, 1}, +{"ting", IS_ZHUYIN|IS_PINYIN, 352, 0}, +{"togn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 353, 1}, +{"tomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 353, 1}, +{"ton", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 353, 1}, +{"tong", IS_ZHUYIN|IS_PINYIN, 353, 0}, +{"tou", IS_ZHUYIN|IS_PINYIN, 354, 0}, +{"tu", IS_ZHUYIN|IS_PINYIN, 355, 0}, +{"tuan", IS_ZHUYIN|IS_PINYIN, 356, 0}, +{"tuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 357, 1}, +{"tuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 358, 1}, +{"tui", IS_ZHUYIN|IS_PINYIN, 357, 0}, +{"tun", IS_ZHUYIN|IS_PINYIN, 358, 0}, +{"tuo", IS_ZHUYIN|IS_PINYIN, 359, 0}, +{"w", IS_PINYIN|PINYIN_INCOMPLETE, 360, 0}, +{"wa", IS_ZHUYIN|IS_PINYIN, 361, 0}, +{"wagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 364, 1}, +{"wai", IS_ZHUYIN|IS_PINYIN, 362, 0}, +{"wamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 364, 1}, +{"wan", IS_ZHUYIN|IS_PINYIN, 363, 0}, +{"wang", IS_ZHUYIN|IS_PINYIN, 364, 0}, +{"wegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 367, 1}, +{"wei", IS_ZHUYIN|IS_PINYIN, 365, 0}, +{"wemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 367, 1}, +{"wen", IS_ZHUYIN|IS_PINYIN, 366, 0}, +{"weng", IS_ZHUYIN|IS_PINYIN, 367, 0}, +{"wo", IS_ZHUYIN|IS_PINYIN, 368, 0}, +{"wu", IS_ZHUYIN|IS_PINYIN, 369, 0}, +{"x", IS_ZHUYIN|IS_PINYIN|PINYIN_INCOMPLETE|ZHUYIN_INCOMPLETE, 370, 0}, +{"xi", IS_ZHUYIN|IS_PINYIN, 371, 0}, +{"xia", IS_ZHUYIN|IS_PINYIN, 372, 0}, +{"xiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 374, 1}, +{"xiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 374, 1}, +{"xian", IS_ZHUYIN|IS_PINYIN, 373, 0}, +{"xiang", IS_ZHUYIN|IS_PINYIN, 374, 0}, +{"xiao", IS_ZHUYIN|IS_PINYIN, 375, 0}, +{"xie", IS_ZHUYIN|IS_PINYIN, 376, 0}, +{"xign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 378, 1}, +{"ximg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 378, 1}, +{"xin", IS_ZHUYIN|IS_PINYIN, 377, 0}, +{"xing", IS_ZHUYIN|IS_PINYIN, 378, 0}, +{"xiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 379, 1}, +{"xiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 379, 1}, +{"xion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 379, 1}, +{"xiong", IS_ZHUYIN|IS_PINYIN, 379, 0}, +{"xiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 380, 1}, +{"xiu", IS_ZHUYIN|IS_PINYIN, 380, 0}, +{"xu", IS_ZHUYIN|IS_PINYIN, 381, 0}, +{"xuan", IS_ZHUYIN|IS_PINYIN, 382, 0}, +{"xue", IS_ZHUYIN|IS_PINYIN, 383, 0}, +{"xuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 384, 1}, +{"xun", IS_ZHUYIN|IS_PINYIN, 384, 0}, +{"xv", IS_PINYIN|PINYIN_CORRECT_V_U, 381, 1}, +{"xvan", IS_PINYIN|PINYIN_CORRECT_V_U, 382, 1}, +{"xve", IS_PINYIN|PINYIN_CORRECT_V_U, 383, 1}, +{"xvn", IS_PINYIN|PINYIN_CORRECT_V_U, 384, 1}, +{"y", IS_PINYIN|PINYIN_INCOMPLETE, 385, 0}, +{"ya", IS_ZHUYIN|IS_PINYIN, 386, 0}, +{"yagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 389, 1}, +{"yamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 389, 1}, +{"yan", IS_ZHUYIN|IS_PINYIN, 388, 0}, +{"yang", IS_ZHUYIN|IS_PINYIN, 389, 0}, +{"yao", IS_ZHUYIN|IS_PINYIN, 390, 0}, +{"ye", IS_ZHUYIN|IS_PINYIN, 391, 0}, +{"yi", IS_ZHUYIN|IS_PINYIN, 392, 0}, +{"yign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 394, 1}, +{"yimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 394, 1}, +{"yin", IS_ZHUYIN|IS_PINYIN, 393, 0}, +{"ying", IS_ZHUYIN|IS_PINYIN, 394, 0}, +{"yo", IS_ZHUYIN|IS_PINYIN, 395, 0}, +{"yogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 396, 1}, +{"yomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 396, 1}, +{"yon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 396, 1}, +{"yong", IS_ZHUYIN|IS_PINYIN, 396, 0}, +{"you", IS_ZHUYIN|IS_PINYIN, 397, 0}, +{"yu", IS_ZHUYIN|IS_PINYIN, 398, 0}, +{"yuan", IS_ZHUYIN|IS_PINYIN, 399, 0}, +{"yue", IS_ZHUYIN|IS_PINYIN, 400, 0}, +{"yuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 401, 1}, +{"yun", IS_ZHUYIN|IS_PINYIN, 401, 0}, +{"yv", IS_PINYIN|PINYIN_CORRECT_V_U, 398, 1}, +{"yvan", IS_PINYIN|PINYIN_CORRECT_V_U, 399, 1}, +{"yve", IS_PINYIN|PINYIN_CORRECT_V_U, 400, 1}, +{"yvn", IS_PINYIN|PINYIN_CORRECT_V_U, 401, 1}, +{"z", IS_PINYIN|PINYIN_INCOMPLETE, 402, 0}, +{"za", IS_ZHUYIN|IS_PINYIN, 403, 0}, +{"zagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 406, 1}, +{"zai", IS_ZHUYIN|IS_PINYIN, 404, 0}, +{"zamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 406, 1}, +{"zan", IS_ZHUYIN|IS_PINYIN, 405, 0}, +{"zang", IS_ZHUYIN|IS_PINYIN, 406, 0}, +{"zao", IS_ZHUYIN|IS_PINYIN, 407, 0}, +{"ze", IS_ZHUYIN|IS_PINYIN, 408, 0}, +{"zegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 411, 1}, +{"zei", IS_ZHUYIN|IS_PINYIN, 409, 0}, +{"zemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 411, 1}, +{"zen", IS_ZHUYIN|IS_PINYIN, 410, 0}, +{"zeng", IS_ZHUYIN|IS_PINYIN, 411, 0}, +{"zh", IS_PINYIN|PINYIN_INCOMPLETE, 412, 0}, +{"zha", IS_ZHUYIN|IS_PINYIN, 413, 0}, +{"zhagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 416, 1}, +{"zhai", IS_ZHUYIN|IS_PINYIN, 414, 0}, +{"zhamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 416, 1}, +{"zhan", IS_ZHUYIN|IS_PINYIN, 415, 0}, +{"zhang", IS_ZHUYIN|IS_PINYIN, 416, 0}, +{"zhao", IS_ZHUYIN|IS_PINYIN, 417, 0}, +{"zhe", IS_ZHUYIN|IS_PINYIN, 418, 0}, +{"zhegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 421, 1}, +{"zhemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 421, 1}, +{"zhen", IS_ZHUYIN|IS_PINYIN, 420, 0}, +{"zheng", IS_ZHUYIN|IS_PINYIN, 421, 0}, +{"zhi", IS_ZHUYIN|IS_PINYIN, 422, 0}, +{"zhogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 423, 1}, +{"zhomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 423, 1}, +{"zhon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 423, 1}, +{"zhong", IS_ZHUYIN|IS_PINYIN, 423, 0}, +{"zhou", IS_ZHUYIN|IS_PINYIN, 424, 0}, +{"zhu", IS_ZHUYIN|IS_PINYIN, 425, 0}, +{"zhua", IS_ZHUYIN|IS_PINYIN, 426, 0}, +{"zhuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 429, 1}, +{"zhuai", IS_ZHUYIN|IS_PINYIN, 427, 0}, +{"zhuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 429, 1}, +{"zhuan", IS_ZHUYIN|IS_PINYIN, 428, 0}, +{"zhuang", IS_ZHUYIN|IS_PINYIN, 429, 0}, +{"zhuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 430, 1}, +{"zhuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 431, 1}, +{"zhui", IS_ZHUYIN|IS_PINYIN, 430, 0}, +{"zhun", IS_ZHUYIN|IS_PINYIN, 431, 0}, +{"zhuo", IS_ZHUYIN|IS_PINYIN, 432, 0}, +{"zi", IS_ZHUYIN|IS_PINYIN, 433, 0}, +{"zogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 434, 1}, +{"zomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 434, 1}, +{"zon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 434, 1}, +{"zong", IS_ZHUYIN|IS_PINYIN, 434, 0}, +{"zou", IS_ZHUYIN|IS_PINYIN, 435, 0}, +{"zu", IS_ZHUYIN|IS_PINYIN, 436, 0}, +{"zuan", IS_ZHUYIN|IS_PINYIN, 437, 0}, +{"zuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 438, 1}, +{"zuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 439, 1}, +{"zui", IS_ZHUYIN|IS_PINYIN, 438, 0}, +{"zun", IS_ZHUYIN|IS_PINYIN, 439, 0}, +{"zuo", IS_ZHUYIN|IS_PINYIN, 440, 0} }; const pinyin_index_item_t luoma_pinyin_index[] = { diff --git a/src/storage/pinyin_phrase3.h b/src/storage/pinyin_phrase3.h index d2e75ad..d23c657 100644 --- a/src/storage/pinyin_phrase3.h +++ b/src/storage/pinyin_phrase3.h @@ -223,8 +223,8 @@ inline int phrase_compare_with_tones(const PinyinIndexItem2<phrase_length> &lhs, } template<size_t phrase_length> -inline int phrase_less_than_with_tones(const PinyinIndexItem2<phrase_length> &lhs, - const PinyinIndexItem2<phrase_length> &rhs) +inline bool phrase_less_than_with_tones(const PinyinIndexItem2<phrase_length> &lhs, + const PinyinIndexItem2<phrase_length> &rhs) { return 0 > phrase_compare_with_tones<phrase_length>(lhs, rhs); } diff --git a/src/storage/special_table.h b/src/storage/special_table.h index 7916a65..8263236 100644 --- a/src/storage/special_table.h +++ b/src/storage/special_table.h @@ -41,6 +41,7 @@ const resplit_table_item_t resplit_table[] = { {{"chu", "nan"}, {ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ZERO_FINAL), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"chun", "an"}, {ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100}, {{"dan", "gan"}, {ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 0, {"dang", "an"}, {ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100}, {{"e", "nai"}, {ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"en", "ai"}, {ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100}, +{{"e", "nen"}, {ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, 100, {"en", "en"}, {ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, 300}, {{"fa", "nan"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"fan", "an"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100}, {{"fan", "gai"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"fang", "ai"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100}, {{"fan", "gan"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"fang", "an"}, {ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100}, @@ -55,6 +56,7 @@ const resplit_table_item_t resplit_table[] = { {{"ji", "nou"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ZERO_FINAL), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 0, {"jin", "ou"}, {ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_IN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 100}, {{"jia", "nai"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"jian", "ai"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100}, {{"jia", "nan"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"jian", "an"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100}, +{{"jia", "nao"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, 100, {"jian", "ao"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, 300}, {{"jia", "ne"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_E)}, 0, {"jian", "e"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E)}, 100}, {{"jia", "nou"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 0, {"jian", "ou"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 100}, {{"jian", "gan"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100, {"jiang", "an"}, {ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100}, @@ -84,6 +86,7 @@ const resplit_table_item_t resplit_table[] = { {{"qia", "ne"}, {ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_A), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_E)}, 0, {"qian", "e"}, {ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E)}, 100}, {{"qin", "gai"}, {ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_IN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"qing", "ai"}, {ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ING), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100}, {{"qin", "gan"}, {ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_IN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 0, {"qing", "an"}, {ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ING), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100}, +{{"qu", "na"}, {ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ZERO_FINAL), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_A)}, 300, {"qun", "a"}, {ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_A)}, 100}, {{"re", "nai"}, {ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_E), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 0, {"ren", "ai"}, {ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, 100}, {{"re", "nan"}, {ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_E), ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 0, {"ren", "an"}, {ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EN), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, 100}, {{"san", "gou"}, {ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AN), ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 0, {"sang", "ou"}, {ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ANG), ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, 100}, diff --git a/src/storage/table_info.cpp b/src/storage/table_info.cpp index 9fe3865..41c7da0 100644 --- a/src/storage/table_info.cpp +++ b/src/storage/table_info.cpp @@ -113,7 +113,17 @@ static TABLE_PHONETIC_TYPE to_table_phonetic_type(const char * str) { if (0 == strcmp("zhuyin", str)) return ZHUYIN_TABLE; - assert(FALSE); + abort(); +} + +static TABLE_DATABASE_FORMAT_TYPE to_table_database_format_type(const char * str) { + if (0 == strcmp("BerkeleyDB", str)) + return BERKELEY_DB_FORMAT; + + if (0 == strcmp("KyotoCabinet", str)) + return KYOTO_CABINET_FORMAT; + + abort(); } static TABLE_TARGET to_table_target(const char * str) { @@ -123,7 +133,7 @@ static TABLE_TARGET to_table_target(const char * str) { if (0 == strcmp("addon", str)) return ADDON_TABLE; - assert(FALSE); + abort(); } static guint8 to_index_of_default_tables(const char * str) { @@ -137,7 +147,7 @@ static guint8 to_index_of_default_tables(const char * str) { HANDLE(NETWORK_DICTIONARY); HANDLE(USER_DICTIONARY); - assert(FALSE); + abort(); } static guint8 to_index_of_addon_tables(const char * str) { @@ -156,11 +166,21 @@ static PHRASE_FILE_TYPE to_file_type(const char * str) { HANDLE(DICTIONARY); HANDLE(USER_FILE); - assert(FALSE); + abort(); } #undef HANDLE +static const char * from_table_database_format_type(const TABLE_DATABASE_FORMAT_TYPE format) { + if (format == BERKELEY_DB_FORMAT) + return "BerkeleyDB"; + + if (format == KYOTO_CABINET_FORMAT) + return "KyotoCabinet"; + + abort(); +} + bool SystemTableInfo2::load(const char * filename) { reset(); @@ -196,9 +216,13 @@ bool SystemTableInfo2::load(const char * filename) { TABLE_PHONETIC_TYPE type = PINYIN_TABLE; char str[256]; - num = fscanf(input, "source table format:%255s", str); + num = fscanf(input, "source table format:%255s\n", str); type = to_table_phonetic_type(str); + TABLE_DATABASE_FORMAT_TYPE format = UNKNOWN_FORMAT; + num = fscanf(input, "database format:%255s\n", str); + format = to_table_database_format_type (str); + #if 0 printf("binver:%d modelver:%d lambda:%f\n", binver, modelver, lambda); printf("type:%d\n", type); @@ -211,6 +235,8 @@ bool SystemTableInfo2::load(const char * filename) { /* Note: support pinyin and zhuyin table now. */ assert(PINYIN_TABLE == type || ZHUYIN_TABLE == type); m_table_phonetic_type = type; + assert(BERKELEY_DB_FORMAT == format || KYOTO_CABINET_FORMAT == format); + m_table_database_format_type = format; int index = 0; char tableinfo[256], dictstr[256]; @@ -309,12 +335,19 @@ bool UserTableInfo::load(const char * filename) { return false; } + TABLE_DATABASE_FORMAT_TYPE format = UNKNOWN_FORMAT; + char str[256]; + num = fscanf(input, "database format:%255s\n", str); + if (EOF != num) + format = to_table_database_format_type (str); + #if 0 printf("binver:%d modelver:%d\n", binver, modelver); #endif m_binary_format_version = binver; m_model_data_version = modelver; + m_table_database_format_type = format; fclose(input); @@ -334,6 +367,8 @@ bool UserTableInfo::save(const char * filename) { fprintf(output, "binary format version:%d\n", m_binary_format_version); fprintf(output, "model data version:%d\n", m_model_data_version); + fprintf(output, "database format:%s\n", + from_table_database_format_type (m_table_database_format_type)); fclose(output); @@ -349,11 +384,15 @@ bool UserTableInfo::is_conform(const SystemTableInfo2 * sysinfo) { if (sysinfo->m_model_data_version != m_model_data_version) return false; + if (sysinfo->m_table_database_format_type != m_table_database_format_type) + return false; + return true; } bool UserTableInfo::make_conform(const SystemTableInfo2 * sysinfo) { m_binary_format_version = sysinfo->m_binary_format_version; m_model_data_version = sysinfo->m_model_data_version; + m_table_database_format_type = sysinfo->m_table_database_format_type; return true; } diff --git a/src/storage/table_info.h b/src/storage/table_info.h index bc3837f..4b8f252 100644 --- a/src/storage/table_info.h +++ b/src/storage/table_info.h @@ -32,6 +32,12 @@ typedef enum { } TABLE_PHONETIC_TYPE; typedef enum { + UNKNOWN_FORMAT, + BERKELEY_DB_FORMAT, + KYOTO_CABINET_FORMAT, +} TABLE_DATABASE_FORMAT_TYPE; + +typedef enum { DEFAULT_TABLE, ADDON_TABLE, } TABLE_TARGET; @@ -62,6 +68,7 @@ private: gfloat m_lambda; TABLE_PHONETIC_TYPE m_table_phonetic_type; + TABLE_DATABASE_FORMAT_TYPE m_table_database_format_type; pinyin_table_info_t m_default_tables[PHRASE_INDEX_LIBRARY_COUNT]; @@ -90,6 +97,7 @@ class UserTableInfo{ private: int m_binary_format_version; int m_model_data_version; + TABLE_DATABASE_FORMAT_TYPE m_table_database_format_type; private: void reset(); diff --git a/src/storage/zhuyin_parser2.cpp b/src/storage/zhuyin_parser2.cpp index 99ae4d2..3f14eab 100644 --- a/src/storage/zhuyin_parser2.cpp +++ b/src/storage/zhuyin_parser2.cpp @@ -160,6 +160,7 @@ static int search_chewing_symbols2(const zhuyin_symbol_item_t * symbol_table, bool ZhuyinSimpleParser2::parse_one_key(pinyin_option_t options, ChewingKey & key, + gint16 & distance, const char * str, int len) const { options &= ~PINYIN_AMB_ALL; unsigned char tone = CHEWING_ZERO_TONE; @@ -240,9 +241,10 @@ int ZhuyinSimpleParser2::parse(pinyin_option_t options, i = std_lite::min(maximum_len - parsed_len, (int)max_chewing_length); + gint16 distance = 0; ChewingKey key; ChewingKeyRest key_rest; for (; i > 0; --i) { - bool success = parse_one_key(options, key, cur_str, i); + bool success = parse_one_key(options, key, distance, cur_str, i); if (success) break; } @@ -289,7 +291,7 @@ bool ZhuyinSimpleParser2::set_scheme(ZhuyinScheme scheme) { m_symbol_table = chewing_standard_dvorak_symbols; m_tone_table = chewing_standard_dvorak_tones; default: - assert(FALSE); + abort(); } return false; @@ -331,6 +333,7 @@ bool ZhuyinSimpleParser2::in_chewing_scheme(pinyin_option_t options, bool ZhuyinDiscreteParser2::parse_one_key(pinyin_option_t options, ChewingKey & key, + gint16 & distance, const char * str, int len) const { if (0 == len) return false; @@ -430,9 +433,10 @@ int ZhuyinDiscreteParser2::parse(pinyin_option_t options, i = std_lite::min(maximum_len - parsed_len, (int)max_chewing_length); + gint16 distance = 0; ChewingKey key; ChewingKeyRest key_rest; for (; i > 0; --i) { - bool success = parse_one_key(options, key, cur_str, i); + bool success = parse_one_key(options, key, distance, cur_str, i); if (success) break; } @@ -480,7 +484,7 @@ bool ZhuyinDiscreteParser2::set_scheme(ZhuyinScheme scheme) { INIT_PARSER(hsu_zhuyin_index, hsu_dvorak); break; default: - assert(FALSE); + abort(); } #undef INIT_PARSER @@ -567,6 +571,7 @@ static int count_same_chars(const char * str, int len) { bool ZhuyinDaChenCP26Parser2::parse_one_key(pinyin_option_t options, ChewingKey & key, + gint16 & distance, const char *str, int len) const { if (0 == len) return false; @@ -731,15 +736,16 @@ int ZhuyinDaChenCP26Parser2::parse(pinyin_option_t options, /* maximum forward match for chewing. */ int parsed_len = 0; const char * cur_str = NULL; - ChewingKey key; ChewingKeyRest key_rest; while (parsed_len < maximum_len) { cur_str = str + parsed_len; i = std_lite::min(maximum_len - parsed_len, (int)max_chewing_dachen26_length); + gint16 distance = 0; + ChewingKey key; ChewingKeyRest key_rest; for (; i > 0; --i) { - bool success = parse_one_key(options, key, cur_str, i); + bool success = parse_one_key(options, key, distance, cur_str, i); if (success) break; } @@ -851,6 +857,7 @@ ZhuyinDirectParser2::ZhuyinDirectParser2 (){ bool ZhuyinDirectParser2::parse_one_key(pinyin_option_t options, ChewingKey & key, + gint16 & distance, const char *str, int len) const { options &= ~PINYIN_AMB_ALL; /* by default, chewing will use the first tone. */ @@ -910,8 +917,6 @@ int ZhuyinDirectParser2::parse(pinyin_option_t options, g_array_set_size(keys, 0); g_array_set_size(key_rests, 0); - ChewingKey key; ChewingKeyRest key_rest; - int parsed_len = 0; int i = 0, cur = 0, next = 0; while (cur < len) { @@ -922,7 +927,9 @@ int ZhuyinDirectParser2::parse(pinyin_option_t options, } next = i; - if (parse_one_key(options, key, str + cur, next - cur)) { + gint16 distance = 0; + ChewingKey key; ChewingKeyRest key_rest; + if (parse_one_key(options, key, distance, str + cur, next - cur)) { #if 0 /* as direct parser handles data source, assume the data is correct when loading. */ diff --git a/src/storage/zhuyin_parser2.h b/src/storage/zhuyin_parser2.h index 45af804..8a9c550 100644 --- a/src/storage/zhuyin_parser2.h +++ b/src/storage/zhuyin_parser2.h @@ -98,7 +98,7 @@ public: virtual ~ZhuyinSimpleParser2() {} - virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const; virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const; @@ -142,7 +142,7 @@ public: virtual ~ZhuyinDiscreteParser2() {} - virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const; virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const; @@ -167,7 +167,7 @@ public: virtual ~ZhuyinDaChenCP26Parser2() {} - virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const; virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const; @@ -187,7 +187,7 @@ public: virtual ~ZhuyinDirectParser2() {} - virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, gint16 & distance, const char *str, int len) const; virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const; }; diff --git a/src/zhuyin.cpp b/src/zhuyin.cpp index cb3ce70..1aeecf5 100644 --- a/src/zhuyin.cpp +++ b/src/zhuyin.cpp @@ -733,7 +733,7 @@ bool zhuyin_set_chewing_scheme(zhuyin_context_t * context, context->m_chewing_parser = new ZhuyinDaChenCP26Parser2(); break; default: - assert(FALSE); + abort(); } return true; } @@ -989,7 +989,7 @@ bool zhuyin_get_sentence(zhuyin_instance_t * instance, return false; MatchResult result = NULL; - assert(results.get_result(0, result)); + check_result(results.get_result(0, result)); bool retval = pinyin::convert_to_utf8 (context->m_phrase_index, result, @@ -1007,9 +1007,10 @@ bool zhuyin_parse_full_pinyin(zhuyin_instance_t * instance, /* disable the pinyin correction options. */ options &= ~PINYIN_CORRECT_ALL; + gint16 distance = 0; int pinyin_len = strlen(onepinyin); bool retval = context->m_full_pinyin_parser->parse_one_key - (options, *onekey, onepinyin, pinyin_len); + (options, *onekey, distance, onepinyin, pinyin_len); return retval; } @@ -1047,9 +1048,10 @@ bool zhuyin_parse_chewing(zhuyin_instance_t * instance, zhuyin_context_t * & context = instance->m_context; zhuyin_option_t options = context->m_options; + gint16 distance = 0; int chewing_len = strlen(onechewing); bool retval = context->m_chewing_parser->parse_one_key - (options, *onekey, onechewing, chewing_len ); + (options, *onekey, distance, onechewing, chewing_len); return retval; } @@ -1181,7 +1183,7 @@ static phrase_token_t _get_previous_token(zhuyin_instance_t * instance, /* use the first candidate. */ MatchResult result = NULL; - assert(results.get_result(0, result)); + check_result(results.get_result(0, result)); phrase_token_t cur_token = g_array_index (result, phrase_token_t, offset); @@ -1298,7 +1300,7 @@ static bool _compute_phrase_length(zhuyin_context_t * context, switch(candidate->m_candidate_type) { case BEST_MATCH_CANDIDATE: - assert(FALSE); + abort(); case NORMAL_CANDIDATE_AFTER_CURSOR: case NORMAL_CANDIDATE_BEFORE_CURSOR: { phrase_index->get_phrase_item(candidate->m_token, item); @@ -1306,7 +1308,7 @@ static bool _compute_phrase_length(zhuyin_context_t * context, break; } case ZOMBIE_CANDIDATE: - assert(FALSE); + abort(); } } @@ -1336,7 +1338,7 @@ static bool _compute_phrase_strings_of_items(zhuyin_instance_t * instance, &(candidate->m_phrase_string)); break; case ZOMBIE_CANDIDATE: - assert(FALSE); + abort(); } } @@ -1705,7 +1707,7 @@ bool zhuyin_train(zhuyin_instance_t * instance){ context->m_modified = true; MatchResult result = NULL; - assert(results.get_result(0, result)); + check_result(results.get_result(0, result)); bool retval = context->m_pinyin_lookup->train_result3 (&matrix, instance->m_constraints, result); @@ -1944,6 +1946,28 @@ bool zhuyin_get_zhuyin_key_rest_length(zhuyin_instance_t * instance, return true; } +/* find the first zero ChewingKey "'". */ +static size_t _compute_zero_start(PhoneticKeyMatrix & matrix, size_t offset) { + ChewingKey key; ChewingKeyRest key_rest; + const ChewingKey zero_key; + + ssize_t index = offset - 1; + for (; index > 0; --index) { + const size_t size = matrix.get_column_size(index); + + if (1 != size) + break; + + matrix.get_item(index, 0, key, key_rest); + if (zero_key == key) + offset = index; + else + break; + } + + return offset; +} + /* when lookup offset: get the previous non-zero ChewingKey. */ bool zhuyin_get_zhuyin_offset(zhuyin_instance_t * instance, @@ -1960,6 +1984,7 @@ bool zhuyin_get_zhuyin_offset(zhuyin_instance_t * instance, break; } + offset = _compute_zero_start(matrix, offset); _check_offset(matrix, offset); *poffset = offset; @@ -1991,6 +2016,7 @@ bool zhuyin_get_left_zhuyin_offset(zhuyin_instance_t * instance, break; } + offset = _compute_zero_start(matrix, offset); _check_offset(matrix, left); *pleft = left; @@ -2007,6 +2033,7 @@ bool zhuyin_get_right_zhuyin_offset(zhuyin_instance_t * instance, size_t right = offset; ChewingKey key; ChewingKeyRest key_rest; + const ChewingKey zero_key; for (size_t index = right; index < matrix.size() - 1; ++index) { const size_t size = matrix.get_column_size(index); @@ -2014,7 +2041,10 @@ bool zhuyin_get_right_zhuyin_offset(zhuyin_instance_t * instance, break; matrix.get_item(index, 0, key, key_rest); - break; + if (zero_key == key) + right = index + 1; + else + break; } if (0 == matrix.get_column_size(right)) diff --git a/src/zhuyin.h b/src/zhuyin.h index e627001..22f7eee 100644 --- a/src/zhuyin.h +++ b/src/zhuyin.h @@ -594,7 +594,7 @@ bool zhuyin_get_n_zhuyin(zhuyin_instance_t * instance, * */ bool zhuyin_get_zhuyin_key(zhuyin_instance_t * instance, - guint index, + size_t index, ChewingKey ** key); /** @@ -608,7 +608,7 @@ bool zhuyin_get_zhuyin_key(zhuyin_instance_t * instance, * */ bool zhuyin_get_zhuyin_key_rest(zhuyin_instance_t * instance, - guint index, + size_t index, ChewingKeyRest ** key_rest); /** diff --git a/tests/Makefile.am b/tests/Makefile.am index 34da2d0..5cf6d32 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -23,7 +23,7 @@ CLEANFILES = *.bak ACLOCAL = aclocal -I $(ac_aux_dir) -INCLUDES = -I$(top_srcdir)/src \ +AM_CPPFLAGS = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ -I$(top_srcdir)/src/lookup \ @@ -40,20 +40,20 @@ noinst_PROGRAMS = test_pinyin \ test_pinyin_SOURCES = test_pinyin.cpp -test_pinyin_LDADD = ../src/libpinyin.la +test_pinyin_LDADD = ../src/libpinyin.la @GLIB2_LIBS@ test_phrase_SOURCES = test_phrase.cpp -test_phrase_LDADD = ../src/libpinyin.la +test_phrase_LDADD = ../src/libpinyin.la @GLIB2_LIBS@ test_chewing_SOURCES = test_chewing.cpp -test_chewing_LDADD = ../src/libpinyin.la +test_chewing_LDADD = ../src/libpinyin.la @GLIB2_LIBS@ if ENABLE_LIBZHUYIN noinst_PROGRAMS += test_zhuyin test_zhuyin_SOURCES = test_zhuyin.cpp -test_zhuyin_LDADD = ../src/libzhuyin.la +test_zhuyin_LDADD = ../src/libzhuyin.la @GLIB2_LIBS@ endif diff --git a/tests/include/CMakeLists.txt b/tests/include/CMakeLists.txt index 3ad956c..43c029f 100644 --- a/tests/include/CMakeLists.txt +++ b/tests/include/CMakeLists.txt @@ -7,3 +7,5 @@ target_link_libraries( test_memory_chunk pinyin ) + +add_test(NAME memory_chunk COMMAND test_memory_chunk) diff --git a/tests/include/Makefile.am b/tests/include/Makefile.am index 4a22aea..a140ff0 100644 --- a/tests/include/Makefile.am +++ b/tests/include/Makefile.am @@ -14,7 +14,7 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. -INCLUDES = -I$(top_srcdir)/src \ +AM_CPPFLAGS = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ -I$(top_srcdir)/src/lookup \ diff --git a/tests/include/test_memory_chunk.cpp b/tests/include/test_memory_chunk.cpp index 5042882..f496e4d 100644 --- a/tests/include/test_memory_chunk.cpp +++ b/tests/include/test_memory_chunk.cpp @@ -55,13 +55,13 @@ int main(int argc, char * argv[]){ printf("%d\t%d\n", *p3, *(p3+1)); int tmp; - assert(chunk->get_content(sizeof(int), &tmp, sizeof(int))); + check_result(chunk->get_content(sizeof(int), &tmp, sizeof(int))); printf("%d\n", tmp); - assert(chunk->save("/tmp/test.bin")); - assert(chunk->load("/tmp/test.bin")); + check_result(chunk->save("/tmp/test.bin")); + check_result(chunk->load("/tmp/test.bin")); #ifdef LIBPINYIN_USE_MMAP - assert(chunk->mmap("/tmp/test.bin")); + check_result(chunk->mmap("/tmp/test.bin")); #endif delete chunk; diff --git a/tests/lookup/Makefile.am b/tests/lookup/Makefile.am index 10c5301..027298b 100644 --- a/tests/lookup/Makefile.am +++ b/tests/lookup/Makefile.am @@ -14,14 +14,19 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. -INCLUDES = -I$(top_srcdir)/src \ +AM_CPPFLAGS = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ -I$(top_srcdir)/src/lookup \ -I$(top_srcdir)/tests \ @GLIB2_CFLAGS@ -LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ +LDADD = \ + ../../src/libpinyin_internal.a \ + ../../src/storage/libstorage.a \ + ../../src/lookup/liblookup.a \ + @GLIB2_LIBS@ \ + $(NULL) noinst_PROGRAMS = test_pinyin_lookup \ test_phrase_lookup diff --git a/tests/lookup/test_pinyin_lookup.cpp b/tests/lookup/test_pinyin_lookup.cpp index c1552b4..9cc9f86 100644 --- a/tests/lookup/test_pinyin_lookup.cpp +++ b/tests/lookup/test_pinyin_lookup.cpp @@ -115,7 +115,7 @@ int main( int argc, char * argv[]){ for (size_t i = 0; i < results.size(); ++i) { MatchResult result = NULL; - assert(results.get_result(i, result)); + check_result(results.get_result(i, result)); for (size_t j = 0; j < result->len; ++j){ phrase_token_t * token = &g_array_index(result, phrase_token_t, j); diff --git a/tests/storage/CMakeLists.txt b/tests/storage/CMakeLists.txt index 378e134..4b3a60f 100644 --- a/tests/storage/CMakeLists.txt +++ b/tests/storage/CMakeLists.txt @@ -40,6 +40,8 @@ target_link_libraries( pinyin ) +add_test(NAME phrase_index_logger COMMAND test_phrase_index_logger) + add_executable( test_phrase_table test_phrase_table.cpp @@ -60,6 +62,8 @@ target_link_libraries( pinyin ) +add_test(NAME ngram COMMAND test_ngram) + add_executable( test_flexible_ngram test_flexible_ngram.cpp @@ -69,3 +73,5 @@ target_link_libraries( test_flexible_ngram pinyin ) + +add_test(NAME flexible_ngram COMMAND test_flexible_ngram) diff --git a/tests/storage/Makefile.am b/tests/storage/Makefile.am index 6f75534..6b05918 100644 --- a/tests/storage/Makefile.am +++ b/tests/storage/Makefile.am @@ -14,14 +14,19 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. -INCLUDES = -I$(top_srcdir)/src \ +AM_CPPFLAGS = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ -I$(top_srcdir)/src/lookup \ -I$(top_srcdir)/tests \ @GLIB2_CFLAGS@ -LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ +LDADD = \ + ../../src/libpinyin_internal.a \ + ../../src/storage/libstorage.a \ + ../../src/lookup/liblookup.a \ + @GLIB2_LIBS@ \ + $(NULL) TESTS = test_phrase_index_logger \ test_ngram \ diff --git a/tests/storage/test_flexible_ngram.cpp b/tests/storage/test_flexible_ngram.cpp index 0aaf15c..28b1d70 100644 --- a/tests/storage/test_flexible_ngram.cpp +++ b/tests/storage/test_flexible_ngram.cpp @@ -26,7 +26,7 @@ int main(int argc, char * argv[]) { typedef FlexibleSingleGram<guint32, guint32>::ArrayItemWithToken array_item_t; const guint32 total_freq = 16; - assert(single_gram.set_array_header(total_freq)); + check_result(single_gram.set_array_header(total_freq)); phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3 }; guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; @@ -35,9 +35,9 @@ int main(int argc, char * argv[]) { for ( size_t i = 0; i < G_N_ELEMENTS(tokens); ++i ){ if ( single_gram.get_array_item(tokens[i], freq) ) - assert(single_gram.set_array_item(tokens[i], freqs[i])); + check_result(single_gram.set_array_item(tokens[i], freqs[i])); else - assert(single_gram.insert_array_item(tokens[i], freqs[i])); + check_result(single_gram.insert_array_item(tokens[i], freqs[i])); } single_gram.get_array_item(3, freq); @@ -53,16 +53,16 @@ int main(int argc, char * argv[]) { printf("item:%d:%d\n", item->m_token, item->m_item); } - assert(single_gram.get_array_header(freq)); + check_result(single_gram.get_array_header(freq)); assert(freq == total_freq); FlexibleBigram<guint32, guint32, guint32> bigram("TEST"); - assert(bigram.attach("/tmp/training.db", ATTACH_READWRITE|ATTACH_CREATE)); + check_result(bigram.attach("/tmp/training.db", ATTACH_READWRITE|ATTACH_CREATE)); bigram.store(1, &single_gram); - assert(single_gram.insert_array_item(5, 8)); - assert(single_gram.remove_array_item(1, freq)); - assert(single_gram.set_array_header(32)); - assert(single_gram.get_array_header(freq)); + check_result(single_gram.insert_array_item(5, 8)); + check_result(single_gram.remove_array_item(1, freq)); + check_result(single_gram.set_array_header(32)); + check_result(single_gram.get_array_header(freq)); printf("new array header:%d\n", freq); bigram.store(2, &single_gram); @@ -122,7 +122,7 @@ int main(int argc, char * argv[]) { delete train_gram; } - assert(bigram.remove(1)); + check_result(bigram.remove(1)); bigram.get_all_items(items); printf("-----------------------items----------------------------\n"); diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp index 0313f1c..3070ee9 100644 --- a/tests/storage/test_ngram.cpp +++ b/tests/storage/test_ngram.cpp @@ -6,7 +6,7 @@ int main(int argc, char * argv[]){ SingleGram single_gram; const guint32 total_freq = 16; - assert(single_gram.set_total_freq(total_freq)); + check_result(single_gram.set_total_freq(total_freq)); phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3}; guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; @@ -15,9 +15,9 @@ int main(int argc, char * argv[]){ for(size_t i = 0; i < 6 ;++i){ if ( single_gram.get_freq(tokens[i], freq)) - assert(single_gram.set_freq(tokens[i], freqs[i])); + check_result(single_gram.set_freq(tokens[i], freqs[i])); else - assert(single_gram.insert_freq(tokens[i], freqs[i])); + check_result(single_gram.insert_freq(tokens[i], freqs[i])); } single_gram.get_freq(3, freq); @@ -33,14 +33,14 @@ int main(int argc, char * argv[]){ printf("item:%d:%f\n", item->m_token, item->m_freq); } - assert(single_gram.get_total_freq(freq)); + check_result(single_gram.get_total_freq(freq)); assert(freq == total_freq); Bigram bigram; - assert(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE)); + check_result(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE)); bigram.store(1, &single_gram); - assert(single_gram.insert_freq(5, 8)); - assert(single_gram.remove_freq(1, freq)); + check_result(single_gram.insert_freq(5, 8)); + check_result(single_gram.remove_freq(1, freq)); single_gram.set_total_freq(32); bigram.store(2, &single_gram); @@ -61,7 +61,7 @@ int main(int argc, char * argv[]){ } printf("--------------------------------------------------------\n"); - assert(single_gram.get_total_freq(freq)); + check_result(single_gram.get_total_freq(freq)); printf("total_freq:%d\n", freq); g_array_free(array, TRUE); @@ -75,8 +75,8 @@ int main(int argc, char * argv[]){ printf("item:%d\n", *token); } - assert(bigram.load_db("/tmp/test.db")); - assert(bigram.save_db("/tmp/test.db")); + check_result(bigram.save_db("/tmp/snapshot.db")); + check_result(bigram.load_db("/tmp/snapshot.db")); g_array_free(items, TRUE); diff --git a/tests/storage/test_phrase_index.cpp b/tests/storage/test_phrase_index.cpp index fa0721a..0fa6a29 100644 --- a/tests/storage/test_phrase_index.cpp +++ b/tests/storage/test_phrase_index.cpp @@ -38,11 +38,11 @@ int main(int argc, char * argv[]){ assert(string1 == string2); FacadePhraseIndex phrase_index_test; - assert(!phrase_index_test.add_phrase_item(1, &phrase_item)); + check_result(!phrase_index_test.add_phrase_item(1, &phrase_item)); MemoryChunk* chunk = new MemoryChunk; - assert(phrase_index_test.store(0, chunk)); - assert(phrase_index_test.load(0, chunk)); + check_result(phrase_index_test.store(0, chunk)); + check_result(phrase_index_test.load(0, chunk)); PhraseItem item2; guint32 time = record_time(); diff --git a/tests/storage/test_phrase_index_logger.cpp b/tests/storage/test_phrase_index_logger.cpp index 4f77964..452b162 100644 --- a/tests/storage/test_phrase_index_logger.cpp +++ b/tests/storage/test_phrase_index_logger.cpp @@ -30,7 +30,7 @@ int main(int argc, char * argv[]){ phrase_index.load(1, chunk); PhraseIndexRange range; - assert(ERROR_OK == phrase_index.get_range(1, range)); + check_result(ERROR_OK == phrase_index.get_range(1, range)); for (size_t i = range.m_range_begin; i < range.m_range_end; ++i ) { phrase_index.add_unigram_frequency(i, 1); } @@ -45,7 +45,7 @@ int main(int argc, char * argv[]){ chunk = new MemoryChunk; chunk->load("../../data/gb_char.bin"); new_chunk = new MemoryChunk; - assert(phrase_index.diff(1, chunk, new_chunk)); + check_result(phrase_index.diff(1, chunk, new_chunk)); new_chunk->save("/tmp/gb_char.dbin"); delete new_chunk; @@ -54,7 +54,7 @@ int main(int argc, char * argv[]){ phrase_index.load(1, chunk); new_chunk = new MemoryChunk; new_chunk->load("/tmp/gb_char.dbin"); - assert(phrase_index.merge(1, new_chunk)); + check_result(phrase_index.merge(1, new_chunk)); chunk = new MemoryChunk; phrase_index.store(1, chunk); chunk->save("/tmp/gb_char2.bin"); diff --git a/tests/storage/test_table_info.cpp b/tests/storage/test_table_info.cpp index e2e4893..27933bc 100644 --- a/tests/storage/test_table_info.cpp +++ b/tests/storage/test_table_info.cpp @@ -44,7 +44,7 @@ void dump_table_info(const pinyin_table_info_t * table_info) { break; default: - assert(false); + abort(); } } @@ -90,8 +90,8 @@ int main(int argc, char * argv[]) { retval = user_table_info.is_conform(&system_table_info); assert(retval); - assert(user_table_info.save("/tmp/user.conf")); - assert(user_table_info.load("/tmp/user.conf")); + check_result(user_table_info.save("/tmp/user.conf")); + check_result(user_table_info.load("/tmp/user.conf")); retval = user_table_info.is_conform(&system_table_info); assert(retval); diff --git a/tests/test_pinyin.cpp b/tests/test_pinyin.cpp index 8eadf89..316cf1e 100644 --- a/tests/test_pinyin.cpp +++ b/tests/test_pinyin.cpp @@ -69,8 +69,8 @@ int main(int argc, char * argv[]){ size_t len = pinyin_parse_more_full_pinyins(instance, linebuf); pinyin_guess_sentence_with_prefix(instance, prefixbuf); - pinyin_guess_candidates(instance, 0, - SORT_BY_PHRASE_LENGTH_AND_FREQUENCY); + guint sort_option = SORT_BY_PHRASE_LENGTH | SORT_BY_FREQUENCY; + pinyin_guess_candidates(instance, 0, sort_option); size_t i = 0; for (i = 0; i <= len; ++i) { diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am index 4354107..72e6743 100644 --- a/utils/segment/Makefile.am +++ b/utils/segment/Makefile.am @@ -16,14 +16,17 @@ MAINTAINERCLEANFILES = Makefile.in -INCLUDES = -I$(top_srcdir)/src \ - -I$(top_srcdir)/src/include \ - -I$(top_srcdir)/src/storage \ - -I$(top_srcdir)/src/lookup \ - -I$(top_srcdir)/utils \ - @GLIB2_CFLAGS@ +AM_CPPFLAGS = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ + @GLIB2_CFLAGS@ -LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ +LDADD = ../../src/libpinyin_internal.a \ + ../../src/storage/libstorage.a \ + ../../src/lookup/liblookup.a \ + @GLIB2_LIBS@ noinst_PROGRAMS = spseg ngseg mergeseq diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am index f5326a1..256386b 100644 --- a/utils/storage/Makefile.am +++ b/utils/storage/Makefile.am @@ -14,14 +14,19 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. -INCLUDES = -I$(top_srcdir)/src \ +AM_CPPFLAGS = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ -I$(top_srcdir)/src/lookup \ -I$(top_srcdir)/utils \ @GLIB2_CFLAGS@ -LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ +LDADD = \ + ../../src/libpinyin_internal.a \ + ../../src/storage/libstorage.a \ + ../../src/lookup/liblookup.a \ + @GLIB2_LIBS@ \ + $(NULL) bin_PROGRAMS = gen_binary_files \ import_interpolation diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp index fe73d75..c4620c4 100644 --- a/utils/storage/gen_binary_files.cpp +++ b/utils/storage/gen_binary_files.cpp @@ -95,16 +95,16 @@ int main(int argc, char * argv[]){ g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } + g_option_context_free(context); SystemTableInfo2 system_table_info; - gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + const gchar * filename = SYSTEM_TABLE_INFO; bool retval = system_table_info.load(filename); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } - g_free(filename); const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp index 041c85c..a07ed47 100644 --- a/utils/storage/import_interpolation.cpp +++ b/utils/storage/import_interpolation.cpp @@ -76,7 +76,7 @@ static ssize_t my_getline(FILE * input){ bool parse_headline(){ /* enter "\data" line */ - assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); + check_result(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); /* read "\data" line */ if ( !taglib_read(linebuf, line_type, values, required) ) { @@ -99,13 +99,13 @@ bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table, Bigram * bigram){ taglib_push_state(); - assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); - assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); - assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + check_result(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + check_result(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + check_result(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); do { retry: - assert(taglib_read(linebuf, line_type, values, required)); + check_result(taglib_read(linebuf, line_type, values, required)); switch(line_type) { case END_LINE: goto end; @@ -118,7 +118,7 @@ bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table, parse_bigram(input, phrase_table, phrase_index, bigram); goto retry; default: - assert(false); + abort(); } } while (my_getline(input) != -1) ; @@ -131,17 +131,17 @@ bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index){ taglib_push_state(); - assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", "")); + check_result(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", "")); do { - assert(taglib_read(linebuf, line_type, values, required)); + check_result(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_1_ITEM_LINE:{ /* handle \item in \1-gram */ TAGLIB_GET_TOKEN(token, 0); TAGLIB_GET_PHRASE_STRING(word, 1); - assert(taglib_validate_token_with_string - (phrase_index, token, word)); + check_result(taglib_validate_token_with_string + (phrase_index, token, word)); TAGLIB_GET_TAGVALUE(glong, count, atol); phrase_index->add_unigram_frequency(token, count); @@ -152,7 +152,7 @@ bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table, case GRAM_2_LINE: goto end; default: - assert(false); + abort(); } } while (my_getline(input) != -1); @@ -166,24 +166,24 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table, Bigram * bigram){ taglib_push_state(); - assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", "")); + check_result(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", "")); phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL; do { - assert(taglib_read(linebuf, line_type, values, required)); + check_result(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_2_ITEM_LINE:{ /* handle \item in \2-gram */ /* two tokens */ TAGLIB_GET_TOKEN(token1, 0); TAGLIB_GET_PHRASE_STRING(word1, 1); - assert(taglib_validate_token_with_string - (phrase_index, token1, word1)); + check_result(taglib_validate_token_with_string + (phrase_index, token1, word1)); TAGLIB_GET_TOKEN(token2, 2); TAGLIB_GET_PHRASE_STRING(word2, 3); - assert(taglib_validate_token_with_string - (phrase_index, token2, word2)); + check_result(taglib_validate_token_with_string + (phrase_index, token2, word2)); TAGLIB_GET_TAGVALUE(glong, count, atol); @@ -209,10 +209,10 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table, /* save the freq */ assert(NULL != last_single_gram); guint32 total_freq = 0; - assert(last_single_gram->get_total_freq(total_freq)); - assert(last_single_gram->insert_freq(token2, count)); + check_result(last_single_gram->get_total_freq(total_freq)); + check_result(last_single_gram->insert_freq(token2, count)); total_freq += count; - assert(last_single_gram->set_total_freq(total_freq)); + check_result(last_single_gram->set_total_freq(total_freq)); break; } case END_LINE: @@ -220,7 +220,7 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table, case GRAM_2_LINE: goto end; default: - assert(false); + abort(); } } while (my_getline(input) != -1); @@ -255,13 +255,12 @@ int main(int argc, char * argv[]){ SystemTableInfo2 system_table_info; - gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + const gchar * filename = SYSTEM_TABLE_INFO; bool retval = system_table_info.load(filename); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } - g_free(filename); PhraseLargeTable3 phrase_table; diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am index a70945a..d82cacf 100644 --- a/utils/training/Makefile.am +++ b/utils/training/Makefile.am @@ -16,14 +16,17 @@ MAINTAINERCLEANFILES = Makefile.in -INCLUDES = -I$(top_srcdir)/src \ - -I$(top_srcdir)/src/include \ - -I$(top_srcdir)/src/storage \ - -I$(top_srcdir)/src/lookup \ - -I$(top_srcdir)/utils \ - @GLIB2_CFLAGS@ - -LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ +AM_CPPFLAGS = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ + @GLIB2_CFLAGS@ + +LDADD = ../../src/libpinyin_internal.a \ + ../../src/storage/libstorage.a \ + ../../src/lookup/liblookup.a \ + @GLIB2_LIBS@ noinst_HEADERS = k_mixture_model.h diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp index a0e1dc6..089eadc 100644 --- a/utils/training/estimate_interpolation.cpp +++ b/utils/training/estimate_interpolation.cpp @@ -55,7 +55,7 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram, parameter_t elem_poss = 0; if (bigram && bigram->get_freq(token, freq)){ guint32 total_freq; - assert(bigram->get_total_freq(total_freq)); + check_result(bigram->get_total_freq(total_freq)); assert(0 != total_freq); elem_poss = freq / (parameter_t) total_freq; } @@ -78,7 +78,7 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram, next_lambda += deleted_count * (numerator / (numerator + part_of_denominator)); } - assert(deleted_bigram->get_total_freq(table_num)); + check_result(deleted_bigram->get_total_freq(table_num)); next_lambda /= table_num; g_array_free(array, TRUE); diff --git a/utils/training/estimate_k_mixture_model.cpp b/utils/training/estimate_k_mixture_model.cpp index 779f67d..f5c5777 100644 --- a/utils/training/estimate_k_mixture_model.cpp +++ b/utils/training/estimate_k_mixture_model.cpp @@ -41,7 +41,7 @@ parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram, parameter_t epsilon = 0.001; KMixtureModelMagicHeader magic_header; - assert(unigram->get_magic_header(magic_header)); + check_result(unigram->get_magic_header(magic_header)); assert(0 != magic_header.m_total_freq); while (fabs(lambda - next_lambda) > epsilon){ @@ -64,7 +64,7 @@ parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram, KMixtureModelArrayHeader array_header; KMixtureModelArrayItem array_item; if ( bigram && bigram->get_array_item(token, array_item) ){ - assert(bigram->get_array_header(array_header)); + check_result(bigram->get_array_header(array_header)); assert(0 != array_header.m_WC); elem_poss = array_item.m_WC / (parameter_t) array_header.m_WC; } @@ -85,7 +85,7 @@ parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram, next_lambda += deleted_count * (numerator / (numerator + part_of_denominator)); } KMixtureModelArrayHeader header; - assert(deleted_bigram->get_array_header(header)); + check_result(deleted_bigram->get_array_header(header)); assert(0 != header.m_WC); next_lambda /= header.m_WC; @@ -131,9 +131,9 @@ int main(int argc, char * argv[]){ KMixtureModelArrayHeader array_header; if (single_gram) - assert(single_gram->get_array_header(array_header)); + check_result(single_gram->get_array_header(array_header)); KMixtureModelArrayHeader deleted_array_header; - assert(deleted_single_gram->get_array_header(deleted_array_header)); + check_result(deleted_single_gram->get_array_header(deleted_array_header)); if ( 0 != deleted_array_header.m_WC ) { parameter_t lambda = compute_interpolation(deleted_single_gram, &bigram, single_gram); diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp index aa33e6b..d795c0d 100644 --- a/utils/training/eval_correction_rate.cpp +++ b/utils/training/eval_correction_rate.cpp @@ -45,14 +45,14 @@ bool get_possible_pinyin(FacadePhraseIndex * phrase_index, key_index = 0; max_freq = 0; for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) { freq = 0; - assert(item.get_nth_pronunciation(m, buffer, freq)); + check_result(item.get_nth_pronunciation(m, buffer, freq)); if ( freq > max_freq ) { key_index = m; max_freq = freq; } } - assert(item.get_nth_pronunciation(key_index, buffer, freq)); + check_result(item.get_nth_pronunciation(key_index, buffer, freq)); assert(max_freq == freq); guint8 len = item.get_phrase_length(); g_array_append_vals(keys, buffer, len); @@ -105,7 +105,7 @@ bool do_one_test(PhoneticLookup<1, 1> * pinyin_lookup, get_best_match(phrase_index, pinyin_lookup, &matrix, &results); assert(1 == results.size()); - assert(results.get_result(0, guessed_tokens)); + check_result(results.get_result(0, guessed_tokens)); /* compare the results */ char * sentence = NULL; char * guessed_sentence = NULL; diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp index fe11cb6..8466d3b 100644 --- a/utils/training/export_k_mixture_model.cpp +++ b/utils/training/export_k_mixture_model.cpp @@ -55,7 +55,7 @@ bool print_k_mixture_model_array_headers(FILE * output, for (size_t i = 0; i < items->len; ++i) { phrase_token_t token = g_array_index(items, phrase_token_t, i); KMixtureModelArrayHeader array_header; - assert(bigram->get_array_header(token, array_header)); + check_result(bigram->get_array_header(token, array_header)); char * phrase = taglib_token_to_string(phrase_index, token); if ( phrase ) fprintf(output, "\\item %d %s count %d freq %d\n", @@ -76,7 +76,7 @@ bool print_k_mixture_model_array_items(FILE * output, for (size_t i = 0; i < items->len; ++i) { phrase_token_t token = g_array_index(items, phrase_token_t, i); KMixtureModelSingleGram * single_gram = NULL; - assert(bigram->load(token, single_gram)); + check_result(bigram->load(token, single_gram)); FlexibleBigramPhraseArray array = g_array_new (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); single_gram->retrieve_all(array); diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp index 9f61bd7..074c198 100644 --- a/utils/training/gen_deleted_ngram.cpp +++ b/utils/training/gen_deleted_ngram.cpp @@ -109,9 +109,9 @@ int main(int argc, char * argv[]){ guint32 freq, total_freq; //increase freq if (single_gram->get_freq(cur_token, freq)) - assert(single_gram->set_freq(cur_token, freq + 1)); + check_result(single_gram->set_freq(cur_token, freq + 1)); else - assert(single_gram->insert_freq(cur_token, 1)); + check_result(single_gram->insert_freq(cur_token, 1)); //increase total freq single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp index b4066d0..94969a4 100644 --- a/utils/training/gen_k_mixture_model.cpp +++ b/utils/training/gen_k_mixture_model.cpp @@ -156,19 +156,19 @@ static void train_word_pair(HashofUnigram hash_of_unigram, */ if ( count > maximum_occurs_allowed ){ gpointer value = NULL; - assert( g_hash_table_lookup_extended - (hash_of_unigram, GUINT_TO_POINTER(token2), - NULL, &value) ); + check_result(g_hash_table_lookup_extended + (hash_of_unigram, GUINT_TO_POINTER(token2), + NULL, &value)); guint32 freq = GPOINTER_TO_UINT(value); freq -= count; if ( freq > 0 ) { g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2), GUINT_TO_POINTER(freq)); } else if ( freq == 0 ) { - assert(g_hash_table_steal(hash_of_unigram, - GUINT_TO_POINTER(token2))); + check_result(g_hash_table_steal(hash_of_unigram, + GUINT_TO_POINTER(token2))); } else { - assert(false); + abort(); } return; } @@ -178,24 +178,24 @@ static void train_word_pair(HashofUnigram hash_of_unigram, if ( 1 == count ) array_item.m_n_1 ++; array_item.m_Mr = std_lite::max(array_item.m_Mr, count); - assert(single_gram->set_array_item(token2, array_item)); + check_result(single_gram->set_array_item(token2, array_item)); } else { /* item doesn't exist. */ /* the same as above. */ if ( count > g_maximum_occurs ){ gpointer value = NULL; - assert( g_hash_table_lookup_extended - (hash_of_unigram, GUINT_TO_POINTER(token2), - NULL, &value) ); + check_result(g_hash_table_lookup_extended + (hash_of_unigram, GUINT_TO_POINTER(token2), + NULL, &value)); guint32 freq = GPOINTER_TO_UINT(value); freq -= count; if ( freq > 0 ) { g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2), GUINT_TO_POINTER(freq)); } else if ( freq == 0 ) { - assert(g_hash_table_steal(hash_of_unigram, - GUINT_TO_POINTER(token2))); + check_result(g_hash_table_steal(hash_of_unigram, + GUINT_TO_POINTER(token2))); } else { - assert(false); + abort(); } return; } @@ -206,7 +206,7 @@ static void train_word_pair(HashofUnigram hash_of_unigram, if ( 1 == count ) array_item.m_n_1 = 1; array_item.m_Mr = count; - assert(single_gram->insert_array_item(token2, array_item)); + check_result(single_gram->insert_array_item(token2, array_item)); } /* save delta in the array header. */ @@ -224,14 +224,14 @@ bool train_single_gram(HashofUnigram hash_of_unigram, assert(NULL != single_gram); delta = 0; /* delta in WC of single_gram. */ KMixtureModelArrayHeader array_header; - assert(single_gram->get_array_header(array_header)); + check_result(single_gram->get_array_header(array_header)); guint32 saved_array_header_WC = array_header.m_WC; HashofSecondWord hash_of_second_word = NULL; gpointer key, value = NULL; - assert(g_hash_table_lookup_extended - (hash_of_document, GUINT_TO_POINTER(token1), - NULL, &value)); + check_result(g_hash_table_lookup_extended + (hash_of_document, GUINT_TO_POINTER(token1), + NULL, &value)); hash_of_second_word = (HashofSecondWord) value; assert(NULL != hash_of_second_word); @@ -244,7 +244,7 @@ bool train_single_gram(HashofUnigram hash_of_unigram, train_word_pair(hash_of_unigram, single_gram, token2, count); } - assert(single_gram->get_array_header(array_header)); + check_result(single_gram->get_array_header(array_header)); delta = array_header.m_WC - saved_array_header_WC; return true; } @@ -268,7 +268,7 @@ static bool train_second_word(HashofUnigram hash_of_unigram, } /* save the single gram. */ - assert(bigram->store(token1, single_gram)); + check_result(bigram->store(token1, single_gram)); delete single_gram; KMixtureModelMagicHeader magic_header; @@ -282,7 +282,7 @@ static bool train_second_word(HashofUnigram hash_of_unigram, return false; } magic_header.m_WC += delta; - assert(bigram->set_magic_header(magic_header)); + check_result(bigram->set_magic_header(magic_header)); return true; } @@ -306,13 +306,13 @@ static bool post_processing_unigram(KMixtureModelBigram * bigram, } KMixtureModelMagicHeader magic_header; - assert(bigram->get_magic_header(magic_header)); + check_result(bigram->get_magic_header(magic_header)); if ( magic_header.m_total_freq + total_freq < magic_header.m_total_freq ){ fprintf(stderr, "the m_total_freq in magic header overflows.\n"); return false; } magic_header.m_total_freq += total_freq; - assert(bigram->set_magic_header(magic_header)); + check_result(bigram->set_magic_header(magic_header)); return true; } @@ -369,8 +369,8 @@ int main(int argc, char * argv[]){ HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); - assert(read_document(&phrase_table, &phrase_index, document, - hash_of_document, hash_of_unigram)); + check_result(read_document(&phrase_table, &phrase_index, document, + hash_of_document, hash_of_unigram)); fclose(document); document = NULL; @@ -386,9 +386,9 @@ int main(int argc, char * argv[]){ } KMixtureModelMagicHeader magic_header; - assert(bigram.get_magic_header(magic_header)); + check_result(bigram.get_magic_header(magic_header)); magic_header.m_N ++; - assert(bigram.set_magic_header(magic_header)); + check_result(bigram.set_magic_header(magic_header)); post_processing_unigram(&bigram, hash_of_unigram); diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp index 9d9c643..3e73cbe 100644 --- a/utils/training/gen_ngram.cpp +++ b/utils/training/gen_ngram.cpp @@ -113,9 +113,9 @@ int main(int argc, char * argv[]){ guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) - assert(single_gram->set_freq(cur_token, freq + 1)); + check_result(single_gram->set_freq(cur_token, freq + 1)); else - assert(single_gram->insert_freq(cur_token, 1)); + check_result(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp index f399408..b50db3c 100644 --- a/utils/training/gen_unigram.cpp +++ b/utils/training/gen_unigram.cpp @@ -93,13 +93,12 @@ int main(int argc, char * argv[]){ SystemTableInfo2 system_table_info; - gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + const gchar * filename = SYSTEM_TABLE_INFO; bool retval = system_table_info.load(filename); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } - g_free(filename); const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp index 1a34871..45f8d5e 100644 --- a/utils/training/import_k_mixture_model.cpp +++ b/utils/training/import_k_mixture_model.cpp @@ -77,7 +77,7 @@ static ssize_t my_getline(FILE * input){ bool parse_headline(KMixtureModelBigram * bigram){ /* enter "\data" line */ - assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", "")); + check_result(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", "")); /* read "\data" line */ if ( !taglib_read(linebuf, line_type, values, required) ) { @@ -111,13 +111,13 @@ bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table, KMixtureModelBigram * bigram){ taglib_push_state(); - assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); - assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); - assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + check_result(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + check_result(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + check_result(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); do { retry: - assert(taglib_read(linebuf, line_type, values, required)); + check_result(taglib_read(linebuf, line_type, values, required)); switch(line_type) { case END_LINE: goto end; @@ -130,7 +130,7 @@ bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table, parse_bigram(input, phrase_table, phrase_index, bigram); goto retry; default: - assert(false); + abort(); } } while (my_getline(input) != -1) ; @@ -144,17 +144,17 @@ bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table, KMixtureModelBigram * bigram){ taglib_push_state(); - assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", "")); + check_result(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", "")); do { - assert(taglib_read(linebuf, line_type, values, required)); + check_result(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_1_ITEM_LINE:{ /* handle \item in \1-gram */ TAGLIB_GET_TOKEN(token, 0); TAGLIB_GET_PHRASE_STRING(word, 1); - assert(taglib_validate_token_with_string - (phrase_index, token, word)); + check_result(taglib_validate_token_with_string + (phrase_index, token, word)); TAGLIB_GET_TAGVALUE(glong, count, atol); TAGLIB_GET_TAGVALUE(glong, freq, atol); @@ -170,7 +170,7 @@ bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table, case GRAM_2_LINE: goto end; default: - assert(false); + abort(); } } while (my_getline(input) != -1); @@ -184,26 +184,26 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table, KMixtureModelBigram * bigram){ taglib_push_state(); - assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, - "count:T:N_n_0:n_1:Mr", "")); + check_result(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, + "count:T:N_n_0:n_1:Mr", "")); phrase_token_t last_token = null_token; KMixtureModelSingleGram * last_single_gram = NULL; do { - assert(taglib_read(linebuf, line_type, values, required)); + check_result(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_2_ITEM_LINE:{ /* handle \item in \2-gram */ /* two tokens */ TAGLIB_GET_TOKEN(token1, 0); TAGLIB_GET_PHRASE_STRING(word1, 1); - assert(taglib_validate_token_with_string - (phrase_index, token1, word1)); + check_result(taglib_validate_token_with_string + (phrase_index, token1, word1)); TAGLIB_GET_TOKEN(token2, 2); TAGLIB_GET_PHRASE_STRING(word2, 3); - assert(taglib_validate_token_with_string - (phrase_index, token2, word2)); + check_result(taglib_validate_token_with_string + (phrase_index, token2, word2)); TAGLIB_GET_TAGVALUE(glong, count, atol); TAGLIB_GET_TAGVALUE(glong, T, atol); @@ -236,7 +236,7 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table, } assert(NULL != last_single_gram); - assert(last_single_gram->insert_array_item(token2, array_item)); + check_result(last_single_gram->insert_array_item(token2, array_item)); break; } case END_LINE: @@ -244,7 +244,7 @@ bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table, case GRAM_2_LINE: goto end; default: - assert(false); + abort(); } } while (my_getline(input) != -1); diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h index 5b7bfde..3fa1dc9 100644 --- a/utils/training/k_mixture_model.h +++ b/utils/training/k_mixture_model.h @@ -79,7 +79,7 @@ static inline parameter_t compute_Pr_G_3(corpus_count_t k, return (alpha * gamma / (B - 1)) * pow((1 - 1 / (B - 1)) , k - 2); } - assert(false); + abort(); } static inline parameter_t compute_Pr_G_3_with_count(corpus_count_t k, diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp index 4879ac7..3a549a6 100644 --- a/utils/training/k_mixture_model_to_interpolation.cpp +++ b/utils/training/k_mixture_model_to_interpolation.cpp @@ -58,8 +58,8 @@ static ssize_t my_getline(FILE * input){ bool parse_headline(FILE * input, FILE * output) { /* enter "\data" line */ - assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", - "count:N:total_freq")); + check_result(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", + "count:N:total_freq")); /* read "\data" line */ if ( !taglib_read(linebuf, line_type, values, required) ) { @@ -83,13 +83,13 @@ bool parse_headline(FILE * input, FILE * output) { bool parse_body(FILE * input, FILE * output){ taglib_push_state(); - assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); - assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); - assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + check_result(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + check_result(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + check_result(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); do { retry: - assert(taglib_read(linebuf, line_type, values, required)); + check_result(taglib_read(linebuf, line_type, values, required)); switch(line_type) { case END_LINE: fprintf(output, "\\end\n"); @@ -105,7 +105,7 @@ bool parse_body(FILE * input, FILE * output){ parse_bigram(input, output); goto retry; default: - assert(false); + abort(); } } while (my_getline(input) != -1); @@ -117,10 +117,10 @@ bool parse_body(FILE * input, FILE * output){ bool parse_unigram(FILE * input, FILE * output){ taglib_push_state(); - assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count")); + check_result(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count")); do { - assert(taglib_read(linebuf, line_type, values, required)); + check_result(taglib_read(linebuf, line_type, values, required)); switch(line_type) { case GRAM_1_ITEM_LINE: { /* handle \item in \1-gram */ @@ -143,7 +143,7 @@ bool parse_unigram(FILE * input, FILE * output){ case GRAM_2_LINE: goto end; default: - assert(false); + abort(); } } while (my_getline(input) != -1); @@ -155,11 +155,11 @@ bool parse_unigram(FILE * input, FILE * output){ bool parse_bigram(FILE * input, FILE * output){ taglib_push_state(); - assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, - "count", "T:N_n_0:n_1:Mr")); + check_result(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, + "count", "T:N_n_0:n_1:Mr")); do { - assert(taglib_read(linebuf, line_type, values, required)); + check_result(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_2_ITEM_LINE:{ /* handle \item in \2-gram */ @@ -180,7 +180,7 @@ bool parse_bigram(FILE * input, FILE * output){ case GRAM_2_LINE: goto end; default: - assert(false); + abort(); } } while (my_getline(input) != -1); diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp index 9554505..b27b4fd 100644 --- a/utils/training/merge_k_mixture_model.cpp +++ b/utils/training/merge_k_mixture_model.cpp @@ -104,7 +104,7 @@ static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target, if (!target->get_magic_header(target_magic_header)) { memset(&target_magic_header, 0, sizeof(KMixtureModelMagicHeader)); } - assert(new_one->get_magic_header(new_magic_header)); + check_result(new_one->get_magic_header(new_magic_header)); if ( target_magic_header.m_WC + new_magic_header.m_WC < std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){ fprintf(stderr, "the m_WC integer in magic header overflows.\n"); @@ -124,7 +124,7 @@ static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target, merged_magic_header.m_total_freq = target_magic_header.m_total_freq + new_magic_header.m_total_freq; - assert(target->set_magic_header(merged_magic_header)); + check_result(target->set_magic_header(merged_magic_header)); return true; } @@ -139,7 +139,7 @@ static bool merge_array_items( /* in & out */ KMixtureModelBigram * target, KMixtureModelSingleGram * target_single_gram = NULL; KMixtureModelSingleGram * new_single_gram = NULL; - assert(new_one->load(*token, new_single_gram)); + check_result(new_one->load(*token, new_single_gram)); bool exists_in_target = target->load(*token, target_single_gram); if ( !exists_in_target ){ target->store(*token, new_single_gram); @@ -152,8 +152,8 @@ static bool merge_array_items( /* in & out */ KMixtureModelBigram * target, KMixtureModelArrayHeader new_array_header; KMixtureModelArrayHeader merged_array_header; - assert(new_one->get_array_header(*token, new_array_header)); - assert(target->get_array_header(*token, target_array_header)); + check_result(new_one->get_array_header(*token, new_array_header)); + check_result(target->get_array_header(*token, target_array_header)); memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader)); merged_array_header.m_WC = target_array_header.m_WC + @@ -176,7 +176,7 @@ static bool merge_array_items( /* in & out */ KMixtureModelBigram * target, FlexibleBigramPhraseArray merged_array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); - assert(merge_two_phrase_array(target_array, new_array, merged_array)); + check_result(merge_two_phrase_array(target_array, new_array, merged_array)); g_array_free(target_array, TRUE); g_array_free(new_array, TRUE); @@ -189,8 +189,8 @@ static bool merge_array_items( /* in & out */ KMixtureModelBigram * target, merged_single_gram->insert_array_item(item->m_token, item->m_item); } - assert(merged_single_gram->set_array_header(merged_array_header)); - assert(target->store(*token, merged_single_gram)); + check_result(merged_single_gram->set_array_header(merged_array_header)); + check_result(target->store(*token, merged_single_gram)); delete merged_single_gram; g_array_free(merged_array, TRUE); } diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp index 988bf1a..5434908 100644 --- a/utils/training/prune_k_mixture_model.cpp +++ b/utils/training/prune_k_mixture_model.cpp @@ -162,10 +162,10 @@ int main(int argc, char * argv[]){ &g_array_index(removed_array, KMixtureModelArrayItemWithToken, m); KMixtureModelArrayHeader array_header; - assert(bigram.get_array_header(item->m_token, array_header)); + check_result(bigram.get_array_header(item->m_token, array_header)); array_header.m_freq -= item->m_item.m_WC; assert(array_header.m_freq >= 0); - assert(bigram.set_array_header(item->m_token, array_header)); + check_result(bigram.set_array_header(item->m_token, array_header)); } g_array_free(removed_array, TRUE); @@ -180,9 +180,9 @@ int main(int argc, char * argv[]){ KMixtureModelArrayHeader array_header; for ( size_t i = 0; i < items->len; ++i ){ phrase_token_t * token = &g_array_index(items, phrase_token_t, i); - assert(bigram.get_array_header(*token, array_header)); + check_result(bigram.get_array_header(*token, array_header)); if ( 0 == array_header.m_WC && 0 == array_header.m_freq ) - assert(bigram.remove(*token)); + check_result(bigram.remove(*token)); } g_array_free(items, TRUE); diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp index 91a4b2c..be4352b 100644 --- a/utils/training/validate_k_mixture_model.cpp +++ b/utils/training/validate_k_mixture_model.cpp @@ -55,7 +55,7 @@ bool validate_unigram(KMixtureModelBigram * bigram){ for (size_t i = 0; i < items->len; ++i) { phrase_token_t * token = &g_array_index(items, phrase_token_t, i); KMixtureModelArrayHeader array_header; - assert(bigram->get_array_header(*token, array_header)); + check_result(bigram->get_array_header(*token, array_header)); word_count += array_header.m_WC; total_freq += array_header.m_freq; } @@ -88,14 +88,14 @@ bool validate_bigram(KMixtureModelBigram * bigram){ for (size_t i = 0; i < items->len; ++i) { phrase_token_t * token = &g_array_index(items, phrase_token_t, i); KMixtureModelSingleGram * single_gram = NULL; - assert(bigram->load(*token, single_gram)); + check_result(bigram->load(*token, single_gram)); FlexibleBigramPhraseArray array = g_array_new (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); single_gram->retrieve_all(array); KMixtureModelArrayHeader array_header; - assert(single_gram->get_array_header(array_header)); + check_result(single_gram->get_array_header(array_header)); guint32 expected_sum = array_header.m_WC; guint32 freq = array_header.m_freq; diff --git a/utils/utils_helper.h b/utils/utils_helper.h index d44d129..385415f 100644 --- a/utils/utils_helper.h +++ b/utils/utils_helper.h @@ -22,6 +22,7 @@ #ifndef UTILS_HELPER_H #define UTILS_HELPER_H +#include "pinyin_utils.h" #define TAGLIB_GET_TOKEN(var, index) \ phrase_token_t var = null_token; \ @@ -42,8 +43,8 @@ type var; \ { \ gpointer value = NULL; \ - assert(g_hash_table_lookup_extended \ - (required, #var, NULL, &value)); \ + check_result(g_hash_table_lookup_extended \ + (required, #var, NULL, &value)); \ var = conv((const char *)value); \ } @@ -55,13 +56,13 @@ \ gchar ** strs = g_strsplit_set(line, " \t", 2); \ if (2 != g_strv_length(strs)) \ - assert(false); \ + abort(); \ \ phrase_token_t _token = atoi(strs[0]); \ const char * phrase = strs[1]; \ if (null_token != _token) \ - assert(taglib_validate_token_with_string \ - (phrase_index, _token, phrase)); \ + check_result(taglib_validate_token_with_string \ + (phrase_index, _token, phrase)); \ \ var = _token; \ \ |