summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore85
-rw-r--r--AUTHORS2
-rw-r--r--CMakeLists.txt151
-rw-r--r--COPYING339
-rw-r--r--ChangeLog37
-rw-r--r--Makefile.am30
-rw-r--r--NEWS0
-rw-r--r--README4
-rwxr-xr-xautogen.sh30
-rw-r--r--cmake/FindBerkeleyDB.cmake25
-rw-r--r--cmake/FindGLIB2.cmake53
-rw-r--r--configure.ac105
-rw-r--r--data/CMakeLists.txt95
-rw-r--r--data/Makefile.am67
-rw-r--r--data/table.conf17
-rw-r--r--doc/Makefile.am24
-rw-r--r--doc/gen_binary_files.11
-rw-r--r--doc/gen_unigram.11
-rw-r--r--doc/import_interpolation.11
-rw-r--r--doc/libpinyin.138
-rw-r--r--libpinyin.pc.in15
-rw-r--r--libpinyin.spec.in121
-rw-r--r--scripts/Makefile.data15
-rw-r--r--scripts/bopomofo.py530
-rw-r--r--scripts/chewing.py73
-rw-r--r--scripts/chewing_enum.h.in45
-rw-r--r--scripts/chewing_table.h.in50
-rw-r--r--scripts/chewingkey.py150
-rw-r--r--scripts/correct.py95
-rw-r--r--scripts/double_pinyin_table.h.in56
-rw-r--r--scripts/genbopomofoheader.py123
-rw-r--r--scripts/genchewingkey.py41
-rw-r--r--scripts/gendoublepinyinheader.py69
-rw-r--r--scripts/genpinyinheader.py46
-rw-r--r--scripts/genpinyins.py57
-rw-r--r--scripts/genpinyintable.py115
-rw-r--r--scripts/genspecialtable.py93
-rw-r--r--scripts/pinyin.py400
-rw-r--r--scripts/pinyin_parser_table.h.in34
-rw-r--r--scripts/pinyintable.py168
-rw-r--r--scripts/specials.txt0
-rw-r--r--scripts/specialtable.py123
-rw-r--r--scripts/utils.py41
-rw-r--r--src/CMakeLists.txt50
-rw-r--r--src/Makefile.am59
-rw-r--r--src/include/CMakeLists.txt11
-rw-r--r--src/include/Makefile.am25
-rw-r--r--src/include/memory_chunk.h413
-rw-r--r--src/include/novel_types.h155
-rw-r--r--src/include/stl_lite.h45
-rw-r--r--src/libpinyin.ver58
-rw-r--r--src/lookup/CMakeLists.txt23
-rw-r--r--src/lookup/Makefile.am36
-rw-r--r--src/lookup/lookup.cpp73
-rw-r--r--src/lookup/lookup.h79
-rw-r--r--src/lookup/phrase_lookup.cpp434
-rw-r--r--src/lookup/phrase_lookup.h142
-rw-r--r--src/lookup/pinyin_lookup2.cpp730
-rw-r--r--src/lookup/pinyin_lookup2.h240
-rw-r--r--src/pinyin.cpp2096
-rw-r--r--src/pinyin.h719
-rw-r--r--src/pinyin_internal.cpp4
-rw-r--r--src/pinyin_internal.h73
-rw-r--r--src/storage/CMakeLists.txt38
-rw-r--r--src/storage/Makefile.am59
-rw-r--r--src/storage/chewing_enum.h104
-rw-r--r--src/storage/chewing_key.h111
-rw-r--r--src/storage/chewing_large_table.cpp1047
-rw-r--r--src/storage/chewing_large_table.h154
-rw-r--r--src/storage/chewing_table.h221
-rw-r--r--src/storage/double_pinyin_table.h371
-rw-r--r--src/storage/facade_chewing_table.h216
-rw-r--r--src/storage/facade_phrase_table2.h203
-rw-r--r--src/storage/flexible_ngram.h719
-rw-r--r--src/storage/ngram.cpp602
-rw-r--r--src/storage/ngram.h329
-rw-r--r--src/storage/phrase_index.cpp860
-rw-r--r--src/storage/phrase_index.h839
-rw-r--r--src/storage/phrase_index_logger.h305
-rw-r--r--src/storage/phrase_large_table2.cpp809
-rw-r--r--src/storage/phrase_large_table2.h157
-rw-r--r--src/storage/pinyin_custom2.h111
-rw-r--r--src/storage/pinyin_parser2.cpp989
-rw-r--r--src/storage/pinyin_parser2.h361
-rw-r--r--src/storage/pinyin_parser_table.h3393
-rw-r--r--src/storage/pinyin_phrase2.h267
-rw-r--r--src/storage/table_info.cpp272
-rw-r--r--src/storage/table_info.h97
-rw-r--r--src/storage/tag_utility.cpp420
-rw-r--r--src/storage/tag_utility.h151
-rw-r--r--tests/CMakeLists.txt33
-rw-r--r--tests/Makefile.am50
-rw-r--r--tests/include/CMakeLists.txt9
-rw-r--r--tests/include/Makefile.am31
-rw-r--r--tests/include/test_memory_chunk.cpp64
-rw-r--r--tests/lookup/CMakeLists.txt21
-rw-r--r--tests/lookup/Makefile.am34
-rw-r--r--tests/lookup/test_phrase_lookup.cpp118
-rw-r--r--tests/lookup/test_pinyin_lookup.cpp126
-rw-r--r--tests/storage/CMakeLists.txt71
-rw-r--r--tests/storage/Makefile.am71
-rw-r--r--tests/storage/test_chewing_table.cpp148
-rw-r--r--tests/storage/test_flexible_ngram.cpp138
-rw-r--r--tests/storage/test_ngram.cpp87
-rw-r--r--tests/storage/test_parser2.cpp144
-rw-r--r--tests/storage/test_phrase_index.cpp122
-rw-r--r--tests/storage/test_phrase_index_logger.cpp67
-rw-r--r--tests/storage/test_phrase_table.cpp86
-rw-r--r--tests/storage/test_table_info.cpp84
-rw-r--r--tests/test_chewing.cpp68
-rw-r--r--tests/test_phrase.cpp74
-rw-r--r--tests/test_pinyin.cpp97
-rw-r--r--tests/tests_helper.h86
-rw-r--r--tests/timer.h48
-rw-r--r--utils/CMakeLists.txt3
-rw-r--r--utils/Makefile.am27
-rw-r--r--utils/segment/CMakeLists.txt19
-rw-r--r--utils/segment/Makefile.am39
-rw-r--r--utils/segment/mergeseq.cpp278
-rw-r--r--utils/segment/ngseg.cpp261
-rw-r--r--utils/segment/spseg.cpp343
-rw-r--r--utils/storage/CMakeLists.txt29
-rw-r--r--utils/storage/Makefile.am45
-rw-r--r--utils/storage/export_interpolation.cpp144
-rw-r--r--utils/storage/gen_binary_files.cpp115
-rw-r--r--utils/storage/gen_pinyin_table.cpp330
-rw-r--r--utils/storage/import_interpolation.cpp313
-rw-r--r--utils/training/CMakeLists.txt129
-rw-r--r--utils/training/Makefile.am97
-rw-r--r--utils/training/estimate_interpolation.cpp144
-rw-r--r--utils/training/estimate_k_mixture_model.cpp159
-rw-r--r--utils/training/eval_correction_rate.cpp211
-rw-r--r--utils/training/export_k_mixture_model.cpp156
-rw-r--r--utils/training/gen_deleted_ngram.cpp128
-rw-r--r--utils/training/gen_k_mixture_model.cpp411
-rw-r--r--utils/training/gen_ngram.cpp136
-rw-r--r--utils/training/gen_unigram.cpp111
-rw-r--r--utils/training/import_k_mixture_model.cpp322
-rw-r--r--utils/training/k_mixture_model.h172
-rw-r--r--utils/training/k_mixture_model_to_interpolation.cpp214
-rw-r--r--utils/training/merge_k_mixture_model.cpp239
-rw-r--r--utils/training/prune_k_mixture_model.cpp192
-rw-r--r--utils/training/validate_k_mixture_model.cpp174
-rw-r--r--utils/utils_helper.h147
144 files changed, 29200 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1d8db03
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,85 @@
+configure
+Makefile
+Makefile.in
+aclocal.m4
+config.log
+INSTALL
+autom4te.cache
+config.guess
+config.h
+config.h.in
+config.status
+config.sub
+depcomp
+install-sh
+libtool
+ltmain.sh
+missing
+stamp-h1
+libltdl
+*~
+*.o
+*.lo
+*.pyc
+.deps
+.libs
+tags
+TAGS
+cscope.*
+*.la
+libpinyin.pc
+libpinyin.spec
+libpinyin.so*
+src/lookup/liblookup.a
+src/storage/libstorage.a
+tests/include/test_memory_chunk
+tests/lookup/test_phrase_lookup
+tests/storage/test_flexible_ngram
+tests/storage/test_ngram
+tests/storage/test_parser
+tests/storage/test_parser2
+tests/storage/test_phrase_index
+tests/storage/test_phrase_index_logger
+tests/storage/test_phrase_table
+tests/test_chewing
+tests/test_phrase
+tests/test_pinyin
+tests/lookup/test_pinyin_lookup
+tests/storage/test_chewing_table
+tests/storage/test_pinyin_table
+utils/segment/ngseg
+utils/segment/spseg
+utils/storage/export_interpolation
+utils/storage/gen_binary_files
+utils/storage/gen_pinyin_table
+utils/storage/gen_chewing_table
+utils/storage/gen_zhuyin_map
+utils/storage/import_interpolation
+utils/training/estimate_interpolation
+utils/training/estimate_k_mixture_model
+utils/training/eval_correction_rate
+utils/training/export_k_mixture_model
+utils/training/gen_deleted_ngram
+utils/training/gen_k_mixture_model
+utils/training/gen_ngram
+utils/training/gen_unigram
+utils/training/import_k_mixture_model
+utils/training/k_mixture_model_to_interpolation
+utils/training/merge_k_mixture_model
+utils/training/prune_k_mixture_model
+utils/training/validate_k_mixture_model
+data/bigram.db
+data/gb_char.bin
+data/gb_char.dbin
+data/gb_char.table
+data/gbk_char.bin
+data/gbk_char.dbin
+data/gbk_char.table
+data/interpolation.text
+data/phrase_index.bin
+data/pinyin_index.bin
+data/user.db
+CMakeFiles
+CMakeCache.txt
+cmake_install.cmake
+CTestTestfile.cmake
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..41a6682
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,2 @@
+James Su 2002,2003,2006 <suzhe@tsinghua.edu.cn>
+Peng Wu 2006-2007 2010-2011 <alexepico@gmail.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..09421f6
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,151 @@
+## Copyright (C) 2011 BYVoid
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+######## Project settings
+cmake_minimum_required(VERSION 2.8)
+set (PACKAGE_NAME libpinyin)
+project (${PACKAGE_NAME} CXX C)
+enable_testing()
+
+######## Package information
+set (PACKAGE_URL https://github.com/libpinyin/libpinyin)
+set (PACKAGE_BUGREPORT https://github.com/libpinyin/libpinyin/issues)
+set (LIBPINYIN_VERSION_MAJOR 0)
+set (LIBPINYIN_VERSION_MINOR 7)
+set (LIBPINYIN_VERSION_REVISION 0)
+set (LIBPINYIN_BINARY_VERSION 2.0)
+
+if (CMAKE_BUILD_TYPE MATCHES Debug)
+ set (version_suffix .Debug)
+endif (CMAKE_BUILD_TYPE MATCHES Debug)
+
+set (
+ LIBPINYIN_VERSION
+ ${LIBPINYIN_VERSION_MAJOR}.${LIBPINYIN_VERSION_MINOR}.${LIBPINYIN_VERSION_REVISION}${version_suffix}
+)
+
+set (VERSION ${LIBPINYIN_VERSION})
+
+######## Validation
+
+include(CheckIncludeFileCXX)
+check_include_file_cxx(locale.h HAVE_LOCALE_H)
+check_include_file_cxx(libintl.h HAVE_LIBINTL_H)
+check_include_file_cxx(stdlib.h HAVE_STDLIB_H)
+check_include_file_cxx(string.h HAVE_STRING_H)
+check_include_file_cxx(sys/time.h HAVE_SYS_TIME_H)
+check_include_file_cxx(unistd.h HAVE_UNISTD_H)
+
+include(CheckFunctionExists)
+check_function_exists(gettimeofday HAVE_GETTIMEOFDAY)
+check_function_exists(malloc HAVE_MALLOC)
+check_function_exists(memcmp HAVE_MEMCMP)
+check_function_exists(memmove HAVE_MEMMOVE)
+check_function_exists(memset HAVE_MEMSET)
+check_function_exists(realloc HAVE_REALLOC)
+check_function_exists(setlocale HAVE_SETLOCALE)
+check_function_exists(stat HAVE_STAT)
+
+include(CheckTypeSize)
+check_type_size(size_t SIZE_OF_SIZE_T)
+
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+find_package(GLIB2 REQUIRED)
+find_package(BerkeleyDB REQUIRED)
+
+######## Windows
+
+if (WIN32)
+ set(CMAKE_SHARED_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX})
+ set(CMAKE_STATIC_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX})
+endif (WIN32)
+
+######## Directory
+
+set (DIR_PREFIX ${CMAKE_INSTALL_PREFIX})
+set (DIR_LIBRARY ${DIR_PREFIX}/${CMAKE_SHARED_LIBRARY_PREFIX})
+set (DIR_LIBRARY_STATIC ${DIR_PREFIX}/${CMAKE_STATIC_LIBRARY_PREFIX})
+set (DIR_INCLUDE ${DIR_PREFIX}/include)
+set (DIR_SHARE ${DIR_PREFIX}/share)
+set (DIR_BIN ${DIR_PREFIX}/bin)
+set (DIR_ETC ${DIR_PREFIX}/etc)
+
+if (DEFINED CMAKE_INSTALL_LIBDIR)
+ set (DIR_LIBRARY ${CMAKE_INSTALL_LIBDIR})
+ set (DIR_LIBRARY_STATIC ${CMAKE_INSTALL_LIBDIR})
+endif (DEFINED CMAKE_INSTALL_LIBDIR)
+
+if (DEFINED SHARE_INSTALL_PREFIX)
+ set (DIR_SHARE ${SHARE_INSTALL_PREFIX})
+endif (DEFINED SHARE_INSTALL_PREFIX)
+
+if (DEFINED INCLUDE_INSTALL_DIR)
+ set (DIR_INCLUDE ${INCLUDE_INSTALL_DIR})
+endif (DEFINED INCLUDE_INSTALL_DIR)
+
+if (DEFINED SYSCONF_INSTALL_DIR)
+ set (DIR_ETC ${SYSCONF_INSTALL_DIR})
+endif (DEFINED SYSCONF_INSTALL_DIR)
+
+set (DIR_SHARE_LIBPINYIN ${DIR_SHARE}/libpinyin)
+set (DIR_INCLUDE_LIBPINYIN ${DIR_INCLUDE}/libpinyin-${LIBPINYIN_BINARY_VERSION})
+
+######## Configuration
+
+set (prefix ${DIR_PREFIX})
+set (exec_prefix ${DIR_PREFIX})
+set (libdir ${DIR_LIBRARY})
+set (includedir ${DIR_INCLUDE})
+set (datadir ${DIR_SHARE})
+
+configure_file(
+ libpinyin.pc.in
+ libpinyin.pc
+ @ONLY
+)
+
+install(
+ FILES
+ ${CMAKE_BINARY_DIR}/libpinyin.pc
+ DESTINATION
+ ${DIR_LIBRARY}/pkgconfig
+)
+
+######## Definition
+
+if (CMAKE_BUILD_TYPE MATCHES Debug)
+ add_definitions(
+ -O0
+ -g3
+ )
+endif (CMAKE_BUILD_TYPE MATCHES Debug)
+
+include_directories(
+ ${GLIB2_INCLUDE_DIR}
+ ${PROJECT_SOURCE_DIR}/src
+ ${PROJECT_SOURCE_DIR}/src/include
+ ${PROJECT_SOURCE_DIR}/src/storage
+ ${PROJECT_SOURCE_DIR}/src/lookup
+ ${PROJECT_SOURCE_DIR}/utils
+ ${PROJECT_SOURCE_DIR}/tests
+)
+
+######## Subdirectories
+
+add_subdirectory(src)
+add_subdirectory(tests)
+add_subdirectory(utils)
+add_subdirectory(data)
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..d511905
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..f773af9
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,37 @@
+version 0.6.91
+* support ucs4 characters;
+* support guess sentence with prefix;
+* initially support fuzzy pinyin segment.
+
+version 0.6.0
+* the first official release of 0.6.x.
+
+version 0.5.92
+* fixes new parsers and chewing large table;
+* improves pinyin_save.
+
+version 0.5.91
+* some code re-factor and simplify;
+* fixes the self-learning work around.
+
+version 0.5.0
+* the first official release of 0.5.x.
+
+version 0.4.93
+* fixes some bugs in new parsers.
+
+version 0.4.92
+* enable parallel make.
+
+version 0.4.91
+* New parsers for full pinyin/double pinyin/chewing.
+ * libpinyin now fully supports all pinyin auto corrections in
+ibus-pinyin;
+ * libpinyin now better supports an/ang, en/eng, in/ing fuzzy
+pinyin match.
+
+version 0.3.0
+* the first official release of 0.3.x.
+
+version 0.2.99
+* import from pinyin.
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..aac12f0
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,30 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+EXTRA_DIST = COPYING
+
+AUTOMAKE_OPTIONS = gnu
+SUBDIRS = src tests utils data doc
+
+MAINTAINERCLEANFILES = Makefile.in
+
+CLEANFILES = *.bak
+
+ACLOCAL = aclocal -I .
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libpinyin.pc
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/NEWS
diff --git a/README b/README
new file mode 100644
index 0000000..6d2eea3
--- /dev/null
+++ b/README
@@ -0,0 +1,4 @@
+libpinyin
+Library to deal with pinyin.
+
+The libpinyin project aims to provide the algorithms core for intelligent sentence-based Chinese pinyin input methods.
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 0000000..0eb6f85
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+# Run this to generate all the initial makefiles, etc.
+
+srcdir=`dirname $0`
+test -z "$srcdir" && srcdir=.
+
+PKG_NAME="libpinyin"
+
+(test -f $srcdir/configure.ac \
+ && test -f $srcdir/README ) || {
+ echo -n "**Error**: Directory "\`$srcdir\'" does not look like the"
+ echo " top-level $PKG_NAME directory"
+ exit 1
+}
+
+which gnome-autogen.sh || {
+ echo "You need to install gnome-common from the GNOME CVS"
+ exit 1
+}
+
+(test -f $srcdir/ChangeLog) || {
+ touch $srcdir/ChangeLog
+}
+
+CFLAGS=${CFLAGS-"-Wall -Werror"}
+
+ACLOCAL_FLAGS="$ACLOCAL_FLAGS"
+REQUIRED_AUTOMAKE_VERSION=1.8
+
+. gnome-autogen.sh "$@"
diff --git a/cmake/FindBerkeleyDB.cmake b/cmake/FindBerkeleyDB.cmake
new file mode 100644
index 0000000..749f166
--- /dev/null
+++ b/cmake/FindBerkeleyDB.cmake
@@ -0,0 +1,25 @@
+# - Try to find Berkeley DB
+# Once done this will define
+#
+# BERKELEY_DB_FOUND - system has Berkeley DB
+# BERKELEY_DB_INCLUDE_DIR - the Berkeley DB include directory
+# BERKELEY_DB_LIBRARIES - Link these to use Berkeley DB
+# BERKELEY_DB_DEFINITIONS - Compiler switches required for using Berkeley DB
+
+# Copyright (c) 2006, Alexander Dymo, <adymo@kdevelop.org>
+#
+# Redistribution and use is allowed according to the terms of the BSD license.
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+
+FIND_PATH(BERKELEY_DB_INCLUDE_DIR db.h
+ /usr/include/db4
+ /usr/local/include/db4
+)
+
+FIND_LIBRARY(BERKELEY_DB_LIBRARIES NAMES db )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Berkeley "Could not find Berkeley DB >= 4.1" BERKELEY_DB_INCLUDE_DIR BERKELEY_DB_LIBRARIES)
+# show the BERKELEY_DB_INCLUDE_DIR and BERKELEY_DB_LIBRARIES variables only in the advanced view
+MARK_AS_ADVANCED(BERKELEY_DB_INCLUDE_DIR BERKELEY_DB_LIBRARIES )
+
diff --git a/cmake/FindGLIB2.cmake b/cmake/FindGLIB2.cmake
new file mode 100644
index 0000000..8c55991
--- /dev/null
+++ b/cmake/FindGLIB2.cmake
@@ -0,0 +1,53 @@
+# - Try to find the GLIB2 libraries
+# Once done this will define
+#
+# GLIB2_FOUND - system has glib2
+# GLIB2_INCLUDE_DIR - the glib2 include directory
+# GLIB2_LIBRARIES - glib2 library
+
+# Copyright (c) 2008 Laurent Montel, <montel@kde.org>
+#
+# Redistribution and use is allowed according to the terms of the BSD license.
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+
+
+if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES)
+ # Already in cache, be silent
+ set(GLIB2_FIND_QUIETLY TRUE)
+endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES)
+
+find_package(PkgConfig)
+pkg_check_modules(PC_LibGLIB2 QUIET glib-2.0)
+
+find_path(GLIB2_MAIN_INCLUDE_DIR
+ NAMES glib.h
+ HINTS ${PC_LibGLIB2_INCLUDEDIR}
+ PATH_SUFFIXES glib-2.0)
+
+find_library(GLIB2_LIBRARY
+ NAMES glib-2.0
+ HINTS ${PC_LibGLIB2_LIBDIR}
+)
+
+set(GLIB2_LIBRARIES ${GLIB2_LIBRARY})
+
+# search the glibconfig.h include dir under the same root where the library is found
+get_filename_component(glib2LibDir "${GLIB2_LIBRARIES}" PATH)
+
+find_path(GLIB2_INTERNAL_INCLUDE_DIR glibconfig.h
+ PATH_SUFFIXES glib-2.0/include
+ HINTS ${PC_LibGLIB2_INCLUDEDIR} "${glib2LibDir}" ${CMAKE_SYSTEM_LIBRARY_PATH})
+
+set(GLIB2_INCLUDE_DIR "${GLIB2_MAIN_INCLUDE_DIR}")
+
+# not sure if this include dir is optional or required
+# for now it is optional
+if(GLIB2_INTERNAL_INCLUDE_DIR)
+ set(GLIB2_INCLUDE_DIR ${GLIB2_INCLUDE_DIR} "${GLIB2_INTERNAL_INCLUDE_DIR}")
+endif(GLIB2_INTERNAL_INCLUDE_DIR)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GLIB2 DEFAULT_MSG GLIB2_LIBRARIES GLIB2_MAIN_INCLUDE_DIR)
+
+mark_as_advanced(GLIB2_INCLUDE_DIR GLIB2_LIBRARIES)
+
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..34fe28a
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,105 @@
+# -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+
+m4_define([libpinyin_major_version], [0])
+m4_define([libpinyin_minor_version], [9])
+m4_define([libpinyin_micro_version], [93])
+m4_define([libpinyin_abi_current], [4])
+m4_define([libpinyin_abi_revision], [0])
+
+m4_define([libpinyin_version],
+ [libpinyin_major_version.libpinyin_minor_version.libpinyin_micro_version])
+m4_define([libpinyin_binary_version],
+ [libpinyin_abi_current.libpinyin_abi_revision])
+
+AC_PREREQ(2.60)
+AC_INIT([libpinyin], [libpinyin_version], [https://github.com/libpinyin/libpinyin/issues/new])
+AM_INIT_AUTOMAKE
+AC_CONFIG_SRCDIR([config.h.in])
+AC_CONFIG_HEADER([config.h])
+m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])
+
+# Define a string for binary compatibility
+m4_define([lt_current], [libpinyin_abi_current])
+m4_define([lt_revision], [libpinyin_abi_revision])
+LT_VERSION_INFO="lt_current:lt_revision"
+AC_SUBST(LT_VERSION_INFO)
+
+LIBPINYIN_BINARY_VERSION="libpinyin_binary_version"
+AC_SUBST(LIBPINYIN_BINARY_VERSION)
+
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_CC
+AC_PROG_CPP
+AC_PROG_INSTALL
+AC_PROG_LN_S
+AC_PROG_MAKE_SET
+
+AC_GNU_SOURCE
+
+# Init libtool
+AC_PROG_LIBTOOL
+AC_SUBST(LIBTOOL_DEPS)
+
+# libtool option to control which symbols are exported
+# right now, symbols starting with _ are not exported
+LIBTOOL_EXPORT_OPTIONS='-export-symbols-regex "^[[^_]].*"'
+AC_SUBST(LIBTOOL_EXPORT_OPTIONS)
+
+# Checks for libraries.
+PKG_CHECK_MODULES(GLIB2, [glib-2.0 >= 2.4.0])
+
+# Checks for header files.
+AC_HEADER_STDC
+AC_CHECK_HEADERS([locale.h stdlib.h string.h sys/time.h unistd.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_HEADER_STDBOOL
+AC_C_CONST
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_HEADER_TIME
+
+# Checks for library functions.
+AC_FUNC_MALLOC
+AC_FUNC_MEMCMP
+AC_FUNC_REALLOC
+AC_FUNC_STAT
+AC_FUNC_MMAP
+AC_CHECK_FUNCS([gettimeofday memmove memset setlocale])
+
+AC_CHECK_HEADERS([libintl.h string.h])
+
+AC_CHECK_HEADER([db.h], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4]))
+
+AC_SEARCH_LIBS([db_create], [db], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4]))
+
+
+AC_CONFIG_FILES([libpinyin.pc
+ libpinyin.spec
+ Makefile
+ doc/Makefile
+ data/Makefile
+ src/Makefile
+ src/include/Makefile
+ src/storage/Makefile
+ src/lookup/Makefile
+ tests/Makefile
+ tests/include/Makefile
+ tests/storage/Makefile
+ tests/lookup/Makefile
+ utils/Makefile
+ utils/storage/Makefile
+ utils/segment/Makefile
+ utils/training/Makefile
+])
+
+AC_OUTPUT
+
+AC_MSG_RESULT([
+Build options:
+ Version $VERSION
+ Install prefix $prefix
+])
diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt
new file mode 100644
index 0000000..7301279
--- /dev/null
+++ b/data/CMakeLists.txt
@@ -0,0 +1,95 @@
+set(
+ BINARY_MODEL_DATA
+ gb_char.bin
+ gbk_char.bin
+ phrase_index.bin
+ pinyin_index.bin
+ bigram.db
+)
+
+set(
+ BINARY_MODEL_DATA_FILES
+ ${CMAKE_BINARY_DIR}/data/gb_char.bin
+ ${CMAKE_BINARY_DIR}/data/gbk_char.bin
+ ${CMAKE_BINARY_DIR}/data/phrase_index.bin
+ ${CMAKE_BINARY_DIR}/data/pinyin_index.bin
+ ${CMAKE_BINARY_DIR}/data/bigram.db
+)
+
+set(
+ gen_binary_files_BIN
+ ${CMAKE_BINARY_DIR}/utils/storage/gen_binary_files
+)
+
+set(
+ import_interpolation_BIN
+ ${CMAKE_BINARY_DIR}/utils/storage/import_interpolation
+)
+
+set(
+ gen_unigram_BIN
+ ${CMAKE_BINARY_DIR}/utils/training/gen_unigram
+)
+
+add_custom_target(
+ data
+ ALL
+ DEPENDS
+ ${BINARY_MODEL_DATA}
+)
+
+add_custom_command(
+ OUTPUT
+ ${CMAKE_SOURCE_DIR}/data/gb_char.table
+ ${CMAKE_SOURCE_DIR}/data/gbk_char.table
+ ${CMAKE_SOURCE_DIR}/data/interpolation2.text
+ COMMENT
+ "Downloading textual model data..."
+ COMMAND
+ wget http://downloads.sourceforge.net/libpinyin/models/model5.text.tar.gz
+ COMMAND
+ tar xvf model5.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data
+)
+
+add_custom_command(
+ OUTPUT
+ gb_char.bin
+ gbk_char.bin
+ phrase_index.bin
+ pinyin_index.bin
+ COMMENT
+ "Building binary model data..."
+ COMMAND
+ ${gen_binary_files_BIN} --table-dir ${CMAKE_SOURCE_DIR}/data
+ DEPENDS
+ gen_binary_files
+ ${CMAKE_SOURCE_DIR}/data/gb_char.table
+ ${CMAKE_SOURCE_DIR}/data/gbk_char.table
+)
+
+add_custom_command(
+ OUTPUT
+ bigram.db
+ COMMENT
+ "Building binary bigram data..."
+ COMMAND
+ ${import_interpolation_BIN} < ${CMAKE_SOURCE_DIR}/data/interpolation2.text
+ COMMAND
+ ${gen_unigram_BIN}
+ DEPENDS
+ import_interpolation
+ ${CMAKE_SOURCE_DIR}/data/interpolation2.text
+)
+
+install(
+ FILES
+ ${BINARY_MODEL_DATA_FILES}
+ DESTINATION
+ ${DIR_SHARE_LIBPINYIN}/data
+)
+
+set_directory_properties(
+ PROPERTIES
+ ADDITIONAL_MAKE_CLEAN_FILES
+ ${BINARY_MODEL_DATA_FILES}
+)
diff --git a/data/Makefile.am b/data/Makefile.am
new file mode 100644
index 0000000..c75fd95
--- /dev/null
+++ b/data/Makefile.am
@@ -0,0 +1,67 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2011 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+tablefiles = gb_char.table gbk_char.table \
+ merged.table \
+ art.table culture.table economy.table \
+ geology.table history.table life.table \
+ nature.table scitech.table \
+ society.table sport.table
+
+binfiles = ${tablefiles:.table=.bin}
+
+
+textual_model_data = interpolation2.text \
+ $(tablefiles)
+
+
+binary_model_data = phrase_index.bin pinyin_index.bin \
+ bigram.db \
+ $(binfiles)
+
+
+MAINTAINERCLEANFILES = Makefile.in
+
+EXTRA_DIST = $(textual_model_data) \
+ table.conf
+
+libpinyin_db_DATA = $(binary_model_data) \
+ table.conf
+
+libpinyin_dbdir = $(libdir)/libpinyin/data
+
+CLEANFILES = $(binary_model_data)
+
+interpolation2.text:
+ wget http://downloads.sourceforge.net/libpinyin/models/model6.text.tar.gz
+ tar xvf model6.text.tar.gz -C $(top_srcdir)/data
+
+
+$(tablefiles): interpolation2.text
+
+bigram.db: $(textual_model_data)
+ $(RM) $(binary_model_data)
+ ../utils/storage/gen_binary_files --table-dir $(top_srcdir)/data
+ ../utils/storage/import_interpolation --table-dir $(top_srcdir)/data < $(top_srcdir)/data/interpolation2.text
+ ../utils/training/gen_unigram --table-dir $(top_srcdir)/data
+
+phrase_index.bin pinyin_index.bin $(binfiles): bigram.db
+
+modify:
+ git reset --hard
+ sed -i -r -e "s'lambda parameter:0\\.[0-9]{3,6}'lambda parameter:$(LAMBDA_PARAMETER)'" table.conf
diff --git a/data/table.conf b/data/table.conf
new file mode 100644
index 0000000..096907c
--- /dev/null
+++ b/data/table.conf
@@ -0,0 +1,17 @@
+binary format version:3
+model data version:6
+lambda parameter:0.276607
+
+4 art.table art.bin art.dbin DICTIONARY
+5 culture.table culture.bin culture.dbin DICTIONARY
+6 economy.table economy.bin economy.dbin DICTIONARY
+7 geology.table geology.bin geology.dbin DICTIONARY
+8 history.table history.bin history.dbin DICTIONARY
+
+9 life.table life.bin life.dbin DICTIONARY
+10 nature.table nature.bin nature.dbin DICTIONARY
+11 scitech.table scitech.bin scitech.dbin DICTIONARY
+12 society.table society.bin society.dbin DICTIONARY
+13 sport.table sport.bin sport.dbin DICTIONARY
+
+14 NULL NULL network.bin USER_FILE \ No newline at end of file
diff --git a/doc/Makefile.am b/doc/Makefile.am
new file mode 100644
index 0000000..358100e
--- /dev/null
+++ b/doc/Makefile.am
@@ -0,0 +1,24 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+man_MANS = libpinyin.1 \
+ gen_binary_files.1 \
+ import_interpolation.1 \
+ gen_unigram.1
+
+EXTRA_DIST = $(man_MANS)
diff --git a/doc/gen_binary_files.1 b/doc/gen_binary_files.1
new file mode 100644
index 0000000..394a953
--- /dev/null
+++ b/doc/gen_binary_files.1
@@ -0,0 +1 @@
+.so man1/libpinyin.1
diff --git a/doc/gen_unigram.1 b/doc/gen_unigram.1
new file mode 100644
index 0000000..394a953
--- /dev/null
+++ b/doc/gen_unigram.1
@@ -0,0 +1 @@
+.so man1/libpinyin.1
diff --git a/doc/import_interpolation.1 b/doc/import_interpolation.1
new file mode 100644
index 0000000..394a953
--- /dev/null
+++ b/doc/import_interpolation.1
@@ -0,0 +1 @@
+.so man1/libpinyin.1
diff --git a/doc/libpinyin.1 b/doc/libpinyin.1
new file mode 100644
index 0000000..419ef90
--- /dev/null
+++ b/doc/libpinyin.1
@@ -0,0 +1,38 @@
+.TH LIBPINYIN "1" "Fed 2012" "libpinyin" "User Commands"
+
+.SH NAME
+libpinyin \- Library to deal with pinyin
+
+.SH DESCRIPTION
+The libpinyin project aims to provide the algorithms core for intelligent sentence-based Chinese pinyin input methods.
+
+.SH TOOLS
+gen_binary_files \- generate initially binary pinyin libraries
+import_interpolation \- import libpinyin textual format model data
+gen_unigram \- increase the unigram frequency for all phrases
+
+.SH USAGE
+.HP
+gen_binary_files --table-dir <DIRNAME>
+.RS
+.HP
+.B --table-dir
+Read textual format files from the <DIRNAME> directory.
+.RE
+.HP
+import_interpolation \< <MODELFILE>
+.HP
+gen_unigram
+
+.SH EXAMPLE
+Download the model.text.tar.gz, and extracts all files into a folder, then run the commands below to generate the binary model data.
+
+.RS
+rm gb_char.bin gbk_char.bin phrase_index.bin pinyin_index.bin bigram.db
+
+gen_binary_files --table-dir ../data
+
+import_interpolation < ../data/interpolation.text
+
+gen_unigram
+.RE
diff --git a/libpinyin.pc.in b/libpinyin.pc.in
new file mode 100644
index 0000000..ea08282
--- /dev/null
+++ b/libpinyin.pc.in
@@ -0,0 +1,15 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+pkgdatadir=@libdir@/libpinyin
+
+libpinyinincludedir=${includedir}/libpinyin-@VERSION@
+libpinyin_binary_version=@LIBPINYIN_BINARY_VERSION@
+
+Name: libpinyin
+Description: Library to deal with pinyin
+Version: @VERSION@
+Requires: glib-2.0
+Libs: -L${libdir} -lpinyin
+Cflags: -I${libpinyinincludedir}
diff --git a/libpinyin.spec.in b/libpinyin.spec.in
new file mode 100644
index 0000000..00be0d0
--- /dev/null
+++ b/libpinyin.spec.in
@@ -0,0 +1,121 @@
+Name: libpinyin
+Version: @VERSION@
+Release: 1%{?dist}
+Summary: Library to deal with pinyin
+
+License: GPLv2+
+URL: https://github.com/libpinyin/libpinyin
+Source0: http://downloads.sourceforge.net/libpinyin/libpinyin/%{name}-%{version}.tar.gz
+
+BuildRequires: db4-devel, glib2-devel
+Requires: %{name}-data%{?_isa} = %{version}-%{release}
+
+%description
+The libpinyin project aims to provide the algorithms core
+for intelligent sentence-based Chinese pinyin input methods.
+
+
+%package devel
+Summary: Development files for %{name}
+Requires: %{name} = %{version}-%{release}
+
+%description devel
+The %{name}-devel package contains libraries and header files for
+developing applications that use %{name}.
+
+
+%package data
+Summary: Data files for %{name}
+Requires: %{name} = %{version}-%{release}
+
+%description data
+The %{name}-data package contains data files.
+
+
+%package tools
+Summary: Tools for %{name}
+Requires: %{name} = %{version}-%{release}
+
+%description tools
+The %{name}-tools package contains tools.
+
+
+%prep
+%setup -q
+
+
+%build
+%configure --disable-static
+make %{?_smp_mflags}
+
+%install
+make install DESTDIR=$RPM_BUILD_ROOT
+find $RPM_BUILD_ROOT -name '*.la' -exec rm -f {} ';'
+
+
+%post -p /sbin/ldconfig
+
+%postun -p /sbin/ldconfig
+
+
+%files
+%doc AUTHORS COPYING README
+%{_libdir}/*.so.*
+%dir %{_datadir}/libpinyin
+
+%files devel
+%doc
+%dir %{_includedir}/libpinyin-@VERSION@
+%{_includedir}/libpinyin-@VERSION@/*
+%{_libdir}/*.so
+%{_libdir}/pkgconfig/libpinyin.pc
+
+%files data
+%doc
+%{_datadir}/libpinyin/data
+
+%files tools
+%{_bindir}/gen_binary_files
+%{_bindir}/import_interpolation
+%{_bindir}/gen_unigram
+%{_mandir}/man1/*.1.*
+
+%changelog
+* Thu May 24 2012 Peng Wu <pwu@redhat.com> - 0.6.91-1
+- Update to 0.6.91
+
+* Mon Feb 13 2012 Peng Wu <pwu@redhat.com> - 0.5.91-1
+- Update to 0.5.91
+
+* Wed Jan 18 2012 Peng Wu <pwu@redhat.com> - 0.5.0-1
+- Update to 0.5.0
+
+* Fri Jan 13 2012 Peng Wu <pwu@redhat.com> - 0.4.93-1
+- Update to 0.4.93
+
+* Mon Jan 09 2012 Peng Wu <pwu@redhat.com> - 0.4.92-2
+- Split tools sub package
+
+* Thu Dec 29 2011 Peng Wu <pwu@redhat.com> - 0.4.92-1
+- Update to 0.4.92
+
+* Tue Dec 27 2011 Peng Wu <pwu@redhat.com> - 0.4.91-1
+- Update to 0.4.91
+
+* Fri Nov 18 2011 Peng Wu <pwu@redhat.com> - 0.3.0-1
+- Update to 0.3.0
+
+* Thu Nov 03 2011 Peng Wu <pwu@redhat.com> - 0.2.99.3-1
+- Update to 0.2.99.3
+
+* Tue Oct 11 2011 Peng Wu <pwu@redhat.com> - 0.2.99.2-1
+- Update to 0.2.99.2
+
+* Wed Sep 28 2011 Peng Wu <pwu@redhat.com> - 0.2.99.1-1
+- Update to 0.2.99.1
+
+* Thu Sep 08 2011 Peng Wu <pwu@redhat.com> - 0.2.99-2
+- Split data sub package
+
+* Wed Aug 31 2011 Peng Wu <alexepico@gmail.com> - 0.2.99-1
+- Initial version
diff --git a/scripts/Makefile.data b/scripts/Makefile.data
new file mode 100644
index 0000000..7929e97
--- /dev/null
+++ b/scripts/Makefile.data
@@ -0,0 +1,15 @@
+all: pinyins.txt
+
+
+pinyins.txt:
+ python3 genpinyins.py
+
+
+update-header:
+ python3 genpinyinheader.py > ../src/storage/pinyin_parser_table.h
+ python3 gendoublepinyinheader.py > ../src/storage/double_pinyin_table.h
+ python3 genbopomofoheader.py > ../src/storage/chewing_table.h
+ python3 genchewingkey.py > ../src/storage/chewing_enum.h
+
+
+.PHONY: pinyins.txt
diff --git a/scripts/bopomofo.py b/scripts/bopomofo.py
new file mode 100644
index 0000000..91a8744
--- /dev/null
+++ b/scripts/bopomofo.py
@@ -0,0 +1,530 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (c) 2010 BYVoid <byvoid1@gmail.com>
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+BOPOMOFO_PINYIN_MAP = {
+ "ㄅ" : "b",
+ "ㄅㄚ" : "ba",
+ "ㄅㄛ" : "bo",
+ "ㄅㄞ" : "bai",
+ "ㄅㄟ" : "bei",
+ "ㄅㄠ" : "bao",
+ "ㄅㄢ" : "ban",
+ "ㄅㄣ" : "ben",
+ "ㄅㄤ" : "bang",
+ "ㄅㄥ" : "beng",
+ "ㄅㄧ" : "bi",
+ "ㄅㄧㄝ" : "bie",
+ "ㄅㄧㄠ" : "biao",
+ "ㄅㄧㄢ" : "bian",
+ "ㄅㄧㄣ" : "bin",
+ "ㄅㄧㄥ" : "bing",
+ "ㄅㄨ" : "bu",
+ "ㄆ" : "p",
+ "ㄆㄚ" : "pa",
+ "ㄆㄛ" : "po",
+ "ㄆㄞ" : "pai",
+ "ㄆㄟ" : "pei",
+ "ㄆㄠ" : "pao",
+ "ㄆㄡ" : "pou",
+ "ㄆㄢ" : "pan",
+ "ㄆㄣ" : "pen",
+ "ㄆㄤ" : "pang",
+ "ㄆㄥ" : "peng",
+ "ㄆㄧ" : "pi",
+ "ㄆㄧㄝ" : "pie",
+ "ㄆㄧㄠ" : "piao",
+ "ㄆㄧㄢ" : "pian",
+ "ㄆㄧㄣ" : "pin",
+ "ㄆㄧㄥ" : "ping",
+ "ㄆㄨ" : "pu",
+ "ㄇ" : "m",
+ "ㄇㄚ" : "ma",
+ "ㄇㄛ" : "mo",
+ "ㄇㄜ" : "me",
+ "ㄇㄞ" : "mai",
+ "ㄇㄟ" : "mei",
+ "ㄇㄠ" : "mao",
+ "ㄇㄡ" : "mou",
+ "ㄇㄢ" : "man",
+ "ㄇㄣ" : "men",
+ "ㄇㄤ" : "mang",
+ "ㄇㄥ" : "meng",
+ "ㄇㄧ" : "mi",
+ "ㄇㄧㄝ" : "mie",
+ "ㄇㄧㄠ" : "miao",
+ "ㄇㄧㄡ" : "miu",
+ "ㄇㄧㄢ" : "mian",
+ "ㄇㄧㄣ" : "min",
+ "ㄇㄧㄥ" : "ming",
+ "ㄇㄨ" : "mu",
+ "ㄈ" : "f",
+ "ㄈㄚ" : "fa",
+ "ㄈㄛ" : "fo",
+ "ㄈㄜ" : "fe",
+ "ㄈㄟ" : "fei",
+ "ㄈㄡ" : "fou",
+ "ㄈㄢ" : "fan",
+ "ㄈㄣ" : "fen",
+ "ㄈㄤ" : "fang",
+ "ㄈㄥ" : "feng",
+ "ㄈㄨ" : "fu",
+ "ㄉ" : "d",
+ "ㄉㄚ" : "da",
+ "ㄉㄜ" : "de",
+ "ㄉㄞ" : "dai",
+ "ㄉㄟ" : "dei",
+ "ㄉㄠ" : "dao",
+ "ㄉㄡ" : "dou",
+ "ㄉㄢ" : "dan",
+ "ㄉㄣ" : "den",
+ "ㄉㄤ" : "dang",
+ "ㄉㄥ" : "deng",
+ "ㄉㄧ" : "di",
+ "ㄉㄧㄚ" : "dia",
+ "ㄉㄧㄝ" : "die",
+ "ㄉㄧㄠ" : "diao",
+ "ㄉㄧㄡ" : "diu",
+ "ㄉㄧㄢ" : "dian",
+ "ㄉㄧㄣ" : "din",
+ "ㄉㄧㄥ" : "ding",
+ "ㄉㄨ" : "du",
+ "ㄉㄨㄛ" : "duo",
+ "ㄉㄨㄟ" : "dui",
+ "ㄉㄨㄢ" : "duan",
+ "ㄉㄨㄣ" : "dun",
+ "ㄉㄨㄥ" : "dong",
+ "ㄊ" : "t",
+ "ㄊㄚ" : "ta",
+ "ㄊㄜ" : "te",
+ "ㄊㄞ" : "tai",
+ "ㄊㄠ" : "tao",
+ "ㄊㄡ" : "tou",
+ "ㄊㄢ" : "tan",
+ "ㄊㄤ" : "tang",
+ "ㄊㄥ" : "teng",
+ "ㄊㄧ" : "ti",
+ "ㄊㄧㄝ" : "tie",
+ "ㄊㄧㄠ" : "tiao",
+ "ㄊㄧㄢ" : "tian",
+ "ㄊㄧㄥ" : "ting",
+ "ㄊㄨ" : "tu",
+ "ㄊㄨㄛ" : "tuo",
+ "ㄊㄨㄟ" : "tui",
+ "ㄊㄨㄢ" : "tuan",
+ "ㄊㄨㄣ" : "tun",
+ "ㄊㄨㄥ" : "tong",
+ "ㄋ" : "n",
+ "ㄋㄚ" : "na",
+ "ㄋㄜ" : "ne",
+ "ㄋㄞ" : "nai",
+ "ㄋㄟ" : "nei",
+ "ㄋㄠ" : "nao",
+ "ㄋㄡ" : "nou",
+ "ㄋㄢ" : "nan",
+ "ㄋㄣ" : "nen",
+ "ㄋㄤ" : "nang",
+ "ㄋㄥ" : "neng",
+ "ㄋㄧ" : "ni",
+ "ㄋㄧㄚ" : "nia",
+ "ㄋㄧㄝ" : "nie",
+ "ㄋㄧㄠ" : "niao",
+ "ㄋㄧㄡ" : "niu",
+ "ㄋㄧㄢ" : "nian",
+ "ㄋㄧㄣ" : "nin",
+ "ㄋㄧㄤ" : "niang",
+ "ㄋㄧㄥ" : "ning",
+ "ㄋㄨ" : "nu",
+ "ㄋㄨㄛ" : "nuo",
+ "ㄋㄨㄢ" : "nuan",
+ "ㄋㄨㄣ" : "nun",
+ "ㄋㄨㄥ" : "nong",
+ "ㄋㄩ" : "nv",
+ "ㄋㄩㄝ" : "nve",
+ "ㄌ" : "l",
+ "ㄌㄚ" : "la",
+ "ㄌㄛ" : "lo",
+ "ㄌㄜ" : "le",
+ "ㄌㄞ" : "lai",
+ "ㄌㄟ" : "lei",
+ "ㄌㄠ" : "lao",
+ "ㄌㄡ" : "lou",
+ "ㄌㄢ" : "lan",
+ "ㄌㄣ" : "len",
+ "ㄌㄤ" : "lang",
+ "ㄌㄥ" : "leng",
+ "ㄌㄧ" : "li",
+ "ㄌㄧㄚ" : "lia",
+ "ㄌㄧㄝ" : "lie",
+ "ㄌㄧㄠ" : "liao",
+ "ㄌㄧㄡ" : "liu",
+ "ㄌㄧㄢ" : "lian",
+ "ㄌㄧㄣ" : "lin",
+ "ㄌㄧㄤ" : "liang",
+ "ㄌㄧㄥ" : "ling",
+ "ㄌㄨ" : "lu",
+ "ㄌㄨㄛ" : "luo",
+ "ㄌㄨㄢ" : "luan",
+ "ㄌㄨㄣ" : "lun",
+ "ㄌㄨㄥ" : "long",
+ "ㄌㄩ" : "lv",
+ "ㄌㄩㄝ" : "lve",
+ "ㄍ" : "g",
+ "ㄍㄚ" : "ga",
+ "ㄍㄜ" : "ge",
+ "ㄍㄞ" : "gai",
+ "ㄍㄟ" : "gei",
+ "ㄍㄠ" : "gao",
+ "ㄍㄡ" : "gou",
+ "ㄍㄢ" : "gan",
+ "ㄍㄣ" : "gen",
+ "ㄍㄤ" : "gang",
+ "ㄍㄥ" : "geng",
+ "ㄍㄨ" : "gu",
+ "ㄍㄨㄚ" : "gua",
+ "ㄍㄨㄛ" : "guo",
+ "ㄍㄨㄞ" : "guai",
+ "ㄍㄨㄟ" : "gui",
+ "ㄍㄨㄢ" : "guan",
+ "ㄍㄨㄣ" : "gun",
+ "ㄍㄨㄤ" : "guang",
+ "ㄍㄨㄥ" : "gong",
+ "ㄎ" : "k",
+ "ㄎㄚ" : "ka",
+ "ㄎㄜ" : "ke",
+ "ㄎㄞ" : "kai",
+ "ㄎㄟ" : "kei",
+ "ㄎㄠ" : "kao",
+ "ㄎㄡ" : "kou",
+ "ㄎㄢ" : "kan",
+ "ㄎㄣ" : "ken",
+ "ㄎㄤ" : "kang",
+ "ㄎㄥ" : "keng",
+ "ㄎㄨ" : "ku",
+ "ㄎㄨㄚ" : "kua",
+ "ㄎㄨㄛ" : "kuo",
+ "ㄎㄨㄞ" : "kuai",
+ "ㄎㄨㄟ" : "kui",
+ "ㄎㄨㄢ" : "kuan",
+ "ㄎㄨㄣ" : "kun",
+ "ㄎㄨㄤ" : "kuang",
+ "ㄎㄨㄥ" : "kong",
+ "ㄏ" : "h",
+ "ㄏㄚ" : "ha",
+ "ㄏㄜ" : "he",
+ "ㄏㄞ" : "hai",
+ "ㄏㄟ" : "hei",
+ "ㄏㄠ" : "hao",
+ "ㄏㄡ" : "hou",
+ "ㄏㄢ" : "han",
+ "ㄏㄣ" : "hen",
+ "ㄏㄤ" : "hang",
+ "ㄏㄥ" : "heng",
+ "ㄏㄨ" : "hu",
+ "ㄏㄨㄚ" : "hua",
+ "ㄏㄨㄛ" : "huo",
+ "ㄏㄨㄞ" : "huai",
+ "ㄏㄨㄟ" : "hui",
+ "ㄏㄨㄢ" : "huan",
+ "ㄏㄨㄣ" : "hun",
+ "ㄏㄨㄤ" : "huang",
+ "ㄏㄨㄥ" : "hong",
+ "ㄐ" : "j",
+ "ㄐㄧ" : "ji",
+ "ㄐㄧㄚ" : "jia",
+ "ㄐㄧㄝ" : "jie",
+ "ㄐㄧㄠ" : "jiao",
+ "ㄐㄧㄡ" : "jiu",
+ "ㄐㄧㄢ" : "jian",
+ "ㄐㄧㄣ" : "jin",
+ "ㄐㄧㄤ" : "jiang",
+ "ㄐㄧㄥ" : "jing",
+ "ㄐㄩ" : "ju",
+ "ㄐㄩㄝ" : "jue",
+ "ㄐㄩㄢ" : "juan",
+ "ㄐㄩㄣ" : "jun",
+ "ㄐㄩㄥ" : "jiong",
+ "ㄑ" : "q",
+ "ㄑㄧ" : "qi",
+ "ㄑㄧㄚ" : "qia",
+ "ㄑㄧㄝ" : "qie",
+ "ㄑㄧㄠ" : "qiao",
+ "ㄑㄧㄡ" : "qiu",
+ "ㄑㄧㄢ" : "qian",
+ "ㄑㄧㄣ" : "qin",
+ "ㄑㄧㄤ" : "qiang",
+ "ㄑㄧㄥ" : "qing",
+ "ㄑㄩ" : "qu",
+ "ㄑㄩㄝ" : "que",
+ "ㄑㄩㄢ" : "quan",
+ "ㄑㄩㄣ" : "qun",
+ "ㄑㄩㄥ" : "qiong",
+ "ㄒ" : "x",
+ "ㄒㄧ" : "xi",
+ "ㄒㄧㄚ" : "xia",
+ "ㄒㄧㄝ" : "xie",
+ "ㄒㄧㄠ" : "xiao",
+ "ㄒㄧㄡ" : "xiu",
+ "ㄒㄧㄢ" : "xian",
+ "ㄒㄧㄣ" : "xin",
+ "ㄒㄧㄤ" : "xiang",
+ "ㄒㄧㄥ" : "xing",
+ "ㄒㄩ" : "xu",
+ "ㄒㄩㄝ" : "xue",
+ "ㄒㄩㄢ" : "xuan",
+ "ㄒㄩㄣ" : "xun",
+ "ㄒㄩㄥ" : "xiong",
+ "ㄓ" : "zhi",
+ "ㄓㄚ" : "zha",
+ "ㄓㄜ" : "zhe",
+ "ㄓㄞ" : "zhai",
+ "ㄓㄟ" : "zhei",
+ "ㄓㄠ" : "zhao",
+ "ㄓㄡ" : "zhou",
+ "ㄓㄢ" : "zhan",
+ "ㄓㄣ" : "zhen",
+ "ㄓㄤ" : "zhang",
+ "ㄓㄥ" : "zheng",
+ "ㄓㄨ" : "zhu",
+ "ㄓㄨㄚ" : "zhua",
+ "ㄓㄨㄛ" : "zhuo",
+ "ㄓㄨㄞ" : "zhuai",
+ "ㄓㄨㄟ" : "zhui",
+ "ㄓㄨㄢ" : "zhuan",
+ "ㄓㄨㄣ" : "zhun",
+ "ㄓㄨㄤ" : "zhuang",
+ "ㄓㄨㄥ" : "zhong",
+ "ㄔ" : "chi",
+ "ㄔㄚ" : "cha",
+ "ㄔㄜ" : "che",
+ "ㄔㄞ" : "chai",
+ "ㄔㄠ" : "chao",
+ "ㄔㄡ" : "chou",
+ "ㄔㄢ" : "chan",
+ "ㄔㄣ" : "chen",
+ "ㄔㄤ" : "chang",
+ "ㄔㄥ" : "cheng",
+ "ㄔㄨ" : "chu",
+ "ㄔㄨㄚ" : "chua",
+ "ㄔㄨㄛ" : "chuo",
+ "ㄔㄨㄞ" : "chuai",
+ "ㄔㄨㄟ" : "chui",
+ "ㄔㄨㄢ" : "chuan",
+ "ㄔㄨㄣ" : "chun",
+ "ㄔㄨㄤ" : "chuang",
+ "ㄔㄨㄥ" : "chong",
+ "ㄕ" : "shi",
+ "ㄕㄚ" : "sha",
+ "ㄕㄜ" : "she",
+ "ㄕㄞ" : "shai",
+ "ㄕㄟ" : "shei",
+ "ㄕㄠ" : "shao",
+ "ㄕㄡ" : "shou",
+ "ㄕㄢ" : "shan",
+ "ㄕㄣ" : "shen",
+ "ㄕㄤ" : "shang",
+ "ㄕㄥ" : "sheng",
+ "ㄕㄨ" : "shu",
+ "ㄕㄨㄚ" : "shua",
+ "ㄕㄨㄛ" : "shuo",
+ "ㄕㄨㄞ" : "shuai",
+ "ㄕㄨㄟ" : "shui",
+ "ㄕㄨㄢ" : "shuan",
+ "ㄕㄨㄣ" : "shun",
+ "ㄕㄨㄤ" : "shuang",
+ "ㄖ" : "ri",
+ "ㄖㄜ" : "re",
+ "ㄖㄠ" : "rao",
+ "ㄖㄡ" : "rou",
+ "ㄖㄢ" : "ran",
+ "ㄖㄣ" : "ren",
+ "ㄖㄤ" : "rang",
+ "ㄖㄥ" : "reng",
+ "ㄖㄨ" : "ru",
+ "ㄖㄨㄚ" : "rua",
+ "ㄖㄨㄛ" : "ruo",
+ "ㄖㄨㄟ" : "rui",
+ "ㄖㄨㄢ" : "ruan",
+ "ㄖㄨㄣ" : "run",
+ "ㄖㄨㄥ" : "rong",
+ "ㄗ" : "zi",
+ "ㄗㄚ" : "za",
+ "ㄗㄜ" : "ze",
+ "ㄗㄞ" : "zai",
+ "ㄗㄟ" : "zei",
+ "ㄗㄠ" : "zao",
+ "ㄗㄡ" : "zou",
+ "ㄗㄢ" : "zan",
+ "ㄗㄣ" : "zen",
+ "ㄗㄤ" : "zang",
+ "ㄗㄥ" : "zeng",
+ "ㄗㄨ" : "zu",
+ "ㄗㄨㄛ" : "zuo",
+ "ㄗㄨㄟ" : "zui",
+ "ㄗㄨㄢ" : "zuan",
+ "ㄗㄨㄣ" : "zun",
+ "ㄗㄨㄥ" : "zong",
+ "ㄘ" : "ci",
+ "ㄘㄚ" : "ca",
+ "ㄘㄜ" : "ce",
+ "ㄘㄞ" : "cai",
+ "ㄘㄠ" : "cao",
+ "ㄘㄡ" : "cou",
+ "ㄘㄢ" : "can",
+ "ㄘㄣ" : "cen",
+ "ㄘㄤ" : "cang",
+ "ㄘㄥ" : "ceng",
+ "ㄘㄨ" : "cu",
+ "ㄘㄨㄛ" : "cuo",
+ "ㄘㄨㄟ" : "cui",
+ "ㄘㄨㄢ" : "cuan",
+ "ㄘㄨㄣ" : "cun",
+ "ㄘㄨㄥ" : "cong",
+ "ㄙ" : "si",
+ "ㄙㄚ" : "sa",
+ "ㄙㄜ" : "se",
+ "ㄙㄞ" : "sai",
+ "ㄙㄠ" : "sao",
+ "ㄙㄡ" : "sou",
+ "ㄙㄢ" : "san",
+ "ㄙㄣ" : "sen",
+ "ㄙㄤ" : "sang",
+ "ㄙㄥ" : "seng",
+ "ㄙㄨ" : "su",
+ "ㄙㄨㄛ" : "suo",
+ "ㄙㄨㄟ" : "sui",
+ "ㄙㄨㄢ" : "suan",
+ "ㄙㄨㄣ" : "sun",
+ "ㄙㄨㄥ" : "song",
+ "ㄚ" : "a",
+ "ㄛ" : "o",
+ "ㄜ" : "e",
+ "ㄞ" : "ai",
+ "ㄟ" : "ei",
+ "ㄠ" : "ao",
+ "ㄡ" : "ou",
+ "ㄢ" : "an",
+ "ㄣ" : "en",
+ "ㄤ" : "ang",
+ "ㄥ" : "eng",
+ "ㄦ" : "er",
+ "ㄧ" : "yi",
+ "ㄧㄚ" : "ya",
+ "ㄧㄛ" : "yo",
+ "ㄧㄝ" : "ye",
+ "ㄧㄞ" : "yai",
+ "ㄧㄠ" : "yao",
+ "ㄧㄡ" : "you",
+ "ㄧㄢ" : "yan",
+ "ㄧㄣ" : "yin",
+ "ㄧㄤ" : "yang",
+ "ㄧㄥ" : "ying",
+ "ㄨ" : "wu",
+ "ㄨㄚ" : "wa",
+ "ㄨㄛ" : "wo",
+ "ㄨㄞ" : "wai",
+ "ㄨㄟ" : "wei",
+ "ㄨㄢ" : "wan",
+ "ㄨㄣ" : "wen",
+ "ㄨㄤ" : "wang",
+ "ㄨㄥ" : "weng",
+ "ㄩ" : "yu",
+ "ㄩㄝ" : "yue",
+ "ㄩㄢ" : "yuan",
+ "ㄩㄣ" : "yun",
+ "ㄩㄥ" : "yong",
+ "ㄫ" : "ng",
+}
+
+PINYIN_BOPOMOFO_MAP = dict([(v, k) for k, v in BOPOMOFO_PINYIN_MAP.items()])
+
+SPECIAL_INITIAL_SET = {'ci', 'chi', 'si', 'shi', 'zi', 'zhi', 'ri'}
+
+'''
+SHENG_YUN_BOPOMOFO_MAP = {
+ "b" : "ㄅ",
+ "p" : "ㄆ",
+ "m" : "ㄇ",
+ "f" : "ㄈ",
+ "d" : "ㄉ",
+ "t" : "ㄊ",
+ "n" : "ㄋ",
+ "l" : "ㄌ",
+ "g" : "ㄍ",
+ "k" : "ㄎ",
+ "h" : "ㄏ",
+ "j" : "ㄐ",
+ "q" : "ㄑ",
+ "x" : "ㄒ",
+ "zh" : "ㄓ",
+ "ch" : "ㄔ",
+ "sh" : "ㄕ",
+ "r" : "ㄖ",
+ "z" : "ㄗ",
+ "c" : "ㄘ",
+ "s" : "ㄙ",
+
+ # 韻母為u,ue,un,uan,ong時ㄧ省略
+ "y" : ("ㄧ", (("u", "ue", "un", "uan", "ong"), "")),
+ "w" : "ㄨ",
+ "a" : "ㄚ",
+ "o" : "ㄛ",
+ "e" : ("ㄜ", ("y", "ㄝ")), # y後面為ㄝ
+
+ # zh ch sh r z c s y後面為空
+ "i" : ("ㄧ", (("zh", "ch", "sh", "r", "z", "c", "s", "y"), "")),
+
+ # jqxy後面為ㄩ w後面為空
+ "u" : ("ㄨ", ("jqxy", "ㄩ")),
+ "v" : "ㄩ",
+ "ai" : "ㄞ",
+ "ei" : "ㄟ",
+ "ao" : "ㄠ",
+ "ou" : "ㄡ",
+ "an" : "ㄢ",
+ "en" : "ㄣ",
+ "ang" : "ㄤ",
+ "eng" : "ㄥ",
+ "er" : "ㄦ",
+ "ia" : "ㄧㄚ",
+ "ie" : "ㄧㄝ",
+ "iai" : "ㄧㄞ",
+ "iao" : "ㄧㄠ",
+ "iu" : "ㄧㄡ",
+ "ian" : "ㄧㄢ",
+ "in" : ("ㄧㄣ", ("y", "ㄣ")), #y後面為ㄣ
+ "iang" : "ㄧㄤ",
+ "ing" : ("ㄧㄥ", ("y", "ㄥ")), #y後面為ㄥ
+ "ua" : "ㄨㄚ",
+ "uo" : "ㄨㄛ",
+ "ue" : "ㄩㄝ",
+ # TODO: "ve" is OK?
+ "ve" : "ㄩㄝ",
+ "uai" : "ㄨㄞ",
+ "ui" : "ㄨㄟ",
+ "uan" : ("ㄨㄢ", ("jqxy", "ㄩㄢ")), # jqxy後面是ㄩㄢ
+ "un" : ("ㄨㄣ", ("jqxy", "ㄩㄣ")), # jqxy後面是ㄩㄣ
+ "uang" : ("ㄨㄤ", ("jqxy", "ㄩㄤ")), # jqxy後面是ㄩㄤ
+ "ong" : ("ㄨㄥ", ("jqxy", "ㄩㄥ")), # y後面為ㄩㄥ
+ "iong" : "ㄩㄥ",
+}
+'''
diff --git a/scripts/chewing.py b/scripts/chewing.py
new file mode 100644
index 0000000..b49c84f
--- /dev/null
+++ b/scripts/chewing.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+ASCII_CHEWING_INITIAL_MAP = {
+ "CHEWING_B" : "ㄅ",
+ "CHEWING_C" : "ㄘ",
+ "CHEWING_CH" : "ㄔ",
+ "CHEWING_D" : "ㄉ",
+ "CHEWING_F" : "ㄈ",
+ "CHEWING_H" : "ㄏ",
+ "CHEWING_G" : "ㄍ",
+ "CHEWING_K" : "ㄎ",
+ "CHEWING_J" : "ㄐ",
+ "CHEWING_M" : "ㄇ",
+ "CHEWING_N" : "ㄋ",
+ "CHEWING_L" : "ㄌ",
+ "CHEWING_R" : "ㄖ",
+ "CHEWING_P" : "ㄆ",
+ "CHEWING_Q" : "ㄑ",
+ "CHEWING_S" : "ㄙ",
+ "CHEWING_SH" : "ㄕ",
+ "CHEWING_T" : "ㄊ",
+ "CHEWING_X" : "ㄒ",
+ "CHEWING_Z" : "ㄗ",
+ "CHEWING_ZH" : "ㄓ",
+}
+
+CHEWING_ASCII_INITIAL_MAP = dict([(v, k) for k, v in ASCII_CHEWING_INITIAL_MAP.items()])
+
+ASCII_CHEWING_MIDDLE_MAP = {
+ "CHEWING_I" : "ㄧ",
+ "CHEWING_U" : "ㄨ",
+ "CHEWING_V" : "ㄩ",
+}
+
+CHEWING_ASCII_MIDDLE_MAP = dict([(v, k) for k, v in ASCII_CHEWING_MIDDLE_MAP.items()])
+
+ASCII_CHEWING_FINAL_MAP = {
+ "CHEWING_A" : "ㄚ",
+ "CHEWING_AI" : "ㄞ",
+ "CHEWING_AN" : "ㄢ",
+ "CHEWING_ANG" : "ㄤ",
+ "CHEWING_AO" : "ㄠ",
+ "CHEWING_E" : "ㄝ", # merge "ㄝ" and "ㄜ"
+ "CHEWING_EI" : "ㄟ",
+ "CHEWING_EN" : "ㄣ",
+ "CHEWING_ENG" : "ㄥ",
+ "CHEWING_ER" : "ㄦ",
+ "CHEWING_NG" : "ㄫ",
+ "CHEWING_O" : "ㄛ",
+ "CHEWING_OU" : "ㄡ",
+}
+
+CHEWING_ASCII_FINAL_MAP = dict([(v, k) for k, v in ASCII_CHEWING_FINAL_MAP.items()])
diff --git a/scripts/chewing_enum.h.in b/scripts/chewing_enum.h.in
new file mode 100644
index 0000000..46072df
--- /dev/null
+++ b/scripts/chewing_enum.h.in
@@ -0,0 +1,45 @@
+#ifndef CHEWING_ENUM_H
+#define CHEWING_ENUM_H
+
+namespace pinyin{
+
+/**
+ * @brief enums of chewing initial element.
+ */
+
+enum ChewingInitial
+{
+@CHEWING_INITIAL@
+};
+
+
+/**
+ * @brief enums of chewing middle element.
+ */
+
+enum ChewingMiddle
+{
+@CHEWING_MIDDLE@
+};
+
+
+/**
+ * @brief enums of chewing final element.
+ */
+enum ChewingFinal
+{
+@CHEWING_FINAL@
+};
+
+
+/**
+ * @brief enums of chewing tone element.
+ */
+enum ChewingTone
+{
+@CHEWING_TONE@
+};
+
+};
+
+#endif
diff --git a/scripts/chewing_table.h.in b/scripts/chewing_table.h.in
new file mode 100644
index 0000000..8780b17
--- /dev/null
+++ b/scripts/chewing_table.h.in
@@ -0,0 +1,50 @@
+#ifndef CHEWING_TABLE_H
+#define CHEWING_TABLE_H
+
+namespace pinyin{
+
+const chewing_symbol_item_t chewing_standard_symbols[] = {
+@STANDARD_SYMBOLS@
+};
+
+const chewing_tone_item_t chewing_standard_tones[] = {
+@STANDARD_TONES@
+};
+
+
+const chewing_symbol_item_t chewing_ginyieh_symbols[] = {
+@GINYIEH_SYMBOLS@
+};
+
+const chewing_tone_item_t chewing_ginyieh_tones[] = {
+@GINYIEH_TONES@
+};
+
+const chewing_symbol_item_t chewing_eten_symbols[] = {
+@ETEN_SYMBOLS@
+};
+
+const chewing_tone_item_t chewing_eten_tones[] = {
+@ETEN_TONES@
+};
+
+const chewing_symbol_item_t chewing_ibm_symbols[] = {
+@IBM_SYMBOLS@
+};
+
+const chewing_tone_item_t chewing_ibm_tones[] = {
+@IBM_TONES@
+};
+
+const char * chewing_tone_table[CHEWING_NUMBER_OF_TONES] = {
+"",
+"ˉ",
+"ˊ",
+"ˇ",
+"ˋ",
+"˙"
+};
+
+};
+
+#endif
diff --git a/scripts/chewingkey.py b/scripts/chewingkey.py
new file mode 100644
index 0000000..5f5770f
--- /dev/null
+++ b/scripts/chewingkey.py
@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+CHEWING_INITIAL_LIST = [
+ 'CHEWING_ZERO_INITIAL', #Zero Initial
+ 'CHEWING_B', #"ㄅ"
+ 'CHEWING_C', #"ㄘ"
+ 'CHEWING_CH', #"ㄔ"
+ 'CHEWING_D', #"ㄉ"
+ 'CHEWING_F', #"ㄈ"
+ 'CHEWING_H', #"ㄏ"
+ 'CHEWING_G', #"ㄍ"
+ 'CHEWING_K', #"ㄎ"
+ 'CHEWING_J', #"ㄐ"
+ 'CHEWING_M', #"ㄇ"
+ 'CHEWING_N', #"ㄋ"
+ 'CHEWING_L', #"ㄌ"
+ 'CHEWING_R', #"ㄖ"
+ 'CHEWING_P', #"ㄆ"
+ 'CHEWING_Q', #"ㄑ"
+ 'CHEWING_S', #"ㄙ"
+ 'CHEWING_SH', #"ㄕ"
+ 'CHEWING_T', #"ㄊ"
+ 'PINYIN_W', #Invalid Chewing
+ 'CHEWING_X', #"ㄒ"
+ 'PINYIN_Y', #Invalid Chewing
+ 'CHEWING_Z', #"ㄗ"
+ 'CHEWING_ZH' #"ㄓ"
+]
+
+
+CHEWING_MIDDLE_LIST = [
+ 'CHEWING_ZERO_MIDDLE', #Zero Middle
+ 'CHEWING_I', #"ㄧ"
+ 'CHEWING_U', #"ㄨ"
+ 'CHEWING_V' #"ㄩ"
+]
+
+
+CHEWING_FINAL_LIST = [
+ 'CHEWING_ZERO_FINAL', #Zero Final
+ 'CHEWING_A', #"ㄚ"
+ 'CHEWING_AI', #"ㄞ"
+ 'CHEWING_AN', #"ㄢ"
+ 'CHEWING_ANG', #"ㄤ"
+ 'CHEWING_AO', #"ㄠ"
+ 'CHEWING_E', #"ㄝ" and "ㄜ"
+ 'INVALID_EA', #Invalid Pinyin/Chewing
+ 'CHEWING_EI', #"ㄟ"
+ 'CHEWING_EN', #"ㄣ"
+ 'CHEWING_ENG', #"ㄥ"
+ 'CHEWING_ER', #"ㄦ"
+ 'CHEWING_NG', #"ㄫ"
+ 'CHEWING_O', #"ㄛ"
+ 'PINYIN_ONG', #"ueng"
+ 'CHEWING_OU', #"ㄡ"
+ 'PINYIN_IN', #"ien"
+ 'PINYIN_ING' #"ieng"
+]
+
+
+CHEWING_TONE_LIST = [
+ 'CHEWING_ZERO_TONE', #Zero Tone
+ 'CHEWING_1', #" "
+ 'CHEWING_2', #'ˊ'
+ 'CHEWING_3', #'ˇ'
+ 'CHEWING_4', #'ˋ'
+ 'CHEWING_5' #'˙'
+]
+
+
+def gen_entries(items, last_enum, num_enum):
+ entries = []
+ for enum, item in enumerate(items, start=0):
+ entry = '{0} = {1}'.format(item, enum)
+ entries.append(entry)
+
+ #last enum
+ entry = last_enum + ' = ' + items[-1]
+ entries.append(entry)
+
+ #num enum
+ entry = num_enum
+ entries.append(entry)
+
+ return ",\n".join(entries)
+
+
+def gen_initials():
+ return gen_entries(CHEWING_INITIAL_LIST, 'CHEWING_LAST_INITIAL',
+ 'CHEWING_NUMBER_OF_INITIALS = CHEWING_LAST_INITIAL + 1')
+
+
+def gen_middles():
+ return gen_entries(CHEWING_MIDDLE_LIST, 'CHEWING_LAST_MIDDLE',
+ 'CHEWING_NUMBER_OF_MIDDLES = CHEWING_LAST_MIDDLE + 1')
+
+
+def gen_finals():
+ return gen_entries(CHEWING_FINAL_LIST, 'CHEWING_LAST_FINAL',
+ 'CHEWING_NUMBER_OF_FINALS = CHEWING_LAST_FINAL + 1')
+
+
+def gen_tones():
+ return gen_entries(CHEWING_TONE_LIST, 'CHEWING_LAST_TONE',
+ 'CHEWING_NUMBER_OF_TONES = CHEWING_LAST_TONE + 1')
+
+
+def gen_table_index(content_table):
+ entries = []
+ for i in range(0, len(CHEWING_INITIAL_LIST)):
+ initial = CHEWING_INITIAL_LIST[i]
+ for m in range(0, len(CHEWING_MIDDLE_LIST)):
+ middle = CHEWING_MIDDLE_LIST[m]
+ for f in range(0, len(CHEWING_FINAL_LIST)):
+ final = CHEWING_FINAL_LIST[f]
+ chewingkey = 'ChewingKey({0}, {1}, {2})'.format(initial, middle, final)
+ index = -1
+ try:
+ index = [x[2] for x in content_table].index(chewingkey)
+ except ValueError:
+ pass
+
+ entry = '{0:<7} /* {1} */'.format(index, chewingkey)
+ entries.append(entry)
+ return ",\n".join(entries)
+
+
+### main function ###
+if __name__ == "__main__":
+ print(gen_initials() + gen_middles() + gen_finals() + gen_tones())
diff --git a/scripts/correct.py b/scripts/correct.py
new file mode 100644
index 0000000..ffd5998
--- /dev/null
+++ b/scripts/correct.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (c) 2007-2008 Peng Huang <shawn.p.huang@gmail.com>
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+auto_correct = [
+ # "correct", "wrong"
+ ("ng", "gn"),
+ ("ng", "mg"),
+ ("iu", "iou"),
+ ("ui", "uei"),
+ ("un", "uen"),
+# ("ue", "ve"),
+ ("ve", "ue"),
+ ("ong", "on"),
+]
+
+auto_correct_ext = [
+ # "correct", "wrong", flag
+ ("ju", "jv", "PINYIN_CORRECT_V_U"),
+ ("qu", "qv", "PINYIN_CORRECT_V_U"),
+ ("xu", "xv", "PINYIN_CORRECT_V_U"),
+ ("yu", "yv", "PINYIN_CORRECT_V_U"),
+
+ ("jue", "jve", "PINYIN_CORRECT_V_U"),
+ ("que", "qve", "PINYIN_CORRECT_V_U"),
+ ("xue", "xve", "PINYIN_CORRECT_V_U"),
+ ("yue", "yve", "PINYIN_CORRECT_V_U"),
+
+ ("juan", "jvan", "PINYIN_CORRECT_V_U"),
+ ("quan", "qvan", "PINYIN_CORRECT_V_U"),
+ ("xuan", "xvan", "PINYIN_CORRECT_V_U"),
+ ("yuan", "yvan", "PINYIN_CORRECT_V_U"),
+
+ ("jun", "jvn", "PINYIN_CORRECT_V_U"),
+ ("qun", "qvn", "PINYIN_CORRECT_V_U"),
+ ("xun", "xvn", "PINYIN_CORRECT_V_U"),
+ ("yun", "yvn", "PINYIN_CORRECT_V_U"),
+
+# ("juang", "jvang", "PINYIN_CORRECT_V_U"),
+# ("quang", "qvang", "PINYIN_CORRECT_V_U"),
+# ("xuang", "xvang", "PINYIN_CORRECT_V_U"),
+# ("yuang", "yvang", "PINYIN_CORRECT_V_U"),
+
+# ("jun", "jven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"),
+# ("qun", "qven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"),
+# ("xun", "xven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"),
+# ("yun", "yven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"),
+]
+
+
+'''
+fuzzy_shengmu = [
+ ("c", "ch"),
+ ("ch", "c"),
+ ("z", "zh"),
+ ("zh", "z"),
+ ("s", "sh"),
+ ("sh", "s"),
+ ("l", "n"),
+ ("n", "l"),
+ ("f", "h"),
+ ("h", "f"),
+ ("l", "r"),
+ ("r", "l"),
+ ("k", "g"),
+ ("g", "k"),
+]
+
+fuzzy_yunmu = [
+ ("an", "ang"),
+ ("ang", "an"),
+ ("en", "eng"),
+ ("eng", "en"),
+ ("in", "ing"),
+ ("ing", "in"),
+]
+'''
diff --git a/scripts/double_pinyin_table.h.in b/scripts/double_pinyin_table.h.in
new file mode 100644
index 0000000..15a8ee9
--- /dev/null
+++ b/scripts/double_pinyin_table.h.in
@@ -0,0 +1,56 @@
+#ifndef DOUBLE_PINYIN_TABLE_H
+#define DOUBLE_PINYIN_TABLE_H
+
+namespace pinyin{
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_mspy_sheng[] = {
+@MSPY_SHENG@
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_mspy_yun[] = {
+@MSPY_YUN@
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_zrm_sheng[] = {
+@ZRM_SHENG@
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_zrm_yun[] = {
+@ZRM_YUN@
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_abc_sheng[] = {
+@ABC_SHENG@
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_abc_yun[] = {
+@ABC_YUN@
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_zgpy_sheng[] = {
+@ZGPY_SHENG@
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_zgpy_yun[] = {
+@ZGPY_YUN@
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_pyjj_sheng[] = {
+@PYJJ_SHENG@
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_pyjj_yun[] = {
+@PYJJ_YUN@
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_xhe_sheng[] = {
+@XHE_SHENG@
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_xhe_yun[] = {
+@XHE_YUN@
+};
+
+};
+
+#endif
diff --git a/scripts/genbopomofoheader.py b/scripts/genbopomofoheader.py
new file mode 100644
index 0000000..cb0fa86
--- /dev/null
+++ b/scripts/genbopomofoheader.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (c) 2010 BYVoid <byvoid1@gmail.com>
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+from operator import itemgetter
+from utils import expand_file
+
+bopomofo = [
+ 'ㄅ', 'ㄆ', 'ㄇ', 'ㄈ', 'ㄉ', 'ㄊ', 'ㄋ', 'ㄌ', 'ㄍ', 'ㄎ',
+ 'ㄏ', 'ㄐ', 'ㄑ', 'ㄒ', 'ㄓ', 'ㄔ', 'ㄕ', 'ㄖ', 'ㄗ', 'ㄘ', 'ㄙ',
+
+ 'ㄧ', 'ㄨ', 'ㄩ', 'ㄚ', 'ㄛ', 'ㄜ', 'ㄝ', 'ㄞ', 'ㄟ', 'ㄠ', 'ㄡ',
+ 'ㄢ', 'ㄣ', 'ㄤ', 'ㄥ', 'ㄦ',
+
+ 'ˉ', 'ˊ', 'ˇ', 'ˋ', '˙',
+]
+
+#陰平聲不標號, use space key
+num_tones = -5
+
+bopomofo_keyboards = {
+ #標準注音鍵盤
+ 'STANDARD':
+ (
+ "1","q","a","z","2","w","s","x","e","d","c","r","f","v","5","t","g","b","y","h","n",
+ "u","j","m","8","i","k",",","9","o","l",".","0","p",";","/","-",
+ " ","6","3","4","7",
+ ),
+ #精業注音鍵盤
+ 'GINYIEH':
+ (
+ "2","w","s","x","3","e","d","c","r","f","v","t","g","b","6","y","h","n","u","j","m",
+ "-","[","'","8","i","k",",","9","o","l",".","0","p",";","/","=",
+ " ","q","a","z","1",
+ ),
+ #倚天注音鍵盤
+ 'ETEN':
+ (
+ "b","p","m","f","d","t","n","l","v","k","h","g","7","c",",",".","/","j",";","'","s",
+ "e","x","u","a","o","r","w","i","q","z","y","8","9","0","-","=",
+ " ","2","3","4","1",
+ ),
+ #IBM注音鍵盤
+ 'IBM':
+ (
+ "1","2","3","4","5","6","7","8","9","0","-","q","w","e","r","t","y","u","i","o","p",
+ "a","s","d","f","g","h","j","k","l",";","z","x","c","v","b","n",
+ " ","m",",",".","/",
+ ),
+}
+
+
+def escape_char(ch):
+ if ch == "'" or ch == "\\":
+ ch = "\\" + ch;
+ return "'{0}'".format(ch)
+
+
+#generate shengmu and yunmu here
+def gen_chewing_symbols(scheme):
+ keyboard = bopomofo_keyboards[scheme]
+ keyboard = keyboard[: num_tones]
+ items = []
+ for (i, key) in enumerate(keyboard):
+ items.append((key, bopomofo[i]))
+ items = sorted(items, key=itemgetter(0))
+ entries = []
+ for (key, string) in items:
+ key = escape_char(key)
+ string = '"{0}"'.format(string)
+ entry = "{{{0: <5}, {1}}}".format(key, string)
+ entries.append(entry)
+ entries.append("{'\\0', NULL}")
+ return ",\n".join(entries)
+
+
+#generate tones here
+def gen_chewing_tones(scheme):
+ keyboard = bopomofo_keyboards[scheme]
+ keyboard = keyboard[num_tones:]
+ items = []
+ for (i, key) in enumerate(keyboard, start=1):
+ items.append((key, i));
+ items = sorted(items, key=itemgetter(0))
+ entries = []
+ for (key, tone) in items:
+ key = escape_char(key);
+ entry = "{{{0: <5}, {1}}}".format(key, tone)
+ entries.append(entry)
+ entries.append("{'\\0', 0}")
+ return ",\n".join(entries)
+
+
+def get_table_content(tablename):
+ (scheme, part) = tablename.split('_', 1)
+ if part == "SYMBOLS":
+ return gen_chewing_symbols(scheme);
+ if part == "TONES":
+ return gen_chewing_tones(scheme);
+
+
+### main function ###
+if __name__ == "__main__":
+ expand_file("chewing_table.h.in", get_table_content)
diff --git a/scripts/genchewingkey.py b/scripts/genchewingkey.py
new file mode 100644
index 0000000..4a0bdcd
--- /dev/null
+++ b/scripts/genchewingkey.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+from utils import expand_file
+from chewingkey import gen_initials, gen_middles, gen_finals, gen_tones
+
+
+def get_table_content(tablename):
+ if tablename == 'CHEWING_INITIAL':
+ return gen_initials()
+ if tablename == 'CHEWING_MIDDLE':
+ return gen_middles()
+ if tablename == 'CHEWING_FINAL':
+ return gen_finals()
+ if tablename == 'CHEWING_TONE':
+ return gen_tones()
+
+
+### main function ###
+if __name__ == "__main__":
+ expand_file("chewing_enum.h.in", get_table_content)
+
diff --git a/scripts/gendoublepinyinheader.py b/scripts/gendoublepinyinheader.py
new file mode 100644
index 0000000..08dd817
--- /dev/null
+++ b/scripts/gendoublepinyinheader.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+import pinyin
+from utils import expand_file
+
+def gen_shengmu_table(scheme):
+ entries = []
+ #select shengmu mapping
+ sheng = pinyin.SHUANGPIN_SCHEMAS[scheme][0]
+ for c in "abcdefghijklmnopqrstuvwxyz;":
+ sh = sheng.get(c, "NULL")
+ if sh != "NULL":
+ sh = '"{0}"'.format(sh)
+ entry = '{{{0: <5}}} /* {1} */'.format(sh, c.upper())
+ entries.append(entry)
+ return ',\n'.join(entries)
+
+
+def gen_yunmu_table(scheme):
+ entries = []
+ #select yunmu mapping
+ yun = pinyin.SHUANGPIN_SCHEMAS[scheme][1]
+ for c in "abcdefghijklmnopqrstuvwxyz;":
+ y = yun.get(c, ("NULL", "NULL"))
+ if len(y) == 1:
+ y1 = y[0]
+ y2 = "NULL"
+ else:
+ y1, y2 = y
+ if y1 != "NULL":
+ y1 = '"{0}"'.format(y1)
+ if y2 != "NULL":
+ y2 = '"{0}"'.format(y2)
+ entry = '{{{{{0: <7}, {1: <7}}}}} /* {2} */'.format(y1, y2, c.upper())
+ entries.append(entry)
+ return ',\n'.join(entries)
+
+
+def get_table_content(tablename):
+ (scheme, part) = tablename.split('_', 1)
+ if part == "SHENG":
+ return gen_shengmu_table(scheme)
+ if part == "YUN":
+ return gen_yunmu_table(scheme)
+
+
+### main function ###
+if __name__ == "__main__":
+ expand_file("double_pinyin_table.h.in", get_table_content)
diff --git a/scripts/genpinyinheader.py b/scripts/genpinyinheader.py
new file mode 100644
index 0000000..81e0538
--- /dev/null
+++ b/scripts/genpinyinheader.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+from utils import expand_file
+from genpinyintable import gen_content_table, \
+ gen_pinyin_index, gen_bopomofo_index, \
+ gen_chewing_key_table
+from genspecialtable import gen_divided_table, gen_resplit_table
+
+def get_table_content(tablename):
+ if tablename == 'CONTENT_TABLE':
+ return gen_content_table()
+ if tablename == 'PINYIN_INDEX':
+ return gen_pinyin_index()
+ if tablename == 'BOPOMOFO_INDEX':
+ return gen_bopomofo_index()
+ if tablename == 'DIVIDED_TABLE':
+ return gen_divided_table()
+ if tablename == 'RESPLIT_TABLE':
+ return gen_resplit_table()
+ if tablename == 'TABLE_INDEX':
+ return gen_chewing_key_table()
+
+
+### main function ###
+if __name__ == "__main__":
+ expand_file("pinyin_parser_table.h.in", get_table_content)
diff --git a/scripts/genpinyins.py b/scripts/genpinyins.py
new file mode 100644
index 0000000..fef40cd
--- /dev/null
+++ b/scripts/genpinyins.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python3
+import os
+from operator import itemgetter
+
+pinyin_dict = {}
+
+
+def strip_tone(old_pinyin_str):
+ oldpinyins = old_pinyin_str.split("'")
+ newpinyins = []
+
+ for pinyin in oldpinyins:
+ if pinyin[-1].isdigit():
+ pinyin = pinyin[:-1]
+ newpinyins.append(pinyin)
+
+ new_pinyin_str = "'".join(newpinyins)
+ return new_pinyin_str
+
+
+def add_pinyin_dict(pinyin, freq):
+ if 0 == freq:
+ return
+ if not pinyin in pinyin_dict:
+ pinyin_dict[pinyin] = freq
+ else:
+ pinyin_dict[pinyin] += freq
+
+
+def load_phrase(filename):
+ phrasefile = open(filename, "r")
+ for line in phrasefile.readlines():
+ line = line.rstrip(os.linesep)
+ (pinyin, word, token, freq) = line.split(None, 3)
+ pinyin = strip_tone(pinyin)
+ freq = int(freq)
+
+ if len(word) in [1, 2]:
+ add_pinyin_dict(pinyin, freq)
+
+ phrasefile.close()
+
+load_phrase("../data/gb_char.table")
+load_phrase("../data/gbk_char.table")
+
+
+def save_pinyin(filename):
+ pinyinfile = open(filename, "w")
+ for pinyin, freq in pinyin_dict.items():
+ freq = str(freq)
+ line = "\t".join((pinyin, freq))
+ pinyinfile.writelines([line, os.linesep])
+ pinyinfile.close()
+
+
+if __name__ == "__main__":
+ save_pinyin("pinyins.txt")
diff --git a/scripts/genpinyintable.py b/scripts/genpinyintable.py
new file mode 100644
index 0000000..cc60034
--- /dev/null
+++ b/scripts/genpinyintable.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+import operator
+import bopomofo
+from pinyintable import *
+from chewingkey import gen_table_index
+
+
+content_table = []
+pinyin_index = []
+bopomofo_index = []
+
+#pinyin table
+def filter_pinyin_list():
+ for (correct, wrong, bopomofo, flags, chewing) in gen_pinyin_list():
+ flags = '|'.join(flags)
+ chewing = "ChewingKey({0})".format(', '.join(chewing))
+ #correct = correct.replace("v", "ü")
+ content_table.append((correct, bopomofo, chewing))
+ if "IS_PINYIN" in flags:
+ pinyin_index.append((wrong, flags, correct))
+ if "IS_CHEWING" in flags:
+ bopomofo_index.append((bopomofo, flags))
+
+
+def sort_all():
+ global content_table, pinyin_index, bopomofo_index
+ #remove duplicates
+ content_table = list(set(content_table))
+ pinyin_index = list(set(pinyin_index))
+ bopomofo_index = list(set(bopomofo_index))
+ #define sort function
+ sortfunc = operator.itemgetter(0)
+ #begin sort
+ content_table = sorted(content_table, key=sortfunc)
+ #prepend zero item to reserve the invalid item
+ content_table.insert(0, ("", "", "ChewingKey()"))
+ #sort index
+ pinyin_index = sorted(pinyin_index, key=sortfunc)
+ bopomofo_index = sorted(bopomofo_index, key=sortfunc)
+
+def get_sheng_yun(pinyin):
+ if pinyin == None:
+ return None, None
+ if pinyin == "":
+ return "", ""
+ if pinyin == "ng":
+ return "", "ng"
+ for i in range(2, 0, -1):
+ s = pinyin[:i]
+ if s in shengmu_list:
+ return s, pinyin[i:]
+ return "", pinyin
+
+def gen_content_table():
+ entries = []
+ for ((correct, bopomofo, chewing)) in content_table:
+ (shengmu, yunmu) = get_sheng_yun(correct)
+ entry = '{{"{0}", "{1}", "{2}", "{3}", {4}}}'.format(correct, shengmu, yunmu, bopomofo, chewing)
+ entries.append(entry)
+ return ',\n'.join(entries)
+
+
+def gen_pinyin_index():
+ entries = []
+ for (wrong, flags, correct) in pinyin_index:
+ index = [x[0] for x in content_table].index(correct)
+ entry = '{{"{0}", {1}, {2}}}'.format(wrong, flags, index)
+ entries.append(entry)
+ return ',\n'.join(entries)
+
+
+def gen_bopomofo_index():
+ entries = []
+ for (bopomofo_str, flags) in bopomofo_index:
+ pinyin_str = bopomofo.BOPOMOFO_PINYIN_MAP[bopomofo_str]
+ index = [x[0] for x in content_table].index(pinyin_str)
+ entry = '{{"{0}", {1}, {2}}}'.format(bopomofo_str, flags, index)
+ entries.append(entry)
+ return ',\n'.join(entries)
+
+
+def gen_chewing_key_table():
+ return gen_table_index(content_table)
+
+
+#init code
+filter_pinyin_list()
+sort_all()
+
+
+### main function ###
+if __name__ == "__main__":
+ #s = gen_content_table() + gen_pinyin_index() + gen_bopomofo_index()
+ s = gen_chewing_key_table()
+ print(s)
diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py
new file mode 100644
index 0000000..061f9d1
--- /dev/null
+++ b/scripts/genspecialtable.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+import operator
+import pinyin
+from pinyintable import get_chewing, get_shengmu_chewing
+from specialtable import *
+
+pinyin_list = sorted(pinyin.PINYIN_LIST)
+shengmu_list = sorted(pinyin.SHENGMU_LIST)
+
+divided_list = []
+resplit_list = []
+
+
+def sort_all():
+ global divided_list, resplit_list
+ divided_list = sorted(divided_list, key=operator.itemgetter(0))
+ resplit_list = sorted(resplit_list, key=operator.itemgetter(0, 1))
+
+'''
+def get_chewing_string(pinyin):
+ #handle shengmu
+ if pinyin not in pinyin_list:
+ if pinyin in shengmu_list:
+ chewing_key = get_shengmu_chewing(pinyin)
+ else:
+ assert False, "Un-expected pinyin string."
+ else:
+ chewing_key = get_chewing(pinyin)
+ chewing_str = 'ChewingKey({0})'.format(', '.join(chewing_key))
+ return chewing_str
+'''
+
+def gen_divided_table():
+ entries = []
+ for (pinyin_key, orig_freq, first_key, second_key, new_freq) \
+ in divided_list:
+
+ if orig_freq >= new_freq:
+ assert orig_freq > 0, "Expected orig_freq > 0 here."
+
+ entry = '{{"{0}", {1}, {{"{2}", "{3}"}}, {4}}}'.format \
+ (pinyin_key, orig_freq, first_key, second_key, new_freq)
+ entries.append(entry)
+ return ',\n'.join(entries)
+
+
+def gen_resplit_table():
+ entries = []
+ for (orig_first_key, orig_second_key, orig_freq, \
+ new_first_key, new_second_key, new_freq) in resplit_list:
+
+ if orig_freq >= new_freq:
+ assert orig_freq > 0, "Expected orig_freq > 0 here."
+
+ entry = '{{{{"{0}", "{1}"}}, {2}, {{"{3}", "{4}"}}, {5}}}'.format \
+ (orig_first_key, orig_second_key, orig_freq,\
+ new_first_key, new_second_key, new_freq)
+ entries.append(entry)
+ return ',\n'.join(entries)
+
+
+#init code, load lists
+divided_list = filter_divided()
+resplit_list = filter_resplit()
+sort_all()
+
+
+### main function ###
+if __name__ == "__main__":
+ s = gen_divided_table() + '\n' + gen_resplit_table()
+ print(s)
+
diff --git a/scripts/pinyin.py b/scripts/pinyin.py
new file mode 100644
index 0000000..dd0e156
--- /dev/null
+++ b/scripts/pinyin.py
@@ -0,0 +1,400 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (c) 2007-2008 Peng Huang <shawn.p.huang@gmail.com>
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+N_ = lambda x : x
+PINYIN_DICT = {
+ "a" : 1, "ai" : 2, "an" : 3, "ang" : 4, "ao" : 5,
+ "ba" : 6, "bai" : 7, "ban" : 8, "bang" : 9, "bao" : 10,
+ "bei" : 11, "ben" : 12, "beng" : 13, "bi" : 14, "bian" : 15,
+ "biao" : 16, "bie" : 17, "bin" : 18, "bing" : 19, "bo" : 20,
+ "bu" : 21, "ca" : 22, "cai" : 23, "can" : 24, "cang" : 25,
+ "cao" : 26, "ce" : 27, "cen" : 28, "ceng" : 29, "ci" : 30,
+ "cong" : 31, "cou" : 32, "cu" : 33, "cuan" : 34, "cui" : 35,
+ "cun" : 36, "cuo" : 37, "cha" : 38, "chai" : 39, "chan" : 40,
+ "chang" : 41, "chao" : 42, "che" : 43, "chen" : 44, "cheng" : 45,
+ "chi" : 46, "chong" : 47, "chou" : 48, "chu" : 49, "chuai" : 50,
+ "chuan" : 51, "chuang" : 52, "chui" : 53, "chun" : 54, "chuo" : 55,
+ "da" : 56, "dai" : 57, "dan" : 58, "dang" : 59, "dao" : 60,
+ "de" : 61, "dei" : 62,
+ # "den" : 63,
+ "deng" : 64, "di" : 65,
+ "dia" : 66, "dian" : 67, "diao" : 68, "die" : 69, "ding" : 70,
+ "diu" : 71, "dong" : 72, "dou" : 73, "du" : 74, "duan" : 75,
+ "dui" : 76, "dun" : 77, "duo" : 78, "e" : 79, "ei" : 80,
+ "en" : 81, "er" : 82, "fa" : 83, "fan" : 84, "fang" : 85,
+ "fei" : 86, "fen" : 87, "feng" : 88, "fo" : 89, "fou" : 90,
+ "fu" : 91, "ga" : 92, "gai" : 93, "gan" : 94, "gang" : 95,
+ "gao" : 96, "ge" : 97, "gei" : 98, "gen" : 99, "geng" : 100,
+ "gong" : 101, "gou" : 102, "gu" : 103, "gua" : 104, "guai" : 105,
+ "guan" : 106, "guang" : 107, "gui" : 108, "gun" : 109, "guo" : 110,
+ "ha" : 111, "hai" : 112, "han" : 113, "hang" : 114, "hao" : 115,
+ "he" : 116, "hei" : 117, "hen" : 118, "heng" : 119, "hong" : 120,
+ "hou" : 121, "hu" : 122, "hua" : 123, "huai" : 124, "huan" : 125,
+ "huang" : 126, "hui" : 127, "hun" : 128, "huo" : 129, "ji" : 130,
+ "jia" : 131, "jian" : 132, "jiang" : 133, "jiao" : 134, "jie" : 135,
+ "jin" : 136, "jing" : 137, "jiong" : 138, "jiu" : 139, "ju" : 140,
+ "juan" : 141, "jue" : 142, "jun" : 143, "ka" : 144, "kai" : 145,
+ "kan" : 146, "kang" : 147, "kao" : 148, "ke" : 149,
+ # "kei" : 150,
+ "ken" : 151, "keng" : 152, "kong" : 153, "kou" : 154, "ku" : 155,
+ "kua" : 156, "kuai" : 157, "kuan" : 158, "kuang" : 159, "kui" : 160,
+ "kun" : 161, "kuo" : 162, "la" : 163, "lai" : 164, "lan" : 165,
+ "lang" : 166, "lao" : 167, "le" : 168, "lei" : 169, "leng" : 170,
+ "li" : 171, "lia" : 172, "lian" : 173, "liang" : 174, "liao" : 175,
+ "lie" : 176, "lin" : 177, "ling" : 178, "liu" : 179,
+ "lo" : 180,
+ "long" : 181, "lou" : 182, "lu" : 183, "luan" : 184,
+ # "lue" : 185,
+ "lun" : 186, "luo" : 187, "lv" : 188, "lve" : 189,
+ "ma" : 190,
+ "mai" : 191, "man" : 192, "mang" : 193, "mao" : 194, "me" : 195,
+ "mei" : 196, "men" : 197, "meng" : 198, "mi" : 199, "mian" : 200,
+ "miao" : 201, "mie" : 202, "min" : 203, "ming" : 204, "miu" : 205,
+ "mo" : 206, "mou" : 207, "mu" : 208, "na" : 209, "nai" : 210,
+ "nan" : 211, "nang" : 212, "nao" : 213, "ne" : 214, "nei" : 215,
+ "nen" : 216, "neng" : 217, "ni" : 218, "nian" : 219, "niang" : 220,
+ "niao" : 221, "nie" : 222, "nin" : 223, "ning" : 224, "niu" : 225,
+ "ng" : 226,
+ "nong" : 227, "nou" : 228, "nu" : 229, "nuan" : 230,
+ # "nue" : 231,
+ "nuo" : 232, "nv" : 233, "nve" : 234,
+ "o" : 235,
+ "ou" : 236, "pa" : 237, "pai" : 238, "pan" : 239, "pang" : 240,
+ "pao" : 241, "pei" : 242, "pen" : 243, "peng" : 244, "pi" : 245,
+ "pian" : 246, "piao" : 247, "pie" : 248, "pin" : 249, "ping" : 250,
+ "po" : 251, "pou" : 252, "pu" : 253, "qi" : 254, "qia" : 255,
+ "qian" : 256, "qiang" : 257, "qiao" : 258, "qie" : 259, "qin" : 260,
+ "qing" : 261, "qiong" : 262, "qiu" : 263, "qu" : 264, "quan" : 265,
+ "que" : 266, "qun" : 267, "ran" : 268, "rang" : 269, "rao" : 270,
+ "re" : 271, "ren" : 272, "reng" : 273, "ri" : 274, "rong" : 275,
+ "rou" : 276, "ru" : 277, "ruan" : 278, "rui" : 279, "run" : 280,
+ "ruo" : 281, "sa" : 282, "sai" : 283, "san" : 284, "sang" : 285,
+ "sao" : 286, "se" : 287, "sen" : 288, "seng" : 289, "si" : 290,
+ "song" : 291, "sou" : 292, "su" : 293, "suan" : 294, "sui" : 295,
+ "sun" : 296, "suo" : 297, "sha" : 298, "shai" : 299, "shan" : 300,
+ "shang" : 301, "shao" : 302, "she" : 303, "shei" : 304, "shen" : 305,
+ "sheng" : 306, "shi" : 307, "shou" : 308, "shu" : 309, "shua" : 310,
+ "shuai" : 311, "shuan" : 312, "shuang" : 313, "shui" : 314, "shun" : 315,
+ "shuo" : 316, "ta" : 317, "tai" : 318, "tan" : 319, "tang" : 320,
+ "tao" : 321, "te" : 322,
+ # "tei" : 323,
+ "teng" : 324, "ti" : 325,
+ "tian" : 326, "tiao" : 327, "tie" : 328, "ting" : 329, "tong" : 330,
+ "tou" : 331, "tu" : 332, "tuan" : 333, "tui" : 334, "tun" : 335,
+ "tuo" : 336, "wa" : 337, "wai" : 338, "wan" : 339, "wang" : 340,
+ "wei" : 341, "wen" : 342, "weng" : 343, "wo" : 344, "wu" : 345,
+ "xi" : 346, "xia" : 347, "xian" : 348, "xiang" : 349, "xiao" : 350,
+ "xie" : 351, "xin" : 352, "xing" : 353, "xiong" : 354, "xiu" : 355,
+ "xu" : 356, "xuan" : 357, "xue" : 358, "xun" : 359, "ya" : 360,
+ "yan" : 361, "yang" : 362, "yao" : 363, "ye" : 364, "yi" : 365,
+ "yin" : 366, "ying" : 367, "yo" : 368, "yong" : 369, "you" : 370,
+ "yu" : 371, "yuan" : 372, "yue" : 373, "yun" : 374, "za" : 375,
+ "zai" : 376, "zan" : 377, "zang" : 378, "zao" : 379, "ze" : 380,
+ "zei" : 381, "zen" : 382, "zeng" : 383, "zi" : 384, "zong" : 385,
+ "zou" : 386, "zu" : 387, "zuan" : 388, "zui" : 389, "zun" : 390,
+ "zuo" : 391, "zha" : 392, "zhai" : 393, "zhan" : 394, "zhang" : 395,
+ "zhao" : 396, "zhe" : 397, "zhen" : 398, "zheng" : 399, "zhi" : 400,
+ "zhong" : 401, "zhou" : 402, "zhu" : 403, "zhua" : 404, "zhuai" : 405,
+ "zhuan" : 406, "zhuang" : 407, "zhui" : 408, "zhun" : 409, "zhuo" : 410,
+ # some weird pinyins
+ #~ "eng" : 411, "chua" : 412, "fe" : 413, "fiao" : 414, "liong" : 415
+}
+
+PINYIN_LIST = PINYIN_DICT.keys ()
+
+
+SHENGMU_DICT = {
+ "b" : 1, "p" : 2, "m" : 3, "f" : 4, "d" : 5,
+ "t" : 6, "n" : 7, "l" : 8, "g" : 9, "k" : 10, "h" : 11,
+ "j" : 12, "q" : 13, "x" : 14, "zh" : 15, "ch" : 16, "sh" : 17,
+ "r" : 18, "z" : 19, "c" : 20, "s" : 21, "y" : 22, "w" : 23
+}
+
+SHENGMU_LIST = SHENGMU_DICT.keys ()
+
+
+YUNMU_DICT = {
+ "a" : 1, "ai" : 2, "an" : 3, "ang" : 4, "ao" : 5,
+ "e" : 6, "ei" : 7, "en" : 8, "eng" : 9, "er" : 10,
+ "i" : 11, "ia" : 12, "ian" : 13, "iang" : 14, "iao" : 15,
+ "ie" : 16, "in" : 17, "ing" : 18, "iong" : 19, "iu" : 20,
+ "o" : 21, "ong" : 22, "ou" : 23, "u" : 24, "ua" : 25,
+ "uai" : 26, "uan" : 27, "uang" : 28, "ue" : 29, "ui" : 30,
+ "un" : 31, "uo" : 32, "v" : 33, "ve" : 34
+}
+
+YUNMU_LIST = YUNMU_DICT.keys ()
+
+
+MOHU_SHENGMU = {
+ "z" : ("z", "zh"),
+ "zh" : ("z", "zh"),
+ "c" : ("c", "ch"),
+ "ch" : ("c", "ch"),
+ "s" : ("s", "sh"),
+ "sh" : ("s", "sh"),
+ "l" : ("l", "n"),
+ "n" : ("l", "n")
+}
+
+MOHU_YUNMU = {
+ "an" : ("an", "ang"),
+ "ang" : ("an", "ang"),
+ "en" : ("en", "eng"),
+ "eng" : ("en", "eng"),
+ "in" : ("in", "ing"),
+ "ing" : ("in", "ing")
+}
+
+MSPY_SHUANGPIN_SHENGMU_DICT = {
+ "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g",
+ "h" : "h", "i" : "ch","j" : "j", "k" : "k", "l" : "l",
+ "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q",
+ "r" : "r", "s" : "s", "t" : "t", "u" : "sh","v" : "zh",
+ "w" : "w", "x" : "x", "y" : "y", "z" : "z"
+}
+
+MSPY_SHUANGPIN_YUNMU_DICT = {
+ "a" : ("a",),
+ "b" : ("ou",),
+ "c" : ("iao",),
+ "d" : ("uang", "iang"),
+ "e" : ("e",),
+ "f" : ("en",),
+ "g" : ("eng", "ng"),
+ "h" : ("ang",),
+ "i" : ("i",),
+ "j" : ("an",),
+ "k" : ("ao",),
+ "l" : ("ai",),
+ "m" : ("ian",),
+ "n" : ("in",),
+ "o" : ("uo", "o"),
+ "p" : ("un",),
+ "q" : ("iu",),
+ "r" : ("uan", "er"),
+ "s" : ("ong", "iong"),
+ "t" : ("ue",),
+ "u" : ("u",),
+ "v" : ("ui","ue"),
+ "w" : ("ia","ua"),
+ "x" : ("ie",),
+ "y" : ("uai", "v"),
+ "z" : ("ei",),
+ ";" : ("ing",)
+}
+
+ZRM_SHUANGPIN_SHENGMU_DICT = {
+ "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g",
+ "h" : "h", "i" : "ch","j" : "j", "k" : "k", "l" : "l",
+ "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q",
+ "r" : "r", "s" : "s", "t" : "t", "u" : "sh","v" : "zh",
+ "w" : "w", "x" : "x", "y" : "y", "z" : "z"
+}
+
+ZRM_SHUANGPIN_YUNMU_DICT = {
+ "a" : ("a",),
+ "b" : ("ou",),
+ "c" : ("iao",),
+ "d" : ("uang", "iang"),
+ "e" : ("e",),
+ "f" : ("en",),
+ "g" : ("eng", "ng"),
+ "h" : ("ang",),
+ "i" : ("i",),
+ "j" : ("an",),
+ "k" : ("ao",),
+ "l" : ("ai",),
+ "m" : ("ian",),
+ "n" : ("in",),
+ "o" : ("uo", "o"),
+ "p" : ("un",),
+ "q" : ("iu",),
+ "r" : ("uan", "er"),
+ "s" : ("ong", "iong"),
+ "t" : ("ue",),
+ "u" : ("u",),
+ "v" : ("ui","v"),
+ "w" : ("ia","ua"),
+ "x" : ("ie",),
+ "y" : ("uai", "ing"),
+ "z" : ("ei",),
+}
+
+ABC_SHUANGPIN_SHENGMU_DICT = {
+ "a" : "zh", "b" : "b", "c" : "c", "d" : "d", "e":"ch", "f" : "f", "g" : "g",
+ "h" : "h", "j" : "j", "k" : "k", "l" : "l",
+ "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q",
+ "r" : "r", "s" : "s", "t" : "t", "v" : "sh",
+ "w" : "w", "x" : "x", "y" : "y", "z" : "z"
+}
+
+ABC_SHUANGPIN_YUNMU_DICT = {
+ "a" : ("a",),
+ "b" : ("ou",),
+ "c" : ("in","uai"),
+ "d" : ("ia", "ua"),
+ "e" : ("e",),
+ "f" : ("en",),
+ "g" : ("eng", "ng"),
+ "h" : ("ang",),
+ "i" : ("i",),
+ "j" : ("an",),
+ "k" : ("ao",),
+ "l" : ("ai",),
+ "m" : ("ue","ui"),
+ "n" : ("un",),
+ "o" : ("uo", "o"),
+ "p" : ("uan",),
+ "q" : ("ei",),
+ "r" : ("er", "iu"),
+ "s" : ("ong", "iong"),
+ "t" : ("iang","uang"),
+ "u" : ("u",),
+ "v" : ("v","ue"),
+ "w" : ("ian",),
+ "x" : ("ie",),
+ "y" : ("ing",),
+ "z" : ("iao",),
+}
+
+ZGPY_SHUANGPIN_SHENGMU_DICT = {
+ "a" : "ch", "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g",
+ "h" : "h", "i" : "sh","j" : "j", "k" : "k", "l" : "l",
+ "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q",
+ "r" : "r", "s" : "s", "t" : "t", "u" : "zh",
+ "w" : "w", "x" : "x", "y" : "y", "z" : "z"
+}
+
+ZGPY_SHUANGPIN_YUNMU_DICT = {
+ "a" : ("a", ),
+ "b" : ("iao", ),
+ "d" : ("ie", ),
+ "e" : ("e", ),
+ "f" : ("ian", ),
+ "g" : ("iang", "uang"),
+ "h" : ("ong", "iong"),
+ "i" : ("i", ),
+ "j" : ("er", "iu"),
+ "k" : ("ei", ),
+ "l" : ("uan", ),
+ "m" : ("un", ),
+ "n" : ("ue", "ui"),
+ "o" : ("uo", "o"),
+ "p" : ("ai", ),
+ "q" : ("ao", ),
+ "r" : ("an", ),
+ "s" : ("ang", ),
+ "t" : ("eng", "ng"),
+ "u" : ("u", ),
+ "v" : ("v", ),
+ "w" : ("en", ),
+ "x" : ("ia", "ua"),
+ "y" : ("in", "uai"),
+ "z" : ("ou" ,),
+ ";" : ("ing", )
+}
+
+PYJJ_SHUANGPIN_SHENGMU_DICT = {
+ "a" : "'", "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g",
+ "h" : "h", "i" : "sh","j" : "j", "k" : "k", "l" : "l",
+ "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q",
+ "r" : "r", "s" : "s", "t" : "t", "u" : "ch","v" : "zh",
+ "w" : "w", "x" : "x", "y" : "y", "z" : "z"
+}
+
+PYJJ_SHUANGPIN_YUNMU_DICT = {
+ "a" : ("a",),
+ "b" : ("ia","ua"),
+ "c" : ("uan",),
+ "d" : ("ao", ),
+ "e" : ("e",),
+ "f" : ("an",),
+ "g" : ("ang",),
+ "h" : ("iang","uang"),
+ "i" : ("i",),
+ "j" : ("ian",),
+ "k" : ("iao",),
+ "l" : ("in",),
+ "m" : ("ie",),
+ "n" : ("iu",),
+ "o" : ("uo", "o"),
+ "p" : ("ou",),
+ "q" : ("er","ing"),
+ "r" : ("en", ),
+ "s" : ("ai", ),
+ "t" : ("eng", "ng"),
+ "u" : ("u",),
+ "v" : ("v","ui"),
+ "w" : ("ei",),
+ "x" : ("uai","ue"),
+ "y" : ("ong","iong"),
+ "z" : ("un",),
+}
+
+XHE_SHUANGPIN_SHENGMU_DICT = {
+ "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g",
+ "h" : "h", "i" : "ch", "j" : "j", "k" : "k", "l" : "l",
+ "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q",
+ "r" : "r", "s" : "s", "t" : "t", "u" : "sh", "v" : "zh",
+ "w" : "w", "x" : "x", "y" : "y", "z" : "z",
+ "a" : "'", "e" : "'"
+}
+
+XHE_SHUANGPIN_YUNMU_DICT = {
+ "a" : ("a",),
+ "b" : ("in",),
+ "c" : ("ao",),
+ "d" : ("ai",),
+ "e" : ("e",),
+ "f" : ("en",),
+ "g" : ("eng", "ng"),
+ "h" : ("ang",),
+ "i" : ("i",),
+ "j" : ("an",),
+ "k" : ("uai", "ing"),
+ "l" : ("iang", "uang"),
+ "m" : ("ian",),
+ "n" : ("iao",),
+ "o" : ("uo", "o"),
+ "p" : ("ie",),
+ "q" : ("iu",),
+ "r" : ("uan", "er"),
+ "s" : ("ong", "iong"),
+ "t" : ("ue",),
+ "u" : ("u",),
+ "v" : ("v", "ui"),
+ "w" : ("ei",),
+ "x" : ("ia", "ua"),
+ "y" : ("un",),
+ "z" : ("ou",),
+}
+
+SHUANGPIN_SCHEMAS = {
+ N_("MSPY") : (MSPY_SHUANGPIN_SHENGMU_DICT, MSPY_SHUANGPIN_YUNMU_DICT),
+ N_("ZRM") : (ZRM_SHUANGPIN_SHENGMU_DICT, ZRM_SHUANGPIN_YUNMU_DICT),
+ N_("ABC") : (ABC_SHUANGPIN_SHENGMU_DICT, ABC_SHUANGPIN_YUNMU_DICT),
+ N_("ZGPY") : (ZGPY_SHUANGPIN_SHENGMU_DICT, ZGPY_SHUANGPIN_YUNMU_DICT),
+ N_("PYJJ") : (PYJJ_SHUANGPIN_SHENGMU_DICT, PYJJ_SHUANGPIN_YUNMU_DICT),
+ N_("XHE") : (XHE_SHUANGPIN_SHENGMU_DICT, XHE_SHUANGPIN_YUNMU_DICT),
+}
+
diff --git a/scripts/pinyin_parser_table.h.in b/scripts/pinyin_parser_table.h.in
new file mode 100644
index 0000000..2f98e0e
--- /dev/null
+++ b/scripts/pinyin_parser_table.h.in
@@ -0,0 +1,34 @@
+#ifndef PINYIN_PARSER_TABLE_H
+#define PINYIN_PARSER_TABLE_H
+
+namespace pinyin{
+
+const pinyin_index_item_t pinyin_index[] = {
+@PINYIN_INDEX@
+};
+
+const chewing_index_item_t chewing_index[] = {
+@BOPOMOFO_INDEX@
+};
+
+const content_table_item_t content_table[] = {
+@CONTENT_TABLE@
+};
+
+const divided_table_item_t divided_table[] = {
+@DIVIDED_TABLE@
+};
+
+const resplit_table_item_t resplit_table[] = {
+@RESPLIT_TABLE@
+};
+
+const gint chewing_key_table[CHEWING_NUMBER_OF_INITIALS *
+ CHEWING_NUMBER_OF_MIDDLES *
+ CHEWING_NUMBER_OF_FINALS] = {
+@TABLE_INDEX@
+};
+
+};
+
+#endif
diff --git a/scripts/pinyintable.py b/scripts/pinyintable.py
new file mode 100644
index 0000000..bddf2dc
--- /dev/null
+++ b/scripts/pinyintable.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+import pinyin
+import bopomofo
+import chewing
+import itertools
+from correct import *
+
+
+pinyin_list = sorted(bopomofo.PINYIN_BOPOMOFO_MAP.keys())
+shengmu_list = sorted(pinyin.SHENGMU_LIST)
+
+
+def check_pinyin_chewing_map():
+ for pinyin_key in pinyin.PINYIN_DICT.keys():
+ if pinyin_key in pinyin_list:
+ pass
+ else:
+ print("pinyin %s has no chewing mapping", pinyin_key)
+
+
+def get_chewing(pinyin_key):
+ initial, middle, final = \
+ 'CHEWING_ZERO_INITIAL', 'CHEWING_ZERO_MIDDLE', 'CHEWING_ZERO_FINAL'
+ assert pinyin_key != None
+ assert pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP
+
+ #handle 'w' and 'y'
+ if pinyin_key[0] == 'w':
+ initial = 'PINYIN_W'
+ if pinyin_key[0] == 'y':
+ initial = 'PINYIN_Y'
+
+ #get chewing string
+ bopomofo_str = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key]
+
+ #handle bopomofo SPECIAL_INITIAL_SET
+ if pinyin_key in bopomofo.SPECIAL_INITIAL_SET:
+ middle = "CHEWING_I"
+ #normal process
+ for char in bopomofo_str:
+ if char in chewing.CHEWING_ASCII_INITIAL_MAP:
+ initial = chewing.CHEWING_ASCII_INITIAL_MAP[char]
+ if char in chewing.CHEWING_ASCII_MIDDLE_MAP:
+ middle = chewing.CHEWING_ASCII_MIDDLE_MAP[char]
+ if char in chewing.CHEWING_ASCII_FINAL_MAP:
+ final = chewing.CHEWING_ASCII_FINAL_MAP[char]
+ if char == "ㄜ": # merge "ㄝ" and "ㄜ"
+ final = "CHEWING_E"
+
+ post_process_rules = {
+ #handle "ueng"/"ong"
+ ("CHEWING_U", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ONG"),
+ #handle "veng"/"iong"
+ ("CHEWING_V", "CHEWING_ENG"): ("CHEWING_I", "PINYIN_ONG"),
+ #handle "ien"/"in"
+ ("CHEWING_I", "CHEWING_EN"): ("CHEWING_ZERO_MIDDLE", "PINYIN_IN"),
+ #handle "ieng"/"ing"
+ ("CHEWING_I", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ING"),
+ }
+
+ if (middle, final) in post_process_rules:
+ (middle, final) = post_process_rules[(middle, final)]
+
+ return initial, middle, final
+
+
+def gen_pinyin_list():
+ for p in itertools.chain(gen_pinyins(),
+ gen_shengmu(),
+ gen_corrects(),
+ gen_u_to_v(),
+ ):
+ yield p
+
+
+def gen_pinyins():
+ #generate all pinyins in bopomofo
+ for pinyin_key in pinyin_list:
+ flags = []
+ if pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP.keys():
+ flags.append("IS_CHEWING")
+ if pinyin_key in pinyin.PINYIN_LIST or \
+ pinyin_key in pinyin.SHENGMU_LIST:
+ flags.append("IS_PINYIN")
+ if pinyin_key in shengmu_list:
+ flags.append("PINYIN_INCOMPLETE")
+ chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key]
+ if chewing_key in chewing.CHEWING_ASCII_INITIAL_MAP and \
+ pinyin_key not in bopomofo.SPECIAL_INITIAL_SET:
+ flags.append("CHEWING_INCOMPLETE")
+ yield pinyin_key, pinyin_key, chewing_key, \
+ flags, get_chewing(pinyin_key)
+
+
+def get_shengmu_chewing(shengmu):
+ assert shengmu in shengmu_list, "Expected shengmu here."
+ chewing_key = 'CHEWING_{0}'.format(shengmu.upper())
+ if chewing_key in chewing.ASCII_CHEWING_INITIAL_MAP:
+ initial = chewing_key
+ else:
+ initial = 'PINYIN_{0}'.format(shengmu.upper())
+ return initial, "CHEWING_ZERO_MIDDLE", "CHEWING_ZERO_FINAL"
+
+def gen_shengmu():
+ #generate all shengmu
+ for shengmu in shengmu_list:
+ if shengmu in pinyin_list:
+ continue
+ flags = ["IS_PINYIN", "PINYIN_INCOMPLETE"]
+ chewing_key = get_shengmu_chewing(shengmu)
+ chewing_initial = chewing_key[0]
+ if chewing_initial in chewing.ASCII_CHEWING_INITIAL_MAP:
+ chewing_initial = chewing.ASCII_CHEWING_INITIAL_MAP[chewing_initial]
+ yield shengmu, shengmu, chewing_initial, \
+ flags, chewing_key
+
+
+def gen_corrects():
+ #generate corrections
+ for correct, wrong in auto_correct:
+ flags = ['IS_PINYIN', 'PINYIN_CORRECT_{0}_{1}'.format(wrong.upper(),
+ correct.upper())]
+ for pinyin_key in pinyin_list:
+ #fixes partial pinyin instead of the whole pinyin
+ if pinyin_key.endswith(correct) and pinyin_key != correct:
+ chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key]
+ new_pinyin_key = pinyin_key.replace(correct, wrong)
+ yield pinyin_key, new_pinyin_key, chewing_key,\
+ flags, get_chewing(pinyin_key)
+
+
+def gen_u_to_v():
+ #generate U to V
+ for correct, wrong, flags in auto_correct_ext:
+ #over-ride flags
+ flags = ['IS_PINYIN', 'PINYIN_CORRECT_V_U']
+ pinyin_key = correct
+ chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key]
+ yield correct, wrong, chewing_key, flags, get_chewing(pinyin_key)
+
+### main function ###
+if __name__ == "__main__":
+ #pre-check here
+ check_pinyin_chewing_map()
+
+ #dump
+ for p in gen_pinyin_list():
+ print (p)
diff --git a/scripts/specials.txt b/scripts/specials.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/scripts/specials.txt
diff --git a/scripts/specialtable.py b/scripts/specialtable.py
new file mode 100644
index 0000000..b6fb680
--- /dev/null
+++ b/scripts/specialtable.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+import os
+import sys
+import math
+import pinyin
+
+pinyin_list = sorted(pinyin.PINYIN_LIST)
+shengmu_list = sorted(pinyin.SHENGMU_LIST)
+yunmu_list = sorted(pinyin.YUNMU_LIST)
+
+phrase_dict = {}
+
+
+def load_phrase(filename):
+ phrasefile = open(filename, "r")
+ for line in phrasefile.readlines():
+ line = line.rstrip(os.linesep)
+ (pinyin_str, freq) = line.split(None, 1)
+ freq = int(freq)
+ if 0 == freq:
+ #print(pinyin_str)
+ continue
+
+ # no duplicate here
+ if "'" in pinyin_str:
+ (first_key, second_key) = pinyin_str.split("'")
+ phrase_dict[(first_key, second_key)] = freq
+ else:
+ phrase_dict[pinyin_str] = freq
+ phrasefile.close()
+
+
+def gen_all_divided():
+ for pinyin_key in pinyin_list:
+ for first_key in pinyin_list:
+ if len(pinyin_key) <= len(first_key):
+ continue
+ if not pinyin_key.startswith(first_key):
+ continue
+ second_key = pinyin_key[len(first_key):]
+ if second_key in pinyin_list:
+ yield pinyin_key, first_key, second_key
+
+
+def filter_divided():
+ for (pinyin_key, first_key, second_key) in gen_all_divided():
+ if not (first_key, second_key) in phrase_dict:
+ continue
+ orig_freq = 0
+ if pinyin_key in phrase_dict:
+ orig_freq = phrase_dict[pinyin_key]
+ new_freq = phrase_dict[(first_key, second_key)]
+ yield pinyin_key, orig_freq, first_key, second_key, new_freq
+
+
+def gen_all_resplit():
+ for pinyin_key in pinyin_list:
+ if pinyin_key[-1] in ["n", "g", "r"]:
+ for yun in yunmu_list:
+ if yun not in pinyin_list:
+ continue
+ #check first new pinyin key
+ if not pinyin_key[:-1] in pinyin_list:
+ continue
+ #check second new pinyin key
+ new_pinyin_key = pinyin_key[-1] + yun
+ if new_pinyin_key in pinyin_list:
+ yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key
+'''
+ elif pinyin_key[-1] in ["e"]:
+ #check first new pinyin key
+ if pinyin_key[:-1] in pinyin_list:
+ yield pinyin_key, "r", pinyin_key[:-1], "er"
+'''
+
+
+def filter_resplit():
+ for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
+ in gen_all_resplit():
+ #do the reverse here, as libpinyin pinyin parser is different with
+ #ibus-pinyin's parser.
+ (orig_first_key, orig_second_key, new_first_key, new_second_key) = \
+ (new_first_key, new_second_key, orig_first_key, orig_second_key)
+ if (new_first_key, new_second_key) not in phrase_dict:
+ continue
+ orig_freq = 0
+ new_freq = phrase_dict[(new_first_key, new_second_key)]
+ if (orig_first_key, orig_second_key) in phrase_dict:
+ orig_freq = phrase_dict[(orig_first_key, orig_second_key)]
+ yield orig_first_key, orig_second_key, orig_freq, \
+ new_first_key, new_second_key, new_freq
+
+
+#init code
+load_phrase("pinyins.txt")
+load_phrase("specials.txt")
+
+if __name__ == "__main__":
+ for p in filter_divided():
+ print (p)
+ for p in filter_resplit():
+ print (p)
diff --git a/scripts/utils.py b/scripts/utils.py
new file mode 100644
index 0000000..01bdbc7
--- /dev/null
+++ b/scripts/utils.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+import os
+
+header = '''/* This file is generated by python scripts. Don't edit this file directly.
+ */
+'''
+
+def expand_file(filename, get_table_content):
+ infile = open(filename, "r")
+ print(header)
+ for line in infile.readlines():
+ line = line.rstrip(os.linesep)
+ if len(line) < 3 :
+ print(line)
+ continue
+ if line[0] == '@' and line[-1] == '@':
+ tablename = line[1:-1]
+ print(get_table_content(tablename))
+ else:
+ print(line)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..4e0b09f
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,50 @@
+set(
+ LIBPINYIN_HEADERS
+ pinyin.h
+)
+
+set(
+ LIBPINYIN_SOURCES
+ pinyin.cpp
+)
+
+add_library(
+ libpinyin
+ SHARED
+ ${LIBPINYIN_SOURCES}
+)
+
+target_link_libraries(
+ libpinyin
+ storage
+ lookup
+)
+
+set_target_properties(
+ libpinyin
+ PROPERTIES
+ OUTPUT_NAME
+ pinyin
+ VERSION
+ 0.0.0
+ SOVERSION
+ 0
+)
+
+install(
+ TARGETS
+ libpinyin
+ LIBRARY DESTINATION
+ ${DIR_LIBRARY}
+)
+
+install(
+ FILES
+ ${LIBPINYIN_HEADERS}
+ DESTINATION
+ ${DIR_INCLUDE_LIBPINYIN}
+)
+
+add_subdirectory(include)
+add_subdirectory(storage)
+add_subdirectory(lookup)
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644
index 0000000..5600c86
--- /dev/null
+++ b/src/Makefile.am
@@ -0,0 +1,59 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+AUTOMAKE_OPTIONS = gnu
+SUBDIRS = include storage lookup
+
+EXTRA_DIST = libpinyin.ver
+
+MAINTAINERCLEANFILES = Makefile.in
+
+CLEANFILES = *.bak
+
+ACLOCAL = aclocal -I $(ac_aux_dir)
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ @GLIB2_CFLAGS@
+
+libpinyinincludedir = $(includedir)/libpinyin-@VERSION@
+
+libpinyininclude_HEADERS= pinyin.h
+
+noinst_HEADERS = pinyin_internal.h
+
+lib_LTLIBRARIES = libpinyin.la
+
+noinst_LTLIBRARIES = libpinyin_internal.la
+
+libpinyin_la_SOURCES = pinyin.cpp
+
+libpinyin_la_LIBADD = storage/libstorage.la lookup/liblookup.la @GLIB2_LIBS@
+
+libpinyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libpinyin.ver \
+ -version-info @LT_VERSION_INFO@
+
+libpinyin_internal_la_SOURCES = pinyin_internal.cpp
+
+libpinyin_internal_la_LIBADD = storage/libstorage.la lookup/liblookup.la
+
+
+## Note:
+## As libpinyin internal interface will change, only provides static library
+## to catch errors when compiling instead of running.
diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt
new file mode 100644
index 0000000..60d7d4c
--- /dev/null
+++ b/src/include/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(
+ LIBPINYIN_INCLUDE_HEADERS
+ novel_types.h
+)
+
+install(
+ FILES
+ ${LIBPINYIN_INCLUDE_HEADERS}
+ DESTINATION
+ ${DIR_INCLUDE_LIBPINYIN}
+)
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
new file mode 100644
index 0000000..a779d97
--- /dev/null
+++ b/src/include/Makefile.am
@@ -0,0 +1,25 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+libpinyinincludedir = $(includedir)/libpinyin-@VERSION@
+
+libpinyininclude_HEADERS= novel_types.h
+
+noinst_HEADERS = memory_chunk.h \
+ stl_lite.h
diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h
new file mode 100644
index 0000000..7b315af
--- /dev/null
+++ b/src/include/memory_chunk.h
@@ -0,0 +1,413 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef MEMORY_CHUNK_H
+#define MEMORY_CHUNK_H
+
+#include <config.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+#include "stl_lite.h"
+
+namespace pinyin{
+
+/* for unmanaged mode
+ * m_free_func == free, when memory is allocated by malloc
+ * m_free_func == munmap, when memory is allocated by mmap
+ * m_free_func == NULL,
+ * when memory is in small protion of allocated area
+ * m_free_func == other,
+ * malloc then free.
+ */
+
+/**
+ * MemoryChunk:
+ *
+ * The utility to manage the memory chunks.
+ *
+ */
+
+class MemoryChunk{
+ typedef void (* free_func_t)(...);
+private:
+ char * m_data_begin;
+ char * m_data_end; //one data pass the end.
+ char * m_allocated; //one data pass the end.
+ free_func_t m_free_func;
+
+private:
+ void freemem(){
+ if ((free_func_t)free == m_free_func)
+ free(m_data_begin);
+#ifdef HAVE_MMAP
+ else if ((free_func_t)munmap == m_free_func)
+ munmap(m_data_begin, capacity());
+#endif
+ else
+ assert(FALSE);
+ }
+
+
+ void reset(){
+ if (m_free_func)
+ freemem();
+
+ m_data_begin = NULL;
+ m_data_end = NULL;
+ m_allocated = NULL;
+ m_free_func = NULL;
+ }
+
+ void ensure_has_space(size_t new_size){
+ int delta_size = m_data_begin + new_size - m_data_end;
+ if ( delta_size <= 0 ) return;
+ ensure_has_more_space ( delta_size );
+ }
+
+ /* enlarge function */
+ void ensure_has_more_space(size_t extra_size){
+ if ( 0 == extra_size ) return;
+ size_t newsize;
+ size_t cursize = size();
+ if ( m_free_func != (free_func_t)free ) {
+ /* copy on resize */
+ newsize = cursize + extra_size;
+ /* do the copy */
+ char * tmp = (char *) malloc(newsize);
+ assert(tmp);
+ memset(tmp, 0, newsize);
+ memmove(tmp, m_data_begin, cursize);
+ /* free the origin memory */
+ if (m_free_func)
+ freemem();
+ /* change varibles */
+ m_data_begin = tmp;
+ m_data_end = m_data_begin + cursize;
+ m_allocated = m_data_begin + newsize;
+ m_free_func = (free_func_t)free;
+ return;
+ }
+ /* the memory area is managed by this memory chunk */
+ if ( extra_size <= (size_t) (m_allocated - m_data_end))
+ return;
+ newsize = std_lite::max( capacity()<<1, cursize + extra_size);
+ m_data_begin = (char *) realloc(m_data_begin, newsize);
+ assert(m_data_begin);
+ memset(m_data_begin + cursize, 0, newsize - cursize);
+ m_data_end = m_data_begin + cursize;
+ m_allocated = m_data_begin + newsize;
+ return;
+ }
+
+public:
+ /**
+ * MemoryChunk::MemoryChunk:
+ *
+ * The constructor of the MemoryChunk.
+ *
+ */
+ MemoryChunk(){
+ m_data_begin = NULL;
+ m_data_end = NULL;
+ m_allocated = NULL;
+ m_free_func = NULL;
+ }
+
+ /**
+ * MemoryChunk::~MemoryChunk:
+ *
+ * The destructor of the MemoryChunk.
+ *
+ */
+ ~MemoryChunk(){
+ reset();
+ }
+
+ /**
+ * MemoryChunk::begin:
+ *
+ * Read access method, to get the begin of the MemoryChunk.
+ *
+ */
+ void* begin() const{
+ return m_data_begin;
+ }
+
+ /**
+ * MemoryChunk::end:
+ *
+ * Write access method, to get the end of the MemoryChunk.
+ *
+ */
+ void* end() const{
+ return m_data_end;
+ }
+
+ /**
+ * MemoryChunk::size:
+ *
+ * Get the size of the content in the MemoryChunk.
+ *
+ */
+ size_t size() const{
+ return m_data_end - m_data_begin;
+ }
+
+ /**
+ * MemoryChunk::set_size:
+ *
+ * Set the size of the content in the MemoryChunk.
+ *
+ */
+ void set_size(size_t newsize){
+ ensure_has_space(newsize);
+ m_data_end = m_data_begin + newsize;
+ }
+
+ /**
+ * MemoryChunk::capacity:
+ *
+ * Get the capacity of the MemoryChunk.
+ *
+ */
+ size_t capacity(){
+ return m_allocated - m_data_begin;
+ }
+
+ /**
+ * MemoryChunk::set_chunk:
+ * @begin: the begin of the data
+ * @length: the length of the data
+ * @free_func: the function to free the data
+ *
+ * Transfer management of a memory chunk allocated by other part of the
+ * system to the memory chunk.
+ *
+ */
+ void set_chunk(void* begin, size_t length, free_func_t free_func){
+ if (m_free_func)
+ freemem();
+
+ m_data_begin = (char *) begin;
+ m_data_end = (char *) m_data_begin + length;
+ m_allocated = (char *) m_data_begin + length;
+ m_free_func = free_func;
+ }
+
+ /**
+ * MemoryChunk::get_sub_chunk:
+ * @offset: the offset in this MemoryChunk.
+ * @length: the data length to be retrieved.
+ * @returns: the newly allocated MemoryChunk.
+ *
+ * Get a sub MemoryChunk from this MemoryChunk.
+ *
+ * Note: use set_chunk internally.
+ * the returned new chunk need to be deleted.
+ *
+ */
+ MemoryChunk * get_sub_chunk(size_t offset, size_t length){
+ MemoryChunk * retval = new MemoryChunk();
+ char * begin_pos = m_data_begin + offset;
+ retval->set_chunk(begin_pos, length, NULL);
+ return retval;
+ }
+
+ /**
+ * MemoryChunk::set_content:
+ * @offset: the offset in this MemoryChunk.
+ * @data: the begin of the data to be copied.
+ * @len: the length of the data to be copied.
+ * @returns: whether the data is copied successfully.
+ *
+ * Data are written directly to the memory area in this MemoryChunk.
+ *
+ */
+ bool set_content(size_t offset, const void * data, size_t len){
+ size_t cursize = std_lite::max(size(), offset + len);
+ ensure_has_space(offset + len);
+ memmove(m_data_begin + offset, data, len);
+ m_data_end = m_data_begin + cursize;
+ return true;
+ }
+
+ /**
+ * MemoryChunk::append_content:
+ * @data: the begin of the data to be copied.
+ * @len: the length of the data to be copied.
+ * @returns: whether the data is appended successfully.
+ *
+ * Data are appended at the end of the MemoryChunk.
+ *
+ */
+ bool append_content(const void * data, size_t len){
+ return set_content(size(), data, len);
+ }
+
+ /**
+ * MemoryChunk::insert_content:
+ * @offset: the offset in this MemoryChunk, which starts from zero.
+ * @data: the begin of the data to be copied.
+ * @length: the length of the data to be copied.
+ * @returns: whether the data is inserted successfully.
+ *
+ * Data are written to the memory area,
+ * the original content are moved towards the rear.
+ *
+ */
+ bool insert_content(size_t offset, const void * data, size_t length){
+ ensure_has_more_space(length);
+ size_t move_size = size() - offset;
+ memmove(m_data_begin + offset + length, m_data_begin + offset, move_size);
+ memmove(m_data_begin + offset, data, length);
+ m_data_end += length;
+ return true;
+ }
+
+ /**
+ * MemoryChunk::remove_content:
+ * @offset: the offset in this MemoryChunk.
+ * @length: the length of the removed content.
+ * @returns: whether the content is removed successfully.
+ *
+ * Data are removed directly,
+ * the following content are moved towards the front.
+ *
+ */
+ bool remove_content(size_t offset, size_t length){
+ size_t move_size = size() - offset - length;
+ memmove(m_data_begin + offset, m_data_begin + offset + length, move_size);
+ m_data_end -= length;
+ return true;
+ }
+
+ /**
+ * MemoryChunk::get_content:
+ * @offset: the offset in this MemoryChunk.
+ * @buffer: the buffer to retrieve the content.
+ * @length: the length of content to be retrieved.
+ * @returns: whether the content is retrieved.
+ *
+ * Get the content in this MemoryChunk.
+ *
+ */
+ bool get_content(size_t offset, void * buffer, size_t length){
+ if ( size() < offset + length )
+ return false;
+ memcpy( buffer, m_data_begin + offset, length);
+ return true;
+ }
+
+ /**
+ * MemoryChunk::compact_memory:
+ *
+ * Compact memory, reduce the size.
+ *
+ */
+ void compact_memory(){
+ if ( m_free_func != (free_func_t)free )
+ return;
+ size_t newsize = size();
+ m_data_begin = (char *) realloc(m_data_begin, newsize);
+ m_allocated = m_data_begin + newsize;
+ }
+
+ /**
+ * MemoryChunk::load:
+ * @filename: load the MemoryChunk from the filename.
+ * @returns: whether the load is successful.
+ *
+ * Load the content from the filename.
+ *
+ */
+ bool load(const char * filename){
+ /* free old data */
+ reset();
+
+ int fd = open(filename, O_RDONLY);
+ if (-1 == fd)
+ return false;
+
+ off_t file_size = lseek(fd, 0, SEEK_END);
+ lseek(fd, 0, SEEK_SET);
+
+ int data_len = file_size;
+
+#ifdef HAVE_MMAP
+ void* data = mmap(NULL, data_len, PROT_READ|PROT_WRITE, MAP_PRIVATE,
+ fd, 0);
+
+ if (MAP_FAILED == data) {
+ close(fd);
+ return false;
+ }
+
+ set_chunk(data, data_len, (free_func_t)munmap);
+#else
+ void* data = malloc(data_len);
+ if ( !data ){
+ close(fd);
+ return false;
+ }
+
+ data_len = read(fd, data, data_len);
+ set_chunk(data, data_len, (free_func_t)free);
+#endif
+
+ close(fd);
+ return true;
+ }
+
+ /**
+ * MemoryChunk::save:
+ * @filename: save this MemoryChunk to the filename.
+ * @returns: whether the save is successful.
+ *
+ * Save the content to the filename.
+ *
+ */
+ bool save(const char * filename){
+ int fd = open(filename, O_CREAT|O_WRONLY|O_TRUNC, 0644);
+ if ( -1 == fd )
+ return false;
+
+ size_t data_len = write(fd, begin(), size());
+ if ( data_len != size()){
+ close(fd);
+ return false;
+ }
+
+ fsync(fd);
+ close(fd);
+ return true;
+ }
+};
+
+};
+
+#endif
diff --git a/src/include/novel_types.h b/src/include/novel_types.h
new file mode 100644
index 0000000..88c063c
--- /dev/null
+++ b/src/include/novel_types.h
@@ -0,0 +1,155 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This header file contains novel types designed for pinyin processing.
+ */
+
+
+#ifndef NOVEL_TYPES_H
+#define NOVEL_TYPES_H
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+typedef guint32 phrase_token_t;
+typedef gunichar ucs4_t;
+
+/*
+ * Phrase Index Library Definition
+ * Reserve 4-bits for future usage.
+ */
+
+#define PHRASE_MASK 0x00FFFFFF
+#define PHRASE_INDEX_LIBRARY_MASK 0x0F000000
+#define PHRASE_INDEX_LIBRARY_COUNT (1<<4)
+#define PHRASE_INDEX_LIBRARY_INDEX(token) ((token&PHRASE_INDEX_LIBRARY_MASK)>>24)
+#define PHRASE_INDEX_MAKE_TOKEN(phrase_index, token) \
+ ( ( (phrase_index<<24) & PHRASE_INDEX_LIBRARY_MASK)|(token & PHRASE_MASK))
+
+
+/*
+ * PhraseIndexRanges definitions
+ */
+
+struct PhraseIndexRange{
+ phrase_token_t m_range_begin;
+ phrase_token_t m_range_end; /* pass the last item like stl */
+};
+
+/* Array of PhraseIndexRange */
+typedef GArray * PhraseIndexRanges[PHRASE_INDEX_LIBRARY_COUNT];
+/* Array of Token */
+typedef GArray * PhraseTokens[PHRASE_INDEX_LIBRARY_COUNT];
+
+
+/*
+ * PinYin Table Definition
+ */
+
+
+/* For both PinYin Table and Phrase Table */
+enum SearchResult{
+ SEARCH_NONE = 0x00, /* found nothing */
+ SEARCH_OK = 0x01 , /* found items */
+ SEARCH_CONTINUED = 0x02 /* has longer word in the storage to search */
+};
+
+/* For Phrase Index */
+enum ErrorResult{
+ ERROR_OK = 0, /* operate ok */
+ ERROR_INSERT_ITEM_EXISTS, /* item already exists */
+ ERROR_REMOVE_ITEM_DONOT_EXISTS, /* item don't exists */
+ ERROR_PHRASE_TOO_LONG, /* the phrase is too long */
+ ERROR_NO_SUB_PHRASE_INDEX, /* sub phrase index is not loaded */
+ ERROR_NO_ITEM, /* item has a null slot */
+ ERROR_OUT_OF_RANGE, /* beyond the end of the sub phrase index */
+ ERROR_FILE_CORRUPTION, /* file is corrupted */
+ ERROR_INTEGER_OVERFLOW, /* integer is overflowed */
+ ERROR_ALREADY_EXISTS, /* the sub phrase already exists. */
+ ERROR_NO_USER_TABLE /* the user table is not loaded. */
+};
+
+/* For N-gram */
+enum ATTACH_FLAG{
+ ATTACH_READONLY = 1,
+ ATTACH_READWRITE = 0x1 << 1,
+ ATTACH_CREATE = 0x1 << 2,
+};
+
+/*
+ * n-gram Definition
+ * no B parameter(there are duplicated items in uni-gram and bi-gram)
+ * used in system n-gram and user n-gram.
+ * using delta technique.
+ */
+
+struct BigramPhraseItem{
+ phrase_token_t m_token;
+ gfloat m_freq; /* P(W2|W1) */
+};
+
+struct BigramPhraseItemWithCount{
+ phrase_token_t m_token;
+ guint32 m_count;
+ gfloat m_freq; /* P(W2|W1) */
+};
+
+typedef GArray * BigramPhraseArray; /* Array of BigramPhraseItem */
+typedef GArray * BigramPhraseWithCountArray; /* Array of BigramPhraseItemWithCount */
+
+#define MAX_PHRASE_LENGTH 16
+
+const phrase_token_t null_token = 0;
+const phrase_token_t sentence_start = 1;
+const phrase_token_t token_min = 0;
+const phrase_token_t token_max = UINT_MAX;
+
+const char c_separate = '#';
+typedef guint32 table_offset_t;
+
+typedef double parameter_t;
+
+/* Array of ChewingKey/ChewingKeyRest */
+typedef GArray * ChewingKeyVector;
+typedef GArray * ChewingKeyRestVector;
+
+/* Array of phrase_token_t */
+typedef GArray * TokenVector;
+typedef TokenVector MatchResults;
+
+/* Array of lookup_constraint_t */
+typedef GArray * CandidateConstraints;
+
+typedef guint32 pinyin_option_t;
+
+typedef enum {
+ RESERVED = 0,
+ GB_DICTIONARY = 1,
+ GBK_DICTIONARY = 2,
+ MERGED_DICTIONARY = 3,
+ USER_DICTIONARY = 15
+} PHRASE_INDEX_LIBRARIES;
+
+G_END_DECLS
+
+#endif
diff --git a/src/include/stl_lite.h b/src/include/stl_lite.h
new file mode 100644
index 0000000..5ad977d
--- /dev/null
+++ b/src/include/stl_lite.h
@@ -0,0 +1,45 @@
+#ifndef STL_LITE_H
+#define STL_LITE_H
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+namespace std_lite{
+
+ /**
+ * To restrict the usage of STL functions in libpinyin,
+ * all needed functions should be imported here.
+ */
+
+
+ using std::min;
+
+
+ using std::max;
+
+
+ using std::pair;
+
+
+ using std::make_pair;
+
+
+ using std::lower_bound;
+
+
+ using std::upper_bound;
+
+
+ using std::equal_range;
+
+
+ using std::make_heap;
+
+
+ using std::pop_heap;
+
+
+}
+#endif
diff --git a/src/libpinyin.ver b/src/libpinyin.ver
new file mode 100644
index 0000000..1b6cc4b
--- /dev/null
+++ b/src/libpinyin.ver
@@ -0,0 +1,58 @@
+LIBPINYIN {
+ global:
+ pinyin_init;
+ pinyin_save;
+ pinyin_set_double_pinyin_scheme;
+ pinyin_set_chewing_scheme;
+ pinyin_load_phrase_library;
+ pinyin_unload_phrase_library;
+ pinyin_begin_add_phrases;
+ pinyin_iterator_add_phrase;
+ pinyin_end_add_phrases;
+ pinyin_fini;
+ pinyin_mask_out;
+ pinyin_set_options;
+ pinyin_alloc_instance;
+ pinyin_free_instance;
+ pinyin_guess_sentence;
+ pinyin_guess_sentence_with_prefix;
+ pinyin_phrase_segment;
+ pinyin_get_sentence;
+ pinyin_parse_full_pinyin;
+ pinyin_parse_more_full_pinyins;
+ pinyin_parse_double_pinyin;
+ pinyin_parse_more_double_pinyins;
+ pinyin_parse_chewing;
+ pinyin_parse_more_chewings;
+ pinyin_in_chewing_keyboard;
+ pinyin_guess_candidates;
+ pinyin_guess_full_pinyin_candidates;
+ pinyin_choose_candidate;
+ pinyin_clear_constraint;
+ pinyin_lookup_tokens;
+ pinyin_train;
+ pinyin_reset;
+ pinyin_get_chewing_string;
+ pinyin_get_pinyin_string;
+ pinyin_get_pinyin_strings;
+ pinyin_token_get_phrase;
+ pinyin_token_get_n_pronunciation;
+ pinyin_token_get_nth_pronunciation;
+ pinyin_token_get_unigram_frequency;
+ pinyin_token_add_unigram_frequency;
+ pinyin_get_n_candidate;
+ pinyin_get_candidate;
+ pinyin_get_candidate_type;
+ pinyin_get_candidate_string;
+ pinyin_get_n_pinyin;
+ pinyin_get_pinyin_key;
+ pinyin_get_pinyin_key_rest;
+ pinyin_get_pinyin_key_rest_positions;
+ pinyin_get_pinyin_key_rest_length;
+ pinyin_get_raw_full_pinyin;
+ pinyin_get_n_phrase;
+ pinyin_get_phrase_token;
+
+ local:
+ *;
+};
diff --git a/src/lookup/CMakeLists.txt b/src/lookup/CMakeLists.txt
new file mode 100644
index 0000000..937b2cb
--- /dev/null
+++ b/src/lookup/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(
+ CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC"
+)
+
+set(
+ LIBLOOKUP_SOURCES
+ pinyin_lookup2.cpp
+ phrase_lookup.cpp
+ lookup.cpp
+)
+
+add_library(
+ lookup
+ STATIC
+ ${LIBLOOKUP_SOURCES}
+)
+
+install(
+ FILES
+ ${LIBLOOKUP_HEADERS}
+ DESTINATION
+ ${DIR_INCLUDE_LIBPINYIN}
+)
diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am
new file mode 100644
index 0000000..00d7df4
--- /dev/null
+++ b/src/lookup/Makefile.am
@@ -0,0 +1,36 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ @GLIB2_CFLAGS@
+
+noinst_HEADERS = lookup.h \
+ pinyin_lookup2.h \
+ phrase_lookup.h
+
+noinst_LTLIBRARIES = liblookup.la
+
+liblookup_la_CXXFLAGS = "-fPIC"
+
+liblookup_la_LDFLAGS = -static
+
+liblookup_la_SOURCES = pinyin_lookup2.cpp \
+ phrase_lookup.cpp \
+ lookup.cpp
diff --git a/src/lookup/lookup.cpp b/src/lookup/lookup.cpp
new file mode 100644
index 0000000..c32a0ec
--- /dev/null
+++ b/src/lookup/lookup.cpp
@@ -0,0 +1,73 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "lookup.h"
+#include "phrase_index.h"
+
+namespace pinyin{
+
+bool convert_to_utf8(FacadePhraseIndex * phrase_index,
+ MatchResults match_results,
+ /* in */ const char * delimiter,
+ /* in */ bool show_tokens,
+ /* out */ char * & result_string){
+ //init variables
+ if ( NULL == delimiter )
+ delimiter = "";
+ result_string = NULL;
+
+ PhraseItem item;
+
+ for ( size_t i = 0; i < match_results->len; ++i ){
+ phrase_token_t token = g_array_index
+ (match_results, phrase_token_t, i);
+ if ( null_token == token )
+ continue;
+
+ phrase_index->get_phrase_item(token, item);
+ ucs4_t buffer[MAX_PHRASE_LENGTH];
+ item.get_phrase_string(buffer);
+
+ guint8 length = item.get_phrase_length();
+ gchar * phrase = NULL;
+ char * tmp = NULL;
+
+ if (show_tokens) {
+ tmp = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
+ phrase = g_strdup_printf("%d %s", token, tmp);
+ g_free(tmp);
+ } else {
+ phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
+ }
+
+ tmp = result_string;
+ if ( NULL == result_string )
+ result_string = g_strdup(phrase);
+ else
+ result_string = g_strconcat(result_string, delimiter, phrase, NULL);
+ g_free(phrase);
+ g_free(tmp);
+ }
+ return true;
+}
+
+};
diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h
new file mode 100644
index 0000000..8dc1a89
--- /dev/null
+++ b/src/lookup/lookup.h
@@ -0,0 +1,79 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef LOOKUP_H
+#define LOOKUP_H
+
+
+/** @file lookup.h
+ * @brief the definitions of common lookup related classes and structs.
+ */
+
+#include "novel_types.h"
+#include <limits.h>
+
+namespace pinyin{
+
+typedef phrase_token_t lookup_key_t;
+
+struct lookup_value_t{
+ /* previous and current tokens of the node */
+ phrase_token_t m_handles[2];
+ /* maximum possibility of current node */
+ gfloat m_poss;
+ /* trace back information for final step */
+ gint32 m_last_step;
+
+ lookup_value_t(gfloat poss = FLT_MAX){
+ m_handles[0] = null_token; m_handles[1] = null_token;
+ m_poss = poss;
+ m_last_step = -1;
+ }
+};
+
+
+class FacadePhraseIndex;
+
+
+/* Note:
+ * LookupStepIndex:
+ * the main purpose of lookup step index is served for an index
+ * for lookup step content, which can quickly merge the same node
+ * with different possibilities,
+ * then only keep the highest value of the node.
+ * LookupStepContent:
+ * the place to store the lookup values of current step,
+ * and indexed by lookup step index.
+ * See also comments on lookup_value_t.
+ */
+
+typedef GHashTable * LookupStepIndex;
+/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */
+typedef GArray * LookupStepContent; /* array of lookup_value_t */
+
+bool convert_to_utf8(FacadePhraseIndex * phrase_index,
+ MatchResults match_results,
+ /* in */ const char * delimiter,
+ /* in */ bool show_tokens,
+ /* out */ char * & result_string);
+
+};
+#endif
diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp
new file mode 100644
index 0000000..f7da0b7
--- /dev/null
+++ b/src/lookup/phrase_lookup.cpp
@@ -0,0 +1,434 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <math.h>
+#include "stl_lite.h"
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "facade_phrase_table2.h"
+#include "ngram.h"
+#include "phrase_lookup.h"
+
+using namespace pinyin;
+
+
+/*
+const gfloat PhraseLookup::bigram_lambda = lambda;
+const gfloat PhraseLookup::unigram_lambda = 1 - lambda;
+*/
+
+static bool populate_prefixes(GPtrArray * steps_index,
+ GPtrArray * steps_content) {
+
+ lookup_key_t initial_key = sentence_start;
+ lookup_value_t initial_value(log(1));
+ initial_value.m_handles[1] = sentence_start;
+
+ LookupStepContent initial_step_content = (LookupStepContent)
+ g_ptr_array_index(steps_content, 0);
+ g_array_append_val(initial_step_content, initial_value);
+
+ LookupStepIndex initial_step_index = (LookupStepIndex)
+ g_ptr_array_index(steps_index, 0);
+ g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key),
+ GUINT_TO_POINTER(initial_step_content->len - 1));
+
+ return true;
+}
+
+static bool init_steps(GPtrArray * steps_index,
+ GPtrArray * steps_content,
+ int nstep) {
+
+ /* add null start step */
+ g_ptr_array_set_size(steps_index, nstep);
+ g_ptr_array_set_size(steps_content, nstep);
+
+ for ( int i = 0; i < nstep; ++i ){
+ /* initialize steps_index */
+ g_ptr_array_index(steps_index, i) = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
+ /* initialize steps_content */
+ g_ptr_array_index(steps_content, i) = g_array_new
+ (FALSE, FALSE, sizeof(lookup_value_t));
+ }
+
+ return true;
+}
+
+static void clear_steps(GPtrArray * steps_index,
+ GPtrArray * steps_content){
+ /* clear steps_index */
+ for ( size_t i = 0; i < steps_index->len; ++i){
+ GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
+ g_hash_table_destroy(table);
+ g_ptr_array_index(steps_index, i) = NULL;
+ }
+
+ /* free steps_content */
+ for ( size_t i = 0; i < steps_content->len; ++i){
+ GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
+ g_array_free(array, TRUE);
+ g_ptr_array_index(steps_content, i) = NULL;
+ }
+}
+
+PhraseLookup::PhraseLookup(const gfloat lambda,
+ FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram)
+ : bigram_lambda(lambda),
+ unigram_lambda(1. - lambda)
+{
+ m_phrase_table = phrase_table;
+ m_phrase_index = phrase_index;
+ m_system_bigram = system_bigram;
+ m_user_bigram = user_bigram;
+
+ m_steps_index = g_ptr_array_new();
+ m_steps_content = g_ptr_array_new();
+
+ /* the member variables below are saved in get_best_match call. */
+ m_sentence = NULL;
+ m_sentence_length = 0;
+}
+
+PhraseLookup::~PhraseLookup(){
+ clear_steps(m_steps_index, m_steps_content);
+ g_ptr_array_free(m_steps_index, TRUE);
+ g_ptr_array_free(m_steps_content, TRUE);
+}
+
+bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
+ MatchResults & results){
+ m_sentence_length = sentence_length;
+ m_sentence = sentence;
+ int nstep = m_sentence_length + 1;
+
+ clear_steps(m_steps_index, m_steps_content);
+
+ init_steps(m_steps_index, m_steps_content, nstep);
+
+ populate_prefixes(m_steps_index, m_steps_content);
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ m_phrase_index->prepare_tokens(tokens);
+
+ for ( int i = 0; i < nstep - 1; ++i ){
+ for ( int m = i + 1; m < nstep; ++m ){
+
+ /* do one phrase table search. */
+ int result = m_phrase_table->search(m - i, sentence + i, tokens);
+
+ /* found next phrase */
+ if ( result & SEARCH_OK ) {
+ search_bigram2(i, tokens),
+ search_unigram2(i, tokens);
+ }
+
+ /* no longer phrase */
+ if (!(result & SEARCH_CONTINUED))
+ break;
+ }
+ }
+
+ m_phrase_index->destroy_tokens(tokens);
+
+ return final_step(results);
+}
+
+#if 0
+
+bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return false;
+
+ lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0);
+ /* find the maximum node */
+ for ( size_t i = 1; i < lookup_content->len; ++i ){
+ lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ return unigram_gen_next_step(nstep, max_value, token);
+}
+
+bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return false;
+
+ for ( size_t i = 0; i < lookup_content->len; ++i ){
+ lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
+ phrase_token_t index_token = cur_value->m_handles[1];
+ SingleGram * system, * user;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if ( !merge_single_gram(&m_merged_single_gram, system, user) )
+ continue;
+
+ guint32 freq;
+ if ( m_merged_single_gram.get_freq(token, freq) ){
+ guint32 total_freq;
+ m_merged_single_gram.get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
+ }
+
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ return found;
+}
+
+#endif
+
+bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return found;
+
+ /* find the maximum node */
+ lookup_value_t * max_value = &g_array_index
+ (lookup_content, lookup_value_t, 0);
+
+ for (size_t i = 1; i < lookup_content->len; ++i) {
+ lookup_value_t * cur_value = &g_array_index
+ (lookup_content, lookup_value_t, i);
+ if (cur_value->m_poss > max_value->m_poss)
+ max_value = cur_value;
+ }
+
+ /* iterate over tokens */
+ for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+ GArray * array = tokens[n];
+ if (NULL == array)
+ continue;
+
+ /* just skip the loop when the length is zero. */
+ for (size_t k = 0; k < array->len; ++k) {
+ phrase_token_t token =
+ g_array_index(array, phrase_token_t, k);
+
+ found = unigram_gen_next_step
+ (nstep, max_value, token) || found;
+ }
+ }
+
+ return found;
+}
+
+bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if (0 == lookup_content->len)
+ return found;
+
+ for (size_t i = 0; i < lookup_content->len; ++i) {
+ lookup_value_t * cur_value = &g_array_index
+ (lookup_content, lookup_value_t, i);
+ phrase_token_t index_token = cur_value->m_handles[1];
+
+ SingleGram * system = NULL, * user = NULL;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if (!merge_single_gram
+ (&m_merged_single_gram, system, user))
+ continue;
+
+ /* iterate over tokens */
+ for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+ GArray * array = tokens[n];
+ if (NULL == array)
+ continue;
+
+ /* just skip the loop when the length is zero. */
+ for (size_t k = 0; k < array->len; ++k) {
+ phrase_token_t token =
+ g_array_index(array, phrase_token_t, k);
+
+ guint32 freq = 0;
+ if (m_merged_single_gram.get_freq(token, freq)) {
+ guint32 total_freq = 0;
+ m_merged_single_gram.get_total_freq(total_freq);
+
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
+ }
+ }
+ }
+
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ return found;
+}
+
+bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
+phrase_token_t token){
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( elem_poss < DBL_EPSILON )
+ return false;
+
+ lookup_value_t next_value;
+ next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
+ next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda);
+ next_value.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_value, &next_value);
+}
+
+bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){
+
+ if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
+ (gdouble) m_phrase_index->get_phrase_index_total_freq();
+
+ if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
+ return false;
+
+ lookup_value_t next_value;
+ next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
+ next_value.m_poss = cur_value->m_poss +
+ log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss );
+ next_value.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_value, &next_value);
+}
+
+bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){
+
+ LookupStepIndex next_lookup_index = (LookupStepIndex)
+ g_ptr_array_index(m_steps_index, next_step_pos);
+ LookupStepContent next_lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, next_step_pos);
+
+ lookup_key_t next_key = next_value->m_handles[1];
+
+ gpointer key = NULL, value = NULL;
+ gboolean lookup_result = g_hash_table_lookup_extended
+ (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
+
+ if (!lookup_result){
+ g_array_append_val(next_lookup_content, *next_value);
+ g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key),
+ GUINT_TO_POINTER(next_lookup_content->len - 1));
+ return true;
+ }else{
+ size_t step_index = GPOINTER_TO_UINT(value);
+ lookup_value_t * orig_next_value = &g_array_index
+ (next_lookup_content, lookup_value_t, step_index);
+
+ if ( orig_next_value->m_poss < next_value->m_poss ){
+ orig_next_value->m_handles[0] = next_value->m_handles[0];
+ assert(orig_next_value->m_handles[1] == next_value->m_handles[1]);
+ orig_next_value->m_poss = next_value->m_poss;
+ orig_next_value->m_last_step = next_value->m_last_step;
+ return true;
+ }
+ return false;
+ }
+}
+
+bool PhraseLookup::final_step(MatchResults & results ){
+
+ /* reset results */
+ g_array_set_size(results, m_steps_content->len - 1);
+ for ( size_t i = 0; i < results->len; ++i ){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ *token = null_token;
+ }
+
+ /* find max element */
+ size_t last_step_pos = m_steps_content->len - 1;
+ LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index
+ (m_steps_content, last_step_pos);
+ if ( last_step_content->len == 0 )
+ return false;
+
+ lookup_value_t * max_value = &g_array_index
+ (last_step_content, lookup_value_t, 0);
+ for ( size_t i = 1; i < last_step_content->len; ++i ){
+ lookup_value_t * cur_value = &g_array_index
+ (last_step_content, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ /* backtracing */
+ while( true ){
+ int cur_step_pos = max_value->m_last_step;
+ if ( -1 == cur_step_pos )
+ break;
+
+ phrase_token_t * token = &g_array_index
+ (results, phrase_token_t, cur_step_pos);
+ *token = max_value->m_handles[1];
+
+ phrase_token_t last_token = max_value->m_handles[0];
+ LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos);
+
+ gpointer key = NULL, value = NULL;
+ gboolean result = g_hash_table_lookup_extended
+ (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
+ if ( !result )
+ return false;
+
+ LookupStepContent lookup_step_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, cur_step_pos);
+ max_value = &g_array_index
+ (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
+ }
+
+ /* no need to reverse the result */
+ return true;
+}
diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h
new file mode 100644
index 0000000..cf65692
--- /dev/null
+++ b/src/lookup/phrase_lookup.h
@@ -0,0 +1,142 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef PHRASE_LOOKUP_H
+#define PHRASE_LOOKUP_H
+
+#include "novel_types.h"
+#include "ngram.h"
+#include "lookup.h"
+
+/**
+ * phrase_lookup.h
+ *
+ * The definitions of phrase lookup related classes and structs.
+ *
+ */
+
+namespace pinyin{
+
+/**
+ * PhraseLookup:
+ *
+ * The phrase lookup class to convert the sentence to phrase tokens.
+ *
+ */
+class PhraseLookup{
+private:
+ const gfloat bigram_lambda;
+ const gfloat unigram_lambda;
+
+ PhraseItem m_cache_phrase_item;
+ SingleGram m_merged_single_gram;
+protected:
+ //saved varibles
+ FacadePhraseTable2 * m_phrase_table;
+ FacadePhraseIndex * m_phrase_index;
+ Bigram * m_system_bigram;
+ Bigram * m_user_bigram;
+
+ //internal step data structure
+ GPtrArray * m_steps_index;
+ /* Array of LookupStepIndex */
+ GPtrArray * m_steps_content;
+ /* Array of LookupStepContent */
+
+ /* Saved sentence */
+ int m_sentence_length;
+ ucs4_t * m_sentence;
+
+protected:
+ /* Explicitly search the next phrase,
+ * to avoid double phrase lookup as the next token has only one.
+ */
+ bool search_unigram2(int nstep, PhraseTokens tokens);
+ bool search_bigram2(int nstep, PhraseTokens tokens);
+
+ bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token);
+ bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss);
+
+ bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step);
+
+ bool final_step(MatchResults & results);
+public:
+ /**
+ * PhraseLookup::PhraseLookup:
+ * @lambda: the lambda parameter for interpolation model.
+ * @phrase_table: the phrase table.
+ * @phrase_index: the phrase index.
+ * @system_bigram: the system bi-gram.
+ * @user_bigram: the user bi-gram.
+ *
+ * The constructor of the PhraseLookup.
+ *
+ */
+ PhraseLookup(const gfloat lambda,
+ FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram);
+
+ /**
+ * PhraseLookup::~PhraseLookup:
+ *
+ * The destructor of the PhraseLookup.
+ *
+ */
+ ~PhraseLookup();
+
+ /**
+ * PhraseLookup::get_best_match:
+ * @sentence_length: the length of the sentence in ucs4 characters.
+ * @sentence: the ucs4 characters of the sentence.
+ * @results: the segmented sentence in the form of phrase tokens.
+ * @returns: whether the segment operation is successful.
+ *
+ * Segment the sentence into phrase tokens.
+ *
+ * Note: this method only accepts the characters in phrase large table.
+ *
+ */
+ bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results);
+
+ /**
+ * PhraseLookup::convert_to_utf8:
+ * @results: the guessed sentence in the form of phrase tokens.
+ * @result_string: the converted sentence in utf8 string.
+ * @returns: whether the convert operation is successful.
+ *
+ * Convert the sentence from phrase tokens to the utf8 string.
+ *
+ * Note: free the result_string by g_free.
+ *
+ */
+ bool convert_to_utf8(MatchResults results,
+ /* out */ char * & result_string)
+ {
+ return pinyin::convert_to_utf8(m_phrase_index, results,
+ "\n", true, result_string);
+ }
+};
+
+};
+
+#endif
diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp
new file mode 100644
index 0000000..2250a93
--- /dev/null
+++ b/src/lookup/pinyin_lookup2.cpp
@@ -0,0 +1,730 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <math.h>
+#include "facade_chewing_table.h"
+#include "pinyin_lookup2.h"
+#include "stl_lite.h"
+
+using namespace pinyin;
+
+/*
+const gfloat PinyinLookup2::bigram_lambda = lambda;
+const gfloat PinyinLookup2::unigram_lambda = 1 - lambda;
+*/
+
+/* internal definition */
+static const size_t nbeam = 32;
+
+static bool dump_max_value(GPtrArray * values){
+ if (0 == values->len)
+ return false;
+
+ const lookup_value_t * max =
+ (const lookup_value_t *) g_ptr_array_index(values, 0);
+
+ for (size_t i = 1; i < values->len; ++i) {
+ const lookup_value_t * cur =
+ (const lookup_value_t *) g_ptr_array_index(values, i);
+
+ if (cur->m_poss > max->m_poss)
+ max = cur;
+ }
+
+ printf("max value: %f\n", max->m_poss);
+
+ return true;
+}
+
+static bool dump_all_values(GPtrArray * values) {
+ if (0 == values->len)
+ return false;
+
+ printf("values:");
+ for (size_t i = 0; i < values->len; ++i) {
+ const lookup_value_t * cur =
+ (const lookup_value_t *) g_ptr_array_index(values, i);
+
+ printf("%f\t", cur->m_poss);
+ }
+ printf("\n");
+
+ return true;
+}
+
+/* populate the candidates. */
+static bool populate_candidates(/* out */ GPtrArray * candidates,
+ /* in */ LookupStepContent step) {
+ g_ptr_array_set_size(candidates, 0);
+
+ if (0 == step->len)
+ return false;
+
+ for (size_t i = 0; i < step->len; ++i) {
+ lookup_value_t * value = &g_array_index
+ (step, lookup_value_t, i);
+
+ g_ptr_array_add(candidates, value);
+ }
+
+ /* dump_max_value(candidates); */
+
+ return true;
+}
+
+static bool lookup_value_less_than(lookup_value_t * lhs, lookup_value_t * rhs){
+ return lhs->m_poss < rhs->m_poss;
+}
+
+/* use maximum heap to get the topest results. */
+static bool get_top_results(/* out */ GPtrArray * topresults,
+ /* in */ GPtrArray * candidates) {
+ g_ptr_array_set_size(topresults, 0);
+
+ if (0 == candidates->len)
+ return false;
+
+ lookup_value_t ** begin =
+ (lookup_value_t **) &g_ptr_array_index(candidates, 0);
+ lookup_value_t ** end =
+ (lookup_value_t **) &g_ptr_array_index(candidates, candidates->len);
+
+ std_lite::make_heap(begin, end, lookup_value_less_than);
+
+ while (end != begin) {
+ lookup_value_t * one = *begin;
+ g_ptr_array_add(topresults, one);
+
+ std_lite::pop_heap(begin, end, lookup_value_less_than);
+ --end;
+
+ if (topresults->len >= nbeam)
+ break;
+ }
+
+ /* dump_all_values(topresults); */
+
+ return true;
+}
+
+static bool populate_prefixes(GPtrArray * steps_index,
+ GPtrArray * steps_content,
+ TokenVector prefixes) {
+ assert(prefixes->len > 0);
+
+ for (size_t i = 0; i < prefixes->len; ++i) {
+ phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
+ lookup_key_t initial_key = token;
+ lookup_value_t initial_value(log(1));
+ initial_value.m_handles[1] = token;
+
+ LookupStepContent initial_step_content = (LookupStepContent)
+ g_ptr_array_index(steps_content, 0);
+ initial_step_content = g_array_append_val
+ (initial_step_content, initial_value);
+
+ LookupStepIndex initial_step_index = (LookupStepIndex)
+ g_ptr_array_index(steps_index, 0);
+ g_hash_table_insert(initial_step_index,
+ GUINT_TO_POINTER(initial_key),
+ GUINT_TO_POINTER(initial_step_content->len - 1));
+ }
+
+ return true;
+}
+
+static bool init_steps(GPtrArray * steps_index,
+ GPtrArray * steps_content,
+ int nstep){
+ /* add null start step */
+ g_ptr_array_set_size(steps_index, nstep);
+ g_ptr_array_set_size(steps_content, nstep);
+
+ for (int i = 0; i < nstep; ++i) {
+ /* initialize steps_index */
+ g_ptr_array_index(steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal);
+ /* initialize steps_content */
+ g_ptr_array_index(steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t));
+ }
+
+ return true;
+}
+
+static void clear_steps(GPtrArray * steps_index, GPtrArray * steps_content){
+ /* clear steps_index */
+ for ( size_t i = 0; i < steps_index->len; ++i){
+ GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
+ g_hash_table_destroy(table);
+ g_ptr_array_index(steps_index, i) = NULL;
+ }
+
+ /* clear steps_content */
+ for ( size_t i = 0; i < steps_content->len; ++i){
+ GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
+ g_array_free(array, TRUE);
+ g_ptr_array_index(steps_content, i) = NULL;
+ }
+}
+
+
+PinyinLookup2::PinyinLookup2(const gfloat lambda,
+ pinyin_option_t options,
+ FacadeChewingTable * pinyin_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram)
+ : bigram_lambda(lambda),
+ unigram_lambda(1. - lambda)
+{
+ m_options = options;
+ m_pinyin_table = pinyin_table;
+ m_phrase_index = phrase_index;
+ m_system_bigram = system_bigram;
+ m_user_bigram = user_bigram;
+
+ m_steps_index = g_ptr_array_new();
+ m_steps_content = g_ptr_array_new();
+
+ /* the member variables below are saved in get_best_match call. */
+ m_keys = NULL;
+ m_constraints = NULL;
+}
+
+PinyinLookup2::~PinyinLookup2(){
+ clear_steps(m_steps_index, m_steps_content);
+ g_ptr_array_free(m_steps_index, TRUE);
+ g_ptr_array_free(m_steps_content, TRUE);
+}
+
+
+bool PinyinLookup2::get_best_match(TokenVector prefixes,
+ ChewingKeyVector keys,
+ CandidateConstraints constraints,
+ MatchResults & results){
+ m_constraints = constraints;
+ m_keys = keys;
+ int nstep = keys->len + 1;
+
+ clear_steps(m_steps_index, m_steps_content);
+
+ init_steps(m_steps_index, m_steps_content, nstep);
+
+ populate_prefixes(m_steps_index, m_steps_content, prefixes);
+
+ PhraseIndexRanges ranges;
+ memset(ranges, 0, sizeof(PhraseIndexRanges));
+ m_phrase_index->prepare_ranges(ranges);
+
+ GPtrArray * candidates = g_ptr_array_new();
+ GPtrArray * topresults = g_ptr_array_new();
+
+ /* begin the viterbi beam search. */
+ for ( int i = 0; i < nstep - 1; ++i ){
+ lookup_constraint_t * cur_constraint = &g_array_index
+ (m_constraints, lookup_constraint_t, i);
+
+ if (CONSTRAINT_NOSEARCH == cur_constraint->m_type)
+ continue;
+
+ LookupStepContent step = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, i);
+
+ populate_candidates(candidates, step);
+ get_top_results(topresults, candidates);
+
+ if (0 == topresults->len)
+ continue;
+
+ for ( int m = i + 1; m < nstep; ++m ){
+ const int len = m - i;
+ if (len > MAX_PHRASE_LENGTH)
+ break;
+
+ lookup_constraint_t * next_constraint = &g_array_index
+ (m_constraints, lookup_constraint_t, m - 1);
+
+ if (CONSTRAINT_NOSEARCH == next_constraint->m_type)
+ break;
+
+ ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data;
+ /* do one pinyin table search. */
+ int result = m_pinyin_table->search(len, pinyin_keys + i, ranges);
+
+ if (result & SEARCH_OK) {
+ /* assume topresults always contains items. */
+ search_bigram2(topresults, i, ranges),
+ search_unigram2(topresults, i, ranges);
+ }
+
+ /* poke the next constraint. */
+ ++ next_constraint;
+ if (CONSTRAINT_ONESTEP == next_constraint->m_type)
+ break;
+
+ /* no longer pinyin */
+ if (!(result & SEARCH_CONTINUED))
+ break;
+ }
+ }
+
+ m_phrase_index->destroy_ranges(ranges);
+
+ g_ptr_array_free(candidates, TRUE);
+ g_ptr_array_free(topresults, TRUE);
+
+ return final_step(results);
+}
+
+bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges) {
+
+ if (0 == topresults->len)
+ return false;
+
+ lookup_value_t * max = (lookup_value_t *)
+ g_ptr_array_index(topresults, 0);
+
+ lookup_constraint_t * constraint =
+ &g_array_index(m_constraints, lookup_constraint_t, nstep);
+
+ if (CONSTRAINT_ONESTEP == constraint->m_type) {
+ return unigram_gen_next_step(nstep, max, constraint->m_token);
+ }
+
+ bool found = false;
+
+ if (NO_CONSTRAINT == constraint->m_type) {
+ for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = ranges[m];
+ if ( !array ) continue;
+
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
+ for ( phrase_token_t token = range->m_range_begin;
+ token != range->m_range_end; ++token){
+ found = unigram_gen_next_step(nstep, max, token)|| found;
+ }
+ }
+ }
+ }
+
+ return found;
+}
+
+bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges) {
+
+ lookup_constraint_t * constraint =
+ &g_array_index(m_constraints, lookup_constraint_t, nstep);
+
+ bool found = false;
+ BigramPhraseArray bigram_phrase_items = g_array_new
+ (FALSE, FALSE, sizeof(BigramPhraseItem));
+
+ for (size_t i = 0; i < topresults->len; ++i) {
+ lookup_value_t * value = (lookup_value_t *)
+ g_ptr_array_index(topresults, i);
+
+ phrase_token_t index_token = value->m_handles[1];
+
+ SingleGram * system = NULL, * user = NULL;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if ( !merge_single_gram(&m_merged_single_gram, system, user) )
+ continue;
+
+ if ( CONSTRAINT_ONESTEP == constraint->m_type ){
+ phrase_token_t token = constraint->m_token;
+
+ guint32 freq;
+ if( m_merged_single_gram.get_freq(token, freq) ){
+ guint32 total_freq;
+ m_merged_single_gram.get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found;
+ }
+ }
+
+ if (NO_CONSTRAINT == constraint->m_type) {
+ for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = ranges[m];
+ if ( !array ) continue;
+
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range =
+ &g_array_index(array, PhraseIndexRange, n);
+
+ g_array_set_size(bigram_phrase_items, 0);
+ m_merged_single_gram.search(range, bigram_phrase_items);
+ for( size_t k = 0; k < bigram_phrase_items->len; ++k) {
+ BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
+ found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found;
+ }
+ }
+ }
+ }
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ g_array_free(bigram_phrase_items, TRUE);
+ return found;
+}
+
+
+bool PinyinLookup2::unigram_gen_next_step(int nstep,
+ lookup_value_t * cur_step,
+ phrase_token_t token) {
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( elem_poss < DBL_EPSILON )
+ return false;
+
+ ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys);
+ if (pinyin_poss < FLT_EPSILON )
+ return false;
+
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup2::bigram_gen_next_step(int nstep,
+ lookup_value_t * cur_step,
+ phrase_token_t token,
+ gfloat bigram_poss) {
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
+ (gdouble) m_phrase_index->get_phrase_index_total_freq();
+ if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
+ return false;
+
+ ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys);
+ if ( pinyin_poss < FLT_EPSILON )
+ return false;
+
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss +
+ log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup2::save_next_step(int next_step_pos,
+ lookup_value_t * cur_step,
+ lookup_value_t * next_step){
+
+ lookup_key_t next_key = next_step->m_handles[1];
+ LookupStepIndex next_lookup_index = (LookupStepIndex)
+ g_ptr_array_index(m_steps_index, next_step_pos);
+ LookupStepContent next_lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, next_step_pos);
+
+ gpointer key = NULL, value = NULL;
+ gboolean lookup_result = g_hash_table_lookup_extended
+ (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
+
+ if ( !lookup_result ){
+ g_array_append_val(next_lookup_content, *next_step);
+ g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1));
+ return true;
+ }else{
+ size_t step_index = GPOINTER_TO_UINT(value);
+ lookup_value_t * orig_next_value = &g_array_index
+ (next_lookup_content, lookup_value_t, step_index);
+
+ if ( orig_next_value->m_poss < next_step->m_poss) {
+ /* found better result. */
+ orig_next_value->m_handles[0] = next_step->m_handles[0];
+ assert(orig_next_value->m_handles[1] == next_step->m_handles[1]);
+ orig_next_value->m_poss = next_step->m_poss;
+ orig_next_value->m_last_step = next_step->m_last_step;
+ return true;
+ }
+
+ return false;
+ }
+}
+
+bool PinyinLookup2::final_step(MatchResults & results){
+
+ /* reset results */
+ g_array_set_size(results, m_steps_content->len - 1);
+ for (size_t i = 0; i < results->len; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ *token = null_token;
+ }
+
+ /* find max element */
+ size_t last_step_pos = m_steps_content->len - 1;
+ GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos);
+ if ( last_step_array->len == 0 )
+ return false;
+
+ lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0);
+ for ( size_t i = 1; i < last_step_array->len; ++i){
+ lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ /* backtracing */
+ while( true ){
+ int cur_step_pos = max_value->m_last_step;
+ if ( -1 == cur_step_pos )
+ break;
+
+ phrase_token_t * token = &g_array_index
+ (results, phrase_token_t, cur_step_pos);
+ *token = max_value->m_handles[1];
+
+ phrase_token_t last_token = max_value->m_handles[0];
+ LookupStepIndex lookup_step_index = (LookupStepIndex)
+ g_ptr_array_index(m_steps_index, cur_step_pos);
+
+ gpointer key = NULL, value = NULL;
+ gboolean result = g_hash_table_lookup_extended
+ (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
+ if (!result)
+ return false;
+
+ LookupStepContent lookup_step_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, cur_step_pos);
+ max_value = &g_array_index
+ (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
+ }
+
+ /* no need to reverse the result */
+ return true;
+}
+
+
+bool PinyinLookup2::train_result2(ChewingKeyVector keys,
+ CandidateConstraints constraints,
+ MatchResults results) {
+ const guint32 initial_seed = 23 * 3;
+ const guint32 expand_factor = 2;
+ const guint32 unigram_factor = 7;
+ const guint32 pinyin_factor = 1;
+ const guint32 ceiling_seed = 23 * 15 * 64;
+
+ /* begin training based on constraints and results. */
+ bool train_next = false;
+ ChewingKey * pinyin_keys = (ChewingKey *) keys->data;
+
+ phrase_token_t last_token = sentence_start;
+ /* constraints->len + 1 == results->len */
+ for (size_t i = 0; i < constraints->len; ++i) {
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ if (null_token == *token)
+ continue;
+
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, i);
+ if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) {
+ if (CONSTRAINT_ONESTEP == constraint->m_type) {
+ assert(*token == constraint->m_token);
+ train_next = true;
+ } else {
+ train_next = false;
+ }
+
+ guint32 seed = initial_seed;
+ /* train bi-gram first, and get train seed. */
+ if (last_token) {
+ SingleGram * user = NULL;
+ m_user_bigram->load(last_token, user);
+
+ guint32 total_freq = 0;
+ if (!user) {
+ user = new SingleGram;
+ }
+ assert(user->get_total_freq(total_freq));
+
+ guint32 freq = 0;
+ /* compute train factor */
+ if (!user->get_freq(*token, freq)) {
+ assert(user->insert_freq(*token, 0));
+ seed = initial_seed;
+ } else {
+ seed = std_lite::max(freq, initial_seed);
+ seed *= expand_factor;
+ seed = std_lite::min(seed, ceiling_seed);
+ }
+
+ /* protect against total_freq overflow */
+ if (seed > 0 && total_freq > total_freq + seed)
+ goto next;
+
+ assert(user->set_total_freq(total_freq + seed));
+ /* if total_freq is not overflow, then freq won't overflow. */
+ assert(user->set_freq(*token, freq + seed));
+ assert(m_user_bigram->store(last_token, user));
+ next:
+ assert(NULL != user);
+ if (user)
+ delete user;
+ }
+
+ /* train uni-gram */
+ m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+ m_cache_phrase_item.increase_pronunciation_possibility
+ (m_options, pinyin_keys + i, seed * pinyin_factor);
+ m_phrase_index->add_unigram_frequency
+ (*token, seed * unigram_factor);
+ }
+ last_token = *token;
+ }
+ return true;
+}
+
+
+int PinyinLookup2::add_constraint(CandidateConstraints constraints,
+ size_t index,
+ phrase_token_t token) {
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return 0;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ if ( index + phrase_length > constraints->len )
+ return 0;
+
+ for (size_t i = index; i < index + phrase_length; ++i){
+ clear_constraint(constraints, i);
+ }
+
+ /* store one step constraint */
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, index);
+ constraint->m_type = CONSTRAINT_ONESTEP;
+ constraint->m_token = token;
+
+ /* propagate no search constraint */
+ for (size_t i = 1; i < phrase_length; ++i){
+ constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
+ constraint->m_type = CONSTRAINT_NOSEARCH;
+ constraint->m_constraint_step = index;
+ }
+
+ return phrase_length;
+}
+
+bool PinyinLookup2::clear_constraint(CandidateConstraints constraints,
+ int index) {
+ if (index < 0 || index >= constraints->len)
+ return false;
+
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, index);
+
+ if (NO_CONSTRAINT == constraint->m_type)
+ return false;
+
+ if (CONSTRAINT_NOSEARCH == constraint->m_type){
+ index = constraint->m_constraint_step;
+ constraint = &g_array_index(constraints, lookup_constraint_t, index);
+ }
+
+ /* now var constraint points to the one step constraint. */
+ assert(constraint->m_type == CONSTRAINT_ONESTEP);
+
+ phrase_token_t token = constraint->m_token;
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ for ( size_t i = 0; i < phrase_length; ++i){
+ if (index + i >= constraints->len)
+ continue;
+
+ constraint = &g_array_index
+ (constraints, lookup_constraint_t, index + i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+
+ return true;
+}
+
+bool PinyinLookup2::validate_constraint(CandidateConstraints constraints,
+ ChewingKeyVector keys) {
+ /* resize constraints array first */
+ size_t constraints_length = constraints->len;
+
+ if ( keys->len > constraints_length ){
+ g_array_set_size(constraints, keys->len);
+
+ /* initialize new element */
+ for( size_t i = constraints_length; i < keys->len; ++i){
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+
+ }else if (keys->len < constraints_length ){
+ /* just shrink it */
+ g_array_set_size(constraints, keys->len);
+ }
+
+ for ( size_t i = 0; i < constraints->len; ++i){
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, i);
+
+ /* handle one step constraint */
+ if ( constraint->m_type == CONSTRAINT_ONESTEP ){
+
+ phrase_token_t token = constraint->m_token;
+ m_phrase_index->get_phrase_item(token, m_cache_phrase_item);
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+
+ /* clear too long constraint */
+ if (i + phrase_length > constraints->len){
+ clear_constraint(constraints, i);
+ continue;
+ }
+
+ ChewingKey * pinyin_keys = (ChewingKey *)keys->data;
+ /* clear invalid pinyin */
+ gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys + i);
+ if (pinyin_poss < FLT_EPSILON)
+ clear_constraint(constraints, i);
+ }
+ }
+ return true;
+}
diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h
new file mode 100644
index 0000000..dbe15c9
--- /dev/null
+++ b/src/lookup/pinyin_lookup2.h
@@ -0,0 +1,240 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef PINYIN_LOOKUP2_H
+#define PINYIN_LOOKUP2_H
+
+
+#include <float.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "chewing_key.h"
+#include "phrase_index.h"
+#include "ngram.h"
+#include "lookup.h"
+
+
+namespace pinyin{
+
+/**
+ * pinyin_lookup2.h
+ *
+ * The definitions of pinyin lookup related classes and structs.
+ *
+ */
+
+
+
+enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH };
+
+struct lookup_constraint_t{
+ /* current type of the step */
+ constraint_type m_type;
+
+ /* Note:
+ * value of m_type:
+ * NO_CONSTRAINT:
+ * no values in the below union.
+ * search all possible next words.
+ * CONSTRAINT_ONESTEP:
+ * m_token contains the next word.
+ * only one word can be used to search for the next step,
+ * use case for user selected candidates.
+ * CONSTRAINT_NOSEARCH:
+ * m_constraint_step contains the value
+ * which points back to the CONSTRAINT_ONESTEP step.
+ * no search is allowed for the current step.
+ */
+
+ union{
+ phrase_token_t m_token;
+ guint32 m_constraint_step; /* index of m_token */
+ };
+};
+
+
+/**
+ * PinyinLookup2:
+ *
+ * The pinyin lookup class to convert pinyin keys to guessed sentence.
+ *
+ */
+class PinyinLookup2{
+private:
+ const gfloat bigram_lambda;
+ const gfloat unigram_lambda;
+
+ PhraseItem m_cache_phrase_item;
+ SingleGram m_merged_single_gram;
+
+protected:
+ /* saved varibles */
+ CandidateConstraints m_constraints;
+ ChewingKeyVector m_keys;
+
+ pinyin_option_t m_options;
+ FacadeChewingTable * m_pinyin_table;
+ FacadePhraseIndex * m_phrase_index;
+ Bigram * m_system_bigram;
+ Bigram * m_user_bigram;
+
+ /* internal step data structure */
+ GPtrArray * m_steps_index;
+ /* Array of LookupStepIndex */
+ GPtrArray * m_steps_content;
+ /* Array of LookupStepContent */
+
+
+ bool search_unigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges);
+ bool search_bigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges);
+
+ bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token);
+ bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss);
+
+ bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step);
+
+ bool final_step(MatchResults & results);
+
+public:
+ /**
+ * PinyinLookup2::PinyinLookup2:
+ * @lambda: the lambda parameter for interpolation model.
+ * @options: the pinyin options.
+ * @pinyin_table: the pinyin table.
+ * @phrase_index: the phrase index.
+ * @system_bigram: the system bi-gram.
+ * @user_bigram: the user bi-gram.
+ *
+ * The constructor of the PinyinLookup2.
+ *
+ */
+ PinyinLookup2(const gfloat lambda,
+ pinyin_option_t options,
+ FacadeChewingTable * pinyin_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram);
+
+ /**
+ * PinyinLookup2::~PinyinLookup2:
+ *
+ * The destructor of the PinyinLookup2.
+ *
+ */
+ ~PinyinLookup2();
+
+ /**
+ * PinyinLookup2::set_options:
+ * @options: the pinyin options.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the pinyin options.
+ *
+ */
+ bool set_options(pinyin_option_t options) {
+ m_options = options;
+ return true;
+ }
+
+ /**
+ * PinyinLookup2::get_best_match:
+ * @prefixes: the phrase tokens before the guessed sentence.
+ * @keys: the pinyin keys of the guessed sentence.
+ * @constraints: the constraints on the guessed sentence.
+ * @results: the guessed sentence in the form of the phrase tokens.
+ * @returns: whether the guess operation is successful.
+ *
+ * Guess the best sentence according to user inputs.
+ *
+ */
+ bool get_best_match(TokenVector prefixes, ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+
+ /**
+ * PinyinLookup2::train_result2:
+ * @keys: the pinyin keys of the guessed sentence.
+ * @constraints: the constraints on the guessed sentence.
+ * @results: the guessed sentence in the form of the phrase tokens.
+ * @returns: whether the train operation is successful.
+ *
+ * Self learning the guessed sentence based on the constraints.
+ *
+ */
+ bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results);
+
+ /**
+ * PinyinLookup2::convert_to_utf8:
+ * @results: the guessed sentence in the form of the phrase tokens.
+ * @result_string: the guessed sentence in the utf8 encoding.
+ * @returns: whether the convert operation is successful.
+ *
+ * Convert the guessed sentence from the phrase tokens to the utf8 string.
+ *
+ */
+ bool convert_to_utf8(MatchResults results,
+ /* out */ char * & result_string)
+ {
+ return pinyin::convert_to_utf8(m_phrase_index, results,
+ NULL, false, result_string);
+ }
+
+
+ /**
+ * PinyinLookup2::add_constraint:
+ * @constraints: the constraints on the guessed sentence.
+ * @index: the character offset in the guessed sentence.
+ * @token: the phrase token in the candidate list chosen by user.
+ * @returns: the number of the characters in the chosen token.
+ *
+ * Add one constraint to the constraints on the guessed sentence.
+ *
+ */
+ int add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token);
+
+ /**
+ * PinyinLookup2::clear_constraint:
+ * @constraints: the constraints on the guessed sentence.
+ * @index: the character offset in the guessed sentence.
+ * @returns: whether the clear operation is successful.
+ *
+ * Clear one constraint in the constraints on the guessed sentence.
+ *
+ */
+ bool clear_constraint(CandidateConstraints constraints, int index);
+
+ /**
+ * PinyinLookup2::validate_constraint:
+ * @constraints: the constraints on the guessed sentence.
+ * @keys: the pinyin keys of the guessed sentence.
+ * @returns: whether the validate operation is successful.
+ *
+ * Validate the old constraints with the new pinyin keys.
+ *
+ */
+ bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys);
+
+};
+
+};
+
+#endif
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
new file mode 100644
index 0000000..95215ae
--- /dev/null
+++ b/src/pinyin.cpp
@@ -0,0 +1,2096 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "pinyin.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <glib/gstdio.h>
+#include "pinyin_internal.h"
+
+
+using namespace pinyin;
+
+/* a glue layer for input method integration. */
+
+typedef GArray * CandidateVector; /* GArray of lookup_candidate_t */
+
+struct _pinyin_context_t{
+ pinyin_option_t m_options;
+
+ FullPinyinParser2 * m_full_pinyin_parser;
+ DoublePinyinParser2 * m_double_pinyin_parser;
+ ChewingParser2 * m_chewing_parser;
+
+ FacadeChewingTable * m_pinyin_table;
+ FacadePhraseTable2 * m_phrase_table;
+ FacadePhraseIndex * m_phrase_index;
+ Bigram * m_system_bigram;
+ Bigram * m_user_bigram;
+
+ PinyinLookup2 * m_pinyin_lookup;
+ PhraseLookup * m_phrase_lookup;
+
+ char * m_system_dir;
+ char * m_user_dir;
+ bool m_modified;
+
+ SystemTableInfo m_system_table_info;
+};
+
+struct _pinyin_instance_t{
+ pinyin_context_t * m_context;
+ gchar * m_raw_full_pinyin;
+ TokenVector m_prefixes;
+ ChewingKeyVector m_pinyin_keys;
+ ChewingKeyRestVector m_pinyin_key_rests;
+ CandidateConstraints m_constraints;
+ MatchResults m_match_results;
+ CandidateVector m_candidates;
+};
+
+struct _lookup_candidate_t{
+ lookup_candidate_type_t m_candidate_type;
+ gchar * m_phrase_string;
+ phrase_token_t m_token;
+ ChewingKeyRest m_orig_rest;
+ gchar * m_new_pinyins;
+ guint32 m_freq; /* the amplifed gfloat numerical value. */
+public:
+ _lookup_candidate_t() {
+ m_candidate_type = NORMAL_CANDIDATE;
+ m_phrase_string = NULL;
+ m_token = null_token;
+ m_new_pinyins = NULL;
+ m_freq = 0;
+ }
+};
+
+struct _import_iterator_t{
+ pinyin_context_t * m_context;
+ guint8 m_phrase_index;
+};
+
+
+static bool check_format(pinyin_context_t * context){
+ const char * userdir = context->m_user_dir;
+
+ UserTableInfo user_table_info;
+ gchar * filename = g_build_filename
+ (userdir, USER_TABLE_INFO, NULL);
+ user_table_info.load(filename);
+ g_free(filename);
+
+ bool exists = user_table_info.is_conform
+ (&context->m_system_table_info);
+
+ if (exists)
+ return exists;
+
+ const pinyin_table_info_t * phrase_files =
+ context->m_system_table_info.get_table_info();
+
+ /* clean up files, if version mis-matches. */
+ for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (NOT_USED == table_info->m_file_type)
+ continue;
+
+ if (NULL == table_info->m_user_filename)
+ continue;
+
+ const char * userfilename = table_info->m_user_filename;
+
+ /* remove dbin file. */
+ filename = g_build_filename(userdir, userfilename, NULL);
+ unlink(filename);
+ g_free(filename);
+ }
+
+ filename = g_build_filename
+ (userdir, USER_PINYIN_INDEX, NULL);
+ unlink(filename);
+ g_free(filename);
+
+ filename = g_build_filename
+ (userdir, USER_PHRASE_INDEX, NULL);
+ unlink(filename);
+ g_free(filename);
+
+ filename = g_build_filename
+ (userdir, USER_BIGRAM, NULL);
+ unlink(filename);
+ g_free(filename);
+
+ return exists;
+}
+
+static bool mark_version(pinyin_context_t * context){
+ const char * userdir = context->m_user_dir;
+
+ UserTableInfo user_table_info;
+ user_table_info.make_conform(&context->m_system_table_info);
+
+ gchar * filename = g_build_filename
+ (userdir, USER_TABLE_INFO, NULL);
+ bool retval = user_table_info.save(filename);
+ g_free(filename);
+
+ return retval;
+}
+
+pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){
+ pinyin_context_t * context = new pinyin_context_t;
+
+ context->m_options = USE_TONE;
+
+ context->m_system_dir = g_strdup(systemdir);
+ context->m_user_dir = g_strdup(userdir);
+ context->m_modified = false;
+
+ gchar * filename = g_build_filename
+ (context->m_system_dir, SYSTEM_TABLE_INFO, NULL);
+ if (!context->m_system_table_info.load(filename)) {
+ fprintf(stderr, "load %s failed!\n", filename);
+ return NULL;
+ }
+ g_free(filename);
+
+
+ check_format(context);
+
+ context->m_full_pinyin_parser = new FullPinyinParser2;
+ context->m_double_pinyin_parser = new DoublePinyinParser2;
+ context->m_chewing_parser = new ChewingParser2;
+
+ /* load chewing table. */
+ context->m_pinyin_table = new FacadeChewingTable;
+
+ /* load system chewing table. */
+ MemoryChunk * chunk = new MemoryChunk;
+ filename = g_build_filename
+ (context->m_system_dir, SYSTEM_PINYIN_INDEX, NULL);
+ if (!chunk->load(filename)) {
+ fprintf(stderr, "open %s failed!\n", filename);
+ return NULL;
+ }
+ g_free(filename);
+
+ /* load user chewing table */
+ MemoryChunk * userchunk = new MemoryChunk;
+ filename = g_build_filename
+ (context->m_user_dir, USER_PINYIN_INDEX, NULL);
+ if (!userchunk->load(filename)) {
+ /* hack here: use local Chewing Table to create empty memory chunk. */
+ ChewingLargeTable table(context->m_options);
+ table.store(userchunk);
+ }
+ g_free(filename);
+
+ context->m_pinyin_table->load(context->m_options, chunk, userchunk);
+
+ /* load phrase table */
+ context->m_phrase_table = new FacadePhraseTable2;
+
+ /* load system phrase table */
+ chunk = new MemoryChunk;
+ filename = g_build_filename
+ (context->m_system_dir, SYSTEM_PHRASE_INDEX, NULL);
+ if (!chunk->load(filename)) {
+ fprintf(stderr, "open %s failed!\n", filename);
+ return NULL;
+ }
+ g_free(filename);
+
+ /* load user phrase table */
+ userchunk = new MemoryChunk;
+ filename = g_build_filename
+ (context->m_user_dir, USER_PHRASE_INDEX, NULL);
+ if (!userchunk->load(filename)) {
+ /* hack here: use local Phrase Table to create empty memory chunk. */
+ PhraseLargeTable2 table;
+ table.store(userchunk);
+ }
+ g_free(filename);
+
+ context->m_phrase_table->load(chunk, userchunk);
+
+ context->m_phrase_index = new FacadePhraseIndex;
+
+ /* hack here: directly call load phrase library. */
+ pinyin_load_phrase_library(context, GB_DICTIONARY);
+ pinyin_load_phrase_library(context, MERGED_DICTIONARY);
+
+ context->m_system_bigram = new Bigram;
+ filename = g_build_filename(context->m_system_dir, SYSTEM_BIGRAM, NULL);
+ context->m_system_bigram->attach(filename, ATTACH_READONLY);
+ g_free(filename);
+
+ context->m_user_bigram = new Bigram;
+ filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL);
+ context->m_user_bigram->load_db(filename);
+ g_free(filename);
+
+ gfloat lambda = context->m_system_table_info.get_lambda();
+
+ context->m_pinyin_lookup = new PinyinLookup2
+ ( lambda, context->m_options,
+ context->m_pinyin_table, context->m_phrase_index,
+ context->m_system_bigram, context->m_user_bigram);
+
+ context->m_phrase_lookup = new PhraseLookup
+ (lambda,
+ context->m_phrase_table, context->m_phrase_index,
+ context->m_system_bigram, context->m_user_bigram);
+
+ return context;
+}
+
+bool pinyin_load_phrase_library(pinyin_context_t * context,
+ guint8 index){
+ if (!(index < PHRASE_INDEX_LIBRARY_COUNT))
+ return false;
+
+ /* check whether the sub phrase index is already loaded. */
+ PhraseIndexRange range;
+ int retval = context->m_phrase_index->get_range(index, range);
+ if (ERROR_OK == retval)
+ return false;
+
+ const pinyin_table_info_t * phrase_files =
+ context->m_system_table_info.get_table_info();
+
+ const pinyin_table_info_t * table_info = phrase_files + index;
+
+ if (SYSTEM_FILE == table_info->m_file_type ||
+ DICTIONARY == table_info->m_file_type) {
+ /* system phrase library */
+ MemoryChunk * chunk = new MemoryChunk;
+
+ const char * systemfilename = table_info->m_system_filename;
+ /* check bin file in system dir. */
+ gchar * chunkfilename = g_build_filename(context->m_system_dir,
+ systemfilename, NULL);
+ chunk->load(chunkfilename);
+ g_free(chunkfilename);
+
+ context->m_phrase_index->load(index, chunk);
+
+ const char * userfilename = table_info->m_user_filename;
+
+ chunkfilename = g_build_filename(context->m_user_dir,
+ userfilename, NULL);
+
+ MemoryChunk * log = new MemoryChunk;
+ log->load(chunkfilename);
+ g_free(chunkfilename);
+
+ /* merge the chunk log. */
+ context->m_phrase_index->merge(index, log);
+ return true;
+ }
+
+ if (USER_FILE == table_info->m_file_type) {
+ /* user phrase library */
+ MemoryChunk * chunk = new MemoryChunk;
+ const char * userfilename = table_info->m_user_filename;
+
+ gchar * chunkfilename = g_build_filename(context->m_user_dir,
+ userfilename, NULL);
+
+ /* check bin file exists. if not, create a new one. */
+ if (chunk->load(chunkfilename)) {
+ context->m_phrase_index->load(index, chunk);
+ } else {
+ delete chunk;
+ context->m_phrase_index->create_sub_phrase(index);
+ }
+
+ g_free(chunkfilename);
+ return true;
+ }
+
+ return false;
+}
+
+bool pinyin_unload_phrase_library(pinyin_context_t * context,
+ guint8 index){
+ /* gb_char.bin and merged.bin can't be unloaded. */
+ if (GB_DICTIONARY == index || MERGED_DICTIONARY == index)
+ return false;
+
+ assert(index < PHRASE_INDEX_LIBRARY_COUNT);
+
+ context->m_phrase_index->unload(index);
+ return true;
+}
+
+import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context,
+ guint8 index){
+ import_iterator_t * iter = new import_iterator_t;
+ iter->m_context = context;
+ iter->m_phrase_index = index;
+ return iter;
+}
+
+bool pinyin_iterator_add_phrase(import_iterator_t * iter,
+ const char * phrase,
+ const char * pinyin,
+ gint count){
+ /* if -1 == count, use the default value. */
+ const gint default_count = 5;
+ const guint32 unigram_factor = 3;
+ if (-1 == count)
+ count = default_count;
+
+ pinyin_context_t * & context = iter->m_context;
+ FacadePhraseTable2 * & phrase_table = context->m_phrase_table;
+ FacadeChewingTable * & pinyin_table = context->m_pinyin_table;
+ FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+
+ bool result = false;
+
+ if (NULL == phrase || NULL == pinyin)
+ return result;
+
+ /* check whether the phrase exists in phrase table */
+ glong len_phrase = 0;
+ ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &len_phrase, NULL);
+
+ pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE;
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys =
+ g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests =
+ g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ /* parse the pinyin. */
+ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
+
+ if (len_phrase != keys->len)
+ return result;
+
+ if (0 == len_phrase || len_phrase >= MAX_PHRASE_LENGTH)
+ return result;
+
+ phrase_token_t token = null_token;
+ GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ /* do phrase table search. */
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index->prepare_tokens(tokens);
+ int retval = phrase_table->search(len_phrase, ucs4_phrase, tokens);
+ int num = reduce_tokens(tokens, tokenarray);
+ phrase_index->destroy_tokens(tokens);
+
+ /* find the best token candidate. */
+ for (size_t i = 0; i < tokenarray->len; ++i) {
+ phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i);
+ if (null_token == token) {
+ token = candidate;
+ continue;
+ }
+
+ if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == iter->m_phrase_index) {
+ /* only one phrase string per sub phrase index. */
+ assert(PHRASE_INDEX_LIBRARY_INDEX(token) != iter->m_phrase_index);
+ token = candidate;
+ continue;
+ }
+ }
+ g_array_free(tokenarray, TRUE);
+
+ PhraseItem item;
+ /* check whether it exists in the same sub phrase index; */
+ if (null_token != token &&
+ PHRASE_INDEX_LIBRARY_INDEX(token) == iter->m_phrase_index) {
+ /* if so, remove the phrase, add the pinyin for the phrase item,
+ then add it back;*/
+ phrase_index->get_phrase_item(token, item);
+ assert(len_phrase == item.get_phrase_length());
+ ucs4_t tmp_phrase[MAX_PHRASE_LENGTH];
+ item.get_phrase_string(tmp_phrase);
+ assert(0 == memcmp
+ (ucs4_phrase, tmp_phrase, sizeof(ucs4_t) * len_phrase));
+
+ PhraseItem * removed_item = NULL;
+ retval = phrase_index->remove_phrase_item(token, removed_item);
+ if (ERROR_OK == retval) {
+ /* maybe check whether there are duplicated pronunciations here. */
+ removed_item->add_pronunciation((ChewingKey *)keys->data,
+ count);
+ phrase_index->add_phrase_item(token, removed_item);
+ delete removed_item;
+ result = true;
+ }
+ } else {
+ /* if not exists in the same sub phrase index,
+ get the maximum token,
+ then add it directly with maximum token + 1; */
+ PhraseIndexRange range;
+ retval = phrase_index->get_range(iter->m_phrase_index, range);
+
+ if (ERROR_OK == retval) {
+ token = range.m_range_end;
+ if (0x00000000 == (token & PHRASE_MASK))
+ token++;
+
+ if (len_phrase == keys->len) { /* valid pinyin */
+ phrase_table->add_index(len_phrase, ucs4_phrase, token);
+ pinyin_table->add_index
+ (keys->len, (ChewingKey *)(keys->data), token);
+
+ item.set_phrase_string(len_phrase, ucs4_phrase);
+ item.add_pronunciation((ChewingKey *)(keys->data), count);
+ phrase_index->add_phrase_item(token, &item);
+ phrase_index->add_unigram_frequency(token,
+ count * unigram_factor);
+ result = true;
+ }
+ }
+ }
+
+ g_array_free(key_rests, TRUE);
+ g_array_free(keys, TRUE);
+ g_free(ucs4_phrase);
+ return result;
+}
+
+void pinyin_end_add_phrases(import_iterator_t * iter){
+ /* compact the content memory chunk of phrase index. */
+ iter->m_context->m_phrase_index->compact();
+ iter->m_context->m_modified = true;
+ delete iter;
+}
+
+bool pinyin_save(pinyin_context_t * context){
+ if (!context->m_user_dir)
+ return false;
+
+ if (!context->m_modified)
+ return false;
+
+ context->m_phrase_index->compact();
+
+ const pinyin_table_info_t * phrase_files =
+ context->m_system_table_info.get_table_info();
+
+ /* skip the reserved zero phrase library. */
+ for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ PhraseIndexRange range;
+ int retval = context->m_phrase_index->get_range(i, range);
+
+ if (ERROR_NO_SUB_PHRASE_INDEX == retval)
+ continue;
+
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (NOT_USED == table_info->m_file_type)
+ continue;
+
+ const char * userfilename = table_info->m_user_filename;
+
+ if (NULL == userfilename)
+ continue;
+
+ if (SYSTEM_FILE == table_info->m_file_type ||
+ DICTIONARY == table_info->m_file_type) {
+ /* system phrase library */
+ MemoryChunk * chunk = new MemoryChunk;
+ MemoryChunk * log = new MemoryChunk;
+ const char * systemfilename = table_info->m_system_filename;
+
+ /* check bin file in system dir. */
+ gchar * chunkfilename = g_build_filename(context->m_system_dir,
+ systemfilename, NULL);
+ chunk->load(chunkfilename);
+ g_free(chunkfilename);
+ context->m_phrase_index->diff(i, chunk, log);
+
+ const char * userfilename = table_info->m_user_filename;
+ gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
+
+ gchar * tmppathname = g_build_filename(context->m_user_dir,
+ tmpfilename, NULL);
+ g_free(tmpfilename);
+
+ gchar * chunkpathname = g_build_filename(context->m_user_dir,
+ userfilename, NULL);
+ log->save(tmppathname);
+
+ int result = rename(tmppathname, chunkpathname);
+ if (0 != result)
+ fprintf(stderr, "rename %s to %s failed.\n",
+ tmppathname, chunkpathname);
+
+ g_free(chunkpathname);
+ g_free(tmppathname);
+ delete log;
+ }
+
+ if (USER_FILE == table_info->m_file_type) {
+ /* user phrase library */
+ MemoryChunk * chunk = new MemoryChunk;
+ context->m_phrase_index->store(i, chunk);
+
+ const char * userfilename = table_info->m_user_filename;
+ gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
+ gchar * tmppathname = g_build_filename(context->m_user_dir,
+ tmpfilename, NULL);
+ g_free(tmpfilename);
+
+ gchar * chunkpathname = g_build_filename(context->m_user_dir,
+ userfilename, NULL);
+
+ chunk->save(tmppathname);
+
+ int result = rename(tmppathname, chunkpathname);
+ if (0 != result)
+ fprintf(stderr, "rename %s to %s failed.\n",
+ tmppathname, chunkpathname);
+
+ g_free(chunkpathname);
+ g_free(tmppathname);
+ delete chunk;
+ }
+ }
+
+ /* save user pinyin table */
+ gchar * tmpfilename = g_build_filename
+ (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL);
+ unlink(tmpfilename);
+ gchar * filename = g_build_filename
+ (context->m_user_dir, USER_PINYIN_INDEX, NULL);
+
+ MemoryChunk * chunk = new MemoryChunk;
+ context->m_pinyin_table->store(chunk);
+ chunk->save(tmpfilename);
+ delete chunk;
+
+ int result = rename(tmpfilename, filename);
+ if (0 != result)
+ fprintf(stderr, "rename %s to %s failed.\n",
+ tmpfilename, filename);
+
+ g_free(tmpfilename);
+ g_free(filename);
+
+ /* save user phrase table */
+ tmpfilename = g_build_filename
+ (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL);
+ unlink(tmpfilename);
+ filename = g_build_filename
+ (context->m_user_dir, USER_PHRASE_INDEX, NULL);
+
+ chunk = new MemoryChunk;
+ context->m_phrase_table->store(chunk);
+ chunk->save(tmpfilename);
+ delete chunk;
+
+ result = rename(tmpfilename, filename);
+ if (0 != result)
+ fprintf(stderr, "rename %s to %s failed.\n",
+ tmpfilename, filename);
+
+ g_free(tmpfilename);
+ g_free(filename);
+
+ /* save user bi-gram */
+ tmpfilename = g_build_filename
+ (context->m_user_dir, USER_BIGRAM ".tmp", NULL);
+ unlink(tmpfilename);
+ filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL);
+ context->m_user_bigram->save_db(tmpfilename);
+
+ result = rename(tmpfilename, filename);
+ if (0 != result)
+ fprintf(stderr, "rename %s to %s failed.\n",
+ tmpfilename, filename);
+
+ g_free(tmpfilename);
+ g_free(filename);
+
+ mark_version(context);
+
+ context->m_modified = false;
+ return true;
+}
+
+bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context,
+ DoublePinyinScheme scheme){
+ context->m_double_pinyin_parser->set_scheme(scheme);
+ return true;
+}
+
+bool pinyin_set_chewing_scheme(pinyin_context_t * context,
+ ChewingScheme scheme){
+ context->m_chewing_parser->set_scheme(scheme);
+ return true;
+}
+
+void pinyin_fini(pinyin_context_t * context){
+ delete context->m_full_pinyin_parser;
+ delete context->m_double_pinyin_parser;
+ delete context->m_chewing_parser;
+ delete context->m_pinyin_table;
+ delete context->m_phrase_table;
+ delete context->m_phrase_index;
+ delete context->m_system_bigram;
+ delete context->m_user_bigram;
+ delete context->m_pinyin_lookup;
+ delete context->m_phrase_lookup;
+
+ g_free(context->m_system_dir);
+ g_free(context->m_user_dir);
+ context->m_modified = false;
+
+ delete context;
+}
+
+bool pinyin_mask_out(pinyin_context_t * context,
+ phrase_token_t mask,
+ phrase_token_t value) {
+
+ context->m_pinyin_table->mask_out(mask, value);
+ context->m_phrase_table->mask_out(mask, value);
+ context->m_user_bigram->mask_out(mask, value);
+
+ const pinyin_table_info_t * phrase_files =
+ context->m_system_table_info.get_table_info();
+
+ /* mask out the phrase index. */
+ for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
+ PhraseIndexRange range;
+ int retval = context->m_phrase_index->get_range(index, range);
+
+ if (ERROR_NO_SUB_PHRASE_INDEX == retval)
+ continue;
+
+ const pinyin_table_info_t * table_info = phrase_files + index;
+
+ if (NOT_USED == table_info->m_file_type)
+ continue;
+
+ const char * userfilename = table_info->m_user_filename;
+
+ if (NULL == userfilename)
+ continue;
+
+ if (SYSTEM_FILE == table_info->m_file_type ||
+ DICTIONARY == table_info->m_file_type) {
+ /* system phrase library */
+ MemoryChunk * chunk = new MemoryChunk;
+
+ const char * systemfilename = table_info->m_system_filename;
+ /* check bin file in system dir. */
+ gchar * chunkfilename = g_build_filename(context->m_system_dir,
+ systemfilename, NULL);
+ chunk->load(chunkfilename);
+ g_free(chunkfilename);
+
+ context->m_phrase_index->load(index, chunk);
+
+ const char * userfilename = table_info->m_user_filename;
+
+ chunkfilename = g_build_filename(context->m_user_dir,
+ userfilename, NULL);
+
+ MemoryChunk * log = new MemoryChunk;
+ log->load(chunkfilename);
+ g_free(chunkfilename);
+
+ /* merge the chunk log with mask. */
+ context->m_phrase_index->merge_with_mask(index, log, mask, value);
+ }
+
+ if (USER_FILE == table_info->m_file_type) {
+ /* user phrase library */
+ context->m_phrase_index->mask_out(index, mask, value);
+ }
+ }
+
+ context->m_phrase_index->compact();
+ return true;
+}
+
+/* copy from options to context->m_options. */
+bool pinyin_set_options(pinyin_context_t * context,
+ pinyin_option_t options){
+ context->m_options = options;
+ context->m_pinyin_table->set_options(context->m_options);
+ context->m_pinyin_lookup->set_options(context->m_options);
+ return true;
+}
+
+
+pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
+ pinyin_instance_t * instance = new pinyin_instance_t;
+ instance->m_context = context;
+
+ instance->m_raw_full_pinyin = NULL;
+
+ instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ instance->m_pinyin_key_rests =
+ g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+ instance->m_constraints = g_array_new
+ (TRUE, FALSE, sizeof(lookup_constraint_t));
+ instance->m_match_results =
+ g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ instance->m_candidates =
+ g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
+
+ return instance;
+}
+
+void pinyin_free_instance(pinyin_instance_t * instance){
+ g_free(instance->m_raw_full_pinyin);
+ g_array_free(instance->m_prefixes, TRUE);
+ g_array_free(instance->m_pinyin_keys, TRUE);
+ g_array_free(instance->m_pinyin_key_rests, TRUE);
+ g_array_free(instance->m_constraints, TRUE);
+ g_array_free(instance->m_match_results, TRUE);
+ g_array_free(instance->m_candidates, TRUE);
+
+ delete instance;
+}
+
+
+static bool pinyin_update_constraints(pinyin_instance_t * instance){
+ pinyin_context_t * & context = instance->m_context;
+ ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
+ CandidateConstraints & constraints = instance->m_constraints;
+
+ size_t key_len = constraints->len;
+ g_array_set_size(constraints, pinyin_keys->len);
+ for (size_t i = key_len; i < pinyin_keys->len; ++i ) {
+ lookup_constraint_t * constraint =
+ &g_array_index(constraints, lookup_constraint_t, i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+
+ context->m_pinyin_lookup->validate_constraint
+ (constraints, pinyin_keys);
+
+ return true;
+}
+
+
+bool pinyin_guess_sentence(pinyin_instance_t * instance){
+ pinyin_context_t * & context = instance->m_context;
+
+ g_array_set_size(instance->m_prefixes, 0);
+ g_array_append_val(instance->m_prefixes, sentence_start);
+
+ pinyin_update_constraints(instance);
+ bool retval = context->m_pinyin_lookup->get_best_match
+ (instance->m_prefixes,
+ instance->m_pinyin_keys,
+ instance->m_constraints,
+ instance->m_match_results);
+
+ return retval;
+}
+
+bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
+ const char * prefix){
+ pinyin_context_t * & context = instance->m_context;
+
+ FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+
+ g_array_set_size(instance->m_prefixes, 0);
+ g_array_append_val(instance->m_prefixes, sentence_start);
+
+ glong len_str = 0;
+ ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL);
+ GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ if (ucs4_str && len_str) {
+ /* add prefixes. */
+ for (ssize_t i = 1; i <= len_str; ++i) {
+ if (i > MAX_PHRASE_LENGTH)
+ break;
+
+ ucs4_t * start = ucs4_str + len_str - i;
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(tokens));
+ phrase_index->prepare_tokens(tokens);
+ int result = context->m_phrase_table->search(i, start, tokens);
+ int num = reduce_tokens(tokens, tokenarray);
+ phrase_index->destroy_tokens(tokens);
+
+ if (result & SEARCH_OK)
+ g_array_append_vals(instance->m_prefixes,
+ tokenarray->data, tokenarray->len);
+ }
+ }
+ g_array_free(tokenarray, TRUE);
+ g_free(ucs4_str);
+
+ pinyin_update_constraints(instance);
+ bool retval = context->m_pinyin_lookup->get_best_match
+ (instance->m_prefixes,
+ instance->m_pinyin_keys,
+ instance->m_constraints,
+ instance->m_match_results);
+
+ return retval;
+}
+
+bool pinyin_phrase_segment(pinyin_instance_t * instance,
+ const char * sentence){
+ pinyin_context_t * & context = instance->m_context;
+
+ const glong num_of_chars = g_utf8_strlen(sentence, -1);
+ glong ucs4_len = 0;
+ ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
+
+ g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
+
+ bool retval = context->m_phrase_lookup->get_best_match
+ (ucs4_len, ucs4_str, instance->m_match_results);
+
+ g_free(ucs4_str);
+ return retval;
+}
+
+/* the returned sentence should be freed by g_free(). */
+bool pinyin_get_sentence(pinyin_instance_t * instance,
+ char ** sentence){
+ pinyin_context_t * & context = instance->m_context;
+
+ bool retval = pinyin::convert_to_utf8
+ (context->m_phrase_index, instance->m_match_results,
+ NULL, false, *sentence);
+
+ return retval;
+}
+
+bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
+ const char * onepinyin,
+ ChewingKey * onekey){
+ pinyin_context_t * & context = instance->m_context;
+
+ int pinyin_len = strlen(onepinyin);
+ bool retval = context->m_full_pinyin_parser->parse_one_key
+ ( context->m_options, *onekey, onepinyin, pinyin_len);
+ return retval;
+}
+
+size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
+ const char * pinyins){
+ pinyin_context_t * & context = instance->m_context;
+
+ g_free(instance->m_raw_full_pinyin);
+ instance->m_raw_full_pinyin = g_strdup(pinyins);
+ int pinyin_len = strlen(pinyins);
+
+ int parse_len = context->m_full_pinyin_parser->parse
+ ( context->m_options, instance->m_pinyin_keys,
+ instance->m_pinyin_key_rests, pinyins, pinyin_len);
+
+ return parse_len;
+}
+
+bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
+ const char * onepinyin,
+ ChewingKey * onekey){
+ pinyin_context_t * & context = instance->m_context;
+
+ int pinyin_len = strlen(onepinyin);
+ bool retval = context->m_double_pinyin_parser->parse_one_key
+ ( context->m_options, *onekey, onepinyin, pinyin_len);
+ return retval;
+}
+
+size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
+ const char * pinyins){
+ pinyin_context_t * & context = instance->m_context;
+ int pinyin_len = strlen(pinyins);
+
+ int parse_len = context->m_double_pinyin_parser->parse
+ ( context->m_options, instance->m_pinyin_keys,
+ instance->m_pinyin_key_rests, pinyins, pinyin_len);
+
+ return parse_len;
+}
+
+bool pinyin_parse_chewing(pinyin_instance_t * instance,
+ const char * onechewing,
+ ChewingKey * onekey){
+ pinyin_context_t * & context = instance->m_context;
+
+ int chewing_len = strlen(onechewing);
+ bool retval = context->m_chewing_parser->parse_one_key
+ ( context->m_options, *onekey, onechewing, chewing_len );
+ return retval;
+}
+
+size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
+ const char * chewings){
+ pinyin_context_t * & context = instance->m_context;
+ int chewing_len = strlen(chewings);
+
+ int parse_len = context->m_chewing_parser->parse
+ ( context->m_options, instance->m_pinyin_keys,
+ instance->m_pinyin_key_rests, chewings, chewing_len);
+
+ return parse_len;
+}
+
+bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
+ const char key, const char ** symbol) {
+ pinyin_context_t * & context = instance->m_context;
+ return context->m_chewing_parser->in_chewing_scheme
+ (context->m_options, key, symbol);
+}
+
+#if 0
+static gint compare_item_with_token(gconstpointer lhs,
+ gconstpointer rhs) {
+ lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
+ lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
+
+ phrase_token_t token_lhs = item_lhs->m_token;
+ phrase_token_t token_rhs = item_rhs->m_token;
+
+ return (token_lhs - token_rhs);
+}
+#endif
+
+static gint compare_item_with_frequency(gconstpointer lhs,
+ gconstpointer rhs) {
+ lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
+ lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
+
+ guint32 freq_lhs = item_lhs->m_freq;
+ guint32 freq_rhs = item_rhs->m_freq;
+
+ return -(freq_lhs - freq_rhs); /* in descendant order */
+}
+
+static phrase_token_t _get_previous_token(pinyin_instance_t * instance,
+ size_t offset) {
+ phrase_token_t prev_token = null_token;
+ ssize_t i;
+
+ if (0 == offset) {
+ /* get previous token from prefixes. */
+ prev_token = sentence_start;
+ size_t prev_token_len = 0;
+
+ pinyin_context_t * context = instance->m_context;
+ TokenVector prefixes = instance->m_prefixes;
+ PhraseItem item;
+
+ for (size_t i = 0; i < prefixes->len; ++i) {
+ phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
+ if (sentence_start == token)
+ continue;
+
+ int retval = context->m_phrase_index->get_phrase_item(token, item);
+ if (ERROR_OK == retval) {
+ size_t token_len = item.get_phrase_length();
+ if (token_len > prev_token_len) {
+ /* found longer match, and save it. */
+ prev_token = token;
+ prev_token_len = token_len;
+ }
+ }
+ }
+ } else {
+ /* get previous token from match results. */
+ assert (0 < offset);
+
+ phrase_token_t cur_token = g_array_index
+ (instance->m_match_results, phrase_token_t, offset);
+ if (null_token != cur_token) {
+ for (i = offset - 1; i >= 0; --i) {
+ cur_token = g_array_index
+ (instance->m_match_results, phrase_token_t, i);
+ if (null_token != cur_token) {
+ prev_token = cur_token;
+ break;
+ }
+ }
+ }
+ }
+
+ return prev_token;
+}
+
+static void _append_items(pinyin_context_t * context,
+ PhraseIndexRanges ranges,
+ lookup_candidate_t * template_item,
+ CandidateVector items) {
+ /* reduce and append to a single GArray. */
+ for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
+ if (NULL == ranges[m])
+ continue;
+
+ for (size_t n = 0; n < ranges[m]->len; ++n) {
+ PhraseIndexRange * range =
+ &g_array_index(ranges[m], PhraseIndexRange, n);
+ for (size_t k = range->m_range_begin;
+ k < range->m_range_end; ++k) {
+ lookup_candidate_t item;
+ item.m_candidate_type = template_item->m_candidate_type;
+ item.m_token = k;
+ item.m_orig_rest = template_item->m_orig_rest;
+ item.m_new_pinyins = g_strdup(template_item->m_new_pinyins);
+ item.m_freq = template_item->m_freq;
+ g_array_append_val(items, item);
+ }
+ }
+ }
+}
+
+#if 0
+static void _remove_duplicated_items(CandidateVector items) {
+ /* remove the duplicated items. */
+ phrase_token_t last_token = null_token, saved_token;
+ for (size_t n = 0; n < items->len; ++n) {
+ lookup_candidate_t * item = &g_array_index
+ (items, lookup_candidate_t, n);
+
+ saved_token = item->m_token;
+ if (last_token == saved_token) {
+ g_array_remove_index(items, n);
+ n--;
+ }
+ last_token = saved_token;
+ }
+}
+#endif
+
+static void _compute_frequency_of_items(pinyin_context_t * context,
+ phrase_token_t prev_token,
+ SingleGram * merged_gram,
+ CandidateVector items) {
+ pinyin_option_t & options = context->m_options;
+ ssize_t i;
+
+ PhraseItem cached_item;
+ /* compute all freqs. */
+ for (i = 0; i < items->len; ++i) {
+ lookup_candidate_t * item = &g_array_index
+ (items, lookup_candidate_t, i);
+ phrase_token_t & token = item->m_token;
+
+ gfloat bigram_poss = 0; guint32 total_freq = 0;
+ if (options & DYNAMIC_ADJUST) {
+ if (null_token != prev_token) {
+ guint32 bigram_freq = 0;
+ merged_gram->get_total_freq(total_freq);
+ merged_gram->get_freq(token, bigram_freq);
+ if (0 != total_freq)
+ bigram_poss = bigram_freq / (gfloat)total_freq;
+ }
+ }
+
+ /* compute the m_freq. */
+ FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+ phrase_index->get_phrase_item(token, cached_item);
+ total_freq = phrase_index->get_phrase_index_total_freq();
+ assert (0 < total_freq);
+
+ gfloat lambda = context->m_system_table_info.get_lambda();
+
+ /* Note: possibility value <= 1.0. */
+ guint32 freq = (lambda * bigram_poss +
+ (1 - lambda) *
+ cached_item.get_unigram_frequency() /
+ (gfloat) total_freq) * 256 * 256 * 256;
+ item->m_freq = freq;
+ }
+}
+
+static bool _prepend_sentence_candidate(pinyin_instance_t * instance,
+ CandidateVector candidates) {
+ /* check whether the best match candidate exists. */
+ gchar * sentence = NULL;
+ pinyin_get_sentence(instance, &sentence);
+ if (NULL == sentence)
+ return false;
+ g_free(sentence);
+
+ /* prepend best match candidate to candidates. */
+ lookup_candidate_t candidate;
+ candidate.m_candidate_type = BEST_MATCH_CANDIDATE;
+ g_array_prepend_val(candidates, candidate);
+
+ return true;
+}
+
+static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
+ size_t offset,
+ CandidateVector candidates) {
+ /* populate m_phrase_string in lookup_candidate_t. */
+
+ for(size_t i = 0; i < candidates->len; ++i) {
+ lookup_candidate_t * candidate = &g_array_index
+ (candidates, lookup_candidate_t, i);
+
+ switch(candidate->m_candidate_type) {
+ case BEST_MATCH_CANDIDATE: {
+ gchar * sentence = NULL;
+ pinyin_get_sentence(instance, &sentence);
+ candidate->m_phrase_string = g_strdup
+ (g_utf8_offset_to_pointer(sentence, offset));
+ g_free(sentence);
+ break;
+ }
+ case NORMAL_CANDIDATE:
+ case DIVIDED_CANDIDATE:
+ case RESPLIT_CANDIDATE:
+ pinyin_token_get_phrase
+ (instance, candidate->m_token, NULL,
+ &(candidate->m_phrase_string));
+ break;
+ case ZOMBIE_CANDIDATE:
+ break;
+ }
+ }
+
+ return true;
+}
+
+static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
+ gconstpointer rhs,
+ gpointer userdata) {
+ size_t index_lhs = *((size_t *) lhs);
+ size_t index_rhs = *((size_t *) rhs);
+ CandidateVector candidates = (CandidateVector) userdata;
+
+ lookup_candidate_t * candidate_lhs =
+ &g_array_index(candidates, lookup_candidate_t, index_lhs);
+ lookup_candidate_t * candidate_rhs =
+ &g_array_index(candidates, lookup_candidate_t, index_rhs);
+
+ return -strcmp(candidate_lhs->m_phrase_string,
+ candidate_rhs->m_phrase_string); /* in descendant order */
+}
+
+
+static bool _remove_duplicated_items_by_phrase_string
+(pinyin_instance_t * instance,
+ CandidateVector candidates) {
+ size_t i;
+ /* create the GArray of indexed item */
+ GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
+ for (i = 0; i < candidates->len; ++i)
+ g_array_append_val(indices, i);
+
+ /* sort the indices array by phrase array */
+ g_array_sort_with_data
+ (indices, compare_indexed_item_with_phrase_string, candidates);
+
+ /* mark duplicated items as zombie candidate */
+ lookup_candidate_t * cur_item, * saved_item = NULL;
+ for (i = 0; i < indices->len; ++i) {
+ size_t cur_index = g_array_index(indices, size_t, i);
+ cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index);
+
+ /* handle the first candidate */
+ if (NULL == saved_item) {
+ saved_item = cur_item;
+ continue;
+ }
+
+ if (0 == strcmp(saved_item->m_phrase_string,
+ cur_item->m_phrase_string)) {
+ /* found duplicated candidates */
+
+ /* keep best match candidate */
+ if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) {
+ cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ continue;
+ }
+
+ if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
+ saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ saved_item = cur_item;
+ continue;
+ }
+
+ /* keep the higher possiblity one
+ to quickly move the word forward in the candidate list */
+ if (cur_item->m_freq > saved_item->m_freq) {
+ /* find better candidate */
+ saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ saved_item = cur_item;
+ continue;
+ } else {
+ cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ continue;
+ }
+ } else {
+ /* keep the current candidate */
+ saved_item = cur_item;
+ }
+ }
+
+ g_array_free(indices, TRUE);
+
+ /* remove zombie candidate from the returned candidates */
+ for (i = 0; i < candidates->len; ++i) {
+ lookup_candidate_t * candidate = &g_array_index
+ (candidates, lookup_candidate_t, i);
+
+ if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) {
+ g_free(candidate->m_phrase_string);
+ g_free(candidate->m_new_pinyins);
+ g_array_remove_index(candidates, i);
+ i--;
+ }
+ }
+
+ return true;
+}
+
+static bool _free_candidates(CandidateVector candidates) {
+ /* free candidates */
+ for (size_t i = 0; i < candidates->len; ++i) {
+ lookup_candidate_t * candidate = &g_array_index
+ (candidates, lookup_candidate_t, i);
+ g_free(candidate->m_phrase_string);
+ g_free(candidate->m_new_pinyins);
+ }
+ g_array_set_size(candidates, 0);
+
+ return true;
+}
+
+bool pinyin_guess_candidates(pinyin_instance_t * instance,
+ size_t offset) {
+
+ pinyin_context_t * & context = instance->m_context;
+ pinyin_option_t & options = context->m_options;
+ ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
+
+ _free_candidates(instance->m_candidates);
+
+ size_t pinyin_len = pinyin_keys->len - offset;
+ ssize_t i;
+
+ /* lookup the previous token here. */
+ phrase_token_t prev_token = null_token;
+
+ if (options & DYNAMIC_ADJUST) {
+ prev_token = _get_previous_token(instance, offset);
+ }
+
+ SingleGram merged_gram;
+ SingleGram * system_gram = NULL, * user_gram = NULL;
+
+ if (options & DYNAMIC_ADJUST) {
+ if (null_token != prev_token) {
+ context->m_system_bigram->load(prev_token, system_gram);
+ context->m_user_bigram->load(prev_token, user_gram);
+ merge_single_gram(&merged_gram, system_gram, user_gram);
+ }
+ }
+
+ PhraseIndexRanges ranges;
+ memset(ranges, 0, sizeof(ranges));
+ context->m_phrase_index->prepare_ranges(ranges);
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
+
+ for (i = pinyin_len; i >= 1; --i) {
+ g_array_set_size(items, 0);
+
+ ChewingKey * keys = &g_array_index
+ (pinyin_keys, ChewingKey, offset);
+
+ /* do pinyin search. */
+ int retval = context->m_pinyin_table->search
+ (i, keys, ranges);
+
+ if ( !(retval & SEARCH_OK) )
+ continue;
+
+ lookup_candidate_t template_item;
+ _append_items(context, ranges, &template_item, items);
+
+#if 0
+ g_array_sort(items, compare_item_with_token);
+
+ _remove_duplicated_items(items);
+#endif
+
+ _compute_frequency_of_items(context, prev_token, &merged_gram, items);
+
+ /* sort the candidates of the same length by frequency. */
+ g_array_sort(items, compare_item_with_frequency);
+
+ /* transfer back items to tokens, and save it into candidates */
+ for (size_t k = 0; k < items->len; ++k) {
+ lookup_candidate_t * item = &g_array_index
+ (items, lookup_candidate_t, k);
+ g_array_append_val(instance->m_candidates, *item);
+ }
+
+#if 0
+ if (!(retval & SEARCH_CONTINUED))
+ break;
+#endif
+ }
+
+ g_array_free(items, TRUE);
+ context->m_phrase_index->destroy_ranges(ranges);
+ if (system_gram)
+ delete system_gram;
+ if (user_gram)
+ delete user_gram;
+
+ /* post process to remove duplicated candidates */
+
+ _prepend_sentence_candidate(instance, instance->m_candidates);
+
+ _compute_phrase_strings_of_items(instance, offset, instance->m_candidates);
+
+ _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates);
+
+ return true;
+}
+
+
+static bool _try_divided_table(pinyin_instance_t * instance,
+ PhraseIndexRanges ranges,
+ size_t offset,
+ CandidateVector items){
+ bool found = false;
+
+ pinyin_context_t * & context = instance->m_context;
+ pinyin_option_t & options = context->m_options;
+ ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
+ ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
+
+ assert(pinyin_keys->len == pinyin_key_rests->len);
+ guint num_keys = pinyin_keys->len;
+ assert(offset < num_keys);
+
+ /* handle "^xian$" -> "xi'an" here */
+ ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
+ ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
+ ChewingKeyRest, offset);
+ ChewingKeyRest orig_rest = *rest;
+ guint16 tone = CHEWING_ZERO_TONE;
+
+ const divided_table_item_t * item = NULL;
+
+ /* back up tone */
+ if (options & USE_TONE) {
+ tone = key->m_tone;
+ if (CHEWING_ZERO_TONE != tone) {
+ key->m_tone = CHEWING_ZERO_TONE;
+ rest->m_raw_end --;
+ }
+ }
+
+ item = context->m_full_pinyin_parser->retrieve_divided_item
+ (options, key, rest, instance->m_raw_full_pinyin,
+ strlen(instance->m_raw_full_pinyin));
+
+ if (item) {
+ /* no ops */
+ assert(item->m_new_freq > 0);
+
+ ChewingKey divided_keys[2];
+ const char * pinyin = item->m_new_keys[0];
+ assert(context->m_full_pinyin_parser->
+ parse_one_key(options, divided_keys[0],
+ pinyin, strlen(pinyin)));
+ pinyin = item->m_new_keys[1];
+ assert(context->m_full_pinyin_parser->
+ parse_one_key(options, divided_keys[1],
+ pinyin, strlen(pinyin)));
+
+ gchar * new_pinyins = g_strdup_printf
+ ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
+
+ /* propagate the tone */
+ if (options & USE_TONE) {
+ if (CHEWING_ZERO_TONE != tone) {
+ assert(0 < tone && tone <= 5);
+ divided_keys[1].m_tone = tone;
+
+ gchar * tmp_str = g_strdup_printf
+ ("%s%d", new_pinyins, tone);
+ g_free(new_pinyins);
+ new_pinyins = tmp_str;
+ }
+ }
+
+ /* do pinyin search. */
+ int retval = context->m_pinyin_table->search
+ (2, divided_keys, ranges);
+
+ if (retval & SEARCH_OK) {
+ lookup_candidate_t template_item;
+ template_item.m_candidate_type = DIVIDED_CANDIDATE;
+ template_item.m_orig_rest = orig_rest;
+ template_item.m_new_pinyins = new_pinyins;
+
+ _append_items(context, ranges, &template_item, items);
+ found = true;
+ }
+ g_free(new_pinyins);
+ }
+
+ /* restore tones */
+ if (options & USE_TONE) {
+ if (CHEWING_ZERO_TONE != tone) {
+ key->m_tone = tone;
+ rest->m_raw_end ++;
+ }
+ }
+
+ return found;
+}
+
+static bool _try_resplit_table(pinyin_instance_t * instance,
+ PhraseIndexRanges ranges,
+ size_t offset,
+ CandidateVector items){
+ bool found = false;
+
+ pinyin_context_t * & context = instance->m_context;
+ pinyin_option_t & options = context->m_options;
+ ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
+ ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
+
+ assert(pinyin_keys->len == pinyin_key_rests->len);
+ guint num_keys = pinyin_keys->len;
+ assert(offset + 1 < num_keys);
+
+ guint16 next_tone = CHEWING_ZERO_TONE;
+
+ /* handle "^fa'nan$" -> "fan'an" here */
+ ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests,
+ ChewingKeyRest, offset);
+ ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests,
+ ChewingKeyRest, offset + 1);
+ /* some "'" here */
+ if (cur_rest->m_raw_end != next_rest->m_raw_begin)
+ return found;
+
+ ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset);
+ ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey,
+ offset + 1);
+
+ /* some tone here */
+ if (CHEWING_ZERO_TONE != cur_key->m_tone)
+ return found;
+
+ ChewingKeyRest orig_rest;
+ orig_rest.m_raw_begin = cur_rest->m_raw_begin;
+ orig_rest.m_raw_end = next_rest->m_raw_end;
+
+ /* backup tone */
+ if (options & USE_TONE) {
+ next_tone = next_key->m_tone;
+ if (CHEWING_ZERO_TONE != next_tone) {
+ next_key->m_tone = CHEWING_ZERO_TONE;
+ next_rest->m_raw_end --;
+ }
+ }
+
+ /* lookup re-split table */
+ const char * str = instance->m_raw_full_pinyin;
+ const resplit_table_item_t * item_by_orig =
+ context->m_full_pinyin_parser->
+ retrieve_resplit_item_by_original_pinyins
+ (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
+
+ const resplit_table_item_t * item_by_new =
+ context->m_full_pinyin_parser->
+ retrieve_resplit_item_by_resplit_pinyins
+ (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str));
+
+ /* there are no same couple of pinyins in re-split table. */
+ assert(!(item_by_orig && item_by_new));
+
+ ChewingKey resplit_keys[2];
+ const char * pinyins[2];
+
+ bool tosearch = false;
+ if (item_by_orig && item_by_orig->m_new_freq) {
+ pinyins[0] = item_by_orig->m_new_keys[0];
+ pinyins[1] = item_by_orig->m_new_keys[1];
+
+ assert(context->m_full_pinyin_parser->
+ parse_one_key(options, resplit_keys[0],
+ pinyins[0], strlen(pinyins[0])));
+
+ assert(context->m_full_pinyin_parser->
+ parse_one_key(options, resplit_keys[1],
+ pinyins[1], strlen(pinyins[1])));
+ tosearch = true;
+ }
+
+ if (item_by_new && item_by_new->m_orig_freq) {
+ pinyins[0] = item_by_new->m_orig_keys[0];
+ pinyins[1] = item_by_new->m_orig_keys[1];
+
+ assert(context->m_full_pinyin_parser->
+ parse_one_key(options, resplit_keys[0],
+ pinyins[0], strlen(pinyins[0])));
+
+ assert(context->m_full_pinyin_parser->
+ parse_one_key(options, resplit_keys[1],
+ pinyins[1], strlen(pinyins[1])));
+ tosearch = true;
+ }
+
+ if (tosearch) {
+ gchar * new_pinyins = g_strdup_printf
+ ("%s'%s", pinyins[0], pinyins[1]);
+
+ /* propagate the tone */
+ if (options & USE_TONE) {
+ if (CHEWING_ZERO_TONE != next_tone) {
+ assert(0 < next_tone && next_tone <= 5);
+ resplit_keys[1].m_tone = next_tone;
+
+ gchar * tmp_str = g_strdup_printf
+ ("%s%d", new_pinyins, next_tone);
+ g_free(new_pinyins);
+ new_pinyins = tmp_str;
+ }
+ }
+
+ /* do pinyin search. */
+ int retval = context->m_pinyin_table->search
+ (2, resplit_keys, ranges);
+
+ if (retval & SEARCH_OK) {
+ lookup_candidate_t template_item;
+ template_item.m_candidate_type = RESPLIT_CANDIDATE;
+ template_item.m_orig_rest = orig_rest;
+ template_item.m_new_pinyins = new_pinyins;
+
+ _append_items(context, ranges, &template_item, items);
+ found = true;
+ }
+ g_free(new_pinyins);
+ }
+
+ /* restore tones */
+ if (options & USE_TONE) {
+ if (CHEWING_ZERO_TONE != next_tone) {
+ next_key->m_tone = next_tone;
+ next_rest->m_raw_end ++;
+ }
+ }
+
+ return found;
+}
+
+bool pinyin_guess_full_pinyin_candidates(pinyin_instance_t * instance,
+ size_t offset){
+
+ pinyin_context_t * & context = instance->m_context;
+ pinyin_option_t & options = context->m_options;
+ ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
+
+ _free_candidates(instance->m_candidates);
+
+ size_t pinyin_len = pinyin_keys->len - offset;
+ pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len);
+ ssize_t i;
+
+ /* lookup the previous token here. */
+ phrase_token_t prev_token = null_token;
+
+ if (options & DYNAMIC_ADJUST) {
+ prev_token = _get_previous_token(instance, offset);
+ }
+
+ SingleGram merged_gram;
+ SingleGram * system_gram = NULL, * user_gram = NULL;
+
+ if (options & DYNAMIC_ADJUST) {
+ if (null_token != prev_token) {
+ context->m_system_bigram->load(prev_token, system_gram);
+ context->m_user_bigram->load(prev_token, user_gram);
+ merge_single_gram(&merged_gram, system_gram, user_gram);
+ }
+ }
+
+ PhraseIndexRanges ranges;
+ memset(ranges, 0, sizeof(ranges));
+ context->m_phrase_index->prepare_ranges(ranges);
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
+
+ if (1 == pinyin_len) {
+ /* because there is only one pinyin left,
+ * the following for-loop will not produce 2 character candidates.
+ * the if-branch will fill the candidate list with
+ * 2 character candidates.
+ */
+
+ if (options & USE_DIVIDED_TABLE) {
+ g_array_set_size(items, 0);
+
+ if (_try_divided_table(instance, ranges, offset, items)) {
+
+#if 0
+ g_array_sort(items, compare_item_with_token);
+
+ _remove_duplicated_items(items);
+#endif
+
+ _compute_frequency_of_items(context, prev_token,
+ &merged_gram, items);
+
+ /* sort the candidates of the same length by frequency. */
+ g_array_sort(items, compare_item_with_frequency);
+
+ /* transfer back items to tokens, and save it into candidates */
+ for (i = 0; i < items->len; ++i) {
+ lookup_candidate_t * item = &g_array_index
+ (items, lookup_candidate_t, i);
+ g_array_append_val(instance->m_candidates, *item);
+ }
+ }
+ }
+ }
+
+ for (i = pinyin_len; i >= 1; --i) {
+ bool found = false;
+ g_array_set_size(items, 0);
+
+ if (2 == i) {
+ /* handle fuzzy pinyin segment here. */
+ if (options & USE_DIVIDED_TABLE) {
+ found = _try_divided_table(instance, ranges, offset, items) ||
+ found;
+ }
+ if (options & USE_RESPLIT_TABLE) {
+ found = _try_resplit_table(instance, ranges, offset, items) ||
+ found;
+ }
+ }
+
+ ChewingKey * keys = &g_array_index
+ (pinyin_keys, ChewingKey, offset);
+
+ /* do pinyin search. */
+ int retval = context->m_pinyin_table->search
+ (i, keys, ranges);
+
+ found = (retval & SEARCH_OK) || found;
+
+ if ( !found )
+ continue;
+
+ lookup_candidate_t template_item;
+ _append_items(context, ranges, &template_item, items);
+
+#if 0
+ g_array_sort(items, compare_item_with_token);
+
+ _remove_duplicated_items(items);
+#endif
+
+ _compute_frequency_of_items(context, prev_token, &merged_gram, items);
+
+ g_array_sort(items, compare_item_with_frequency);
+
+ for (size_t k = 0; k < items->len; ++k) {
+ lookup_candidate_t * item = &g_array_index
+ (items, lookup_candidate_t, k);
+ g_array_append_val(instance->m_candidates, *item);
+ }
+
+#if 0
+ if (!(retval & SEARCH_CONTINUED))
+ break;
+#endif
+ }
+
+ g_array_free(items, TRUE);
+ context->m_phrase_index->destroy_ranges(ranges);
+ if (system_gram)
+ delete system_gram;
+ if (user_gram)
+ delete user_gram;
+
+ /* post process to remove duplicated candidates */
+
+ _prepend_sentence_candidate(instance, instance->m_candidates);
+
+ _compute_phrase_strings_of_items(instance, offset, instance->m_candidates);
+
+ _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates);
+
+ return true;
+}
+
+
+int pinyin_choose_candidate(pinyin_instance_t * instance,
+ size_t offset,
+ lookup_candidate_t * candidate){
+ pinyin_context_t * & context = instance->m_context;
+
+ if (DIVIDED_CANDIDATE == candidate->m_candidate_type ||
+ RESPLIT_CANDIDATE == candidate->m_candidate_type) {
+ /* update full pinyin. */
+ gchar * oldpinyins = instance->m_raw_full_pinyin;
+ const ChewingKeyRest rest = candidate->m_orig_rest;
+ oldpinyins[rest.m_raw_begin] = '\0';
+ const gchar * left_part = oldpinyins;
+ const gchar * right_part = oldpinyins + rest.m_raw_end;
+ gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins,
+ right_part, NULL);
+ g_free(oldpinyins);
+ instance->m_raw_full_pinyin = newpinyins;
+
+ /* re-parse the full pinyin. */
+ const gchar * pinyins = instance->m_raw_full_pinyin;
+ int pinyin_len = strlen(pinyins);
+ int parse_len = context->m_full_pinyin_parser->parse
+ (context->m_options, instance->m_pinyin_keys,
+ instance->m_pinyin_key_rests, pinyins, pinyin_len);
+
+ /* Note: there may be some un-parsable input here. */
+ }
+
+ /* sync m_constraints to the length of m_pinyin_keys. */
+ bool retval = context->m_pinyin_lookup->validate_constraint
+ (instance->m_constraints, instance->m_pinyin_keys);
+
+ phrase_token_t token = candidate->m_token;
+ guint8 len = context->m_pinyin_lookup->add_constraint
+ (instance->m_constraints, offset, token);
+
+ /* safe guard: validate the m_constraints again. */
+ retval = context->m_pinyin_lookup->validate_constraint
+ (instance->m_constraints, instance->m_pinyin_keys) && len;
+
+ return offset + len;
+}
+
+bool pinyin_clear_constraint(pinyin_instance_t * instance,
+ size_t offset){
+ pinyin_context_t * & context = instance->m_context;
+
+ bool retval = context->m_pinyin_lookup->clear_constraint
+ (instance->m_constraints, offset);
+
+ return retval;
+}
+
+bool pinyin_lookup_tokens(pinyin_instance_t * instance,
+ const char * phrase, GArray * tokenarray){
+ pinyin_context_t * & context = instance->m_context;
+ FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+
+ glong ucs4_len = 0;
+ ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL);
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index->prepare_tokens(tokens);
+ int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens);
+ int num = reduce_tokens(tokens, tokenarray);
+ phrase_index->destroy_tokens(tokens);
+
+ return SEARCH_OK & retval;
+}
+
+bool pinyin_train(pinyin_instance_t * instance){
+ if (!instance->m_context->m_user_dir)
+ return false;
+
+ pinyin_context_t * & context = instance->m_context;
+ context->m_modified = true;
+
+ bool retval = context->m_pinyin_lookup->train_result2
+ (instance->m_pinyin_keys, instance->m_constraints,
+ instance->m_match_results);
+
+ return retval;
+}
+
+bool pinyin_reset(pinyin_instance_t * instance){
+ g_free(instance->m_raw_full_pinyin);
+ instance->m_raw_full_pinyin = NULL;
+
+ g_array_set_size(instance->m_prefixes, 0);
+ g_array_set_size(instance->m_pinyin_keys, 0);
+ g_array_set_size(instance->m_pinyin_key_rests, 0);
+ g_array_set_size(instance->m_constraints, 0);
+ g_array_set_size(instance->m_match_results, 0);
+ _free_candidates(instance->m_candidates);
+
+ return true;
+}
+
+bool pinyin_get_chewing_string(pinyin_instance_t * instance,
+ ChewingKey * key,
+ gchar ** utf8_str) {
+ *utf8_str = NULL;
+ if (0 == key->get_table_index())
+ return false;
+
+ *utf8_str = key->get_chewing_string();
+ return true;
+}
+
+bool pinyin_get_pinyin_string(pinyin_instance_t * instance,
+ ChewingKey * key,
+ gchar ** utf8_str) {
+ *utf8_str = NULL;
+ if (0 == key->get_table_index())
+ return false;
+
+ *utf8_str = key->get_pinyin_string();
+ return true;
+}
+
+bool pinyin_get_pinyin_strings(pinyin_instance_t * instance,
+ ChewingKey * key,
+ gchar ** shengmu,
+ gchar ** yunmu) {
+ if (0 == key->get_table_index())
+ return false;
+
+ if (shengmu)
+ *shengmu = key->get_shengmu_string();
+ if (yunmu)
+ *yunmu = key->get_yunmu_string();
+ return true;
+}
+
+bool pinyin_token_get_phrase(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint * len,
+ gchar ** utf8_str) {
+ pinyin_context_t * & context = instance->m_context;
+ PhraseItem item;
+ ucs4_t buffer[MAX_PHRASE_LENGTH];
+
+ int retval = context->m_phrase_index->get_phrase_item(token, item);
+ if (ERROR_OK != retval)
+ return false;
+
+ item.get_phrase_string(buffer);
+ guint length = item.get_phrase_length();
+ if (len)
+ *len = length;
+ if (utf8_str)
+ *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
+ return true;
+}
+
+bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint * num){
+ *num = 0;
+ pinyin_context_t * & context = instance->m_context;
+ PhraseItem item;
+
+ int retval = context->m_phrase_index->get_phrase_item(token, item);
+ if (ERROR_OK != retval)
+ return false;
+
+ *num = item.get_n_pronunciation();
+ return true;
+}
+
+bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint nth,
+ ChewingKeyVector keys){
+ g_array_set_size(keys, 0);
+ pinyin_context_t * & context = instance->m_context;
+ PhraseItem item;
+ ChewingKey buffer[MAX_PHRASE_LENGTH];
+ guint32 freq = 0;
+
+ int retval = context->m_phrase_index->get_phrase_item(token, item);
+ if (ERROR_OK != retval)
+ return false;
+
+ item.get_nth_pronunciation(nth, buffer, freq);
+ guint8 len = item.get_phrase_length();
+ g_array_append_vals(keys, buffer, len);
+ return true;
+}
+
+bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint * freq) {
+ *freq = 0;
+ pinyin_context_t * & context = instance->m_context;
+ PhraseItem item;
+
+ int retval = context->m_phrase_index->get_phrase_item(token, item);
+ if (ERROR_OK != retval)
+ return false;
+
+ *freq = item.get_unigram_frequency();
+ return true;
+}
+
+bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint delta){
+ pinyin_context_t * & context = instance->m_context;
+ int retval = context->m_phrase_index->add_unigram_frequency
+ (token, delta);
+ return ERROR_OK == retval;
+}
+
+bool pinyin_get_n_candidate(pinyin_instance_t * instance,
+ guint * num) {
+ *num = instance->m_candidates->len;
+ return true;
+}
+
+bool pinyin_get_candidate(pinyin_instance_t * instance,
+ guint index,
+ lookup_candidate_t ** candidate) {
+ CandidateVector & candidates = instance->m_candidates;
+
+ *candidate = NULL;
+
+ if (index >= candidates->len)
+ return false;
+
+ *candidate = &g_array_index(candidates, lookup_candidate_t, index);
+
+ return true;
+}
+
+bool pinyin_get_candidate_type(pinyin_instance_t * instance,
+ lookup_candidate_t * candidate,
+ lookup_candidate_type_t * type) {
+ *type = candidate->m_candidate_type;
+ return true;
+}
+
+bool pinyin_get_candidate_string(pinyin_instance_t * instance,
+ lookup_candidate_t * candidate,
+ const gchar ** utf8_str) {
+ *utf8_str = candidate->m_phrase_string;
+ return true;
+}
+
+bool pinyin_get_n_pinyin(pinyin_instance_t * instance,
+ guint * num) {
+ *num = 0;
+
+ if (instance->m_pinyin_keys->len !=
+ instance->m_pinyin_key_rests->len)
+ return false;
+
+ *num = instance->m_pinyin_keys->len;
+ return true;
+}
+
+bool pinyin_get_pinyin_key(pinyin_instance_t * instance,
+ guint index,
+ ChewingKey ** key) {
+ ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
+
+ *key = NULL;
+
+ if (index >= pinyin_keys->len)
+ return false;
+
+ *key = &g_array_index(pinyin_keys, ChewingKey, index);
+
+ return true;
+}
+
+bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance,
+ guint index,
+ ChewingKeyRest ** key_rest) {
+ ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
+
+ *key_rest = NULL;
+
+ if (index >= pinyin_key_rests->len)
+ return false;
+
+ *key_rest = &g_array_index(pinyin_key_rests, ChewingKeyRest, index);
+
+ return true;
+}
+
+bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance,
+ ChewingKeyRest * key_rest,
+ guint16 * begin, guint16 * end) {
+ if (begin)
+ *begin = key_rest->m_raw_begin;
+
+ if (end)
+ *end = key_rest->m_raw_end;
+
+ return true;
+}
+
+bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance,
+ ChewingKeyRest * key_rest,
+ guint16 * length) {
+ *length = key_rest->length();
+ return true;
+}
+
+bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance,
+ const gchar ** utf8_str) {
+ *utf8_str = instance->m_raw_full_pinyin;
+ return true;
+}
+
+bool pinyin_get_n_phrase(pinyin_instance_t * instance,
+ guint * num) {
+ *num = instance->m_match_results->len;
+ return true;
+}
+
+bool pinyin_get_phrase_token(pinyin_instance_t * instance,
+ guint index,
+ phrase_token_t * token){
+ MatchResults & match_results = instance->m_match_results;
+
+ *token = null_token;
+
+ if (index >= match_results->len)
+ return false;
+
+ *token = g_array_index(match_results, phrase_token_t, index);
+
+ return true;
+}
+
+
+/**
+ * Note: prefix is the text before the pre-edit string.
+ */
diff --git a/src/pinyin.h b/src/pinyin.h
new file mode 100644
index 0000000..8c39c3d
--- /dev/null
+++ b/src/pinyin.h
@@ -0,0 +1,719 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef PINYIN_H
+#define PINYIN_H
+
+
+#include "novel_types.h"
+#include "pinyin_custom2.h"
+
+
+G_BEGIN_DECLS
+
+typedef struct _ChewingKey ChewingKey;
+typedef struct _ChewingKeyRest ChewingKeyRest;
+
+typedef struct _pinyin_context_t pinyin_context_t;
+typedef struct _pinyin_instance_t pinyin_instance_t;
+typedef struct _lookup_candidate_t lookup_candidate_t;
+
+typedef struct _import_iterator_t import_iterator_t;
+
+typedef enum _lookup_candidate_type_t{
+ BEST_MATCH_CANDIDATE = 1,
+ NORMAL_CANDIDATE,
+ DIVIDED_CANDIDATE,
+ RESPLIT_CANDIDATE,
+ ZOMBIE_CANDIDATE
+} lookup_candidate_type_t;
+
+/**
+ * pinyin_init:
+ * @systemdir: the system wide language model data directory.
+ * @userdir: the user's language model data directory.
+ * @returns: the newly created pinyin context, NULL if failed.
+ *
+ * Create a new pinyin context.
+ *
+ */
+pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir);
+
+/**
+ * pinyin_load_phrase_library:
+ * @context: the pinyin context.
+ * @index: the phrase index to be loaded.
+ * @returns: whether the load succeeded.
+ *
+ * Load the sub phrase library of the index.
+ *
+ */
+bool pinyin_load_phrase_library(pinyin_context_t * context,
+ guint8 index);
+
+/**
+ * pinyin_unload_phrase_library:
+ * @context: the pinyin context.
+ * @index: the phrase index to be unloaded.
+ * @returns: whether the unload succeeded.
+ *
+ * Unload the sub phrase library of the index.
+ *
+ */
+bool pinyin_unload_phrase_library(pinyin_context_t * context,
+ guint8 index);
+
+/**
+ * pinyin_begin_add_phrases:
+ * @context: the pinyin context.
+ * @index: the phrase index to be imported.
+ * @returns: the import iterator.
+ *
+ * Begin to add phrases.
+ *
+ */
+import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context,
+ guint8 index);
+
+/**
+ * pinyin_iterator_add_phrase:
+ * @iter: the import iterator.
+ * @phrase: the phrase string.
+ * @pinyin: the pinyin string.
+ * @count: the count of the phrase/pinyin pair, -1 to use the default value.
+ * @returns: whether the add operation succeeded.
+ *
+ * Add a pair of phrase and pinyin with count.
+ *
+ */
+bool pinyin_iterator_add_phrase(import_iterator_t * iter,
+ const char * phrase,
+ const char * pinyin,
+ gint count);
+
+/**
+ * pinyin_end_add_phrases:
+ * @iter: the import iterator.
+ *
+ * End adding phrases.
+ *
+ */
+void pinyin_end_add_phrases(import_iterator_t * iter);
+
+/**
+ * pinyin_save:
+ * @context: the pinyin context to be saved into user directory.
+ * @returns: whether the save succeeded.
+ *
+ * Save the user's self-learning information of the pinyin context.
+ *
+ */
+bool pinyin_save(pinyin_context_t * context);
+
+/**
+ * pinyin_set_double_pinyin_scheme:
+ * @context: the pinyin context.
+ * @scheme: the double pinyin scheme.
+ * @returns: whether the set double pinyin scheme succeeded.
+ *
+ * Change the double pinyin scheme of the pinyin context.
+ *
+ */
+bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context,
+ DoublePinyinScheme scheme);
+
+/**
+ * pinyin_set_chewing_scheme:
+ * @context: the pinyin context.
+ * @scheme: the chewing scheme.
+ * @returns: whether the set chewing scheme succeeded.
+ *
+ * Change the chewing scheme of the pinyin context.
+ *
+ */
+bool pinyin_set_chewing_scheme(pinyin_context_t * context,
+ ChewingScheme scheme);
+
+/**
+ * pinyin_fini:
+ * @context: the pinyin context.
+ *
+ * Finalize the pinyin context.
+ *
+ */
+void pinyin_fini(pinyin_context_t * context);
+
+
+/**
+ * pinyin_mask_out:
+ * @context: the pinyin context.
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: whether the mask out operation is successful.
+ *
+ * Mask out the matched phrase tokens.
+ *
+ */
+bool pinyin_mask_out(pinyin_context_t * context,
+ phrase_token_t mask,
+ phrase_token_t value);
+
+
+/**
+ * pinyin_set_options:
+ * @context: the pinyin context.
+ * @options: the pinyin options of the pinyin context.
+ * @returns: whether the set options scheme succeeded.
+ *
+ * Set the options of the pinyin context.
+ *
+ */
+bool pinyin_set_options(pinyin_context_t * context,
+ pinyin_option_t options);
+
+/**
+ * pinyin_alloc_instance:
+ * @context: the pinyin context.
+ * @returns: the newly allocated pinyin instance, NULL if failed.
+ *
+ * Allocate a new pinyin instance from the context.
+ *
+ */
+pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context);
+
+/**
+ * pinyin_free_instance:
+ * @instance: the pinyin instance.
+ *
+ * Free the pinyin instance.
+ *
+ */
+void pinyin_free_instance(pinyin_instance_t * instance);
+
+
+/**
+ * pinyin_guess_sentence:
+ * @instance: the pinyin instance.
+ * @returns: whether the sentence are guessed successfully.
+ *
+ * Guess a sentence from the saved pinyin keys in the instance.
+ *
+ */
+bool pinyin_guess_sentence(pinyin_instance_t * instance);
+
+/**
+ * pinyin_guess_sentence_with_prefix:
+ * @instance: the pinyin instance.
+ * @prefix: the prefix before the sentence.
+ * @returns: whether the sentence are guessed successfully.
+ *
+ * Guess a sentence from the saved pinyin keys with a prefix.
+ *
+ */
+bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance,
+ const char * prefix);
+
+/**
+ * pinyin_phrase_segment:
+ * @instance: the pinyin instance.
+ * @sentence: the utf-8 sentence to be segmented.
+ * @returns: whether the sentence are segmented successfully.
+ *
+ * Segment a sentence and saved the result in the instance.
+ *
+ */
+bool pinyin_phrase_segment(pinyin_instance_t * instance,
+ const char * sentence);
+
+/**
+ * pinyin_get_sentence:
+ * @instance: the pinyin instance.
+ * @sentence: the saved sentence in the instance.
+ * @returns: whether the sentence is already saved in the instance.
+ *
+ * Get the sentence from the instance.
+ *
+ * Note: the returned sentence should be freed by g_free().
+ *
+ */
+bool pinyin_get_sentence(pinyin_instance_t * instance,
+ char ** sentence);
+
+/**
+ * pinyin_parse_full_pinyin:
+ * @instance: the pinyin instance.
+ * @onepinyin: a single full pinyin to be parsed.
+ * @onekey: the parsed key.
+ * @returns: whether the parse is successfully.
+ *
+ * Parse a single full pinyin.
+ *
+ */
+bool pinyin_parse_full_pinyin(pinyin_instance_t * instance,
+ const char * onepinyin,
+ ChewingKey * onekey);
+
+/**
+ * pinyin_parse_more_full_pinyins:
+ * @instance: the pinyin instance.
+ * @pinyins: the full pinyins to be parsed.
+ * @returns: the parsed length of the full pinyins.
+ *
+ * Parse multiple full pinyins and save it in the instance.
+ *
+ */
+size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance,
+ const char * pinyins);
+
+/**
+ * pinyin_parse_double_pinyin:
+ * @instance: the pinyin instance.
+ * @onepinyin: the single double pinyin to be parsed.
+ * @onekey: the parsed key.
+ * @returns: whether the parse is successfully.
+ *
+ * Parse a single double pinyin.
+ *
+ */
+bool pinyin_parse_double_pinyin(pinyin_instance_t * instance,
+ const char * onepinyin,
+ ChewingKey * onekey);
+
+/**
+ * pinyin_parse_more_double_pinyins:
+ * @instance: the pinyin instance.
+ * @pinyins: the double pinyins to be parsed.
+ * @returns: the parsed length of the double pinyins.
+ *
+ * Parse multiple double pinyins and save it in the instance.
+ *
+ */
+size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance,
+ const char * pinyins);
+
+/**
+ * pinyin_parse_chewing:
+ * @instance: the pinyin instance.
+ * @onechewing: the single chewing to be parsed.
+ * @onekey: the parsed key.
+ * @returns: whether the parse is successfully.
+ *
+ * Parse a single chewing.
+ *
+ */
+bool pinyin_parse_chewing(pinyin_instance_t * instance,
+ const char * onechewing,
+ ChewingKey * onekey);
+
+/**
+ * pinyin_parse_more_chewings:
+ * @instance: the pinyin instance.
+ * @chewings: the chewings to be parsed.
+ * @returns: the parsed length of the chewings.
+ *
+ * Parse multiple chewings and save it in the instance.
+ *
+ */
+size_t pinyin_parse_more_chewings(pinyin_instance_t * instance,
+ const char * chewings);
+
+/**
+ * pinyin_in_chewing_keyboard:
+ * @instance: the pinyin instance.
+ * @key: the input key.
+ * @symbol: the chewing symbol.
+ * @returns: whether the key is in current chewing scheme.
+ *
+ * Check whether the input key is in current chewing scheme.
+ *
+ */
+bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance,
+ const char key, const char ** symbol);
+/**
+ * pinyin_guess_candidates:
+ * @instance: the pinyin instance.
+ * @offset: the offset in the pinyin keys.
+ * @returns: whether a list of tokens are gotten.
+ *
+ * Guess the candidates at the offset.
+ *
+ */
+bool pinyin_guess_candidates(pinyin_instance_t * instance,
+ size_t offset);
+
+/**
+ * pinyin_guess_full_pinyin_candidates:
+ * @instance: the pinyin instance.
+ * @offset: the offset in the pinyin keys.
+ * @returns: whether a list of lookup_candidate_t candidates are gotten.
+ *
+ * Guess the full pinyin candidates at the offset.
+ *
+ */
+bool pinyin_guess_full_pinyin_candidates(pinyin_instance_t * instance,
+ size_t offset);
+
+/**
+ * pinyin_choose_candidate:
+ * @instance: the pinyin instance.
+ * @offset: the offset in the pinyin keys.
+ * @candidate: the selected candidate.
+ * @returns: the cursor after the chosen candidate.
+ *
+ * Choose a full pinyin candidate at the offset.
+ *
+ */
+int pinyin_choose_candidate(pinyin_instance_t * instance,
+ size_t offset,
+ lookup_candidate_t * candidate);
+
+/**
+* pinyin_clear_constraint:
+* @instance: the pinyin instance.
+* @offset: the offset in the pinyin keys.
+* @returns: whether the constraint is cleared.
+*
+* Clear the previous chosen candidate.
+*
+*/
+bool pinyin_clear_constraint(pinyin_instance_t * instance,
+ size_t offset);
+
+/**
+ * pinyin_lookup_tokens:
+ * @instance: the pinyin instance.
+ * @phrase: the phrase to be looked up.
+ * @tokenarray: the returned GArray of tokens.
+ * @returns: whether the lookup operation is successful.
+ *
+ * Lookup the tokens for the phrase utf8 string.
+ *
+ */
+bool pinyin_lookup_tokens(pinyin_instance_t * instance,
+ const char * phrase, GArray * tokenarray);
+
+/**
+ * pinyin_train:
+ * @instance: the pinyin instance.
+ * @returns: whether the sentence is trained.
+ *
+ * Train the current user input sentence.
+ *
+ */
+bool pinyin_train(pinyin_instance_t * instance);
+
+/**
+ * pinyin_reset:
+ * @instance: the pinyin instance.
+ * @returns: whether the pinyin instance is resetted.
+ *
+ * Reset the pinyin instance.
+ *
+ */
+bool pinyin_reset(pinyin_instance_t * instance);
+
+/**
+ * pinyin_get_chewing_string:
+ * @instance: the pinyin instance.
+ * @key: the chewing key.
+ * @utf8_str: the chewing string.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the chewing string of the key.
+ *
+ */
+bool pinyin_get_chewing_string(pinyin_instance_t * instance,
+ ChewingKey * key,
+ gchar ** utf8_str);
+
+/**
+ * pinyin_get_pinyin_string:
+ * @instance: the pinyin instance.
+ * @key: the pinyin key.
+ * @utf8_str: the pinyin string.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the pinyin string of the key.
+ *
+ */
+bool pinyin_get_pinyin_string(pinyin_instance_t * instance,
+ ChewingKey * key,
+ gchar ** utf8_str);
+
+/**
+ * pinyin_get_pinyin_strings:
+ * @instance: the pinyin instance.
+ * @key: the pinyin key.
+ * @shengmu: the shengmu string.
+ * @yunmu: the yunmu string.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the shengmu and yunmu strings of the key.
+ *
+ */
+bool pinyin_get_pinyin_strings(pinyin_instance_t * instance,
+ ChewingKey * key,
+ gchar ** shengmu,
+ gchar ** yunmu);
+
+/**
+ * pinyin_token_get_phrase:
+ * @instance: the pinyin instance.
+ * @token: the phrase token.
+ * @len: the phrase length.
+ * @utf8_str: the phrase string.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the phrase length and utf8 string.
+ *
+ */
+bool pinyin_token_get_phrase(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint * len,
+ gchar ** utf8_str);
+
+/**
+ * pinyin_token_get_n_pronunciation:
+ * @instance: the pinyin instance.
+ * @token: the phrase token.
+ * @num: the number of pinyins.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the number of the pinyins.
+ *
+ */
+bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint * num);
+
+/**
+ * pinyin_token_get_nth_pronunciation:
+ * @instance: the pinyin instance.
+ * @token: the phrase token.
+ * @nth: the index of the pinyin.
+ * @keys: the GArray of chewing key.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the nth pinyin from the phrase.
+ *
+ */
+bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint nth,
+ ChewingKeyVector keys);
+
+/**
+ * pinyin_token_get_unigram_frequency:
+ * @instance: the pinyin instance.
+ * @token: the phrase token.
+ * @freq: the unigram frequency of the phrase.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the unigram frequency of the phrase.
+ *
+ */
+bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint * freq);
+
+/**
+ * pinyin_token_add_unigram_frequency:
+ * @instance: the pinyin instance.
+ * @token: the phrase token.
+ * @delta: the delta of the unigram frequency.
+ * @returns: whether the add operation is successful.
+ *
+ * Add delta to the unigram frequency of the phrase token.
+ *
+ */
+bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance,
+ phrase_token_t token,
+ guint delta);
+
+/**
+ * pinyin_get_n_candidate:
+ * @instance: the pinyin instance.
+ * @num: the number of the candidates.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the number of the candidates.
+ *
+ */
+bool pinyin_get_n_candidate(pinyin_instance_t * instance,
+ guint * num);
+
+/**
+ * pinyin_get_candidate:
+ * @instance: the pinyin instance.
+ * @index: the index of the candidate.
+ * @candidate: the retrieved candidate.
+ *
+ * Get the candidate of the index from the candidates.
+ *
+ */
+bool pinyin_get_candidate(pinyin_instance_t * instance,
+ guint index,
+ lookup_candidate_t ** candidate);
+
+/**
+ * pinyin_get_candidate_type:
+ * @instance: the pinyin instance.
+ * @candidate: the lookup candidate.
+ * @type: the type of the candidate.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the type of the lookup candidate.
+ *
+ */
+bool pinyin_get_candidate_type(pinyin_instance_t * instance,
+ lookup_candidate_t * candidate,
+ lookup_candidate_type_t * type);
+
+/**
+ * pinyin_get_candidate_string:
+ * @instance: the pinyin instance.
+ * @candidate: the lookup candidate.
+ * @utf8_str: the string of the candidate.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the string of the candidate.
+ *
+ */
+bool pinyin_get_candidate_string(pinyin_instance_t * instance,
+ lookup_candidate_t * candidate,
+ const gchar ** utf8_str);
+
+/**
+ * pinyin_get_n_pinyin:
+ * @instance: the pinyin instance.
+ * @num: the number of the pinyins.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the number of the pinyins.
+ *
+ */
+bool pinyin_get_n_pinyin(pinyin_instance_t * instance,
+ guint * num);
+
+/**
+ * pinyin_get_pinyin_key:
+ * @instance: the pinyin instance.
+ * @index: the index of the pinyin key.
+ * @key: the retrieved pinyin key.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the pinyin key of the index from the pinyin keys.
+ *
+ */
+bool pinyin_get_pinyin_key(pinyin_instance_t * instance,
+ guint index,
+ ChewingKey ** key);
+
+/**
+ * pinyin_get_pinyin_key_rest:
+ * @instance: the pinyin index.
+ * @index: the index of the pinyin key rest.
+ * @key_rest: the retrieved pinyin key rest.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the pinyin key rest of the index from the pinyin key rests.
+ *
+ */
+bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance,
+ guint index,
+ ChewingKeyRest ** key_rest);
+
+/**
+ * pinyin_get_pinyin_key_rest_positions:
+ * @instance: the pinyin instance.
+ * @key_rest: the pinyin key rest.
+ * @begin: the begin position of the corresponding pinyin key.
+ * @end: the end position of the corresponding pinyin key.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the positions of the pinyin key rest.
+ *
+ */
+bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance,
+ ChewingKeyRest * key_rest,
+ guint16 * begin, guint16 * end);
+
+/**
+ * pinyin_get_pinyin_key_rest_length:
+ * @instance: the pinyin instance.
+ * @key_rest: the pinyin key rest.
+ * @length: the length of the corresponding pinyin key.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the length of the corresponding pinyin key.
+ *
+ */
+bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance,
+ ChewingKeyRest * key_rest,
+ guint16 * length);
+
+/**
+ * pinyin_get_raw_full_pinyin:
+ * @instance: the pinyin instance.
+ * @utf8_str: the modified raw full pinyin after choose candidate.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the modified raw full pinyin after choose candidate.
+ *
+ */
+bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance,
+ const gchar ** utf8_str);
+
+/**
+ * pinyin_get_n_phrase:
+ * @instance: the pinyin instance.
+ * @num: the number of the phrase tokens.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the number of the phrase tokens.
+ *
+ */
+bool pinyin_get_n_phrase(pinyin_instance_t * instance,
+ guint * num);
+
+/**
+ * pinyin_get_phrase_token:
+ * @instance: the pinyin instance.
+ * @index: the index of the phrase token.
+ * @token: the retrieved phrase token.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the phrase token of the index from the phrase tokens.
+ *
+ */
+bool pinyin_get_phrase_token(pinyin_instance_t * instance,
+ guint index,
+ phrase_token_t * token);
+
+/* hack here. */
+typedef ChewingKey PinyinKey;
+typedef ChewingKeyRest PinyinKeyPos;
+
+
+G_END_DECLS
+
+#endif
diff --git a/src/pinyin_internal.cpp b/src/pinyin_internal.cpp
new file mode 100644
index 0000000..79fb688
--- /dev/null
+++ b/src/pinyin_internal.cpp
@@ -0,0 +1,4 @@
+#include "pinyin_internal.h"
+
+
+/* Place holder for pinyin internal library. */
diff --git a/src/pinyin_internal.h b/src/pinyin_internal.h
new file mode 100644
index 0000000..3f97efa
--- /dev/null
+++ b/src/pinyin_internal.h
@@ -0,0 +1,73 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef PINYIN_INTERNAL_H
+#define PINYIN_INTERNAL_H
+
+#include <stdio.h>
+#include "novel_types.h"
+#include "memory_chunk.h"
+#include "pinyin_custom2.h"
+#include "chewing_key.h"
+#include "pinyin_parser2.h"
+#include "pinyin_phrase2.h"
+#include "chewing_large_table.h"
+#include "phrase_large_table2.h"
+#include "facade_chewing_table.h"
+#include "facade_phrase_table2.h"
+#include "phrase_index.h"
+#include "phrase_index_logger.h"
+#include "ngram.h"
+#include "lookup.h"
+#include "pinyin_lookup2.h"
+#include "phrase_lookup.h"
+#include "tag_utility.h"
+#include "table_info.h"
+
+
+/* training module */
+#include "flexible_ngram.h"
+
+
+/* define filenames */
+#define SYSTEM_TABLE_INFO "table.conf"
+#define USER_TABLE_INFO "user.conf"
+#define SYSTEM_BIGRAM "bigram.db"
+#define USER_BIGRAM "user_bigram.db"
+#define DELETED_BIGRAM "deleted_bigram.db"
+#define SYSTEM_PINYIN_INDEX "pinyin_index.bin"
+#define USER_PINYIN_INDEX "user_pinyin_index.bin"
+#define SYSTEM_PHRASE_INDEX "phrase_index.bin"
+#define USER_PHRASE_INDEX "user_phrase_index.bin"
+
+
+using namespace pinyin;
+
+
+/* the following fixes build on Debian GNU/kFreeBSD */
+#include <errno.h>
+#ifndef ENODATA
+#define ENODATA ENOENT
+#endif
+
+
+#endif
diff --git a/src/storage/CMakeLists.txt b/src/storage/CMakeLists.txt
new file mode 100644
index 0000000..e33e213
--- /dev/null
+++ b/src/storage/CMakeLists.txt
@@ -0,0 +1,38 @@
+set(
+ CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC"
+)
+
+set(
+ LIBSTORAGE_HEADERS
+ chewing_key.h
+ pinyin_custom2.h
+)
+
+set(
+ LIBSTORAGE_SOURCES
+ phrase_index.cpp
+ phrase_large_table2.cpp
+ ngram.cpp
+ tag_utility.cpp
+ pinyin_parser2.cpp
+ chewing_large_table.cpp
+)
+
+add_library(
+ storage
+ STATIC
+ ${LIBSTORAGE_SOURCES}
+)
+
+target_link_libraries(
+ storage
+ ${GLIB2_LIBRARIES}
+ ${BERKELEY_DB_LIBRARIES}
+)
+
+install(
+ FILES
+ ${LIBSTORAGE_HEADERS}
+ DESTINATION
+ ${DIR_INCLUDE_LIBPINYIN}
+)
diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am
new file mode 100644
index 0000000..d805f18
--- /dev/null
+++ b/src/storage/Makefile.am
@@ -0,0 +1,59 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+INCLUDES = -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ @GLIB2_CFLAGS@
+
+libpinyinincludedir = $(includedir)/libpinyin-@VERSION@
+
+libpinyininclude_HEADERS= pinyin_custom2.h
+
+
+noinst_HEADERS = chewing_enum.h \
+ chewing_key.h \
+ pinyin_parser2.h \
+ phrase_index.h \
+ phrase_index_logger.h \
+ phrase_large_table2.h \
+ ngram.h \
+ flexible_ngram.h \
+ tag_utility.h \
+ pinyin_parser_table.h \
+ double_pinyin_table.h \
+ chewing_table.h \
+ pinyin_phrase2.h \
+ chewing_large_table.h \
+ facade_chewing_table.h \
+ facade_phrase_table2.h \
+ table_info.h
+
+
+noinst_LTLIBRARIES = libstorage.la
+
+libstorage_la_CXXFLAGS = "-fPIC"
+
+libstorage_la_LDFLAGS = -static
+
+libstorage_la_SOURCES = phrase_index.cpp \
+ phrase_large_table2.cpp \
+ ngram.cpp \
+ tag_utility.cpp \
+ pinyin_parser2.cpp \
+ chewing_large_table.cpp \
+ table_info.cpp
+
diff --git a/src/storage/chewing_enum.h b/src/storage/chewing_enum.h
new file mode 100644
index 0000000..e6d212d
--- /dev/null
+++ b/src/storage/chewing_enum.h
@@ -0,0 +1,104 @@
+/* This file is generated by python scripts. Don't edit this file directly.
+ */
+
+#ifndef CHEWING_ENUM_H
+#define CHEWING_ENUM_H
+
+namespace pinyin{
+
+/**
+ * @brief enums of chewing initial element.
+ */
+
+enum ChewingInitial
+{
+CHEWING_ZERO_INITIAL = 0,
+CHEWING_B = 1,
+CHEWING_C = 2,
+CHEWING_CH = 3,
+CHEWING_D = 4,
+CHEWING_F = 5,
+CHEWING_H = 6,
+CHEWING_G = 7,
+CHEWING_K = 8,
+CHEWING_J = 9,
+CHEWING_M = 10,
+CHEWING_N = 11,
+CHEWING_L = 12,
+CHEWING_R = 13,
+CHEWING_P = 14,
+CHEWING_Q = 15,
+CHEWING_S = 16,
+CHEWING_SH = 17,
+CHEWING_T = 18,
+PINYIN_W = 19,
+CHEWING_X = 20,
+PINYIN_Y = 21,
+CHEWING_Z = 22,
+CHEWING_ZH = 23,
+CHEWING_LAST_INITIAL = CHEWING_ZH,
+CHEWING_NUMBER_OF_INITIALS = CHEWING_LAST_INITIAL + 1
+};
+
+
+/**
+ * @brief enums of chewing middle element.
+ */
+
+enum ChewingMiddle
+{
+CHEWING_ZERO_MIDDLE = 0,
+CHEWING_I = 1,
+CHEWING_U = 2,
+CHEWING_V = 3,
+CHEWING_LAST_MIDDLE = CHEWING_V,
+CHEWING_NUMBER_OF_MIDDLES = CHEWING_LAST_MIDDLE + 1
+};
+
+
+/**
+ * @brief enums of chewing final element.
+ */
+enum ChewingFinal
+{
+CHEWING_ZERO_FINAL = 0,
+CHEWING_A = 1,
+CHEWING_AI = 2,
+CHEWING_AN = 3,
+CHEWING_ANG = 4,
+CHEWING_AO = 5,
+CHEWING_E = 6,
+INVALID_EA = 7,
+CHEWING_EI = 8,
+CHEWING_EN = 9,
+CHEWING_ENG = 10,
+CHEWING_ER = 11,
+CHEWING_NG = 12,
+CHEWING_O = 13,
+PINYIN_ONG = 14,
+CHEWING_OU = 15,
+PINYIN_IN = 16,
+PINYIN_ING = 17,
+CHEWING_LAST_FINAL = PINYIN_ING,
+CHEWING_NUMBER_OF_FINALS = CHEWING_LAST_FINAL + 1
+};
+
+
+/**
+ * @brief enums of chewing tone element.
+ */
+enum ChewingTone
+{
+CHEWING_ZERO_TONE = 0,
+CHEWING_1 = 1,
+CHEWING_2 = 2,
+CHEWING_3 = 3,
+CHEWING_4 = 4,
+CHEWING_5 = 5,
+CHEWING_LAST_TONE = CHEWING_5,
+CHEWING_NUMBER_OF_TONES = CHEWING_LAST_TONE + 1
+};
+
+};
+
+#endif
diff --git a/src/storage/chewing_key.h b/src/storage/chewing_key.h
new file mode 100644
index 0000000..f3202e8
--- /dev/null
+++ b/src/storage/chewing_key.h
@@ -0,0 +1,111 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef CHEWING_KEY_H
+#define CHEWING_KEY_H
+
+#include <glib.h>
+#include "chewing_enum.h"
+
+using namespace pinyin;
+
+G_BEGIN_DECLS
+
+/** @file chewing_key.h
+ * @brief the definitions of chewing key related classes and structs.
+ */
+
+
+/** Note: The parsed pinyins are stored in the following two
+ * GArrays to speed up chewing table lookup.
+ * As the chewing large table only contains information of struct ChewingKey.
+ */
+
+struct _ChewingKey
+{
+ guint16 m_initial : 5;
+ guint16 m_middle : 2;
+ guint16 m_final : 5;
+ guint16 m_tone : 3;
+
+ _ChewingKey() {
+ m_initial = CHEWING_ZERO_INITIAL;
+ m_middle = CHEWING_ZERO_MIDDLE;
+ m_final = CHEWING_ZERO_FINAL;
+ m_tone = CHEWING_ZERO_TONE;
+ }
+
+ _ChewingKey(ChewingInitial initial, ChewingMiddle middle,
+ ChewingFinal final) {
+ m_initial = initial;
+ m_middle = middle;
+ m_final = final;
+ m_tone = CHEWING_ZERO_TONE;
+ }
+
+public:
+ gint get_table_index();
+
+ /* Note: the return value should be freed by g_free. */
+ gchar * get_pinyin_string();
+ gchar * get_shengmu_string();
+ gchar * get_yunmu_string();
+ gchar * get_chewing_string();
+};
+
+typedef struct _ChewingKey ChewingKey;
+
+static inline bool operator == (ChewingKey lhs, ChewingKey rhs) {
+ if (lhs.m_initial != rhs.m_initial)
+ return false;
+ if (lhs.m_middle != rhs.m_middle)
+ return false;
+ if (lhs.m_final != rhs.m_final)
+ return false;
+ if (lhs.m_tone != rhs.m_tone)
+ return false;
+ return true;
+}
+
+struct _ChewingKeyRest
+{
+ /* Note: the table index is removed,
+ * Please use get_table_index in ChewingKey.
+ */
+ guint16 m_raw_begin; /* the begin of the raw input. */
+ guint16 m_raw_end; /* the end of the raw input. */
+
+ _ChewingKeyRest() {
+ /* the 0th item in pinyin parser table is reserved for invalid. */
+ m_raw_begin = 0;
+ m_raw_end = 0;
+ }
+
+ guint16 length() {
+ return m_raw_end - m_raw_begin;
+ }
+};
+
+typedef struct _ChewingKeyRest ChewingKeyRest;
+
+G_END_DECLS
+
+#endif
diff --git a/src/storage/chewing_large_table.cpp b/src/storage/chewing_large_table.cpp
new file mode 100644
index 0000000..2eb8658
--- /dev/null
+++ b/src/storage/chewing_large_table.cpp
@@ -0,0 +1,1047 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "chewing_large_table.h"
+#include <assert.h>
+#include "pinyin_phrase2.h"
+#include "pinyin_parser2.h"
+
+
+/* internal class definition */
+
+namespace pinyin{
+class ChewingLengthIndexLevel{
+
+protected:
+ GArray * m_chewing_array_indexes;
+
+public:
+ /* constructor/destructor */
+ ChewingLengthIndexLevel();
+ ~ChewingLengthIndexLevel();
+
+ /* load/store method */
+ bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk, table_offset_t offset,
+ table_offset_t & end);
+
+ /* search method */
+ int search(pinyin_option_t options, int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const;
+
+ /* add/remove index method */
+ int add_index(int phrase_length, /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token);
+ int remove_index(int phrase_length, /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token);
+
+ /* get length method */
+ int get_length() const;
+
+ /* mask out method */
+ bool mask_out(phrase_token_t mask, phrase_token_t value);
+};
+
+
+template<size_t phrase_length>
+class ChewingArrayIndexLevel{
+protected:
+ typedef PinyinIndexItem2<phrase_length> IndexItem;
+
+protected:
+ MemoryChunk m_chunk;
+
+ /* compress consecutive tokens */
+ int convert(pinyin_option_t options,
+ const ChewingKey keys[],
+ IndexItem * begin,
+ IndexItem * end,
+ PhraseIndexRanges ranges) const;
+
+public:
+ /* load/store method */
+ bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk, table_offset_t offset,
+ table_offset_t & end);
+
+ /* search method */
+ int search(pinyin_option_t options, /* in */const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const;
+
+ /* add/remove index method */
+ int add_index(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token);
+ int remove_index(/* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token);
+
+ /* get length method */
+ int get_length() const;
+
+ /* mask out method */
+ bool mask_out(phrase_token_t mask, phrase_token_t value);
+};
+
+};
+
+
+using namespace pinyin;
+
+/* class implementation */
+
+ChewingBitmapIndexLevel::ChewingBitmapIndexLevel(pinyin_option_t options)
+ : m_options(options) {
+ memset(m_chewing_length_indexes, 0, sizeof(m_chewing_length_indexes));
+}
+
+void ChewingBitmapIndexLevel::reset() {
+ for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k)
+ for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
+ for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m)
+ for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES;
+ ++n) {
+ ChewingLengthIndexLevel * & length_array =
+ m_chewing_length_indexes[k][l][m][n];
+ if (length_array)
+ delete length_array;
+ length_array = NULL;
+ }
+}
+
+
+/* search method */
+
+int ChewingBitmapIndexLevel::search(int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const {
+ assert(phrase_length > 0);
+ return initial_level_search(phrase_length, keys, ranges);
+}
+
+int ChewingBitmapIndexLevel::initial_level_search (int phrase_length,
+ /* in */ const ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const {
+
+/* macros */
+#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \
+ { \
+ result |= middle_and_final_level_search(ORIGIN, phrase_length, \
+ keys, ranges); \
+ if (m_options & AMBIGUITY) { \
+ result |= middle_and_final_level_search(ANOTHER, \
+ phrase_length, \
+ keys, ranges); \
+ } \
+ return result; \
+ }
+
+ /* deal with ambiguities */
+ int result = SEARCH_NONE;
+ const ChewingKey & first_key = keys[0];
+
+ switch(first_key.m_initial) {
+ MATCH(PINYIN_AMB_C_CH, CHEWING_C, CHEWING_CH);
+ MATCH(PINYIN_AMB_C_CH, CHEWING_CH, CHEWING_C);
+ MATCH(PINYIN_AMB_Z_ZH, CHEWING_Z, CHEWING_ZH);
+ MATCH(PINYIN_AMB_Z_ZH, CHEWING_ZH, CHEWING_Z);
+ MATCH(PINYIN_AMB_S_SH, CHEWING_S, CHEWING_SH);
+ MATCH(PINYIN_AMB_S_SH, CHEWING_SH, CHEWING_S);
+ MATCH(PINYIN_AMB_L_R, CHEWING_R, CHEWING_L);
+ MATCH(PINYIN_AMB_L_N, CHEWING_N, CHEWING_L);
+ MATCH(PINYIN_AMB_F_H, CHEWING_F, CHEWING_H);
+ MATCH(PINYIN_AMB_F_H, CHEWING_H, CHEWING_F);
+ MATCH(PINYIN_AMB_G_K, CHEWING_G, CHEWING_K);
+ MATCH(PINYIN_AMB_G_K, CHEWING_K, CHEWING_G);
+
+ case CHEWING_L:
+ {
+ result |= middle_and_final_level_search
+ (CHEWING_L, phrase_length, keys, ranges);
+
+ if (m_options & PINYIN_AMB_L_N)
+ result |= middle_and_final_level_search
+ (CHEWING_N, phrase_length, keys,ranges);
+
+ if (m_options & PINYIN_AMB_L_R)
+ result |= middle_and_final_level_search
+ (CHEWING_R, phrase_length, keys, ranges);
+ return result;
+ }
+ default:
+ {
+ result |= middle_and_final_level_search
+ ((ChewingInitial) first_key.m_initial,
+ phrase_length, keys, ranges);
+ return result;
+ }
+ }
+#undef MATCH
+ return result;
+}
+
+
+int ChewingBitmapIndexLevel::middle_and_final_level_search
+(ChewingInitial initial, int phrase_length, /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const {
+
+/* macros */
+#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \
+ { \
+ result = tone_level_search \
+ (initial, middle, \
+ ORIGIN, phrase_length, keys, ranges); \
+ if (m_options & AMBIGUITY) { \
+ result |= tone_level_search \
+ (initial, middle, \
+ ANOTHER, phrase_length, keys, ranges); \
+ } \
+ return result; \
+ }
+
+ int result = SEARCH_NONE;
+ const ChewingKey & first_key = keys[0];
+ const ChewingMiddle middle = (ChewingMiddle)first_key.m_middle;
+
+ switch(first_key.m_final) {
+ case CHEWING_ZERO_FINAL:
+ {
+ if (middle == CHEWING_ZERO_MIDDLE) { /* in-complete pinyin */
+ if (!(m_options & PINYIN_INCOMPLETE))
+ return result;
+ for (int m = CHEWING_ZERO_MIDDLE;
+ m < CHEWING_NUMBER_OF_MIDDLES; ++m)
+ for (int n = CHEWING_ZERO_FINAL;
+ n < CHEWING_NUMBER_OF_FINALS; ++n) {
+
+ if (CHEWING_ZERO_MIDDLE == m &&
+ CHEWING_ZERO_FINAL == n)
+ continue;
+
+ result |= tone_level_search
+ (initial, (ChewingMiddle) m, (ChewingFinal) n,
+ phrase_length, keys, ranges);
+ }
+ return result;
+ } else { /* normal pinyin */
+ result |= tone_level_search
+ (initial, middle, CHEWING_ZERO_FINAL,
+ phrase_length, keys, ranges);
+ return result;
+ }
+ }
+
+ MATCH(PINYIN_AMB_AN_ANG, CHEWING_AN, CHEWING_ANG);
+ MATCH(PINYIN_AMB_AN_ANG, CHEWING_ANG, CHEWING_AN);
+ MATCH(PINYIN_AMB_EN_ENG, CHEWING_EN, CHEWING_ENG);
+ MATCH(PINYIN_AMB_EN_ENG, CHEWING_ENG, CHEWING_EN);
+ MATCH(PINYIN_AMB_IN_ING, PINYIN_IN, PINYIN_ING);
+ MATCH(PINYIN_AMB_IN_ING, PINYIN_ING, PINYIN_IN);
+
+ default:
+ {
+ result |= tone_level_search
+ (initial, middle, (ChewingFinal) first_key.m_final,
+ phrase_length, keys, ranges);
+ return result;
+ }
+ }
+#undef MATCH
+ return result;
+}
+
+
+int ChewingBitmapIndexLevel::tone_level_search
+(ChewingInitial initial, ChewingMiddle middle, ChewingFinal final,
+ int phrase_length, /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const {
+
+ int result = SEARCH_NONE;
+ const ChewingKey & first_key = keys[0];
+
+ switch (first_key.m_tone) {
+ case CHEWING_ZERO_TONE:
+ {
+ /* deal with zero tone in chewing large table. */
+ for (int i = CHEWING_ZERO_TONE; i < CHEWING_NUMBER_OF_TONES; ++i) {
+ ChewingLengthIndexLevel * phrases =
+ m_chewing_length_indexes
+ [initial][middle][final][(ChewingTone)i];
+ if (phrases)
+ result |= phrases->search
+ (m_options, phrase_length - 1, keys + 1, ranges);
+ }
+ return result;
+ }
+ default:
+ {
+ ChewingLengthIndexLevel * phrases =
+ m_chewing_length_indexes
+ [initial][middle][final][CHEWING_ZERO_TONE];
+ if (phrases)
+ result |= phrases->search
+ (m_options, phrase_length - 1, keys + 1, ranges);
+
+ phrases = m_chewing_length_indexes
+ [initial][middle][final][(ChewingTone) first_key.m_tone];
+ if (phrases)
+ result |= phrases->search
+ (m_options, phrase_length - 1, keys + 1, ranges);
+ return result;
+ }
+ }
+ return result;
+}
+
+
+ChewingLengthIndexLevel::ChewingLengthIndexLevel() {
+ m_chewing_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
+}
+
+ChewingLengthIndexLevel::~ChewingLengthIndexLevel() {
+#define CASE(len) case len: \
+ { \
+ ChewingArrayIndexLevel<len> * & array = g_array_index \
+ (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
+ if (array) \
+ delete array; \
+ array = NULL; \
+ break; \
+ }
+
+ for (guint i = 0; i < m_chewing_array_indexes->len; ++i) {
+ switch (i){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+ }
+#undef CASE
+ g_array_free(m_chewing_array_indexes, TRUE);
+}
+
+
+int ChewingLengthIndexLevel::search(pinyin_option_t options, int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const {
+ int result = SEARCH_NONE;
+ if (m_chewing_array_indexes->len < phrase_length + 1)
+ return result;
+ if (m_chewing_array_indexes->len > phrase_length + 1)
+ result |= SEARCH_CONTINUED;
+
+#define CASE(len) case len: \
+ { \
+ ChewingArrayIndexLevel<len> * & array = g_array_index \
+ (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
+ if (!array) \
+ return result; \
+ result |= array->search(options, keys, ranges); \
+ return result; \
+ }
+
+ switch (phrase_length) {
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+
+#undef CASE
+}
+
+
+template<size_t phrase_length>
+int ChewingArrayIndexLevel<phrase_length>::search
+(pinyin_option_t options, /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const {
+ IndexItem * chunk_begin = NULL, * chunk_end = NULL;
+ chunk_begin = (IndexItem *) m_chunk.begin();
+ chunk_end = (IndexItem *) m_chunk.end();
+
+ /* do the search */
+ ChewingKey left_keys[phrase_length], right_keys[phrase_length];
+ compute_lower_value2(options, keys, left_keys, phrase_length);
+ compute_upper_value2(options, keys, right_keys, phrase_length);
+
+ IndexItem left(left_keys, -1), right(right_keys, -1);
+
+ IndexItem * begin = std_lite::lower_bound
+ (chunk_begin, chunk_end, left,
+ phrase_exact_less_than2<phrase_length>);
+ IndexItem * end = std_lite::upper_bound
+ (chunk_begin, chunk_end, right,
+ phrase_exact_less_than2<phrase_length>);
+
+ return convert(options, keys, begin, end, ranges);
+}
+
+/* compress consecutive tokens */
+template<size_t phrase_length>
+int ChewingArrayIndexLevel<phrase_length>::convert
+(pinyin_option_t options, const ChewingKey keys[],
+ IndexItem * begin, IndexItem * end,
+ PhraseIndexRanges ranges) const {
+ IndexItem * iter = NULL;
+ PhraseIndexRange cursor;
+ GArray * head, * cursor_head = NULL;
+
+ int result = SEARCH_NONE;
+ /* TODO: check the below code */
+ cursor.m_range_begin = null_token; cursor.m_range_end = null_token;
+ for (iter = begin; iter != end; ++iter) {
+ if (0 != pinyin_compare_with_ambiguities2
+ (options, keys, iter->m_keys, phrase_length))
+ continue;
+
+ phrase_token_t token = iter->m_token;
+ head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)];
+ if (NULL == head)
+ continue;
+
+ result |= SEARCH_OK;
+
+ if (null_token == cursor.m_range_begin) {
+ cursor.m_range_begin = token;
+ cursor.m_range_end = token + 1;
+ cursor_head = head;
+ } else if (cursor.m_range_end == token &&
+ PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_begin) ==
+ PHRASE_INDEX_LIBRARY_INDEX(token)) {
+ ++cursor.m_range_end;
+ } else {
+ g_array_append_val(cursor_head, cursor);
+ cursor.m_range_begin = token; cursor.m_range_end = token + 1;
+ cursor_head = head;
+ }
+ }
+
+ if (null_token == cursor.m_range_begin)
+ return result;
+
+ g_array_append_val(cursor_head, cursor);
+ return result;
+}
+
+
+/* add/remove index method */
+
+int ChewingBitmapIndexLevel::add_index(int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token) {
+ const ChewingKey first_key = keys[0];
+ ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes
+ [first_key.m_initial][first_key.m_middle]
+ [first_key.m_final][first_key.m_tone];
+
+ if (NULL == length_array) {
+ length_array = new ChewingLengthIndexLevel();
+ }
+
+ return length_array->add_index(phrase_length - 1, keys + 1, token);
+}
+
+int ChewingBitmapIndexLevel::remove_index(int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token) {
+ const ChewingKey first_key = keys[0];
+ ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes
+ [first_key.m_initial][first_key.m_middle]
+ [first_key.m_final][first_key.m_tone];
+
+ if (NULL == length_array)
+ return ERROR_REMOVE_ITEM_DONOT_EXISTS;
+
+ int retval = length_array->remove_index(phrase_length - 1, keys + 1, token);
+
+ /* remove empty array. */
+ if (0 == length_array->get_length()) {
+ delete length_array;
+ length_array = NULL;
+ }
+
+ return retval;
+}
+
+int ChewingLengthIndexLevel::add_index(int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token) {
+ if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
+ return ERROR_PHRASE_TOO_LONG;
+
+ if (m_chewing_array_indexes->len <= phrase_length)
+ g_array_set_size(m_chewing_array_indexes, phrase_length + 1);
+
+#define CASE(len) case len: \
+ { \
+ ChewingArrayIndexLevel<len> * & array = g_array_index \
+ (m_chewing_array_indexes, \
+ ChewingArrayIndexLevel<len> *, len); \
+ if (NULL == array) \
+ array = new ChewingArrayIndexLevel<len>; \
+ return array->add_index(keys, token); \
+ }
+
+ switch(phrase_length) {
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+
+#undef CASE
+}
+
+int ChewingLengthIndexLevel::remove_index(int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token) {
+ if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
+ return ERROR_PHRASE_TOO_LONG;
+
+ if (m_chewing_array_indexes->len <= phrase_length)
+ return ERROR_REMOVE_ITEM_DONOT_EXISTS;
+
+#define CASE(len) case len: \
+ { \
+ ChewingArrayIndexLevel<len> * & array = g_array_index \
+ (m_chewing_array_indexes, \
+ ChewingArrayIndexLevel<len> *, len); \
+ if (NULL == array) \
+ return ERROR_REMOVE_ITEM_DONOT_EXISTS; \
+ int retval = array->remove_index(keys, token); \
+ \
+ /* remove empty array. */ \
+ if (0 == array->get_length()) { \
+ delete array; \
+ array = NULL; \
+ \
+ /* shrink self array. */ \
+ g_array_set_size(m_chewing_array_indexes, \
+ get_length()); \
+ } \
+ return retval; \
+ }
+
+ switch (phrase_length) {
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+
+#undef CASE
+}
+
+template<size_t phrase_length>
+int ChewingArrayIndexLevel<phrase_length>::add_index
+(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) {
+ IndexItem * begin, * end;
+
+ IndexItem add_elem(keys, token);
+ begin = (IndexItem *) m_chunk.begin();
+ end = (IndexItem *) m_chunk.end();
+
+ std_lite::pair<IndexItem *, IndexItem *> range;
+ range = std_lite::equal_range
+ (begin, end, add_elem, phrase_exact_less_than2<phrase_length>);
+
+ IndexItem * cur_elem;
+ for (cur_elem = range.first;
+ cur_elem != range.second; ++cur_elem) {
+ if (cur_elem->m_token == token)
+ return ERROR_INSERT_ITEM_EXISTS;
+ if (cur_elem->m_token > token)
+ break;
+ }
+
+ int offset = (cur_elem - begin) * sizeof(IndexItem);
+ m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
+ return ERROR_OK;
+}
+
+template<size_t phrase_length>
+int ChewingArrayIndexLevel<phrase_length>::remove_index
+(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) {
+ IndexItem * begin, * end;
+
+ IndexItem remove_elem(keys, token);
+ begin = (IndexItem *) m_chunk.begin();
+ end = (IndexItem *) m_chunk.end();
+
+ std_lite::pair<IndexItem *, IndexItem *> range;
+ range = std_lite::equal_range
+ (begin, end, remove_elem, phrase_exact_less_than2<phrase_length>);
+
+ IndexItem * cur_elem;
+ for (cur_elem = range.first;
+ cur_elem != range.second; ++cur_elem) {
+ if (cur_elem->m_token == token)
+ break;
+ }
+
+ if (cur_elem == range.second)
+ return ERROR_REMOVE_ITEM_DONOT_EXISTS;
+
+ int offset = (cur_elem - begin) * sizeof(IndexItem);
+ m_chunk.remove_content(offset, sizeof(IndexItem));
+ return ERROR_OK;
+}
+
+
+/* load text method */
+bool ChewingLargeTable::load_text(FILE * infile) {
+ char pinyin[256];
+ char phrase[256];
+ phrase_token_t token;
+ size_t freq;
+
+ while (!feof(infile)) {
+ int num = fscanf(infile, "%s %s %u %ld",
+ pinyin, phrase, &token, &freq);
+
+ if (4 != num)
+ continue;
+
+ if(feof(infile))
+ break;
+
+ glong len = g_utf8_strlen(phrase, -1);
+
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys;
+ ChewingKeyRestVector key_rests;
+
+ keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ pinyin_option_t options = USE_TONE;
+ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
+
+ if (len != keys->len) {
+ fprintf(stderr, "ChewingLargeTable::load_text:%s\t%s\t%u\t%ld\n",
+ pinyin, phrase, token, freq);
+ continue;
+ }
+
+ add_index(keys->len, (ChewingKey *)keys->data, token);
+
+ g_array_free(keys, TRUE);
+ g_array_free(key_rests, TRUE);
+ }
+
+ return true;
+}
+
+
+/* load/store method */
+
+bool ChewingBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
+ table_offset_t end) {
+ reset();
+ char * begin = (char *) chunk->begin();
+ table_offset_t phrase_begin, phrase_end;
+ table_offset_t * index = (table_offset_t *) (begin + offset);
+ phrase_end = *index;
+
+ for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k)
+ for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
+ for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m)
+ for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) {
+ phrase_begin = phrase_end;
+ index++;
+ phrase_end = *index;
+
+ if (phrase_begin == phrase_end) /* null pointer */
+ continue;
+
+ /* after reset() all phrases are null pointer. */
+ ChewingLengthIndexLevel * phrases = new ChewingLengthIndexLevel;
+ m_chewing_length_indexes[k][l][m][n] = phrases;
+
+ phrases->load(chunk, phrase_begin, phrase_end - 1);
+ assert(phrase_end <= end);
+ assert(*(begin + phrase_end - 1) == c_separate);
+ }
+
+ offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t);
+ assert(c_separate == *(begin + offset));
+ return true;
+}
+
+bool ChewingBitmapIndexLevel::store(MemoryChunk * new_chunk,
+ table_offset_t offset,
+ table_offset_t & end) {
+ table_offset_t phrase_end;
+ table_offset_t index = offset;
+ offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t);
+
+ /* add '#' */
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+
+ for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k)
+ for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
+ for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m)
+ for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) {
+ ChewingLengthIndexLevel * phrases =
+ m_chewing_length_indexes[k][l][m][n];
+
+ if (NULL == phrases) { /* null pointer */
+ new_chunk->set_content(index, &offset,
+ sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ continue;
+ }
+
+ /* has a end '#' */
+ phrases->store(new_chunk, offset, phrase_end);
+ offset = phrase_end;
+
+ /* add '#' */
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset,
+ sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ }
+
+ end = offset;
+ return true;
+}
+
+bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
+ table_offset_t end) {
+ char * begin = (char *) chunk->begin();
+ guint32 nindex = *((guint32 *)(begin + offset)); /* number of index */
+ table_offset_t * index = (table_offset_t *)
+ (begin + offset + sizeof(guint32));
+
+ table_offset_t phrase_begin, phrase_end = *index;
+ g_array_set_size(m_chewing_array_indexes, 0);
+ for (guint32 i = 0; i < nindex; ++i) {
+ phrase_begin = phrase_end;
+ index++;
+ phrase_end = *index;
+
+ if (phrase_begin == phrase_end) {
+ void * null = NULL;
+ g_array_append_val(m_chewing_array_indexes, null);
+ continue;
+ }
+
+#define CASE(len) case len: \
+ { \
+ ChewingArrayIndexLevel<len> * phrase = \
+ new ChewingArrayIndexLevel<len>; \
+ phrase->load(chunk, phrase_begin, phrase_end - 1); \
+ assert(*(begin + phrase_end - 1) == c_separate); \
+ assert(phrase_end <= end); \
+ g_array_append_val(m_chewing_array_indexes, phrase); \
+ break; \
+ }
+
+ switch ( i ){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+
+#undef CASE
+ }
+
+ /* check '#' */
+ offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
+ assert(c_separate == *(begin + offset));
+ return true;
+}
+
+bool ChewingLengthIndexLevel::store(MemoryChunk * new_chunk,
+ table_offset_t offset,
+ table_offset_t & end) {
+ guint32 nindex = m_chewing_array_indexes->len; /* number of index */
+ new_chunk->set_content(offset, &nindex, sizeof(guint32));
+ table_offset_t index = offset + sizeof(guint32);
+
+ offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+
+ table_offset_t phrase_end;
+ for (guint32 i = 0; i < nindex; ++i) {
+#define CASE(len) case len: \
+ { \
+ ChewingArrayIndexLevel<len> * phrase = g_array_index \
+ (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
+ if (NULL == phrase) { \
+ new_chunk->set_content \
+ (index, &offset, sizeof(table_offset_t)); \
+ index += sizeof(table_offset_t); \
+ continue; \
+ } \
+ phrase->store(new_chunk, offset, phrase_end); \
+ offset = phrase_end; \
+ break; \
+ }
+
+ switch ( i ){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+#undef CASE
+
+ /* add '#' */
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ }
+
+ end = offset;
+ return true;
+}
+
+template<size_t phrase_length>
+bool ChewingArrayIndexLevel<phrase_length>::
+load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end) {
+ char * begin = (char *) chunk->begin();
+ m_chunk.set_chunk(begin + offset, end - offset, NULL);
+ return true;
+}
+
+template<size_t phrase_length>
+bool ChewingArrayIndexLevel<phrase_length>::
+store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
+ new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
+ end = offset + m_chunk.size();
+ return true;
+}
+
+
+/* get length method */
+
+int ChewingLengthIndexLevel::get_length() const {
+ int length = m_chewing_array_indexes->len;
+
+ /* trim trailing zero. */
+ for (int i = length - 1; i >= 0; --i) {
+ void * array = g_array_index(m_chewing_array_indexes, void *, i);
+
+ if (NULL != array)
+ break;
+
+ --length;
+ }
+
+ return length;
+}
+
+template<size_t phrase_length>
+int ChewingArrayIndexLevel<phrase_length>::get_length() const {
+ IndexItem * chunk_begin = NULL, * chunk_end = NULL;
+ chunk_begin = (IndexItem *) m_chunk.begin();
+ chunk_end = (IndexItem *) m_chunk.end();
+
+ return chunk_end - chunk_begin;
+}
+
+
+/* mask out method */
+
+bool ChewingBitmapIndexLevel::mask_out(phrase_token_t mask,
+ phrase_token_t value) {
+ for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k)
+ for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
+ for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m)
+ for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES;
+ ++n) {
+ ChewingLengthIndexLevel * & length_array =
+ m_chewing_length_indexes[k][l][m][n];
+
+ if (NULL == length_array)
+ continue;
+
+ length_array->mask_out(mask, value);
+
+ if (0 == length_array->get_length()) {
+ delete length_array;
+ length_array = NULL;
+ }
+ }
+ return true;
+}
+
+bool ChewingLengthIndexLevel::mask_out(phrase_token_t mask,
+ phrase_token_t value) {
+#define CASE(len) case len: \
+ { \
+ ChewingArrayIndexLevel<len> * & array = g_array_index \
+ (m_chewing_array_indexes, \
+ ChewingArrayIndexLevel<len> *, len); \
+ \
+ if (NULL == array) \
+ continue; \
+ \
+ array->mask_out(mask, value); \
+ \
+ if (0 == array->get_length()) { \
+ delete array; \
+ array = NULL; \
+ } \
+ break; \
+ }
+
+ for (guint i = 0; i < m_chewing_array_indexes->len; ++i) {
+ switch (i){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+ }
+#undef CASE
+ g_array_set_size(m_chewing_array_indexes, get_length());
+ return true;
+}
+
+template<size_t phrase_length>
+bool ChewingArrayIndexLevel<phrase_length>::mask_out
+(phrase_token_t mask, phrase_token_t value) {
+ IndexItem * begin = NULL, * end = NULL;
+ begin = (IndexItem *) m_chunk.begin();
+ end = (IndexItem *) m_chunk.end();
+
+ for (IndexItem * cur = begin; cur != end; ++cur) {
+ if ((cur->m_token & mask) != value)
+ continue;
+
+ int offset = (cur - begin) * sizeof(IndexItem);
+ m_chunk.remove_content(offset, sizeof(IndexItem));
+
+ /* update chunk end. */
+ end = (IndexItem *) m_chunk.end();
+ --cur;
+ }
+
+ return true;
+}
diff --git a/src/storage/chewing_large_table.h b/src/storage/chewing_large_table.h
new file mode 100644
index 0000000..30ae9aa
--- /dev/null
+++ b/src/storage/chewing_large_table.h
@@ -0,0 +1,154 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef CHEWING_LARGE_TABLE_H
+#define CHEWING_LARGE_TABLE_H
+
+
+#include <stdio.h>
+#include "novel_types.h"
+#include "memory_chunk.h"
+#include "chewing_key.h"
+
+namespace pinyin{
+
+class ChewingLengthIndexLevel;
+
+class ChewingBitmapIndexLevel{
+
+protected:
+ pinyin_option_t m_options;
+
+protected:
+ ChewingLengthIndexLevel * m_chewing_length_indexes
+ [CHEWING_NUMBER_OF_INITIALS][CHEWING_NUMBER_OF_MIDDLES]
+ [CHEWING_NUMBER_OF_FINALS][CHEWING_NUMBER_OF_TONES];
+
+ /* search functions */
+ int initial_level_search(int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const;
+
+ int middle_and_final_level_search(ChewingInitial initial,
+ int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const;
+ int tone_level_search(ChewingInitial initial, ChewingMiddle middle,
+ ChewingFinal final, int phrase_length,
+ /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const;
+
+ void reset();
+
+public:
+ /* constructor/destructor */
+ ChewingBitmapIndexLevel(pinyin_option_t options);
+ ~ChewingBitmapIndexLevel() { reset(); }
+
+ /* set options method */
+ bool set_options(pinyin_option_t options) {
+ m_options = options;
+ return true;
+ }
+
+ /* load/store method */
+ bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk, table_offset_t offset,
+ table_offset_t & end);
+
+ /* search method */
+ int search(int phrase_length, /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const;
+
+ /* add/remove index method */
+ int add_index(int phrase_length, /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token);
+ int remove_index(int phrase_length, /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token);
+
+ /* mask out method */
+ bool mask_out(phrase_token_t mask, phrase_token_t value);
+};
+
+
+class ChewingLargeTable{
+protected:
+ ChewingBitmapIndexLevel m_bitmap_table;
+ MemoryChunk * m_chunk;
+
+ void reset(){
+ if (m_chunk) {
+ delete m_chunk; m_chunk = NULL;
+ }
+ }
+
+public:
+ /* constructor/destructor */
+ ChewingLargeTable(pinyin_option_t options):
+ m_bitmap_table(options), m_chunk(NULL) {}
+
+ ~ChewingLargeTable() { reset(); }
+
+ /* set options method */
+ bool set_options(pinyin_option_t options) {
+ return m_bitmap_table.set_options(options);
+ }
+
+ /* load/store method */
+ bool load(MemoryChunk * chunk) {
+ reset();
+ m_chunk = chunk;
+ return m_bitmap_table.load(chunk, 0, chunk->size());
+ }
+
+ bool store(MemoryChunk * new_chunk) {
+ table_offset_t end;
+ return m_bitmap_table.store(new_chunk, 0, end);
+ }
+
+ bool load_text(FILE * file);
+
+ /* search method */
+ int search(int phrase_length, /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const {
+ return m_bitmap_table.search(phrase_length, keys, ranges);
+ }
+
+ /* add/remove index method */
+ int add_index(int phrase_length, /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token) {
+ return m_bitmap_table.add_index(phrase_length, keys, token);
+ }
+
+ int remove_index(int phrase_length, /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token) {
+ return m_bitmap_table.remove_index(phrase_length, keys, token);
+ }
+
+ /* mask out method */
+ bool mask_out(phrase_token_t mask, phrase_token_t value) {
+ return m_bitmap_table.mask_out(mask, value);
+ }
+};
+
+};
+
+#endif
diff --git a/src/storage/chewing_table.h b/src/storage/chewing_table.h
new file mode 100644
index 0000000..56ceba0
--- /dev/null
+++ b/src/storage/chewing_table.h
@@ -0,0 +1,221 @@
+/* This file is generated by python scripts. Don't edit this file directly.
+ */
+
+#ifndef CHEWING_TABLE_H
+#define CHEWING_TABLE_H
+
+namespace pinyin{
+
+const chewing_symbol_item_t chewing_standard_symbols[] = {
+{',' , "ㄝ"},
+{'-' , "ㄦ"},
+{'.' , "ㄡ"},
+{'/' , "ㄥ"},
+{'0' , "ㄢ"},
+{'1' , "ㄅ"},
+{'2' , "ㄉ"},
+{'5' , "ㄓ"},
+{'8' , "ㄚ"},
+{'9' , "ㄞ"},
+{';' , "ㄤ"},
+{'a' , "ㄇ"},
+{'b' , "ㄖ"},
+{'c' , "ㄏ"},
+{'d' , "ㄎ"},
+{'e' , "ㄍ"},
+{'f' , "ㄑ"},
+{'g' , "ㄕ"},
+{'h' , "ㄘ"},
+{'i' , "ㄛ"},
+{'j' , "ㄨ"},
+{'k' , "ㄜ"},
+{'l' , "ㄠ"},
+{'m' , "ㄩ"},
+{'n' , "ㄙ"},
+{'o' , "ㄟ"},
+{'p' , "ㄣ"},
+{'q' , "ㄆ"},
+{'r' , "ㄐ"},
+{'s' , "ㄋ"},
+{'t' , "ㄔ"},
+{'u' , "ㄧ"},
+{'v' , "ㄒ"},
+{'w' , "ㄊ"},
+{'x' , "ㄌ"},
+{'y' , "ㄗ"},
+{'z' , "ㄈ"},
+{'\0', NULL}
+};
+
+const chewing_tone_item_t chewing_standard_tones[] = {
+{' ' , 1},
+{'3' , 3},
+{'4' , 4},
+{'6' , 2},
+{'7' , 5},
+{'\0', 0}
+};
+
+
+const chewing_symbol_item_t chewing_ginyieh_symbols[] = {
+{'\'' , "ㄩ"},
+{',' , "ㄝ"},
+{'-' , "ㄧ"},
+{'.' , "ㄡ"},
+{'/' , "ㄥ"},
+{'0' , "ㄢ"},
+{'2' , "ㄅ"},
+{'3' , "ㄉ"},
+{'6' , "ㄓ"},
+{'8' , "ㄚ"},
+{'9' , "ㄞ"},
+{';' , "ㄤ"},
+{'=' , "ㄦ"},
+{'[' , "ㄨ"},
+{'b' , "ㄒ"},
+{'c' , "ㄌ"},
+{'d' , "ㄋ"},
+{'e' , "ㄊ"},
+{'f' , "ㄎ"},
+{'g' , "ㄑ"},
+{'h' , "ㄕ"},
+{'i' , "ㄛ"},
+{'j' , "ㄘ"},
+{'k' , "ㄜ"},
+{'l' , "ㄠ"},
+{'m' , "ㄙ"},
+{'n' , "ㄖ"},
+{'o' , "ㄟ"},
+{'p' , "ㄣ"},
+{'r' , "ㄍ"},
+{'s' , "ㄇ"},
+{'t' , "ㄐ"},
+{'u' , "ㄗ"},
+{'v' , "ㄏ"},
+{'w' , "ㄆ"},
+{'x' , "ㄈ"},
+{'y' , "ㄔ"},
+{'\0', NULL}
+};
+
+const chewing_tone_item_t chewing_ginyieh_tones[] = {
+{' ' , 1},
+{'1' , 5},
+{'a' , 3},
+{'q' , 2},
+{'z' , 4},
+{'\0', 0}
+};
+
+const chewing_symbol_item_t chewing_eten_symbols[] = {
+{'\'' , "ㄘ"},
+{',' , "ㄓ"},
+{'-' , "ㄥ"},
+{'.' , "ㄔ"},
+{'/' , "ㄕ"},
+{'0' , "ㄤ"},
+{'7' , "ㄑ"},
+{'8' , "ㄢ"},
+{'9' , "ㄣ"},
+{';' , "ㄗ"},
+{'=' , "ㄦ"},
+{'a' , "ㄚ"},
+{'b' , "ㄅ"},
+{'c' , "ㄒ"},
+{'d' , "ㄉ"},
+{'e' , "ㄧ"},
+{'f' , "ㄈ"},
+{'g' , "ㄐ"},
+{'h' , "ㄏ"},
+{'i' , "ㄞ"},
+{'j' , "ㄖ"},
+{'k' , "ㄎ"},
+{'l' , "ㄌ"},
+{'m' , "ㄇ"},
+{'n' , "ㄋ"},
+{'o' , "ㄛ"},
+{'p' , "ㄆ"},
+{'q' , "ㄟ"},
+{'r' , "ㄜ"},
+{'s' , "ㄙ"},
+{'t' , "ㄊ"},
+{'u' , "ㄩ"},
+{'v' , "ㄍ"},
+{'w' , "ㄝ"},
+{'x' , "ㄨ"},
+{'y' , "ㄡ"},
+{'z' , "ㄠ"},
+{'\0', NULL}
+};
+
+const chewing_tone_item_t chewing_eten_tones[] = {
+{' ' , 1},
+{'1' , 5},
+{'2' , 2},
+{'3' , 3},
+{'4' , 4},
+{'\0', 0}
+};
+
+const chewing_symbol_item_t chewing_ibm_symbols[] = {
+{'-' , "ㄏ"},
+{'0' , "ㄎ"},
+{'1' , "ㄅ"},
+{'2' , "ㄆ"},
+{'3' , "ㄇ"},
+{'4' , "ㄈ"},
+{'5' , "ㄉ"},
+{'6' , "ㄊ"},
+{'7' , "ㄋ"},
+{'8' , "ㄌ"},
+{'9' , "ㄍ"},
+{';' , "ㄠ"},
+{'a' , "ㄧ"},
+{'b' , "ㄥ"},
+{'c' , "ㄣ"},
+{'d' , "ㄩ"},
+{'e' , "ㄒ"},
+{'f' , "ㄚ"},
+{'g' , "ㄛ"},
+{'h' , "ㄜ"},
+{'i' , "ㄗ"},
+{'j' , "ㄝ"},
+{'k' , "ㄞ"},
+{'l' , "ㄟ"},
+{'n' , "ㄦ"},
+{'o' , "ㄘ"},
+{'p' , "ㄙ"},
+{'q' , "ㄐ"},
+{'r' , "ㄓ"},
+{'s' , "ㄨ"},
+{'t' , "ㄔ"},
+{'u' , "ㄖ"},
+{'v' , "ㄤ"},
+{'w' , "ㄑ"},
+{'x' , "ㄢ"},
+{'y' , "ㄕ"},
+{'z' , "ㄡ"},
+{'\0', NULL}
+};
+
+const chewing_tone_item_t chewing_ibm_tones[] = {
+{' ' , 1},
+{',' , 3},
+{'.' , 4},
+{'/' , 5},
+{'m' , 2},
+{'\0', 0}
+};
+
+const char * chewing_tone_table[CHEWING_NUMBER_OF_TONES] = {
+"",
+"ˉ",
+"ˊ",
+"ˇ",
+"ˋ",
+"˙"
+};
+
+};
+
+#endif
diff --git a/src/storage/double_pinyin_table.h b/src/storage/double_pinyin_table.h
new file mode 100644
index 0000000..52af618
--- /dev/null
+++ b/src/storage/double_pinyin_table.h
@@ -0,0 +1,371 @@
+/* This file is generated by python scripts. Don't edit this file directly.
+ */
+
+#ifndef DOUBLE_PINYIN_TABLE_H
+#define DOUBLE_PINYIN_TABLE_H
+
+namespace pinyin{
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_mspy_sheng[] = {
+{NULL } /* A */,
+{"b" } /* B */,
+{"c" } /* C */,
+{"d" } /* D */,
+{NULL } /* E */,
+{"f" } /* F */,
+{"g" } /* G */,
+{"h" } /* H */,
+{"ch" } /* I */,
+{"j" } /* J */,
+{"k" } /* K */,
+{"l" } /* L */,
+{"m" } /* M */,
+{"n" } /* N */,
+{"'" } /* O */,
+{"p" } /* P */,
+{"q" } /* Q */,
+{"r" } /* R */,
+{"s" } /* S */,
+{"t" } /* T */,
+{"sh" } /* U */,
+{"zh" } /* V */,
+{"w" } /* W */,
+{"x" } /* X */,
+{"y" } /* Y */,
+{"z" } /* Z */,
+{NULL } /* ; */
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_mspy_yun[] = {
+{{"a" , NULL }} /* A */,
+{{"ou" , NULL }} /* B */,
+{{"iao" , NULL }} /* C */,
+{{"uang" , "iang" }} /* D */,
+{{"e" , NULL }} /* E */,
+{{"en" , NULL }} /* F */,
+{{"eng" , "ng" }} /* G */,
+{{"ang" , NULL }} /* H */,
+{{"i" , NULL }} /* I */,
+{{"an" , NULL }} /* J */,
+{{"ao" , NULL }} /* K */,
+{{"ai" , NULL }} /* L */,
+{{"ian" , NULL }} /* M */,
+{{"in" , NULL }} /* N */,
+{{"uo" , "o" }} /* O */,
+{{"un" , NULL }} /* P */,
+{{"iu" , NULL }} /* Q */,
+{{"uan" , "er" }} /* R */,
+{{"ong" , "iong" }} /* S */,
+{{"ue" , NULL }} /* T */,
+{{"u" , NULL }} /* U */,
+{{"ui" , "ue" }} /* V */,
+{{"ia" , "ua" }} /* W */,
+{{"ie" , NULL }} /* X */,
+{{"uai" , "v" }} /* Y */,
+{{"ei" , NULL }} /* Z */,
+{{"ing" , NULL }} /* ; */
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_zrm_sheng[] = {
+{NULL } /* A */,
+{"b" } /* B */,
+{"c" } /* C */,
+{"d" } /* D */,
+{NULL } /* E */,
+{"f" } /* F */,
+{"g" } /* G */,
+{"h" } /* H */,
+{"ch" } /* I */,
+{"j" } /* J */,
+{"k" } /* K */,
+{"l" } /* L */,
+{"m" } /* M */,
+{"n" } /* N */,
+{"'" } /* O */,
+{"p" } /* P */,
+{"q" } /* Q */,
+{"r" } /* R */,
+{"s" } /* S */,
+{"t" } /* T */,
+{"sh" } /* U */,
+{"zh" } /* V */,
+{"w" } /* W */,
+{"x" } /* X */,
+{"y" } /* Y */,
+{"z" } /* Z */,
+{NULL } /* ; */
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_zrm_yun[] = {
+{{"a" , NULL }} /* A */,
+{{"ou" , NULL }} /* B */,
+{{"iao" , NULL }} /* C */,
+{{"uang" , "iang" }} /* D */,
+{{"e" , NULL }} /* E */,
+{{"en" , NULL }} /* F */,
+{{"eng" , "ng" }} /* G */,
+{{"ang" , NULL }} /* H */,
+{{"i" , NULL }} /* I */,
+{{"an" , NULL }} /* J */,
+{{"ao" , NULL }} /* K */,
+{{"ai" , NULL }} /* L */,
+{{"ian" , NULL }} /* M */,
+{{"in" , NULL }} /* N */,
+{{"uo" , "o" }} /* O */,
+{{"un" , NULL }} /* P */,
+{{"iu" , NULL }} /* Q */,
+{{"uan" , "er" }} /* R */,
+{{"ong" , "iong" }} /* S */,
+{{"ue" , NULL }} /* T */,
+{{"u" , NULL }} /* U */,
+{{"ui" , "v" }} /* V */,
+{{"ia" , "ua" }} /* W */,
+{{"ie" , NULL }} /* X */,
+{{"uai" , "ing" }} /* Y */,
+{{"ei" , NULL }} /* Z */,
+{{NULL , NULL }} /* ; */
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_abc_sheng[] = {
+{"zh" } /* A */,
+{"b" } /* B */,
+{"c" } /* C */,
+{"d" } /* D */,
+{"ch" } /* E */,
+{"f" } /* F */,
+{"g" } /* G */,
+{"h" } /* H */,
+{NULL } /* I */,
+{"j" } /* J */,
+{"k" } /* K */,
+{"l" } /* L */,
+{"m" } /* M */,
+{"n" } /* N */,
+{"'" } /* O */,
+{"p" } /* P */,
+{"q" } /* Q */,
+{"r" } /* R */,
+{"s" } /* S */,
+{"t" } /* T */,
+{NULL } /* U */,
+{"sh" } /* V */,
+{"w" } /* W */,
+{"x" } /* X */,
+{"y" } /* Y */,
+{"z" } /* Z */,
+{NULL } /* ; */
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_abc_yun[] = {
+{{"a" , NULL }} /* A */,
+{{"ou" , NULL }} /* B */,
+{{"in" , "uai" }} /* C */,
+{{"ia" , "ua" }} /* D */,
+{{"e" , NULL }} /* E */,
+{{"en" , NULL }} /* F */,
+{{"eng" , "ng" }} /* G */,
+{{"ang" , NULL }} /* H */,
+{{"i" , NULL }} /* I */,
+{{"an" , NULL }} /* J */,
+{{"ao" , NULL }} /* K */,
+{{"ai" , NULL }} /* L */,
+{{"ue" , "ui" }} /* M */,
+{{"un" , NULL }} /* N */,
+{{"uo" , "o" }} /* O */,
+{{"uan" , NULL }} /* P */,
+{{"ei" , NULL }} /* Q */,
+{{"er" , "iu" }} /* R */,
+{{"ong" , "iong" }} /* S */,
+{{"iang" , "uang" }} /* T */,
+{{"u" , NULL }} /* U */,
+{{"v" , "ue" }} /* V */,
+{{"ian" , NULL }} /* W */,
+{{"ie" , NULL }} /* X */,
+{{"ing" , NULL }} /* Y */,
+{{"iao" , NULL }} /* Z */,
+{{NULL , NULL }} /* ; */
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_zgpy_sheng[] = {
+{"ch" } /* A */,
+{"b" } /* B */,
+{"c" } /* C */,
+{"d" } /* D */,
+{NULL } /* E */,
+{"f" } /* F */,
+{"g" } /* G */,
+{"h" } /* H */,
+{"sh" } /* I */,
+{"j" } /* J */,
+{"k" } /* K */,
+{"l" } /* L */,
+{"m" } /* M */,
+{"n" } /* N */,
+{"'" } /* O */,
+{"p" } /* P */,
+{"q" } /* Q */,
+{"r" } /* R */,
+{"s" } /* S */,
+{"t" } /* T */,
+{"zh" } /* U */,
+{NULL } /* V */,
+{"w" } /* W */,
+{"x" } /* X */,
+{"y" } /* Y */,
+{"z" } /* Z */,
+{NULL } /* ; */
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_zgpy_yun[] = {
+{{"a" , NULL }} /* A */,
+{{"iao" , NULL }} /* B */,
+{{NULL , NULL }} /* C */,
+{{"ie" , NULL }} /* D */,
+{{"e" , NULL }} /* E */,
+{{"ian" , NULL }} /* F */,
+{{"iang" , "uang" }} /* G */,
+{{"ong" , "iong" }} /* H */,
+{{"i" , NULL }} /* I */,
+{{"er" , "iu" }} /* J */,
+{{"ei" , NULL }} /* K */,
+{{"uan" , NULL }} /* L */,
+{{"un" , NULL }} /* M */,
+{{"ue" , "ui" }} /* N */,
+{{"uo" , "o" }} /* O */,
+{{"ai" , NULL }} /* P */,
+{{"ao" , NULL }} /* Q */,
+{{"an" , NULL }} /* R */,
+{{"ang" , NULL }} /* S */,
+{{"eng" , "ng" }} /* T */,
+{{"u" , NULL }} /* U */,
+{{"v" , NULL }} /* V */,
+{{"en" , NULL }} /* W */,
+{{"ia" , "ua" }} /* X */,
+{{"in" , "uai" }} /* Y */,
+{{"ou" , NULL }} /* Z */,
+{{"ing" , NULL }} /* ; */
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_pyjj_sheng[] = {
+{"'" } /* A */,
+{"b" } /* B */,
+{"c" } /* C */,
+{"d" } /* D */,
+{NULL } /* E */,
+{"f" } /* F */,
+{"g" } /* G */,
+{"h" } /* H */,
+{"sh" } /* I */,
+{"j" } /* J */,
+{"k" } /* K */,
+{"l" } /* L */,
+{"m" } /* M */,
+{"n" } /* N */,
+{"'" } /* O */,
+{"p" } /* P */,
+{"q" } /* Q */,
+{"r" } /* R */,
+{"s" } /* S */,
+{"t" } /* T */,
+{"ch" } /* U */,
+{"zh" } /* V */,
+{"w" } /* W */,
+{"x" } /* X */,
+{"y" } /* Y */,
+{"z" } /* Z */,
+{NULL } /* ; */
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_pyjj_yun[] = {
+{{"a" , NULL }} /* A */,
+{{"ia" , "ua" }} /* B */,
+{{"uan" , NULL }} /* C */,
+{{"ao" , NULL }} /* D */,
+{{"e" , NULL }} /* E */,
+{{"an" , NULL }} /* F */,
+{{"ang" , NULL }} /* G */,
+{{"iang" , "uang" }} /* H */,
+{{"i" , NULL }} /* I */,
+{{"ian" , NULL }} /* J */,
+{{"iao" , NULL }} /* K */,
+{{"in" , NULL }} /* L */,
+{{"ie" , NULL }} /* M */,
+{{"iu" , NULL }} /* N */,
+{{"uo" , "o" }} /* O */,
+{{"ou" , NULL }} /* P */,
+{{"er" , "ing" }} /* Q */,
+{{"en" , NULL }} /* R */,
+{{"ai" , NULL }} /* S */,
+{{"eng" , "ng" }} /* T */,
+{{"u" , NULL }} /* U */,
+{{"v" , "ui" }} /* V */,
+{{"ei" , NULL }} /* W */,
+{{"uai" , "ue" }} /* X */,
+{{"ong" , "iong" }} /* Y */,
+{{"un" , NULL }} /* Z */,
+{{NULL , NULL }} /* ; */
+};
+
+const double_pinyin_scheme_shengmu_item_t double_pinyin_xhe_sheng[] = {
+{"'" } /* A */,
+{"b" } /* B */,
+{"c" } /* C */,
+{"d" } /* D */,
+{"'" } /* E */,
+{"f" } /* F */,
+{"g" } /* G */,
+{"h" } /* H */,
+{"ch" } /* I */,
+{"j" } /* J */,
+{"k" } /* K */,
+{"l" } /* L */,
+{"m" } /* M */,
+{"n" } /* N */,
+{"'" } /* O */,
+{"p" } /* P */,
+{"q" } /* Q */,
+{"r" } /* R */,
+{"s" } /* S */,
+{"t" } /* T */,
+{"sh" } /* U */,
+{"zh" } /* V */,
+{"w" } /* W */,
+{"x" } /* X */,
+{"y" } /* Y */,
+{"z" } /* Z */,
+{NULL } /* ; */
+};
+
+const double_pinyin_scheme_yunmu_item_t double_pinyin_xhe_yun[] = {
+{{"a" , NULL }} /* A */,
+{{"in" , NULL }} /* B */,
+{{"ao" , NULL }} /* C */,
+{{"ai" , NULL }} /* D */,
+{{"e" , NULL }} /* E */,
+{{"en" , NULL }} /* F */,
+{{"eng" , "ng" }} /* G */,
+{{"ang" , NULL }} /* H */,
+{{"i" , NULL }} /* I */,
+{{"an" , NULL }} /* J */,
+{{"uai" , "ing" }} /* K */,
+{{"iang" , "uang" }} /* L */,
+{{"ian" , NULL }} /* M */,
+{{"iao" , NULL }} /* N */,
+{{"uo" , "o" }} /* O */,
+{{"ie" , NULL }} /* P */,
+{{"iu" , NULL }} /* Q */,
+{{"uan" , "er" }} /* R */,
+{{"ong" , "iong" }} /* S */,
+{{"ue" , NULL }} /* T */,
+{{"u" , NULL }} /* U */,
+{{"v" , "ui" }} /* V */,
+{{"ei" , NULL }} /* W */,
+{{"ia" , "ua" }} /* X */,
+{{"un" , NULL }} /* Y */,
+{{"ou" , NULL }} /* Z */,
+{{NULL , NULL }} /* ; */
+};
+
+};
+
+#endif
diff --git a/src/storage/facade_chewing_table.h b/src/storage/facade_chewing_table.h
new file mode 100644
index 0000000..474311c
--- /dev/null
+++ b/src/storage/facade_chewing_table.h
@@ -0,0 +1,216 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef FACADE_CHEWING_TABLE_H
+#define FACADE_CHEWING_TABLE_H
+
+#include "novel_types.h"
+#include "chewing_large_table.h"
+
+namespace pinyin{
+
+/**
+ * FacadeChewingTable:
+ *
+ * The facade class of chewing large table.
+ *
+ */
+
+class FacadeChewingTable{
+private:
+ ChewingLargeTable * m_system_chewing_table;
+ ChewingLargeTable * m_user_chewing_table;
+
+ void reset() {
+ if (m_system_chewing_table) {
+ delete m_system_chewing_table;
+ m_system_chewing_table = NULL;
+ }
+
+ if (m_user_chewing_table) {
+ delete m_user_chewing_table;
+ m_user_chewing_table = NULL;
+ }
+ }
+public:
+ /**
+ * FacadeChewingTable::FacadeChewingTable:
+ *
+ * The constructor of the FacadeChewingTable.
+ *
+ */
+ FacadeChewingTable() {
+ m_system_chewing_table = NULL;
+ m_user_chewing_table = NULL;
+ }
+
+ /**
+ * FacadeChewingTable::~FacadeChewingTable:
+ *
+ * The destructor of the FacadeChewingTable.
+ *
+ */
+ ~FacadeChewingTable() {
+ reset();
+ }
+
+ /**
+ * FacadeChewingTable::set_options:
+ * @options: the pinyin options.
+ * @returns: whether the setting options is successful.
+ *
+ * Set the options of the system and user chewing table.
+ *
+ */
+ bool set_options(pinyin_option_t options) {
+ bool result = false;
+ if (m_system_chewing_table)
+ result = m_system_chewing_table->set_options(options) || result;
+ if (m_user_chewing_table)
+ result = m_user_chewing_table->set_options(options) || result;
+ return result;
+ }
+
+ /**
+ * FacadeChewingTable::load:
+ * @options: the pinyin options.
+ * @system: the memory chunk of the system chewing table.
+ * @user: the memory chunk of the user chewing table.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the system or user chewing table from the memory chunks.
+ *
+ */
+ bool load(pinyin_option_t options, MemoryChunk * system,
+ MemoryChunk * user){
+ reset();
+
+ bool result = false;
+ if (system) {
+ m_system_chewing_table = new ChewingLargeTable(options);
+ result = m_system_chewing_table->load(system) || result;
+ }
+ if (user) {
+ m_user_chewing_table = new ChewingLargeTable(options);
+ result = m_user_chewing_table->load(user) || result;
+ }
+ return result;
+ }
+
+ /**
+ * FacadeChewingTable::store:
+ * @new_user: the memory chunk to store the user chewing table.
+ * @returns: whether the store operation is successful.
+ *
+ * Store the user chewing table to the memory chunk.
+ *
+ */
+ bool store(MemoryChunk * new_user) {
+ if (NULL == m_user_chewing_table)
+ return false;
+ return m_user_chewing_table->store(new_user);
+ }
+
+ /**
+ * FacadeChewingTable::search:
+ * @phrase_length: the length of the phrase to be searched.
+ * @keys: the pinyin key of the phrase to be searched.
+ * @ranges: the array of GArrays to store the matched phrase token.
+ * @returns: the search result of enum SearchResult.
+ *
+ * Search the phrase tokens according to the pinyin keys.
+ *
+ */
+ int search(int phrase_length, /* in */ const ChewingKey keys[],
+ /* out */ PhraseIndexRanges ranges) const {
+
+ /* clear ranges. */
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ if (ranges[i])
+ g_array_set_size(ranges[i], 0);
+ }
+
+ int result = SEARCH_NONE;
+
+ if (NULL != m_system_chewing_table)
+ result |= m_system_chewing_table->search
+ (phrase_length, keys, ranges);
+
+ if (NULL != m_user_chewing_table)
+ result |= m_user_chewing_table->search
+ (phrase_length, keys, ranges);
+
+ return result;
+ }
+
+ /**
+ * FacadeChewingTable::add_index:
+ * @phrase_length: the length of the phrase to be added.
+ * @keys: the pinyin keys of the phrase to be added.
+ * @token: the token of the phrase to be added.
+ * @returns: the add result of enum ErrorResult.
+ *
+ * Add the phrase token to the user chewing table.
+ *
+ */
+ int add_index(int phrase_length, /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token) {
+ if (NULL == m_user_chewing_table)
+ return ERROR_NO_USER_TABLE;
+ return m_user_chewing_table->add_index(phrase_length, keys, token);
+ }
+
+ /**
+ * FacadeChewingTable::remove_index:
+ * @phrase_length: the length of the phrase to be removed.
+ * @keys: the pinyin keys of the phrase to be removed.
+ * @token: the token of the phrase to be removed.
+ * @returns: the remove result of enum ErrorResult.
+ *
+ * Remove the phrase token from the user chewing table.
+ *
+ */
+ int remove_index(int phrase_length, /* in */ const ChewingKey keys[],
+ /* in */ phrase_token_t token) {
+ if (NULL == m_user_chewing_table)
+ return ERROR_NO_USER_TABLE;
+ return m_user_chewing_table->remove_index(phrase_length, keys, token);
+ }
+
+ /**
+ * FacadeChewingTable::mask_out:
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: whether the mask out operation is successful.
+ *
+ * Mask out the matched chewing index.
+ *
+ */
+ bool mask_out(phrase_token_t mask, phrase_token_t value) {
+ if (NULL == m_user_chewing_table)
+ return false;
+ return m_user_chewing_table->mask_out(mask, value);
+ }
+};
+
+};
+
+#endif
diff --git a/src/storage/facade_phrase_table2.h b/src/storage/facade_phrase_table2.h
new file mode 100644
index 0000000..3ef1c37
--- /dev/null
+++ b/src/storage/facade_phrase_table2.h
@@ -0,0 +1,203 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef FACADE_PHRASE_TABLE2_H
+#define FACADE_PHRASE_TABLE2_H
+
+#include "phrase_large_table2.h"
+
+namespace pinyin{
+
+/**
+ * FacadePhraseTable2:
+ *
+ * The facade class of phrase large table2.
+ *
+ */
+
+class FacadePhraseTable2{
+private:
+ PhraseLargeTable2 * m_system_phrase_table;
+ PhraseLargeTable2 * m_user_phrase_table;
+
+ void reset(){
+ if (m_system_phrase_table) {
+ delete m_system_phrase_table;
+ m_system_phrase_table = NULL;
+ }
+
+ if (m_user_phrase_table) {
+ delete m_user_phrase_table;
+ m_user_phrase_table = NULL;
+ }
+ }
+
+public:
+ /**
+ * FacadePhraseTable2::FacadePhraseTable2:
+ *
+ * The constructor of the FacadePhraseTable2.
+ *
+ */
+ FacadePhraseTable2() {
+ m_system_phrase_table = NULL;
+ m_user_phrase_table = NULL;
+ }
+
+ /**
+ * FacadePhraseTable2::~FacadePhraseTable2:
+ *
+ * The destructor of the FacadePhraseTable2.
+ *
+ */
+ ~FacadePhraseTable2() {
+ reset();
+ }
+
+ /**
+ * FacadePhraseTable2::load:
+ * @system: the memory chunk of the system phrase table.
+ * @user: the memory chunk of the user phrase table.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the system or user phrase table from the memory chunks.
+ *
+ */
+ bool load(MemoryChunk * system, MemoryChunk * user) {
+ reset();
+
+ bool result = false;
+ if (system) {
+ m_system_phrase_table = new PhraseLargeTable2;
+ result = m_system_phrase_table->load(system) || result;
+ }
+ if (user) {
+ m_user_phrase_table = new PhraseLargeTable2;
+ result = m_user_phrase_table->load(user) || result;
+ }
+ return result;
+ }
+
+ /**
+ * FacadePhraseTable2::store:
+ * @new_user: the memory chunk to store the user phrase table.
+ * @returns: whether the store operation is successful.
+ *
+ * Store the user phrase table to the memory chunk.
+ *
+ */
+ bool store(MemoryChunk * new_user) {
+ if (NULL == m_user_phrase_table)
+ return false;
+ return m_user_phrase_table->store(new_user);
+ }
+
+ /**
+ * FacadePhraseTable2::search:
+ * @phrase_length: the length of the phrase to be searched.
+ * @phrase: the ucs4 characters of the phrase to be searched.
+ * @tokens: the GArray of tokens to store the matched phrases.
+ * @returns: the search result of enum SearchResult.
+ *
+ * Search the phrase tokens according to the ucs4 characters.
+ *
+ */
+ int search(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const {
+ /* clear tokens. */
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ if (tokens[i])
+ g_array_set_size(tokens[i], 0);
+ }
+
+ int result = SEARCH_NONE;
+
+ if (NULL != m_system_phrase_table)
+ result |= m_system_phrase_table->search
+ (phrase_length, phrase, tokens);
+
+ if (NULL != m_user_phrase_table)
+ result |= m_user_phrase_table->search
+ (phrase_length, phrase, tokens);
+
+ return result;
+ }
+
+ /**
+ * FacadePhraseTable2::add_index:
+ * @phrase_length: the length of the phrase to be added.
+ * @phrase: the ucs4 characters of the phrase to be added.
+ * @token: the token of the phrase to be added.
+ * @returns: the add result of enum ErrorResult.
+ *
+ * Add the phrase token to the user phrase table.
+ *
+ */
+ int add_index(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* in */ phrase_token_t token) {
+ if (NULL == m_user_phrase_table)
+ return ERROR_NO_USER_TABLE;
+
+ return m_user_phrase_table->add_index
+ (phrase_length, phrase, token);
+ }
+
+ /**
+ * FacadePhraseTable2::remove_index:
+ * @phrase_length: the length of the phrase to be removed.
+ * @phrase: the ucs4 characters of the phrase to be removed.
+ * @token: the token of the phrase to be removed.
+ * @returns: the remove result of enum ErrorResult.
+ *
+ * Remove the phrase token from the user phrase table.
+ *
+ */
+ int remove_index(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* in */ phrase_token_t token) {
+ if (NULL == m_user_phrase_table)
+ return ERROR_NO_USER_TABLE;
+
+ return m_user_phrase_table->remove_index
+ (phrase_length, phrase, token);
+ }
+
+ /**
+ * FacadePhraseTable2::mask_out:
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: whether the mask out operation is successful.
+ *
+ * Mask out the matched phrase index.
+ *
+ */
+ bool mask_out(phrase_token_t mask, phrase_token_t value) {
+ if (NULL == m_user_phrase_table)
+ return false;
+
+ return m_user_phrase_table->mask_out
+ (mask, value);
+ }
+};
+
+};
+
+
+#endif
diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h
new file mode 100644
index 0000000..6cff7ff
--- /dev/null
+++ b/src/storage/flexible_ngram.h
@@ -0,0 +1,719 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+
+#ifndef FLEXIBLE_NGRAM_H
+#define FLEXIBLE_NGRAM_H
+
+#include <db.h>
+#include <errno.h>
+
+/* Note: the signature of the template parameters.
+ * struct MagicHeader, ArrayHeader, ArrayItem.
+ */
+
+namespace pinyin{
+
+typedef GArray * FlexibleBigramPhraseArray;
+
+/**
+ * FlexibleSingleGram:
+ * @ArrayHeader: the struct ArrayHeader.
+ * @ArrayItem: the struct ArrayItem.
+ *
+ * The flexible single gram is mainly used for training purpose.
+ *
+ */
+
+template<typename ArrayHeader, typename ArrayItem>
+class FlexibleSingleGram{
+ template<typename MH, typename AH,
+ typename AI>
+ friend class FlexibleBigram;
+private:
+ MemoryChunk m_chunk;
+ FlexibleSingleGram(void * buffer, size_t length){
+ m_chunk.set_chunk(buffer, length, NULL);
+ }
+public:
+ /**
+ * ArrayItemWithToken:
+ *
+ * Define the struct ArrayItemWithToken type.
+ *
+ */
+ typedef struct{
+ phrase_token_t m_token;
+ ArrayItem m_item;
+ } ArrayItemWithToken;
+
+private:
+ static bool token_less_than(const ArrayItemWithToken & lhs,
+ const ArrayItemWithToken & rhs){
+ return lhs.m_token < rhs.m_token;
+ }
+
+public:
+ /**
+ * FlexibleSingleGram::FlexibleSingleGram:
+ *
+ * The constructor of the FlexibleSingleGram.
+ *
+ */
+ FlexibleSingleGram(){
+ m_chunk.set_size(sizeof(ArrayHeader));
+ memset(m_chunk.begin(), 0, sizeof(ArrayHeader));
+ }
+
+ /**
+ * FlexibleSingleGram::retrieve_all:
+ * @array: the array to store all items in this single gram.
+ * @returns: whether the retrieve operation is successful.
+ *
+ * Retrieve all items in this single gram.
+ *
+ */
+ bool retrieve_all(/* out */ FlexibleBigramPhraseArray array){
+ const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ const ArrayItemWithToken * end = (const ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken item;
+ for ( const ArrayItemWithToken * cur_item = begin;
+ cur_item != end;
+ ++cur_item){
+ /* Note: optimize this with g_array_append_vals? */
+ item.m_token = cur_item->m_token;
+ item.m_item = cur_item->m_item;
+ g_array_append_val(array, item);
+ }
+
+ return true;
+ }
+
+ /**
+ * FlexibleSingleGram::search:
+ * @range: the token range.
+ * @array: the array to store the array items with token in the range.
+ * @returns: whether the search operation is successful.
+ *
+ * Search the array items with token in the range.
+ *
+ * Note: The array result may contain many items.
+ *
+ */
+ bool search(/* in */ PhraseIndexRange * range,
+ /* out */ FlexibleBigramPhraseArray array){
+ const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ const ArrayItemWithToken * end = (const ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = range->m_range_begin;
+ const ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ ArrayItemWithToken item;
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token >= range->m_range_end )
+ break;
+ item.m_token = cur_item->m_token;
+ item.m_item = cur_item->m_item;
+ g_array_append_val(array, item);
+ }
+
+ return true;
+ }
+
+ /**
+ * FlexibleSingleGram::insert_array_item:
+ * @token: the phrase token to be inserted.
+ * @item: the array item of this token.
+ * @returns: whether the insert operation is successful.
+ *
+ * Insert the array item of the token.
+ *
+ */
+ bool insert_array_item(/* in */ phrase_token_t token,
+ /* in */ const ArrayItem & item){
+ ArrayItemWithToken * begin = (ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ ArrayItemWithToken * end = (ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = token;
+ ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ ArrayItemWithToken insert_item;
+ insert_item.m_token = token;
+ insert_item.m_item = item;
+
+ for ( ; cur_item != end; ++cur_item ){
+ if ( cur_item->m_token > token ){
+ size_t offset = sizeof(ArrayHeader) +
+ sizeof(ArrayItemWithToken) * (cur_item - begin);
+ m_chunk.insert_content(offset, &insert_item,
+ sizeof(ArrayItemWithToken));
+ return true;
+ }
+ if ( cur_item->m_token == token ){
+ return false;
+ }
+ }
+ m_chunk.insert_content(m_chunk.size(), &insert_item,
+ sizeof(ArrayItemWithToken));
+ return true;
+ }
+
+ /**
+ * FlexibleSingleGram::remove_array_item:
+ * @token: the phrase token to be removed.
+ * @item: the content of the removed array item.
+ * @returns: whether the remove operation is successful.
+ *
+ * Remove the array item of the token.
+ *
+ */
+ bool remove_array_item(/* in */ phrase_token_t token,
+ /* out */ ArrayItem & item)
+ {
+ /* clear retval */
+ memset(&item, 0, sizeof(ArrayItem));
+
+ const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ const ArrayItemWithToken * end = (const ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = token;
+ const ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token )
+ return false;
+ if ( cur_item->m_token == token ){
+ memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
+ size_t offset = sizeof(ArrayHeader) +
+ sizeof(ArrayItemWithToken) * (cur_item - begin);
+ m_chunk.remove_content(offset, sizeof(ArrayItemWithToken));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * FlexibleSingleGram::get_array_item:
+ * @token: the phrase token.
+ * @item: the array item of the token.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array item of the token.
+ *
+ */
+ bool get_array_item(/* in */ phrase_token_t token,
+ /* out */ ArrayItem & item)
+ {
+ /* clear retval */
+ memset(&item, 0, sizeof(ArrayItem));
+
+ const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ const ArrayItemWithToken * end = (const ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = token;
+ const ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token )
+ return false;
+ if ( cur_item->m_token == token ){
+ memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * FlexibleSingleGram::set_array_item:
+ * @token: the phrase token.
+ * @item: the array item of the token.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the array item of the token.
+ *
+ */
+ bool set_array_item(/* in */ phrase_token_t token,
+ /* in */ const ArrayItem & item){
+ ArrayItemWithToken * begin = (ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ ArrayItemWithToken * end = (ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = token;
+ ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item ){
+ if ( cur_item->m_token > token ){
+ return false;
+ }
+ if ( cur_item->m_token == token ){
+ memcpy(&(cur_item->m_item), &item, sizeof(ArrayItem));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * FlexibleSingleGram::get_array_header:
+ * @header: the array header of this single gram.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array header of this single gram.
+ *
+ */
+ bool get_array_header(/* out */ ArrayHeader & header){
+ /* clear retval */
+ memset(&header, 0, sizeof(ArrayHeader));
+ char * buf_begin = (char *)m_chunk.begin();
+ memcpy(&header, buf_begin, sizeof(ArrayHeader));
+ return true;
+ }
+
+ /**
+ * FlexibleSingleGram::set_array_header:
+ * @header: the array header of this single gram.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the array header of this single gram.
+ *
+ */
+ bool set_array_header(/* in */ const ArrayHeader & header){
+ char * buf_begin = (char *)m_chunk.begin();
+ memcpy(buf_begin, &header, sizeof(ArrayHeader));
+ return true;
+ }
+};
+
+/**
+ * FlexibleBigram:
+ * @MagicHeader: the struct type of the magic header.
+ * @ArrayHeader: the struct type of the array header.
+ * @ArrayItem: the struct type of the array item.
+ *
+ * The flexible bi-gram is mainly used for training purpose.
+ *
+ */
+template<typename MagicHeader, typename ArrayHeader,
+ typename ArrayItem>
+class FlexibleBigram{
+ /* Note: some flexible bi-gram file format check should be here. */
+private:
+ DB * m_db;
+
+ phrase_token_t m_magic_header_index[2];
+
+ char m_magic_number[4];
+
+ void reset(){
+ if ( m_db ){
+ m_db->sync(m_db, 0);
+ m_db->close(m_db, 0);
+ m_db = NULL;
+ }
+ }
+
+public:
+ /**
+ * FlexibleBigram::FlexibleBigram:
+ * @magic_number: the 4 bytes magic number of the flexible bi-gram.
+ *
+ * The constructor of the FlexibleBigram.
+ *
+ */
+ FlexibleBigram(const char * magic_number){
+ m_db = NULL;
+ m_magic_header_index[0] = null_token;
+ m_magic_header_index[1] = null_token;
+
+ memcpy(m_magic_number, magic_number, sizeof(m_magic_number));
+ }
+
+ /**
+ * FlexibleBigram::~FlexibleBigram:
+ *
+ * The destructor of the FlexibleBigram.
+ *
+ */
+ ~FlexibleBigram(){
+ reset();
+ }
+
+ /**
+ * FlexibleBigram::attach:
+ * @dbfile: the path name of the flexible bi-gram.
+ * @flags: the attach flags for the Berkeley DB.
+ * @returns: whether the attach operation is successful.
+ *
+ * Attach Berkeley DB on filesystem for training purpose.
+ *
+ */
+ bool attach(const char * dbfile, guint32 flags){
+ reset();
+ u_int32_t db_flags = 0;
+
+ if ( flags & ATTACH_READONLY )
+ db_flags |= DB_RDONLY;
+ if ( flags & ATTACH_READWRITE )
+ assert( !(flags & ATTACH_READONLY ) );
+
+ if ( !dbfile )
+ return false;
+ int ret = db_create(&m_db, NULL, 0);
+ if ( ret != 0 )
+ assert(false);
+
+ ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
+ if ( ret != 0 && (flags & ATTACH_CREATE) ) {
+ db_flags |= DB_CREATE;
+ /* Create database file here, and write the signature. */
+ ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
+ if ( ret != 0 )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = m_magic_number;
+ db_data.size = sizeof(m_magic_number);
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(m_magic_number);
+
+ ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+ }
+
+ /* check the signature. */
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(m_magic_number);
+ ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+ if ( sizeof(m_magic_number) != db_data.size )
+ return false;
+ if ( memcmp(db_data.data, m_magic_number,
+ sizeof(m_magic_number)) == 0 )
+ return true;
+ return false;
+ }
+
+ /**
+ * FlexibleBigram::load:
+ * @index: the previous token in the flexible bi-gram.
+ * @single_gram: the single gram of the previous token.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the single gram of the previous token.
+ *
+ */
+ bool load(phrase_token_t index,
+ FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ single_gram = NULL;
+
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0)
+ return false;
+
+ single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem>
+ (db_data.data, db_data.size);
+
+ return true;
+ }
+
+ /**
+ * FlexibleBigram::store:
+ * @index: the previous token in the flexible bi-gram.
+ * @single_gram: the single gram of the previous token.
+ * @returns: whether the store operation is successful.
+ *
+ * Store the single gram of the previous token.
+ *
+ */
+ bool store(phrase_token_t index,
+ FlexibleSingleGram<ArrayHeader, ArrayItem> * single_gram){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = single_gram->m_chunk.begin();
+ db_data.size = single_gram->m_chunk.size();
+
+ int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+ }
+
+ /**
+ * FlexibleBigram::remove:
+ * @index: the previous token in the flexible bi-gram.
+ * @returns: whether the remove operation is successful.
+ *
+ * Remove the single gram of the previous token.
+ *
+ */
+ bool remove(phrase_token_t index){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ int ret = m_db->del(m_db, NULL, &db_key, 0);
+ return ret == 0;
+ }
+
+ /**
+ * FlexibleBigram::get_all_items:
+ * @items: the GArray to store all previous tokens.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array of all previous tokens for parameter estimation.
+ *
+ */
+ bool get_all_items(GArray * items){
+ g_array_set_size(items, 0);
+
+ if ( !m_db )
+ return false;
+
+ DBC * cursorp;
+ DBT key, data;
+ int ret;
+
+ /* Get a cursor */
+ m_db->cursor(m_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){
+ if (key.size != sizeof(phrase_token_t)){
+ /* skip magic header. */
+ continue;
+ }
+ phrase_token_t * token = (phrase_token_t *) key.data;
+ g_array_append_val(items, *token);
+ }
+
+ if ( ret != DB_NOTFOUND ){
+ fprintf(stderr, "training db error, exit!");
+
+ if (cursorp != NULL)
+ cursorp->c_close(cursorp);
+
+ exit(EIO);
+ }
+
+ /* Cursors must be closed */
+ if (cursorp != NULL)
+ cursorp->c_close(cursorp);
+ return true;
+ }
+
+ /**
+ * FlexibleBigram::get_magic_header:
+ * @header: the magic header.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the magic header of the flexible bi-gram.
+ *
+ */
+ bool get_magic_header(MagicHeader & header){
+ /* clear retval */
+ memset(&header, 0, sizeof(MagicHeader));
+
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = sizeof(m_magic_number);
+ db_data.dlen = sizeof(MagicHeader);
+
+ int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+
+ if ( sizeof(MagicHeader) != db_data.size )
+ return false;
+
+ memcpy(&header, db_data.data, sizeof(MagicHeader));
+ return true;
+ }
+
+ /**
+ * FlexibleBigram::set_magic_header:
+ * @header: the magic header.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the magic header of the flexible bi-gram.
+ *
+ */
+ bool set_magic_header(const MagicHeader & header){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = (void *) &header;
+ db_data.size = sizeof(MagicHeader);
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = sizeof(m_magic_number);
+ db_data.dlen = sizeof(MagicHeader);
+
+ int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+ }
+
+ /**
+ * FlexibleBigram::get_array_header:
+ * @index: the previous token in the flexible bi-gram.
+ * @header: the array header in the single gram of the previous token.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array header in the single gram of the previous token.
+ *
+ */
+ bool get_array_header(phrase_token_t index, ArrayHeader & header){
+ /* clear retval */
+ memset(&header, 0, sizeof(ArrayHeader));
+
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(ArrayHeader);
+ int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+
+ assert(db_data.size == sizeof(ArrayHeader));
+ memcpy(&header, db_data.data, sizeof(ArrayHeader));
+ return true;
+ }
+
+ /**
+ * FlexibleBigram::set_array_header:
+ * @index: the previous token of the flexible bi-gram.
+ * @header: the array header in the single gram of the previous token.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the array header in the single gram of the previous token.
+ *
+ */
+ bool set_array_header(phrase_token_t index, const ArrayHeader & header){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = (void *)&header;
+ db_data.size = sizeof(ArrayHeader);
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(ArrayHeader);
+
+ int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+ }
+
+};
+
+};
+
+#endif
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
new file mode 100644
index 0000000..3964388
--- /dev/null
+++ b/src/storage/ngram.cpp
@@ -0,0 +1,602 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <glib.h>
+#include <glib/gstdio.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "ngram.h"
+
+using namespace pinyin;
+
+struct SingleGramItem{
+ phrase_token_t m_token;
+ guint32 m_freq;
+};
+
+SingleGram::SingleGram(){
+ m_chunk.set_size(sizeof(guint32));
+ memset(m_chunk.begin(), 0, sizeof(guint32));
+}
+
+SingleGram::SingleGram(void * buffer, size_t length){
+ m_chunk.set_chunk(buffer, length, NULL);
+}
+
+bool SingleGram::get_total_freq(guint32 & total) const{
+ char * buf_begin = (char *)m_chunk.begin();
+ total = *((guint32 *)buf_begin);
+ return true;
+}
+
+bool SingleGram::set_total_freq(guint32 total){
+ char * buf_begin = (char *)m_chunk.begin();
+ *((guint32 *)buf_begin) = total;
+ return true;
+}
+
+guint32 SingleGram::get_length(){
+ /* get the number of items. */
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
+
+ const guint32 length = end - begin;
+
+ if (0 == length) {
+ /* no items here, total freq should be zero. */
+ guint32 total_freq = 0;
+ assert(get_total_freq(total_freq));
+ assert(0 == total_freq);
+ }
+
+ return length;
+}
+
+guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
+ guint32 removed_items = 0;
+
+ guint32 total_freq = 0;
+ assert(get_total_freq(total_freq));
+
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
+
+ for (const SingleGramItem * cur = begin; cur != end; ++cur) {
+ if ((cur->m_token & mask) != value)
+ continue;
+
+ total_freq -= cur->m_freq;
+ size_t offset = sizeof(guint32) +
+ sizeof(SingleGramItem) * (cur - begin);
+ m_chunk.remove_content(offset, sizeof(SingleGramItem));
+
+ /* update chunk end. */
+ end = (const SingleGramItem *) m_chunk.end();
+ ++removed_items;
+ --cur;
+ }
+
+ assert(set_total_freq(total_freq));
+ return removed_items;
+}
+
+bool SingleGram::prune(){
+ assert(false);
+#if 0
+ SingleGramItem * begin = (SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+
+ size_t nitem = 0;
+ for ( SingleGramItem * cur = begin; cur != end; ++cur){
+ cur->m_freq--;
+ nitem++;
+ if ( cur->m_freq == 0 ){
+ size_t offset = sizeof(guint32) + (cur - begin)
+ * sizeof(SingleGramItem) ;
+ m_chunk.remove_content(offset, sizeof(SingleGramItem));
+ }
+ }
+ guint32 total_freq;
+ assert(get_total_freq(total_freq));
+ assert(set_total_freq(total_freq - nitem));
+#endif
+ return true;
+}
+
+static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
+ return lhs.m_token < rhs.m_token;
+}
+
+bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
+ const {
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
+
+ guint32 total_freq;
+ BigramPhraseItemWithCount bigram_item_with_count;
+ assert(get_total_freq(total_freq));
+
+ for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
+ bigram_item_with_count.m_token = cur_item->m_token;
+ bigram_item_with_count.m_count = cur_item->m_freq;
+ bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
+ g_array_append_val(array, bigram_item_with_count);
+ }
+
+ return true;
+}
+
+bool SingleGram::search(/* in */ PhraseIndexRange * range,
+ /* out */ BigramPhraseArray array) const {
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+
+ SingleGramItem compare_item;
+ compare_item.m_token = range->m_range_begin;
+ const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ guint32 total_freq;
+ BigramPhraseItem bigram_item;
+ assert(get_total_freq(total_freq));
+
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token >= range->m_range_end )
+ break;
+ bigram_item.m_token = cur_item->m_token;
+ bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
+ g_array_append_val(array, bigram_item);
+ }
+
+ return true;
+}
+
+bool SingleGram::insert_freq( /* in */ phrase_token_t token,
+ /* in */ guint32 freq){
+ SingleGramItem * begin = (SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ SingleGramItem * end = (SingleGramItem *) m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ SingleGramItem insert_item;
+ insert_item.m_token = token;
+ insert_item.m_freq = freq;
+ for ( ; cur_item != end; ++cur_item ){
+ if ( cur_item->m_token > token ){
+ size_t offset = sizeof(guint32) +
+ sizeof(SingleGramItem) * (cur_item - begin);
+ m_chunk.insert_content(offset, &insert_item,
+ sizeof(SingleGramItem));
+ return true;
+ }
+ if ( cur_item->m_token == token ){
+ return false;
+ }
+ }
+ m_chunk.insert_content(m_chunk.size(), &insert_item,
+ sizeof(SingleGramItem));
+ return true;
+}
+
+bool SingleGram::remove_freq( /* in */ phrase_token_t token,
+ /* out */ guint32 & freq){
+ freq = 0;
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item ){
+ if ( cur_item->m_token > token )
+ return false;
+ if ( cur_item->m_token == token ){
+ freq = cur_item -> m_freq;
+ size_t offset = sizeof(guint32) +
+ sizeof(SingleGramItem) * (cur_item - begin);
+ m_chunk.remove_content(offset, sizeof(SingleGramItem));
+ return true;
+ }
+ }
+ return false;
+}
+
+bool SingleGram::get_freq(/* in */ phrase_token_t token,
+ /* out */ guint32 & freq) const {
+ freq = 0;
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token )
+ return false;
+ if ( cur_item->m_token == token ){
+ freq = cur_item -> m_freq;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool SingleGram::set_freq( /* in */ phrase_token_t token,
+ /* in */ guint32 freq){
+ SingleGramItem * begin = (SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ for ( ;cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token ){
+ return false;
+ }
+ if ( cur_item->m_token == token ){
+ cur_item -> m_freq = freq;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool Bigram::load_db(const char * dbfile){
+ reset();
+
+ /* create in memory db. */
+ int ret = db_create(&m_db, NULL, 0);
+ assert(ret == 0);
+
+ ret = m_db->open(m_db, NULL, NULL, NULL,
+ DB_HASH, DB_CREATE, 0600);
+ if ( ret != 0 )
+ return false;
+
+ /* load db into memory. */
+ DB * tmp_db = NULL;
+ ret = db_create(&tmp_db, NULL, 0);
+ assert(ret == 0);
+
+ if (NULL == tmp_db)
+ return false;
+
+ ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
+ DB_HASH, DB_RDONLY, 0600);
+ if ( ret != 0 )
+ return false;
+
+ DBC * cursorp = NULL;
+ DBT key, data;
+
+ /* Get a cursor */
+ tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ int ret = m_db->put(m_db, NULL, &key, &data, 0);
+ assert(ret == 0);
+ }
+ assert (ret == DB_NOTFOUND);
+
+ /* Cursors must be closed */
+ if ( cursorp != NULL )
+ cursorp->c_close(cursorp);
+
+ if ( tmp_db != NULL )
+ tmp_db->close(tmp_db, 0);
+
+ return true;
+}
+
+bool Bigram::save_db(const char * dbfile){
+ DB * tmp_db = NULL;
+
+ int ret = unlink(dbfile);
+ if ( ret != 0 && errno != ENOENT)
+ return false;
+
+ ret = db_create(&tmp_db, NULL, 0);
+ assert(ret == 0);
+
+ if (NULL == tmp_db)
+ return false;
+
+ ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
+ DB_HASH, DB_CREATE, 0600);
+ if ( ret != 0 )
+ return false;
+
+ DBC * cursorp = NULL;
+ DBT key, data;
+ /* Get a cursor */
+ m_db->cursor(m_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
+ assert(ret == 0);
+ }
+ assert (ret == DB_NOTFOUND);
+
+ /* Cursors must be closed */
+ if ( cursorp != NULL )
+ cursorp->c_close(cursorp);
+
+ if ( tmp_db != NULL )
+ tmp_db->close(tmp_db, 0);
+
+ return true;
+}
+
+bool Bigram::attach(const char * dbfile, guint32 flags){
+ reset();
+ u_int32_t db_flags = 0;
+
+ if ( flags & ATTACH_READONLY )
+ db_flags |= DB_RDONLY;
+ if ( flags & ATTACH_READWRITE )
+ assert( !( flags & ATTACH_READONLY ) );
+ if ( flags & ATTACH_CREATE )
+ db_flags |= DB_CREATE;
+
+ if ( !dbfile )
+ return false;
+ int ret = db_create(&m_db, NULL, 0);
+ if ( ret != 0 )
+ assert(false);
+
+ ret = m_db->open(m_db, NULL, dbfile, NULL,
+ DB_HASH, db_flags, 0644);
+ if ( ret != 0)
+ return false;
+
+ return true;
+}
+
+bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
+ single_gram = NULL;
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+
+ single_gram = new SingleGram(db_data.data, db_data.size);
+ return true;
+}
+
+bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = single_gram->m_chunk.begin();
+ db_data.size = single_gram->m_chunk.size();
+
+ int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+}
+
+bool Bigram::remove(/* in */ phrase_token_t index){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ int ret = m_db->del(m_db, NULL, &db_key, 0);
+ return 0 == ret;
+}
+
+bool Bigram::get_all_items(GArray * items){
+ g_array_set_size(items, 0);
+
+ if ( !m_db )
+ return false;
+
+ DBC * cursorp = NULL;
+ DBT key, data;
+ int ret;
+ /* Get a cursor */
+ m_db->cursor(m_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ assert(key.size == sizeof(phrase_token_t));
+ phrase_token_t * token = (phrase_token_t *)key.data;
+ g_array_append_val(items, *token);
+ }
+
+ assert (ret == DB_NOTFOUND);
+
+ /* Cursors must be closed */
+ if (cursorp != NULL)
+ cursorp->c_close(cursorp);
+
+ return true;
+}
+
+bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ if (!get_all_items(items)) {
+ g_array_free(items, TRUE);
+ return false;
+ }
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t index = g_array_index(items, phrase_token_t, i);
+
+ if ((index & mask) == value) {
+ assert(remove(index));
+ continue;
+ }
+
+ SingleGram * gram = NULL;
+ assert(load(index, gram));
+
+ int num = gram->mask_out(mask, value);
+ if (0 == num) {
+ delete gram;
+ continue;
+ }
+
+ if (0 == gram->get_length()) {
+ assert(remove(index));
+ } else {
+ assert(store(index, gram));
+ }
+
+ delete gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
+
+
+namespace pinyin{
+
+/* merge origin system info and delta user info */
+bool merge_single_gram(SingleGram * merged, const SingleGram * system,
+ const SingleGram * user){
+ if (NULL == system && NULL == user)
+ return false;
+
+ MemoryChunk & merged_chunk = merged->m_chunk;
+
+ if (NULL == system) {
+ merged_chunk.set_chunk(user->m_chunk.begin(),
+ user->m_chunk.size(), NULL);
+ return true;
+ }
+
+ if (NULL == user) {
+ merged_chunk.set_chunk(system->m_chunk.begin(),
+ system->m_chunk.size(), NULL);
+ return true;
+ }
+
+ /* clear merged. */
+ merged_chunk.set_size(sizeof(guint32));
+
+ /* merge the origin info and delta info */
+ guint32 system_total, user_total;
+ assert(system->get_total_freq(system_total));
+ assert(user->get_total_freq(user_total));
+ const guint32 merged_total = system_total + user_total;
+ merged_chunk.set_content(0, &merged_total, sizeof(guint32));
+
+ const SingleGramItem * cur_system = (const SingleGramItem *)
+ (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
+ const SingleGramItem * system_end = (const SingleGramItem *)
+ system->m_chunk.end();
+
+ const SingleGramItem * cur_user = (const SingleGramItem *)
+ (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
+ const SingleGramItem * user_end = (const SingleGramItem *)
+ user->m_chunk.end();
+
+ while (cur_system < system_end && cur_user < user_end) {
+
+ if (cur_system->m_token < cur_user->m_token) {
+ /* do append operation here */
+ merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
+ cur_system++;
+ } else if (cur_system->m_token > cur_user->m_token) {
+ /* do append operation here */
+ merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
+ cur_user++;
+ } else {
+ assert(cur_system->m_token == cur_user->m_token);
+
+ SingleGramItem merged_item;
+ merged_item.m_token = cur_system->m_token;
+ merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
+
+ merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
+ cur_system++; cur_user++;
+ }
+ }
+
+ /* add remained items. */
+ while (cur_system < system_end) {
+ merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
+ cur_system++;
+ }
+
+ while (cur_user < user_end) {
+ merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
+ cur_user++;
+ }
+
+ return true;
+}
+
+};
diff --git a/src/storage/ngram.h b/src/storage/ngram.h
new file mode 100644
index 0000000..e4045a9
--- /dev/null
+++ b/src/storage/ngram.h
@@ -0,0 +1,329 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef NGRAM_H
+#define NGRAM_H
+
+#include <db.h>
+
+namespace pinyin{
+
+class Bigram;
+
+/** Note:
+ * The system single gram contains the trained freqs.
+ * The user single gram contains the delta freqs.
+ * During the Viterbi beam search, use merge_single_gram to merge the system
+ * single gram and the user single gram.
+ */
+
+
+/**
+ * SingleGram:
+ *
+ * The single gram in the bi-gram.
+ *
+ */
+class SingleGram{
+ friend class Bigram;
+ friend bool merge_single_gram(SingleGram * merged,
+ const SingleGram * system,
+ const SingleGram * user);
+
+private:
+ MemoryChunk m_chunk;
+ SingleGram(void * buffer, size_t length);
+public:
+ /**
+ * SingleGram::SingleGram:
+ *
+ * The constructor of the SingleGram.
+ *
+ */
+ SingleGram();
+ /**
+ * SingleGram::retrieve_all:
+ * @array: the GArray to store the retrieved bi-gram phrase item.
+ * @returns: whether the retrieve operation is successful.
+ *
+ * Retrieve all bi-gram phrase items in this single gram.
+ *
+ */
+ bool retrieve_all(/* out */ BigramPhraseWithCountArray array) const;
+
+ /**
+ * SingleGram::search:
+ * @range: the token range.
+ * @array: the GArray to store the matched bi-gram phrase item.
+ * @returns: whether the search operation is successful.
+ *
+ * Search the bi-gram phrase items according to the token range.
+ *
+ * Note: the array result may contain many items.
+ *
+ */
+ bool search(/* in */ PhraseIndexRange * range,
+ /* out */ BigramPhraseArray array) const;
+
+ /**
+ * SingleGram::insert_freq:
+ * @token: the phrase token.
+ * @freq: the freq of this token.
+ * @returns: whether the insert operation is successful.
+ *
+ * Insert the token with the freq.
+ *
+ */
+ bool insert_freq(/* in */ phrase_token_t token,
+ /* in */ guint32 freq);
+
+ /**
+ * SingleGram::remove_freq:
+ * @token: the phrase token.
+ * @freq: the freq of the removed token.
+ * @returns: whether the remove operation is successful.
+ *
+ * Remove the token.
+ *
+ */
+ bool remove_freq(/* in */ phrase_token_t token,
+ /* out */ guint32 & freq);
+
+ /**
+ * SingleGram::get_freq:
+ * @token: the phrase token.
+ * @freq: the freq of the token.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the freq of the token.
+ *
+ */
+ bool get_freq(/* in */ phrase_token_t token,
+ /* out */ guint32 & freq) const;
+
+ /**
+ * SingleGram::set_freq:
+ * @token: the phrase token.
+ * @freq: the freq of the token.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the freq of the token.
+ *
+ */
+ bool set_freq(/* in */ phrase_token_t token,
+ /* in */ guint32 freq);
+
+ /**
+ * SingleGram::get_total_freq:
+ * @total: the total freq of this single gram.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the total freq of this single gram.
+ *
+ */
+ bool get_total_freq(guint32 & total) const;
+
+ /**
+ * SingleGram::set_total_freq:
+ * @total: the total freq of this single gram.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the total freq of this single gram.
+ *
+ */
+ bool set_total_freq(guint32 total);
+
+ /**
+ * SingleGram::get_length:
+ * @returns: the number of items in this single gram.
+ *
+ * Get the number of items in this single gram.
+ *
+ */
+ guint32 get_length();
+
+ /**
+ * SingleGram::mask_out:
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: the number of removed items.
+ *
+ * Mask out the matched items in this single gram.
+ *
+ */
+ guint32 mask_out(phrase_token_t mask, phrase_token_t value);
+
+ /**
+ * SingleGram::prune:
+ * @returns: whether the prune operation is successful.
+ *
+ * Obsoleted by Katz k mixture model pruning.
+ *
+ */
+ bool prune();
+};
+
+
+/**
+ * Bigram:
+ *
+ * The Bi-gram class.
+ *
+ */
+class Bigram{
+private:
+ DB * m_db;
+
+ void reset(){
+ if ( m_db ){
+ m_db->sync(m_db, 0);
+ m_db->close(m_db, 0);
+ m_db = NULL;
+ }
+ }
+
+public:
+ /**
+ * Bigram::Bigram:
+ *
+ * The constructor of the Bigram.
+ *
+ */
+ Bigram(){
+ m_db = NULL;
+ }
+
+ /**
+ * Bigram::~Bigram:
+ *
+ * The destructor of the Bigram.
+ *
+ */
+ ~Bigram(){
+ reset();
+ }
+
+ /**
+ * Bigram::load_db:
+ * @dbfile: the Berkeley DB file name.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the Berkeley DB into memory.
+ *
+ */
+ bool load_db(const char * dbfile);
+
+ /**
+ * Bigram::save_db:
+ * @dbfile: the Berkeley DB file name.
+ * @returns: whether the save operation is successful.
+ *
+ * Save the in-memory Berkeley DB into disk.
+ *
+ */
+ bool save_db(const char * dbfile);
+
+ /**
+ * Bigram::attach:
+ * @dbfile: the Berkeley DB file name.
+ * @flags: the flags of enum ATTACH_FLAG.
+ * @returns: whether the attach operation is successful.
+ *
+ * Attach this Bigram with the Berkeley DB.
+ *
+ */
+ bool attach(const char * dbfile, guint32 flags);
+
+ /**
+ * Bigram::load:
+ * @index: the previous token in the bi-gram.
+ * @single_gram: the single gram of the previous token.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the single gram of the previous token.
+ *
+ */
+ bool load(/* in */ phrase_token_t index,
+ /* out */ SingleGram * & single_gram);
+
+ /**
+ * Bigram::store:
+ * @index: the previous token in the bi-gram.
+ * @single_gram: the single gram of the previous token.
+ * @returns: whether the store operation is successful.
+ *
+ * Store the single gram of the previous token.
+ *
+ */
+ bool store(/* in */ phrase_token_t index,
+ /* in */ SingleGram * single_gram);
+
+ /**
+ * Bigram::remove:
+ * @index: the previous token in the bi-gram.
+ * @returns: whether the remove operation is successful.
+ *
+ * Remove the single gram of the previous token.
+ *
+ */
+ bool remove(/* in */ phrase_token_t index);
+
+ /**
+ * Bigram::get_all_items:
+ * @items: the GArray to store all previous tokens.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array of all previous tokens for parameter estimation.
+ *
+ */
+ bool get_all_items(/* out */ GArray * items);
+
+ /**
+ * Bigram::mask_out:
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: whether the mask out operation is successful.
+ *
+ * Mask out the matched items.
+ *
+ */
+ bool mask_out(phrase_token_t mask, phrase_token_t value);
+};
+
+/**
+ * merge_single_gram:
+ * @merged: the merged single gram of system and user single gram.
+ * @system: the system single gram to be merged.
+ * @user: the user single gram to be merged.
+ * @returns: whether the merge operation is successful.
+ *
+ * Merge the system and user single gram into one merged single gram.
+ *
+ * Note: Please keep system and user single gram
+ * when using merged single gram.
+ *
+ */
+bool merge_single_gram(SingleGram * merged, const SingleGram * system,
+ const SingleGram * user);
+
+};
+
+#endif
diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp
new file mode 100644
index 0000000..5fe61c2
--- /dev/null
+++ b/src/storage/phrase_index.cpp
@@ -0,0 +1,860 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "phrase_index.h"
+#include "pinyin_custom2.h"
+
+using namespace pinyin;
+
+bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
+ m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
+ return true;
+}
+
+bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys,
+ guint32 & freq){
+ guint8 phrase_length = get_phrase_length();
+ table_offset_t offset = phrase_item_header + phrase_length * sizeof( ucs4_t) + index * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32));
+
+ bool retval = m_chunk.get_content
+ (offset, keys, phrase_length * sizeof(ChewingKey));
+ if ( !retval )
+ return retval;
+ return m_chunk.get_content
+ (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32));
+}
+
+#if 0
+void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){
+ guint8 phrase_length = get_phrase_length();
+ set_n_pronunciation(get_n_pronunciation() + 1);
+ m_chunk.set_content(m_chunk.size(), keys,
+ phrase_length * sizeof(ChewingKey));
+ m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
+}
+#endif
+
+bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){
+ guint8 phrase_length = get_phrase_length();
+ guint8 npron = get_n_pronunciation();
+ size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
+ char * buf_begin = (char *) m_chunk.begin();
+ guint32 total_freq = 0;
+
+ for (int i = 0; i < npron; ++i) {
+ char * chewing_begin = buf_begin + offset +
+ i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+ guint32 * freq = (guint32 *)(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+
+ total_freq += *freq;
+
+ if (0 == pinyin_exact_compare2
+ (keys, (ChewingKey *)chewing_begin, phrase_length)) {
+ /* found the exact match pinyin keys. */
+
+ /* protect against total_freq overflow. */
+ if (delta > 0 && total_freq > total_freq + delta)
+ return false;
+
+ *freq += delta;
+ total_freq += delta;
+ return true;
+ }
+ }
+
+ set_n_pronunciation(npron + 1);
+ m_chunk.set_content(m_chunk.size(), keys,
+ phrase_length * sizeof(ChewingKey));
+ m_chunk.set_content(m_chunk.size(), &delta, sizeof(guint32));
+ return true;
+}
+
+void PhraseItem::remove_nth_pronunciation(size_t index){
+ guint8 phrase_length = get_phrase_length();
+ set_n_pronunciation(get_n_pronunciation() - 1);
+ size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t) +
+ index * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+ m_chunk.remove_content(offset, phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+}
+
+bool PhraseItem::get_phrase_string(ucs4_t * phrase){
+ guint8 phrase_length = get_phrase_length();
+ return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
+}
+
+bool PhraseItem::set_phrase_string(guint8 phrase_length, ucs4_t * phrase){
+ m_chunk.set_content(0, &phrase_length, sizeof(guint8));
+ m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t));
+ return true;
+}
+
+void PhraseItem::increase_pronunciation_possibility(pinyin_option_t options,
+ ChewingKey * keys,
+ gint32 delta){
+ guint8 phrase_length = get_phrase_length();
+ guint8 npron = get_n_pronunciation();
+ size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t);
+ char * buf_begin = (char *) m_chunk.begin();
+ guint32 total_freq = 0;
+
+ for (int i = 0; i < npron; ++i) {
+ char * chewing_begin = buf_begin + offset +
+ i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+ guint32 * freq = (guint32 *)(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+ total_freq += *freq;
+
+ if (0 == pinyin_compare_with_ambiguities2
+ (options, keys,
+ (ChewingKey *)chewing_begin, phrase_length)) {
+
+ /* protect against total_freq overflow. */
+ if (delta > 0 && total_freq > total_freq + delta)
+ return;
+
+ *freq += delta;
+ total_freq += delta;
+ }
+ }
+}
+
+
+guint32 SubPhraseIndex::get_phrase_index_total_freq(){
+ return m_total_freq;
+}
+
+int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
+ table_offset_t offset;
+ guint32 freq;
+ bool result = m_phrase_index.get_content
+ ((token & PHRASE_MASK)
+ * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+
+ if ( !result )
+ return ERROR_OUT_OF_RANGE;
+
+ if ( 0 == offset )
+ return ERROR_NO_ITEM;
+
+ result = m_phrase_content.get_content
+ (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
+
+ if ( !result )
+ return ERROR_FILE_CORRUPTION;
+
+ //protect total_freq overflow
+ if ( delta > 0 && m_total_freq > m_total_freq + delta )
+ return ERROR_INTEGER_OVERFLOW;
+
+ freq += delta;
+ m_total_freq += delta;
+ m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
+
+ return ERROR_OK;
+}
+
+int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
+ table_offset_t offset;
+ guint8 phrase_length;
+ guint8 n_prons;
+
+ bool result = m_phrase_index.get_content
+ ((token & PHRASE_MASK)
+ * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+
+ if ( !result )
+ return ERROR_OUT_OF_RANGE;
+
+ if ( 0 == offset )
+ return ERROR_NO_ITEM;
+
+ result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
+ if ( !result )
+ return ERROR_FILE_CORRUPTION;
+
+ result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
+ if ( !result )
+ return ERROR_FILE_CORRUPTION;
+
+ size_t length = phrase_item_header + phrase_length * sizeof ( ucs4_t ) + n_prons * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32) );
+ item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
+ return ERROR_OK;
+}
+
+int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
+ table_offset_t offset = m_phrase_content.size();
+ if ( 0 == offset )
+ offset = 8;
+ m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
+ m_phrase_index.set_content((token & PHRASE_MASK)
+ * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+ m_total_freq += item->get_unigram_frequency();
+ return ERROR_OK;
+}
+
+int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
+ PhraseItem old_item;
+
+ int result = get_phrase_item(token, old_item);
+ if (result != ERROR_OK)
+ return result;
+
+ item = new PhraseItem;
+ //implictly copy data from m_chunk_content.
+ item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size());
+
+ const table_offset_t zero_const = 0;
+ m_phrase_index.set_content((token & PHRASE_MASK)
+ * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
+ m_total_freq -= item->get_unigram_frequency();
+ return ERROR_OK;
+}
+
+bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases ){
+ sub_phrases = new SubPhraseIndex;
+ }
+
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+ bool retval = sub_phrases->load(chunk, 0, chunk->size());
+ if ( !retval )
+ return retval;
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+ return retval;
+}
+
+bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
+ table_offset_t end;
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases )
+ return false;
+
+ sub_phrases->store(new_chunk, 0, end);
+ return true;
+}
+
+bool FacadePhraseIndex::unload(guint8 phrase_index){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases )
+ return false;
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+ delete sub_phrases;
+ sub_phrases = NULL;
+ return true;
+}
+
+bool FacadePhraseIndex::diff(guint8 phrase_index, MemoryChunk * oldchunk,
+ MemoryChunk * newlog){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases )
+ return false;
+
+ SubPhraseIndex old_sub_phrases;
+ old_sub_phrases.load(oldchunk, 0, oldchunk->size());
+ PhraseIndexLogger logger;
+
+ bool retval = sub_phrases->diff(&old_sub_phrases, &logger);
+ logger.store(newlog);
+ return retval;
+}
+
+bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases )
+ return false;
+
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+ PhraseIndexLogger logger;
+ logger.load(log);
+
+ bool retval = sub_phrases->merge(&logger);
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+
+ return retval;
+}
+
+bool FacadePhraseIndex::merge_with_mask(guint8 phrase_index,
+ MemoryChunk * log,
+ phrase_token_t mask,
+ phrase_token_t value){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases )
+ return false;
+
+ /* check mask and value. */
+ phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
+ phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
+ if ((phrase_index & index_mask) != index_value)
+ return false;
+
+ /* unload old sub phrase index */
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+
+ /* calculate the sub phrase index mask and value. */
+ mask &= PHRASE_MASK; value &= PHRASE_MASK;
+
+ /* prepare the new logger. */
+ PhraseIndexLogger oldlogger;
+ oldlogger.load(log);
+ PhraseIndexLogger * newlogger = mask_out_phrase_index_logger
+ (&oldlogger, mask, value);
+
+ bool retval = sub_phrases->merge(newlogger);
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+ delete newlogger;
+
+ return retval;
+}
+
+
+bool SubPhraseIndex::load(MemoryChunk * chunk,
+ table_offset_t offset, table_offset_t end){
+ //save the memory chunk
+ if ( m_chunk ){
+ delete m_chunk;
+ m_chunk = NULL;
+ }
+ m_chunk = chunk;
+
+ char * buf_begin = (char *)chunk->begin();
+ chunk->get_content(offset, &m_total_freq, sizeof(guint32));
+ offset += sizeof(guint32);
+ table_offset_t index_one, index_two, index_three;
+ chunk->get_content(offset, &index_one, sizeof(table_offset_t));
+ offset += sizeof(table_offset_t);
+ chunk->get_content(offset, &index_two, sizeof(table_offset_t));
+ offset += sizeof(table_offset_t);
+ chunk->get_content(offset, &index_three, sizeof(table_offset_t));
+ offset += sizeof(table_offset_t);
+ g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
+ g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
+ g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
+ m_phrase_index.set_chunk(buf_begin + index_one,
+ index_two - 1 - index_one, NULL);
+ m_phrase_content.set_chunk(buf_begin + index_two,
+ index_three - 1 - index_two, NULL);
+ g_return_val_if_fail( index_three <= end, FALSE);
+ return true;
+}
+
+bool SubPhraseIndex::store(MemoryChunk * new_chunk,
+ table_offset_t offset, table_offset_t& end){
+ new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
+ table_offset_t index = offset + sizeof(guint32);
+
+ offset = index + sizeof(table_offset_t) * 3 ;
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
+ offset += m_phrase_index.size();
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+
+ new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
+ offset += m_phrase_content.size();
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ return true;
+}
+
+bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
+ /* diff the header */
+ MemoryChunk oldheader, newheader;
+ guint32 total_freq = oldone->get_phrase_index_total_freq();
+ oldheader.set_content(0, &total_freq, sizeof(guint32));
+ total_freq = get_phrase_index_total_freq();
+ newheader.set_content(0, &total_freq, sizeof(guint32));
+ logger->append_record(LOG_MODIFY_HEADER, null_token,
+ &oldheader, &newheader);
+
+ /* diff phrase items */
+ PhraseIndexRange oldrange, currange, range;
+ oldone->get_range(oldrange); get_range(currange);
+ range.m_range_begin = std_lite::min(oldrange.m_range_begin,
+ currange.m_range_begin);
+ range.m_range_end = std_lite::max(oldrange.m_range_end,
+ currange.m_range_end);
+ PhraseItem olditem, newitem;
+
+ for (phrase_token_t token = range.m_range_begin;
+ token < range.m_range_end; ++token ){
+ bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
+ bool newretval = ERROR_OK == get_phrase_item(token, newitem);
+
+ if ( oldretval ){
+ if ( newretval ) { /* compare phrase item. */
+ if ( olditem == newitem )
+ continue;
+ logger->append_record(LOG_MODIFY_RECORD, token,
+ &(olditem.m_chunk), &(newitem.m_chunk));
+ } else { /* remove phrase item. */
+ logger->append_record(LOG_REMOVE_RECORD, token,
+ &(olditem.m_chunk), NULL);
+ }
+ } else {
+ if ( newretval ){ /* add phrase item. */
+ logger->append_record(LOG_ADD_RECORD, token,
+ NULL, &(newitem.m_chunk));
+ } else { /* both empty. */
+ /* do nothing. */
+ }
+ }
+ }
+
+ return true;
+}
+
+bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
+ LOG_TYPE log_type; phrase_token_t token;
+ MemoryChunk oldchunk, newchunk;
+ PhraseItem olditem, newitem, item, * tmpitem;
+
+ while(logger->has_next_record()){
+ bool retval = logger->next_record
+ (log_type, token, &oldchunk, &newchunk);
+
+ if (!retval)
+ break;
+
+ switch(log_type){
+ case LOG_ADD_RECORD:{
+ assert( 0 == oldchunk.size() );
+ newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+ NULL);
+ add_phrase_item(token, &newitem);
+ break;
+ }
+ case LOG_REMOVE_RECORD:{
+ assert( 0 == newchunk.size() );
+ tmpitem = NULL;
+ remove_phrase_item(token, tmpitem);
+
+ olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+ NULL);
+
+ if (olditem != *tmpitem) {
+ delete tmpitem;
+ return false;
+ }
+
+ delete tmpitem;
+
+ break;
+ }
+ case LOG_MODIFY_RECORD:{
+ get_phrase_item(token, item);
+ olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+ NULL);
+ newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+ NULL);
+ if (item != olditem)
+ return false;
+
+ if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */
+ tmpitem = NULL;
+ remove_phrase_item(token, tmpitem);
+ assert(olditem == *tmpitem);
+ add_phrase_item(token, &newitem);
+ delete tmpitem;
+ } else { /* in place editing. */
+ /* newchunk.size() <= item.m_chunk.size() */
+ /* Hack here: we assume the behaviour of get_phrase_item
+ * point to the actual data positon, so changes to item
+ * will be saved in SubPhraseIndex immediately.
+ */
+ memmove(item.m_chunk.begin(), newchunk.begin(),
+ newchunk.size());
+ }
+ break;
+ }
+ case LOG_MODIFY_HEADER:{
+ guint32 total_freq = get_phrase_index_total_freq();
+ guint32 tmp_freq = 0;
+ assert(null_token == token);
+ assert(oldchunk.size() == newchunk.size());
+ oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
+ if (total_freq != tmp_freq)
+ return false;
+ newchunk.get_content(0, &tmp_freq, sizeof(guint32));
+ m_total_freq = tmp_freq;
+ break;
+ }
+ default:
+ assert(false);
+ }
+ }
+ return true;
+}
+
+bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases ){
+ sub_phrases = new SubPhraseIndex;
+ }
+
+ char pinyin[256];
+ char phrase[256];
+ phrase_token_t token;
+ size_t freq;
+
+ PhraseItem * item_ptr = new PhraseItem;
+ phrase_token_t cur_token = 0;
+
+ while (!feof(infile)){
+ int num = fscanf(infile, "%s %s %u %ld",
+ pinyin, phrase, &token, &freq);
+
+ if (4 != num)
+ continue;
+
+ if (feof(infile))
+ break;
+
+ assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );
+
+ glong written;
+ ucs4_t * phrase_ucs4 = g_utf8_to_ucs4(phrase, -1, NULL,
+ &written, NULL);
+
+ if ( 0 == cur_token ){
+ cur_token = token;
+ item_ptr->set_phrase_string(written, phrase_ucs4);
+ }
+
+ if ( cur_token != token ){
+ add_phrase_item( cur_token, item_ptr);
+ delete item_ptr;
+ item_ptr = new PhraseItem;
+ cur_token = token;
+ item_ptr->set_phrase_string(written, phrase_ucs4);
+ }
+
+ pinyin_option_t options = USE_TONE;
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests =
+ g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
+
+ if (item_ptr->get_phrase_length() == keys->len) {
+ item_ptr->add_pronunciation((ChewingKey *)keys->data, freq);
+ } else {
+ fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n",
+ pinyin, phrase);
+ }
+
+ g_array_free(keys, TRUE);
+ g_array_free(key_rests, TRUE);
+ g_free(phrase_ucs4);
+ }
+
+ add_phrase_item( cur_token, item_ptr);
+ delete item_ptr;
+#if 0
+ m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
+#endif
+ return true;
+}
+
+int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index,
+ guint8 & max_index){
+ min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0;
+ for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
+ if ( m_sub_phrase_indices[i] ) {
+ min_index = std_lite::min(min_index, i);
+ max_index = std_lite::max(max_index, i);
+ }
+ }
+ return ERROR_OK;
+}
+
+int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){
+ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrase )
+ return ERROR_NO_SUB_PHRASE_INDEX;
+
+ int result = sub_phrase->get_range(range);
+ if ( result )
+ return result;
+
+ range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin);
+ range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end);
+ return ERROR_OK;
+}
+
+int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){
+ const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin();
+ const table_offset_t * end = (const table_offset_t *)m_phrase_index.end();
+
+ if (begin == end) {
+ /* skip empty sub phrase index. */
+ range.m_range_begin = 1;
+ range.m_range_end = 1;
+ return ERROR_OK;
+ }
+
+ /* remove trailing zeros. */
+ const table_offset_t * poffset = 0;
+ for (poffset = end - 1; poffset >= begin + 1; --poffset) {
+ if (0 != *poffset)
+ break;
+ }
+
+ range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */
+ range.m_range_end = poffset + 1 - begin; /* removed zeros. */
+
+ return ERROR_OK;
+}
+
+bool FacadePhraseIndex::compact(){
+ for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
+ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
+ if ( !sub_phrase )
+ continue;
+
+ PhraseIndexRange range;
+ int result = sub_phrase->get_range(range);
+ if ( result != ERROR_OK )
+ continue;
+
+ SubPhraseIndex * new_sub_phrase = new SubPhraseIndex;
+
+ PhraseItem item;
+ for ( phrase_token_t token = range.m_range_begin;
+ token < range.m_range_end;
+ ++token ) {
+ result = sub_phrase->get_phrase_item(token, item);
+ if ( result != ERROR_OK )
+ continue;
+ new_sub_phrase->add_phrase_item(token, &item);
+ }
+
+ delete sub_phrase;
+ m_sub_phrase_indices[index] = new_sub_phrase;
+ }
+ return true;
+}
+
+bool SubPhraseIndex::mask_out(phrase_token_t mask, phrase_token_t value){
+ PhraseIndexRange range;
+ if (ERROR_OK != get_range(range))
+ return false;
+
+ /* calculate mask and value for sub phrase index. */
+ mask &= PHRASE_MASK; value &= PHRASE_MASK;
+
+ for (phrase_token_t token = range.m_range_begin;
+ token < range.m_range_end; ++token) {
+ if ((token & mask) != value)
+ continue;
+
+ PhraseItem * item = NULL;
+ remove_phrase_item(token, item);
+ if (item)
+ delete item;
+ }
+
+ return true;
+}
+
+bool FacadePhraseIndex::mask_out(guint8 phrase_index,
+ phrase_token_t mask,
+ phrase_token_t value){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if (!sub_phrases)
+ return false;
+
+ /* check mask and value. */
+ phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask);
+ phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value);
+
+ if ((phrase_index & index_mask ) != index_value)
+ return false;
+
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+ bool retval = sub_phrases->mask_out(mask, value);
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+
+ return retval;
+}
+
+namespace pinyin{
+
+
+static bool _peek_header(PhraseIndexLogger * logger,
+ guint32 & old_total_freq){
+ old_total_freq = 0;
+
+ size_t header_count = 0;
+ LOG_TYPE log_type; phrase_token_t token;
+ MemoryChunk oldchunk, newchunk;
+
+ while (logger->has_next_record()) {
+ bool retval = logger->next_record
+ (log_type, token, &oldchunk, &newchunk);
+
+ if (!retval)
+ break;
+
+ if (LOG_MODIFY_HEADER != log_type)
+ continue;
+
+ ++header_count;
+
+ oldchunk.get_content(0, &old_total_freq, sizeof(guint32));
+ }
+
+ /* 1 for normal case, 0 for corrupted file. */
+ assert(1 >= header_count);
+
+ return 1 == header_count? true : false;
+}
+
+bool _compute_new_header(PhraseIndexLogger * logger,
+ phrase_token_t mask,
+ phrase_token_t value,
+ guint32 & new_total_freq) {
+
+ LOG_TYPE log_type; phrase_token_t token;
+ MemoryChunk oldchunk, newchunk;
+ PhraseItem olditem, newitem;
+
+ while(logger->has_next_record()) {
+ bool retval = logger->next_record
+ (log_type, token, &oldchunk, &newchunk);
+
+ if (!retval)
+ break;
+
+ if (LOG_MODIFY_HEADER == log_type)
+ continue;
+
+ if ((token & mask) == value)
+ continue;
+
+ switch(log_type) {
+ case LOG_ADD_RECORD:{
+ assert( 0 == oldchunk.size() );
+ newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+ NULL);
+ new_total_freq += newitem.get_unigram_frequency();
+ break;
+ }
+ case LOG_REMOVE_RECORD:{
+ assert( 0 == newchunk.size() );
+ olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+ NULL);
+ new_total_freq -= olditem.get_unigram_frequency();
+ break;
+ }
+ case LOG_MODIFY_RECORD:{
+ olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+ NULL);
+ new_total_freq -= olditem.get_unigram_frequency();
+
+ newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+ NULL);
+ new_total_freq += newitem.get_unigram_frequency();
+ break;
+ }
+ default:
+ assert(false);
+ }
+ }
+
+ return true;
+}
+
+static bool _write_header(PhraseIndexLogger * logger,
+ guint32 & old_total_freq,
+ guint32 & new_total_freq) {
+ MemoryChunk oldheader, newheader;
+ oldheader.set_content(0, &old_total_freq, sizeof(guint32));
+ newheader.set_content(0, &new_total_freq, sizeof(guint32));
+ logger->append_record(LOG_MODIFY_HEADER, null_token,
+ &oldheader, &newheader);
+ return true;
+}
+
+static bool _mask_out_records(PhraseIndexLogger * oldlogger,
+ phrase_token_t mask,
+ phrase_token_t value,
+ PhraseIndexLogger * newlogger) {
+ LOG_TYPE log_type; phrase_token_t token;
+ MemoryChunk oldchunk, newchunk;
+
+ while(oldlogger->has_next_record()) {
+ bool retval = oldlogger->next_record
+ (log_type, token, &oldchunk, &newchunk);
+
+ if (!retval)
+ break;
+
+ if (LOG_MODIFY_HEADER == log_type)
+ continue;
+
+ if ((token & mask) == value)
+ continue;
+
+ newlogger->append_record(log_type, token, &oldchunk, &newchunk);
+ }
+
+ return true;
+}
+
+PhraseIndexLogger * mask_out_phrase_index_logger
+(PhraseIndexLogger * oldlogger, phrase_token_t mask,
+ phrase_token_t value) {
+ PhraseIndexLogger * newlogger = new PhraseIndexLogger;
+ guint32 old_total_freq = 0, new_total_freq = 0;
+
+ /* peek the header value. */
+ if (!_peek_header(oldlogger, old_total_freq))
+ return newlogger;
+
+ new_total_freq = old_total_freq;
+
+ /* compute the new header based on add/modify/remove records. */
+ oldlogger->rewind();
+ if (!_compute_new_header(oldlogger, mask, value, new_total_freq))
+ return newlogger;
+
+ /* write out the modify header record. */
+ _write_header(newlogger, old_total_freq, new_total_freq);
+
+ /* mask out the matched records. */
+ oldlogger->rewind();
+ _mask_out_records(oldlogger, mask, value, newlogger);
+
+ return newlogger;
+}
+
+};
diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h
new file mode 100644
index 0000000..e1dad0b
--- /dev/null
+++ b/src/storage/phrase_index.h
@@ -0,0 +1,839 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef PHRASE_INDEX_H
+#define PHRASE_INDEX_H
+
+#include <stdio.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "chewing_key.h"
+#include "pinyin_parser2.h"
+#include "pinyin_phrase2.h"
+#include "memory_chunk.h"
+#include "phrase_index_logger.h"
+
+/**
+ * Phrase Index File Format
+ *
+ * Indirect Index: Index by Token
+ * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * + Phrase Offset + Phrase Offset + Phrase Offset + ...... +
+ * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Phrase Content:
+ * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * + Phrase Length + number of Pronunciations + Uni-gram Frequency+
+ * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * + Phrase String(UCS4) + n Pronunciations with Frequency +
+ * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+
+namespace pinyin{
+
+/* Store delta info by phrase index logger in user home directory.
+ */
+
+const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
+
+/**
+ * PhraseItem:
+ *
+ * The PhraseItem to access the items in phrase index.
+ *
+ */
+class PhraseItem{
+ friend class SubPhraseIndex;
+ friend bool _compute_new_header(PhraseIndexLogger * logger,
+ phrase_token_t mask,
+ phrase_token_t value,
+ guint32 & new_total_freq);
+
+private:
+ MemoryChunk m_chunk;
+ bool set_n_pronunciation(guint8 n_prouns);
+public:
+ /**
+ * PhraseItem::PhraseItem:
+ *
+ * The constructor of the PhraseItem.
+ *
+ */
+ PhraseItem(){
+ m_chunk.set_size(phrase_item_header);
+ memset(m_chunk.begin(), 0, m_chunk.size());
+ }
+
+#if 0
+ PhraseItem(MemoryChunk & chunk){
+ m_chunk.set_content(0, chunk->begin(), chunk->size());
+ assert ( m_chunk.size() >= phrase_item_header);
+ }
+#endif
+
+ /**
+ * PhraseItem::get_phrase_length:
+ * @returns: the length of this phrase item.
+ *
+ * Get the length of this phrase item.
+ *
+ */
+ guint8 get_phrase_length(){
+ char * buf_begin = (char *)m_chunk.begin();
+ return (*(guint8 *)buf_begin);
+ }
+
+ /**
+ * PhraseItem::get_n_pronunciation:
+ * @returns: the number of the pronunciations.
+ *
+ * Get the number of the pronunciations.
+ *
+ */
+ guint8 get_n_pronunciation(){
+ char * buf_begin = ( char *) m_chunk.begin();
+ return (*(guint8 *)(buf_begin + sizeof(guint8)));
+ }
+
+ /**
+ * PhraseItem::get_unigram_frequency:
+ * @returns: the uni-gram frequency of this phrase item.
+ *
+ * Get the uni-gram frequency of this phrase item.
+ *
+ */
+ guint32 get_unigram_frequency(){
+ char * buf_begin = (char *)m_chunk.begin();
+ return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
+ }
+
+ /**
+ * PhraseItem::get_pronunciation_possibility:
+ * @options: the pinyin options.
+ * @keys: the pronunciation keys.
+ * @returns: the possibility of this phrase item pronounces the pinyin.
+ *
+ * Get the possibility of this phrase item pronounces the pinyin.
+ *
+ */
+ gfloat get_pronunciation_possibility(pinyin_option_t options,
+ ChewingKey * keys){
+ guint8 phrase_length = get_phrase_length();
+ guint8 npron = get_n_pronunciation();
+ size_t offset = phrase_item_header + phrase_length * sizeof (ucs4_t);
+ char * buf_begin = (char *)m_chunk.begin();
+ guint32 matched = 0, total_freq =0;
+ for ( int i = 0 ; i < npron ; ++i){
+ char * chewing_begin = buf_begin + offset +
+ i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
+ guint32 * freq = (guint32 *)(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+ total_freq += *freq;
+ if ( 0 == pinyin_compare_with_ambiguities2
+ (options, keys,
+ (ChewingKey *)chewing_begin,phrase_length) ){
+ matched += *freq;
+ }
+ }
+
+#if 1
+ /* an additional safe guard for chewing. */
+ if ( 0 == total_freq )
+ return 0;
+#endif
+
+ /* used preprocessor to avoid zero freq, in gen_chewing_table. */
+ gfloat retval = matched / (gfloat) total_freq;
+ return retval;
+ }
+
+ /**
+ * PhraseItem::increase_pronunciation_possibility:
+ * @options: the pinyin options.
+ * @keys: the pronunciation keys.
+ * @delta: the delta to be added to the pronunciation keys.
+ *
+ * Add the delta to the pronunciation of the pronunciation keys.
+ *
+ */
+ void increase_pronunciation_possibility(pinyin_option_t options,
+ ChewingKey * keys,
+ gint32 delta);
+
+ /**
+ * PhraseItem::get_phrase_string:
+ * @phrase: the ucs4 character buffer.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the ucs4 characters of this phrase item.
+ *
+ */
+ bool get_phrase_string(ucs4_t * phrase);
+
+ /**
+ * PhraseItem::set_phrase_string:
+ * @phrase_length: the ucs4 character length of this phrase item.
+ * @phrase: the ucs4 character buffer.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the length and ucs4 characters of this phrase item.
+ *
+ */
+ bool set_phrase_string(guint8 phrase_length, ucs4_t * phrase);
+
+ /**
+ * PhraseItem::get_nth_pronunciation:
+ * @index: the pronunciation index.
+ * @keys: the pronunciation keys.
+ * @freq: the frequency of the pronunciation.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the nth pronunciation of this phrase item.
+ *
+ */
+ bool get_nth_pronunciation(size_t index,
+ /* out */ ChewingKey * keys,
+ /* out */ guint32 & freq);
+
+ /**
+ * PhraseItem::add_pronunciation:
+ * @keys: the pronunciation keys.
+ * @delta: the delta of the frequency of the pronunciation.
+ * @returns: whether the add operation is successful.
+ *
+ * Add one pronunciation.
+ *
+ */
+ bool add_pronunciation(ChewingKey * keys, guint32 delta);
+
+ /**
+ * PhraseItem::remove_nth_pronunciation:
+ * @index: the pronunciation index.
+ *
+ * Remove the nth pronunciation.
+ *
+ * Note: Normally don't change the first pronunciation,
+ * which decides the token number.
+ *
+ */
+ void remove_nth_pronunciation(size_t index);
+
+ bool operator == (const PhraseItem & rhs) const{
+ if (m_chunk.size() != rhs.m_chunk.size())
+ return false;
+ return memcmp(m_chunk.begin(), rhs.m_chunk.begin(),
+ m_chunk.size()) == 0;
+ }
+
+ bool operator != (const PhraseItem & rhs) const{
+ return ! (*this == rhs);
+ }
+};
+
+/*
+ * In Sub Phrase Index, token == (token & PHRASE_MASK).
+ */
+
+/**
+ * SubPhraseIndex:
+ *
+ * The SubPhraseIndex class for internal usage.
+ *
+ */
+class SubPhraseIndex{
+private:
+ guint32 m_total_freq;
+ MemoryChunk m_phrase_index;
+ MemoryChunk m_phrase_content;
+ MemoryChunk * m_chunk;
+
+ void reset(){
+ m_total_freq = 0;
+ m_phrase_index.set_size(0);
+ m_phrase_content.set_size(0);
+ if ( m_chunk ){
+ delete m_chunk;
+ m_chunk = NULL;
+ }
+ }
+
+public:
+ /**
+ * SubPhraseIndex::SubPhraseIndex:
+ *
+ * The constructor of the SubPhraseIndex.
+ *
+ */
+ SubPhraseIndex():m_total_freq(0){
+ m_chunk = NULL;
+ }
+
+ /**
+ * SubPhraseIndex::~SubPhraseIndex:
+ *
+ * The destructor of the SubPhraseIndex.
+ *
+ */
+ ~SubPhraseIndex(){
+ reset();
+ }
+
+ /**
+ * SubPhraseIndex::load:
+ * @chunk: the memory chunk of the binary sub phrase index.
+ * @offset: the begin of binary data in the memory chunk.
+ * @end: the end of binary data in the memory chunk.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the sub phrase index from the memory chunk.
+ *
+ */
+ bool load(MemoryChunk * chunk,
+ table_offset_t offset, table_offset_t end);
+
+ /**
+ * SubPhraseIndex::store:
+ * @new_chunk: the new memory chunk to store this sub phrase index.
+ * @offset: the begin of binary data in the memory chunk.
+ * @end: the end of stored binary data in the memory chunk.
+ * @returns: whether the store operation is successful.
+ *
+ * Store the sub phrase index to the new memory chunk.
+ *
+ */
+ bool store(MemoryChunk * new_chunk,
+ table_offset_t offset, table_offset_t & end);
+
+ /**
+ * SubPhraseIndex::diff:
+ * @oldone: the original content of sub phrase index.
+ * @logger: the delta information of user self-learning data.
+ * @returns: whether the diff operation is successful.
+ *
+ * Compare this sub phrase index with the original content of the system
+ * sub phrase index to generate the logger of difference.
+ *
+ * Note: Switch to logger format to reduce user space storage.
+ *
+ */
+ bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger);
+
+ /**
+ * SubPhraseIndex::merge:
+ * @logger: the logger of difference in user home directory.
+ * @returns: whether the merge operation is successful.
+ *
+ * Merge the user logger of difference with this sub phrase index.
+ *
+ */
+ bool merge(PhraseIndexLogger * logger);
+
+ /**
+ * SubPhraseIndex::get_range:
+ * @range: the token range.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the token range in this sub phrase index.
+ *
+ */
+ int get_range(/* out */ PhraseIndexRange & range);
+
+ /**
+ * SubPhraseIndex::get_phrase_index_total_freq:
+ * @returns: the total frequency of this sub phrase index.
+ *
+ * Get the total frequency of this sub phrase index.
+ *
+ * Note: maybe call it "Zero-gram".
+ *
+ */
+ guint32 get_phrase_index_total_freq();
+
+ /**
+ * SubPhraseIndex::add_unigram_frequency:
+ * @token: the phrase token.
+ * @delta: the delta value of the phrase token.
+ * @returns: the status of the add operation.
+ *
+ * Add delta value to the phrase of the token.
+ *
+ * Note: this method is a fast path to add delta value.
+ * Maybe use the get_phrase_item method instead in future.
+ *
+ */
+ int add_unigram_frequency(phrase_token_t token, guint32 delta);
+
+ /**
+ * SubPhraseIndex::get_phrase_item:
+ * @token: the phrase token.
+ * @item: the phrase item of the token.
+ * @returns: the status of the get operation.
+ *
+ * Get the phrase item from this sub phrase index.
+ *
+ * Note:get_phrase_item function can't modify the phrase item size,
+ * but can increment the freq of the special pronunciation,
+ * or change the content without size increasing.
+ *
+ */
+ int get_phrase_item(phrase_token_t token, PhraseItem & item);
+
+ /**
+ * SubPhraseIndex::add_phrase_item:
+ * @token: the phrase token.
+ * @item: the phrase item of the token.
+ * @returns: the status of the add operation.
+ *
+ * Add the phrase item to this sub phrase index.
+ *
+ */
+ int add_phrase_item(phrase_token_t token, PhraseItem * item);
+
+ /**
+ * SubPhraseIndex::remove_phrase_item:
+ * @token: the phrase token.
+ * @item: the removed phrase item of the token.
+ * @returns: the status of the remove operation.
+ *
+ * Remove the phrase item of the token.
+ *
+ * Note: this remove_phrase_item method will substract the unigram
+ * frequency of the removed item from m_total_freq.
+ *
+ */
+ int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);
+
+ /**
+ * SubPhraseIndex::mask_out:
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: whether the mask out operation is successful.
+ *
+ * Mask out the matched phrase items.
+ *
+ */
+ bool mask_out(phrase_token_t mask, phrase_token_t value);
+};
+
+/**
+ * FacadePhraseIndex:
+ *
+ * The facade class of phrase index.
+ *
+ */
+class FacadePhraseIndex{
+private:
+ guint32 m_total_freq;
+ SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
+public:
+ /**
+ * FacadePhraseIndex::FacadePhraseIndex:
+ *
+ * The constructor of the FacadePhraseIndex.
+ *
+ */
+ FacadePhraseIndex(){
+ m_total_freq = 0;
+ memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
+ }
+
+ /**
+ * FacadePhraseIndex::~FacadePhraseIndex:
+ *
+ * The destructor of the FacadePhraseIndex.
+ *
+ */
+ ~FacadePhraseIndex(){
+ for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
+ if ( m_sub_phrase_indices[i] ){
+ delete m_sub_phrase_indices[i];
+ m_sub_phrase_indices[i] = NULL;
+ }
+ }
+ }
+
+ /**
+ * FacadePhraseIndex::load_text:
+ * @phrase_index: the index of sub phrase index to be loaded.
+ * @infile: the textual format file of the phrase table.
+ * @returns: whether the load operation is successful.
+ *
+ * Load one sub phrase index from the textual format file.
+ * Note: load sub phrase index according to the config in future.
+ *
+ */
+ bool load_text(guint8 phrase_index, FILE * infile);
+
+ /**
+ * FacadePhraseIndex::load:
+ * @phrase_index: the index of sub phrase index to be loaded.
+ * @chunk: the memory chunk of sub phrase index to be loaded.
+ * @returns: whether the load operation is successful.
+ *
+ * Load one sub phrase index from the memory chunk.
+ *
+ */
+ bool load(guint8 phrase_index, MemoryChunk * chunk);
+
+ /**
+ * FacadePhraseIndex::store:
+ * @phrase_index: the index of sub phrase index to be stored.
+ * @new_chunk: the memory chunk of sub phrase index to be stored.
+ * @returns: whether the store operation is successful.
+ *
+ * Store one sub phrase index to the memory chunk.
+ *
+ */
+ bool store(guint8 phrase_index, MemoryChunk * new_chunk);
+
+ /**
+ * FacadePhraseIndex::unload:
+ * @phrase_index: the index of sub phrase index to be unloaded.
+ * @returns: whether the unload operation is successful.
+ *
+ * Unload one sub phrase index.
+ *
+ */
+ bool unload(guint8 phrase_index);
+
+
+ /**
+ * FacadePhraseIndex::diff:
+ * @phrase_index: the index of sub phrase index to be differed.
+ * @oldchunk: the original content of sub phrase index.
+ * @newlog: the delta information of user self-learning data.
+ * @returns: whether the diff operation is successful.
+ *
+ * Store user delta information in the logger format.
+ *
+ * Note: the ownership of oldchunk is transfered here.
+ *
+ */
+ bool diff(guint8 phrase_index, MemoryChunk * oldchunk,
+ MemoryChunk * newlog);
+
+ /**
+ * FacadePhraseIndex::merge:
+ * @phrase_index: the index of sub phrase index to be merged.
+ * @log: the logger of difference in user home directory.
+ * @returns: whether the merge operation is successful.
+ *
+ * Merge the user logger of difference with the sub phrase index.
+ *
+ * Note: the ownership of log is transfered here.
+ *
+ */
+ bool merge(guint8 phrase_index, MemoryChunk * log);
+
+ /**
+ * FacadePhraseIndex::merge_with_mask:
+ * @phrase_index: the index of sub phrase index to be merged.
+ * @log: the logger of difference in user home directory.
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: whether the merge operation is successful.
+ *
+ * Merge the user logger of difference with mask operation.
+ *
+ * Note: the ownership of log is transfered here.
+ *
+ */
+ bool merge_with_mask(guint8 phrase_index, MemoryChunk * log,
+ phrase_token_t mask, phrase_token_t value);
+
+ /**
+ * FacadePhraseIndex::compact:
+ * @returns: whether the compact operation is successful.
+ *
+ * Compat all sub phrase index memory usage.
+ *
+ */
+ bool compact();
+
+ /**
+ * FacadePhraseIndex::mask_out:
+ * @phrase_index: the index of sub phrase index.
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: whether the mask out operation is successful.
+ *
+ * Mask out the matched phrase items.
+ *
+ * Note: should call compact() after the mask out operation.
+ *
+ */
+ bool mask_out(guint8 phrase_index,
+ phrase_token_t mask, phrase_token_t value);
+
+ /**
+ * FacadePhraseIndex::get_sub_phrase_range:
+ * @min_index: the minimal sub phrase index.
+ * @max_index: the maximal sub phrase index.
+ * @returns: the status of the get operation.
+ *
+ * Get the minimum and maximum of the sub phrase index.
+ *
+ */
+ int get_sub_phrase_range(guint8 & min_index, guint8 & max_index);
+
+ /**
+ * FacadePhraseIndex::get_range:
+ * @phrase_index: the index of sub phrase index.
+ * @range: the token range of the sub phrase index.
+ * @returns: the status of the get operation.
+ *
+ * Get the token range of the sub phrase index.
+ *
+ */
+ int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range);
+
+ /**
+ * FacadePhraseIndex::get_phrase_index_total_freq:
+ * @returns: the total freq of the facade phrase index.
+ *
+ * Get the total freq of the facade phrase index.
+ *
+ * Note: maybe call it "Zero-gram".
+ *
+ */
+ guint32 get_phrase_index_total_freq(){
+ return m_total_freq;
+ }
+
+ /**
+ * FacadePhraseIndex::add_unigram_frequency:
+ * @token: the phrase token.
+ * @delta: the delta value of the phrase token.
+ * @returns: the status of the add operation.
+ *
+ * Add delta value to the phrase of the token.
+ *
+ */
+ int add_unigram_frequency(phrase_token_t token, guint32 delta){
+ guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
+ if ( !sub_phrase )
+ return ERROR_NO_SUB_PHRASE_INDEX;
+ m_total_freq += delta;
+ return sub_phrase->add_unigram_frequency(token, delta);
+ }
+
+ /**
+ * FacadePhraseIndex::get_phrase_item:
+ * @token: the phrase token.
+ * @item: the phrase item of the token.
+ * @returns: the status of the get operation.
+ *
+ * Get the phrase item from the facade phrase index.
+ *
+ */
+ int get_phrase_item(phrase_token_t token, PhraseItem & item){
+ guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
+ if ( !sub_phrase )
+ return ERROR_NO_SUB_PHRASE_INDEX;
+ return sub_phrase->get_phrase_item(token, item);
+ }
+
+ /**
+ * FacadePhraseIndex::add_phrase_item:
+ * @token: the phrase token.
+ * @item: the phrase item of the token.
+ * @returns: the status of the add operation.
+ *
+ * Add the phrase item to the facade phrase index.
+ *
+ */
+ int add_phrase_item(phrase_token_t token, PhraseItem * item){
+ guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+ SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
+ if ( !sub_phrase ){
+ sub_phrase = new SubPhraseIndex;
+ }
+ m_total_freq += item->get_unigram_frequency();
+ return sub_phrase->add_phrase_item(token, item);
+ }
+
+ /**
+ * FacadePhraseIndex::remove_phrase_item:
+ * @token: the phrase token.
+ * @item: the removed phrase item of the token.
+ * @returns: the status of the remove operation.
+ *
+ * Remove the phrase item of the token.
+ *
+ */
+ int remove_phrase_item(phrase_token_t token, PhraseItem * & item){
+ guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+ SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
+ if ( !sub_phrase ){
+ return ERROR_NO_SUB_PHRASE_INDEX;
+ }
+ int result = sub_phrase->remove_phrase_item(token, item);
+ if ( result )
+ return result;
+ m_total_freq -= item->get_unigram_frequency();
+ return result;
+ }
+
+ /**
+ * FacadePhraseIndex::prepare_ranges:
+ * @ranges: the ranges to be prepared.
+ * @returns: whether the prepare operation is successful.
+ *
+ * Prepare the ranges.
+ *
+ */
+ bool prepare_ranges(PhraseIndexRanges ranges) {
+ /* assume memset(ranges, 0, sizeof(ranges)); */
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ GArray * & range = ranges[i];
+ assert(NULL == range);
+
+ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i];
+ if (sub_phrase) {
+ range = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange));
+ }
+ }
+ return true;
+ }
+
+ /**
+ * FacadePhraseIndex::clear_ranges:
+ * @ranges: the ranges to be cleared.
+ * @returns: whether the clear operation is successful.
+ *
+ * Clear the ranges.
+ *
+ */
+ bool clear_ranges(PhraseIndexRanges ranges) {
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ GArray * range = ranges[i];
+ if (range) {
+ g_array_set_size(range, 0);
+ }
+ }
+ return true;
+ }
+
+ /**
+ * FacadePhraseIndex::destroy_ranges:
+ * @ranges: the ranges to be destroyed.
+ * @returns: whether the destroy operation is successful.
+ *
+ * Destroy the ranges.
+ *
+ */
+ bool destroy_ranges(PhraseIndexRanges ranges) {
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ GArray * & range = ranges[i];
+ if (range) {
+ g_array_free(range, TRUE);
+ range = NULL;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * FacadePhraseIndex::prepare_tokens:
+ * @tokens: the tokens to be prepared.
+ * @returns: whether the prepare operation is successful.
+ *
+ * Prepare the tokens.
+ *
+ */
+ bool prepare_tokens(PhraseTokens tokens) {
+ /* assume memset(tokens, 0, sizeof(tokens)); */
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ GArray * & token = tokens[i];
+ assert(NULL == token);
+
+ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i];
+ if (sub_phrase) {
+ token = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ }
+ }
+ return true;
+ }
+
+ /**
+ * FacadePhraseIndex::clear_tokens:
+ * @tokens: the tokens to be cleared.
+ * @return: whether the clear operation is successful.
+ *
+ * Clear the tokens.
+ *
+ */
+ bool clear_tokens(PhraseTokens tokens) {
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ GArray * token = tokens[i];
+ if (token) {
+ g_array_set_size(token, 0);
+ }
+ }
+ return true;
+ }
+
+ /**
+ * FacadePhraseIndex::destroy_tokens:
+ * @tokens: the tokens to be destroyed.
+ * @returns: whether the destroy operation is successful.
+ *
+ * Destroy the tokens.
+ *
+ */
+ bool destroy_tokens(PhraseTokens tokens) {
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ GArray * & token = tokens[i];
+ if (token) {
+ g_array_free(token, TRUE);
+ token = NULL;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * FacadePhraseIndex::create_sub_phrase:
+ * @index: the phrase index to be created.
+ * @returns: the result of the create operation.
+ *
+ * Create the sub phrase index.
+ *
+ */
+ int create_sub_phrase(guint8 index) {
+ SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
+ if (sub_phrase) {
+ return ERROR_ALREADY_EXISTS;
+ }
+
+ sub_phrase = new SubPhraseIndex;
+
+ return ERROR_OK;
+ }
+};
+
+PhraseIndexLogger * mask_out_phrase_index_logger
+(PhraseIndexLogger * oldlogger, phrase_token_t mask, phrase_token_t value);
+
+};
+
+#endif
diff --git a/src/storage/phrase_index_logger.h b/src/storage/phrase_index_logger.h
new file mode 100644
index 0000000..06f933e
--- /dev/null
+++ b/src/storage/phrase_index_logger.h
@@ -0,0 +1,305 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef PHRASE_LOGGER_H
+#define PHRASE_LOGGER_H
+
+#include <assert.h>
+#include "novel_types.h"
+#include "memory_chunk.h"
+
+/**
+ * File Format
+ * Logger Record type: add/remove/modify
+ *
+ * Modify Header: header/null token/len/old data chunk/new data chunk
+ *
+ * Add Record: add/token/len/data chunk
+ * Remove Record: remove/token/len/data chunk
+ * Modify Record: modify/token/old len/new len/old data chunk/new data chunk
+ *
+ */
+
+namespace pinyin{
+
+enum LOG_TYPE{
+ LOG_ADD_RECORD = 1,
+ LOG_REMOVE_RECORD,
+ LOG_MODIFY_RECORD,
+ LOG_MODIFY_HEADER
+};
+
+
+/**
+ * PhraseIndexLogger:
+ *
+ * The logger of phrase index changes.
+ *
+ */
+class PhraseIndexLogger{
+protected:
+ MemoryChunk * m_chunk;
+ size_t m_offset;
+ bool m_error;
+
+ void reset(){
+ if ( m_chunk ){
+ delete m_chunk;
+ m_chunk = NULL;
+ }
+ m_offset = 0;
+ m_error = false;
+ }
+public:
+ /**
+ * PhraseIndexLogger::PhraseIndexLogger:
+ *
+ * The constructor of the PhraseIndexLogger.
+ *
+ */
+ PhraseIndexLogger():m_offset(0), m_error(false){
+ m_chunk = new MemoryChunk;
+ }
+
+ /**
+ * PhraseIndexLogger::~PhraseIndexLogger:
+ *
+ * The destructor of the PhraseIndexLogger.
+ *
+ */
+ ~PhraseIndexLogger(){
+ reset();
+ }
+
+ /**
+ * PhraseIndexLogger::load:
+ * @chunk: the memory chunk of the logs.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the logs from the memory chunk.
+ *
+ */
+ bool load(MemoryChunk * chunk) {
+ reset();
+ m_chunk = chunk;
+ return true;
+ }
+
+ /**
+ * PhraseIndexLogger::store:
+ * @new_chunk: the new memory chunk to store the logs.
+ * @returns: whether the store operation is successful.
+ *
+ * Store the logs to the new memory chunk.
+ *
+ */
+ bool store(MemoryChunk * new_chunk){
+ new_chunk->set_content(0, m_chunk->begin(), m_chunk->size());
+ return true;
+ }
+
+ /**
+ * PhraseIndexLogger::has_next_record:
+ * @returns: whether this logger has next record.
+ *
+ * Whether this logger has next record.
+ *
+ */
+ bool has_next_record(){
+ if (m_error)
+ return false;
+
+ return m_offset < m_chunk->size();
+ }
+
+ /**
+ * PhraseIndexLogger::rewind:
+ * @returns: whether the rewind operation is successful.
+ *
+ * Rewind this logger to the begin of logs.
+ *
+ */
+ bool rewind(){
+ m_offset = 0;
+ return true;
+ }
+
+ /**
+ * PhraseIndexLogger::next_record:
+ * @log_type: the type of this log record.
+ * @token: the token of this log record.
+ * @oldone: the original content of the phrase item.
+ * @newone: the new content of the phrase item.
+ *
+ * Read the next log record.
+ *
+ * Prolog: has_next_record() returned true.
+ *
+ */
+ bool next_record(LOG_TYPE & log_type, phrase_token_t & token,
+ MemoryChunk * oldone, MemoryChunk * newone){
+ size_t offset = m_offset;
+ m_chunk->get_content(offset, &log_type, sizeof(LOG_TYPE));
+ offset += sizeof(LOG_TYPE);
+ m_chunk->get_content(offset, &token, sizeof(phrase_token_t));
+ offset += sizeof(phrase_token_t);
+
+ oldone->set_size(0); newone->set_size(0);
+
+ switch(log_type){
+ case LOG_ADD_RECORD:{
+ guint16 len = 0;
+ m_chunk->get_content(offset, &len, sizeof(guint16));
+ offset += sizeof(guint16);
+ newone->set_content(0, ((char *)m_chunk->begin()) + offset, len);
+ offset += len;
+ break;
+ }
+ case LOG_REMOVE_RECORD:{
+ guint16 len = 0;
+ m_chunk->get_content(offset, &len, sizeof(guint16));
+ offset += sizeof(guint16);
+ oldone->set_content(0, ((char *)m_chunk->begin()) + offset, len);
+ offset += len;
+ break;
+ }
+ case LOG_MODIFY_RECORD:{
+ guint16 oldlen = 0, newlen = 0;
+ m_chunk->get_content(offset, &oldlen, sizeof(guint16));
+ offset += sizeof(guint16);
+ m_chunk->get_content(offset, &newlen, sizeof(guint16));
+ offset += sizeof(guint16);
+ oldone->set_content(0, ((char *)m_chunk->begin()) + offset,
+ oldlen);
+ offset += oldlen;
+ newone->set_content(0, ((char *)m_chunk->begin()) + offset, newlen);
+ offset += newlen;
+ break;
+ }
+ case LOG_MODIFY_HEADER:{
+ assert(token == null_token);
+ guint16 len = 0;
+ m_chunk->get_content(offset, &len, sizeof(guint16));
+ offset += sizeof(guint16);
+ oldone->set_content(0, ((char *)m_chunk->begin()) + offset,
+ len);
+ offset += len;
+ newone->set_content(0, ((char *)m_chunk->begin()) + offset,
+ len);
+ offset += len;
+ break;
+ }
+ default:
+ m_error = true;
+ return false;
+ }
+
+ m_offset = offset;
+ return true;
+ }
+
+ /**
+ * PhraseIndexLogger::append_record:
+ * @log_type: the type of this log record.
+ * @token: the token of this log record.
+ * @oldone: the original content of the phrase item.
+ * @newone: the new content of the phrase item.
+ *
+ * Append one log record to the logger.
+ *
+ */
+ bool append_record(LOG_TYPE log_type, phrase_token_t token,
+ MemoryChunk * oldone, MemoryChunk * newone){
+
+ MemoryChunk chunk;
+ size_t offset = 0;
+ chunk.set_content(offset, &log_type, sizeof(LOG_TYPE));
+ offset += sizeof(LOG_TYPE);
+ chunk.set_content(offset, &token, sizeof(phrase_token_t));
+ offset += sizeof(phrase_token_t);
+
+ switch(log_type){
+ case LOG_ADD_RECORD:{
+ assert( NULL == oldone );
+ assert( NULL != newone );
+ /* use newone chunk */
+ guint16 len = newone->size();
+ chunk.set_content(offset, &len, sizeof(guint16));
+ offset += sizeof(guint16);
+ chunk.set_content(offset, newone->begin(), newone->size());
+ offset += newone->size();
+ break;
+ }
+ case LOG_REMOVE_RECORD:{
+ assert(NULL != oldone);
+ assert(NULL == newone);
+ /* use oldone chunk */
+ guint16 len = oldone->size();
+ chunk.set_content(offset, &len, sizeof(guint16));
+ offset += sizeof(guint16);
+ chunk.set_content(offset, oldone->begin(), oldone->size());
+ offset += oldone->size();
+ break;
+ }
+ case LOG_MODIFY_RECORD:{
+ assert(NULL != oldone);
+ assert(NULL != newone);
+ guint16 oldlen = oldone->size();
+ guint16 newlen = newone->size();
+ chunk.set_content(offset, &oldlen, sizeof(guint16));
+ offset += sizeof(guint16);
+ chunk.set_content(offset, &newlen, sizeof(guint16));
+ offset += sizeof(guint16);
+ chunk.set_content(offset, oldone->begin(), oldone->size());
+ offset += oldlen;
+ chunk.set_content(offset, newone->begin(), newone->size());
+ offset += newlen;
+ break;
+ }
+ case LOG_MODIFY_HEADER:{
+ assert(NULL != oldone);
+ assert(NULL != newone);
+ assert(null_token == token);
+ guint16 oldlen = oldone->size();
+ guint16 newlen = newone->size();
+ assert(oldlen == newlen);
+ chunk.set_content(offset, &oldlen, sizeof(guint16));
+ offset += sizeof(guint16);
+ chunk.set_content(offset, oldone->begin(), oldone->size());
+ offset += oldlen;
+ chunk.set_content(offset, newone->begin(), newone->size());
+ offset += newlen;
+ break;
+ }
+ default:
+ assert(false);
+ }
+
+ /* store log record. */
+ m_chunk->set_content(m_chunk->size(), chunk.begin(), chunk.size());
+ return true;
+ }
+};
+
+};
+
+#endif
diff --git a/src/storage/phrase_large_table2.cpp b/src/storage/phrase_large_table2.cpp
new file mode 100644
index 0000000..f7d8ae2
--- /dev/null
+++ b/src/storage/phrase_large_table2.cpp
@@ -0,0 +1,809 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <assert.h>
+#include <string.h>
+#include "phrase_large_table2.h"
+
+
+/* class definition */
+
+namespace pinyin{
+
+class PhraseLengthIndexLevel2{
+protected:
+ GArray * m_phrase_array_indexes;
+public:
+ PhraseLengthIndexLevel2();
+ ~PhraseLengthIndexLevel2();
+
+ /* load/store method */
+ bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
+
+ /* search method */
+ int search(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const;
+
+ /* add_index/remove_index method */
+ int add_index(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* in */ phrase_token_t token);
+ int remove_index(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* in */ phrase_token_t token);
+
+ /* get length method */
+ int get_length() const;
+
+ /* mask out method */
+ bool mask_out(phrase_token_t mask, phrase_token_t value);
+};
+
+
+template<size_t phrase_length>
+struct PhraseIndexItem2{
+ phrase_token_t m_token;
+ ucs4_t m_phrase[phrase_length];
+public:
+ PhraseIndexItem2<phrase_length>(const ucs4_t phrase[], phrase_token_t token){
+ memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length);
+ m_token = token;
+ }
+};
+
+
+template<size_t phrase_length>
+class PhraseArrayIndexLevel2{
+protected:
+ typedef PhraseIndexItem2<phrase_length> IndexItem;
+
+protected:
+ MemoryChunk m_chunk;
+public:
+ bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
+
+ /* search method */
+ int search(/* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const;
+
+ /* add_index/remove_index method */
+ int add_index(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token);
+ int remove_index(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token);
+
+ /* get length method */
+ int get_length() const;
+
+ /* mask out method */
+ bool mask_out(phrase_token_t mask, phrase_token_t value);
+};
+
+};
+
+using namespace pinyin;
+
+/* class implementation */
+
+template<size_t phrase_length>
+static int phrase_compare2(const PhraseIndexItem2<phrase_length> &lhs,
+ const PhraseIndexItem2<phrase_length> &rhs){
+ ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase;
+ ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase;
+
+ return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length);
+}
+
+template<size_t phrase_length>
+static bool phrase_less_than2(const PhraseIndexItem2<phrase_length> & lhs,
+ const PhraseIndexItem2<phrase_length> & rhs){
+ return 0 > phrase_compare2(lhs, rhs);
+}
+
+PhraseBitmapIndexLevel2::PhraseBitmapIndexLevel2(){
+ memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes));
+}
+
+void PhraseBitmapIndexLevel2::reset(){
+ for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){
+ PhraseLengthIndexLevel2 * & length_array =
+ m_phrase_length_indexes[i];
+ if ( length_array )
+ delete length_array;
+ length_array = NULL;
+ }
+}
+
+
+/* search method */
+
+int PhraseBitmapIndexLevel2::search(int phrase_length,
+ /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const {
+ assert(phrase_length > 0);
+
+ int result = SEARCH_NONE;
+ /* use the first 8-bit of the lower 16-bit for bitmap index,
+ * as most the higher 16-bit are zero.
+ */
+ guint8 first_key = (phrase[0] & 0xFF00) >> 8;
+
+ PhraseLengthIndexLevel2 * phrase_array = m_phrase_length_indexes[first_key];
+ if ( phrase_array )
+ return phrase_array->search(phrase_length, phrase, tokens);
+ return result;
+}
+
+PhraseLengthIndexLevel2::PhraseLengthIndexLevel2(){
+ m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
+}
+
+PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){
+#define CASE(len) case len: \
+ { \
+ PhraseArrayIndexLevel2<len> * & array = g_array_index \
+ (m_phrase_array_indexes, \
+ PhraseArrayIndexLevel2<len> *, len - 1); \
+ if ( array ) { \
+ delete array; \
+ array = NULL; \
+ } \
+ break; \
+ }
+
+ for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i){
+ switch (i){
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ default:
+ assert(false);
+ }
+ }
+ g_array_free(m_phrase_array_indexes, TRUE);
+#undef CASE
+}
+
+int PhraseLengthIndexLevel2::search(int phrase_length,
+ /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const {
+ int result = SEARCH_NONE;
+ if(m_phrase_array_indexes->len < phrase_length)
+ return result;
+ if (m_phrase_array_indexes->len > phrase_length)
+ result |= SEARCH_CONTINUED;
+
+#define CASE(len) case len: \
+ { \
+ PhraseArrayIndexLevel2<len> * array = g_array_index \
+ (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
+ if ( !array ) \
+ return result; \
+ result |= array->search(phrase, tokens); \
+ return result; \
+ }
+
+ switch ( phrase_length ){
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ default:
+ assert(false);
+ }
+#undef CASE
+}
+
+template<size_t phrase_length>
+int PhraseArrayIndexLevel2<phrase_length>::search
+(/* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const {
+ int result = SEARCH_NONE;
+
+ IndexItem * chunk_begin = NULL, * chunk_end = NULL;
+ chunk_begin = (IndexItem *) m_chunk.begin();
+ chunk_end = (IndexItem *) m_chunk.end();
+
+ /* do the search */
+ IndexItem search_elem(phrase, -1);
+ std_lite::pair<IndexItem *, IndexItem *> range;
+ range = std_lite::equal_range
+ (chunk_begin, chunk_end, search_elem,
+ phrase_less_than2<phrase_length>);
+
+ const IndexItem * const begin = range.first;
+ const IndexItem * const end = range.second;
+ if (begin == end)
+ return result;
+
+ const IndexItem * iter = NULL;
+ GArray * array = NULL;
+
+ for (iter = begin; iter != end; ++iter) {
+ phrase_token_t token = iter->m_token;
+
+ /* filter out disabled sub phrase indices. */
+ array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)];
+ if (NULL == array)
+ continue;
+
+ result |= SEARCH_OK;
+
+ g_array_append_val(array, token);
+ }
+
+ return result;
+}
+
+
+/* add/remove index method */
+
+int PhraseBitmapIndexLevel2::add_index(int phrase_length,
+ /* in */ const ucs4_t phrase[],
+ /* in */ phrase_token_t token){
+ guint8 first_key = (phrase[0] & 0xFF00) >> 8;
+
+ PhraseLengthIndexLevel2 * & length_array =
+ m_phrase_length_indexes[first_key];
+
+ if ( !length_array ){
+ length_array = new PhraseLengthIndexLevel2();
+ }
+ return length_array->add_index(phrase_length, phrase, token);
+}
+
+int PhraseBitmapIndexLevel2::remove_index(int phrase_length,
+ /* in */ const ucs4_t phrase[],
+ /* in */ phrase_token_t token){
+ guint8 first_key = (phrase[0] & 0xFF00) >> 8;
+
+ PhraseLengthIndexLevel2 * & length_array =
+ m_phrase_length_indexes[first_key];
+
+ if (NULL == length_array)
+ return ERROR_REMOVE_ITEM_DONOT_EXISTS;
+
+ int retval = length_array->remove_index(phrase_length, phrase, token);
+
+ /* remove empty array. */
+ if (0 == length_array->get_length()) {
+ delete length_array;
+ length_array = NULL;
+ }
+
+ return retval;
+}
+
+int PhraseLengthIndexLevel2::add_index(int phrase_length,
+ /* in */ const ucs4_t phrase[],
+ /* in */ phrase_token_t token) {
+ if (phrase_length >= MAX_PHRASE_LENGTH)
+ return ERROR_PHRASE_TOO_LONG;
+
+ if (m_phrase_array_indexes->len < phrase_length)
+ g_array_set_size(m_phrase_array_indexes, phrase_length);
+
+#define CASE(len) case len: \
+ { \
+ PhraseArrayIndexLevel2<len> * & array = g_array_index \
+ (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
+ if ( !array ) \
+ array = new PhraseArrayIndexLevel2<len>; \
+ return array->add_index(phrase, token); \
+ }
+
+ switch(phrase_length){
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ default:
+ assert(false);
+ }
+
+#undef CASE
+}
+
+int PhraseLengthIndexLevel2::remove_index(int phrase_length,
+ /* in */ const ucs4_t phrase[],
+ /* in */ phrase_token_t token) {
+ if (phrase_length >= MAX_PHRASE_LENGTH)
+ return ERROR_PHRASE_TOO_LONG;
+
+ if (m_phrase_array_indexes->len < phrase_length)
+ return ERROR_REMOVE_ITEM_DONOT_EXISTS;
+
+#define CASE(len) case len: \
+ { \
+ PhraseArrayIndexLevel2<len> * & array = g_array_index \
+ (m_phrase_array_indexes, \
+ PhraseArrayIndexLevel2<len> *, len - 1); \
+ if (NULL == array) \
+ return ERROR_REMOVE_ITEM_DONOT_EXISTS; \
+ int retval = array->remove_index(phrase, token); \
+ \
+ /* remove empty array. */ \
+ if (0 == array->get_length()) { \
+ delete array; \
+ array = NULL; \
+ \
+ /* shrink self array. */ \
+ g_array_set_size(m_phrase_array_indexes, \
+ get_length()); \
+ } \
+ return retval; \
+ }
+
+ switch(phrase_length){
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ default:
+ assert(false);
+ }
+#undef CASE
+}
+
+template<size_t phrase_length>
+int PhraseArrayIndexLevel2<phrase_length>::add_index
+(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token){
+ IndexItem * begin, * end;
+
+ IndexItem add_elem(phrase, token);
+ begin = (IndexItem *) m_chunk.begin();
+ end = (IndexItem *) m_chunk.end();
+
+ std_lite::pair<IndexItem *, IndexItem *> range;
+ range = std_lite::equal_range
+ (begin, end, add_elem, phrase_less_than2<phrase_length>);
+
+ IndexItem * cur_elem;
+ for (cur_elem = range.first;
+ cur_elem != range.second; ++cur_elem) {
+ if (cur_elem->m_token == token)
+ return ERROR_INSERT_ITEM_EXISTS;
+ if (cur_elem->m_token > token)
+ break;
+ }
+
+ int offset = (cur_elem - begin) * sizeof(IndexItem);
+ m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
+ return ERROR_OK;
+}
+
+template<size_t phrase_length>
+int PhraseArrayIndexLevel2<phrase_length>::remove_index
+(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token) {
+ IndexItem * begin, * end;
+
+ IndexItem remove_elem(phrase, token);
+ begin = (IndexItem *) m_chunk.begin();
+ end = (IndexItem *) m_chunk.end();
+
+ std_lite::pair<IndexItem *, IndexItem *> range;
+ range = std_lite::equal_range
+ (begin, end, remove_elem, phrase_less_than2<phrase_length>);
+
+ IndexItem * cur_elem;
+ for (cur_elem = range.first;
+ cur_elem != range.second; ++cur_elem) {
+ if (cur_elem->m_token == token)
+ break;
+ }
+
+ if (cur_elem == range.second)
+ return ERROR_REMOVE_ITEM_DONOT_EXISTS;
+
+ int offset = (cur_elem - begin) * sizeof(IndexItem);
+ m_chunk.remove_content(offset, sizeof(IndexItem));
+ return ERROR_OK;
+}
+
+
+/* load text method */
+
+bool PhraseLargeTable2::load_text(FILE * infile){
+ char pinyin[256];
+ char phrase[256];
+ phrase_token_t token;
+ size_t freq;
+
+ while (!feof(infile)) {
+ int num = fscanf(infile, "%s %s %u %ld",
+ pinyin, phrase, &token, &freq);
+
+ if (4 != num)
+ continue;
+
+ if (feof(infile))
+ break;
+
+ glong phrase_len = g_utf8_strlen(phrase, -1);
+ ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
+ add_index(phrase_len, new_phrase, token);
+
+ g_free(new_phrase);
+ }
+ return true;
+}
+
+
+/* load/store method */
+
+bool PhraseBitmapIndexLevel2::load(MemoryChunk * chunk,
+ table_offset_t offset,
+ table_offset_t end){
+ reset();
+ char * buf_begin = (char *) chunk->begin();
+ table_offset_t phrase_begin, phrase_end;
+ table_offset_t * index = (table_offset_t *) (buf_begin + offset);
+ phrase_end = *index;
+
+ for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
+ phrase_begin = phrase_end;
+ index++;
+ phrase_end = *index;
+ if ( phrase_begin == phrase_end ) //null pointer
+ continue;
+
+ /* after reset() all phrases are null pointer. */
+ PhraseLengthIndexLevel2 * phrases = new PhraseLengthIndexLevel2;
+ m_phrase_length_indexes[i] = phrases;
+
+ phrases->load(chunk, phrase_begin, phrase_end - 1);
+ assert( phrase_end <= end );
+ assert( *(buf_begin + phrase_end - 1) == c_separate);
+ }
+ offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
+ assert( c_separate == *(buf_begin + offset) );
+ return true;
+}
+
+bool PhraseBitmapIndexLevel2::store(MemoryChunk * new_chunk,
+ table_offset_t offset,
+ table_offset_t & end){
+ table_offset_t phrase_end;
+ table_offset_t index = offset;
+ offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t);
+ //add '#'
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset +=sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
+ PhraseLengthIndexLevel2 * phrases = m_phrase_length_indexes[i];
+ if ( !phrases ) { //null pointer
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ continue;
+ }
+ phrases->store(new_chunk, offset, phrase_end); //has a end '#'
+ offset = phrase_end;
+ //add '#'
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ }
+ end = offset;
+ return true;
+}
+
+bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk,
+ table_offset_t offset,
+ table_offset_t end) {
+ char * buf_begin = (char *) chunk->begin();
+ guint32 nindex = *((guint32 *)(buf_begin + offset));
+ table_offset_t * index = (table_offset_t *)
+ (buf_begin + offset + sizeof(guint32));
+
+ table_offset_t phrase_begin, phrase_end = *index;
+ g_array_set_size(m_phrase_array_indexes, 0);
+ for (size_t i = 1; i <= nindex; ++i) {
+ phrase_begin = phrase_end;
+ index++;
+ phrase_end = *index;
+ if ( phrase_begin == phrase_end ){
+ void * null = NULL;
+ g_array_append_val(m_phrase_array_indexes, null);
+ continue;
+ }
+
+#define CASE(len) case len: \
+ { \
+ PhraseArrayIndexLevel2<len> * phrase = \
+ new PhraseArrayIndexLevel2<len>; \
+ phrase->load(chunk, phrase_begin, phrase_end - 1); \
+ assert( *(buf_begin + phrase_end - 1) == c_separate ); \
+ assert( phrase_end <= end ); \
+ g_array_append_val(m_phrase_array_indexes, phrase); \
+ break; \
+ }
+ switch ( i ){
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ default:
+ assert(false);
+ }
+#undef CASE
+ }
+ offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
+ assert ( c_separate == * (buf_begin + offset) );
+ return true;
+}
+
+bool PhraseLengthIndexLevel2::store(MemoryChunk * new_chunk,
+ table_offset_t offset,
+ table_offset_t & end) {
+ guint32 nindex = m_phrase_array_indexes->len;
+ new_chunk->set_content(offset, &nindex, sizeof(guint32));
+ table_offset_t index = offset + sizeof(guint32);
+
+ offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+
+ table_offset_t phrase_end;
+ for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) {
+#define CASE(len) case len: \
+ { \
+ PhraseArrayIndexLevel2<len> * phrase = g_array_index \
+ (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \
+ if ( !phrase ){ \
+ new_chunk->set_content \
+ (index, &offset, sizeof(table_offset_t)); \
+ index += sizeof(table_offset_t); \
+ continue; \
+ } \
+ phrase->store(new_chunk, offset, phrase_end); \
+ offset = phrase_end; \
+ break; \
+ }
+ switch ( i ){
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ default:
+ assert(false);
+ }
+ //add '#'
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+
+#undef CASE
+ }
+ end = offset;
+ return true;
+}
+
+template<size_t phrase_length>
+bool PhraseArrayIndexLevel2<phrase_length>::
+load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
+ char * buf_begin = (char *) chunk->begin();
+ m_chunk.set_chunk(buf_begin + offset, end - offset, NULL);
+ return true;
+}
+
+template<size_t phrase_length>
+bool PhraseArrayIndexLevel2<phrase_length>::
+store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
+ new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
+ end = offset + m_chunk.size();
+ return true;
+}
+
+
+/* get length method */
+
+int PhraseLengthIndexLevel2::get_length() const {
+ int length = m_phrase_array_indexes->len;
+
+ /* trim trailing zero. */
+ for (int i = length - 1; i >= 0; --i) {
+ void * array = g_array_index(m_phrase_array_indexes, void *, i);
+
+ if (NULL != array)
+ break;
+
+ --length;
+ }
+
+ return length;
+}
+
+template<size_t phrase_length>
+int PhraseArrayIndexLevel2<phrase_length>::get_length() const {
+ IndexItem * chunk_begin = NULL, * chunk_end = NULL;
+ chunk_begin = (IndexItem *) m_chunk.begin();
+ chunk_end = (IndexItem *) m_chunk.end();
+
+ return chunk_end - chunk_begin;
+}
+
+
+/* mask out method */
+
+bool PhraseBitmapIndexLevel2::mask_out(phrase_token_t mask,
+ phrase_token_t value){
+ for (size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) {
+ PhraseLengthIndexLevel2 * & length_array =
+ m_phrase_length_indexes[i];
+
+ if (NULL == length_array)
+ continue;
+
+ length_array->mask_out(mask, value);
+
+ if (0 == length_array->get_length()) {
+ delete length_array;
+ length_array = NULL;
+ }
+ }
+
+ return true;
+}
+
+bool PhraseLengthIndexLevel2::mask_out(phrase_token_t mask,
+ phrase_token_t value){
+#define CASE(len) case len: \
+ { \
+ PhraseArrayIndexLevel2<len> * & array = g_array_index \
+ (m_phrase_array_indexes, \
+ PhraseArrayIndexLevel2<len> *, len - 1); \
+ \
+ if (NULL == array) \
+ continue; \
+ \
+ array->mask_out(mask, value); \
+ \
+ if (0 == array->get_length()) { \
+ delete array; \
+ array = NULL; \
+ } \
+ break; \
+ }
+
+ for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) {
+ switch (i) {
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ default:
+ assert(false);
+ }
+ }
+ /* shrink self array. */
+ g_array_set_size(m_phrase_array_indexes, get_length());
+#undef CASE
+ return true;
+}
+
+template<size_t phrase_length>
+bool PhraseArrayIndexLevel2<phrase_length>::mask_out
+(phrase_token_t mask, phrase_token_t value) {
+ IndexItem * begin = NULL, * end = NULL;
+ begin = (IndexItem *) m_chunk.begin();
+ end = (IndexItem *) m_chunk.end();
+
+ for (IndexItem * cur = begin; cur != end; ++cur) {
+ if ((cur->m_token & mask) != value)
+ continue;
+
+ int offset = (cur - begin) * sizeof(IndexItem);
+ m_chunk.remove_content(offset, sizeof(IndexItem));
+
+ /* update chunk end. */
+ end = (IndexItem *) m_chunk.end();
+ --cur;
+ }
+
+ return true;
+}
diff --git a/src/storage/phrase_large_table2.h b/src/storage/phrase_large_table2.h
new file mode 100644
index 0000000..cf6807c
--- /dev/null
+++ b/src/storage/phrase_large_table2.h
@@ -0,0 +1,157 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef PHRASE_LARGE_TABLE2_H
+#define PHRASE_LARGE_TABLE2_H
+
+#include <stdio.h>
+#include "novel_types.h"
+#include "memory_chunk.h"
+
+namespace pinyin{
+
+const size_t PHRASE_NUMBER_OF_BITMAP_INDEX = 1<<(sizeof(ucs4_t) / 4 * 8);
+
+class PhraseLengthIndexLevel2;
+
+class PhraseBitmapIndexLevel2{
+protected:
+ PhraseLengthIndexLevel2 * m_phrase_length_indexes[PHRASE_NUMBER_OF_BITMAP_INDEX];
+ /* use the third byte of ucs4_t for class PhraseLengthIndexLevel2. */
+ void reset();
+public:
+ PhraseBitmapIndexLevel2();
+ ~PhraseBitmapIndexLevel2(){
+ reset();
+ }
+
+ /* load/store method */
+ bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end);
+
+ /* search method */
+ int search(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const;
+
+ /* add_index/remove_index method */
+ int add_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token);
+
+ int remove_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token);
+
+ /* mask out method */
+ bool mask_out(phrase_token_t mask, phrase_token_t value);
+};
+
+
+class PhraseLargeTable2{
+protected:
+ PhraseBitmapIndexLevel2 m_bitmap_table;
+ MemoryChunk * m_chunk;
+
+ void reset(){
+ if ( m_chunk ){
+ delete m_chunk;
+ m_chunk = NULL;
+ }
+ }
+public:
+ PhraseLargeTable2(){
+ m_chunk = NULL;
+ }
+
+ ~PhraseLargeTable2(){
+ reset();
+ }
+
+ /* load/store method */
+ bool load(MemoryChunk * chunk){
+ reset();
+ m_chunk = chunk;
+ return m_bitmap_table.load(chunk, 0, chunk->size());
+ }
+
+ bool store(MemoryChunk * new_chunk){
+ table_offset_t end;
+ return m_bitmap_table.store(new_chunk, 0, end);
+ }
+
+ bool load_text(FILE * file);
+
+ /* search method */
+ int search(int phrase_length, /* in */ const ucs4_t phrase[],
+ /* out */ PhraseTokens tokens) const {
+ return m_bitmap_table.search(phrase_length, phrase, tokens);
+ }
+
+ /* add_index/remove_index method */
+ int add_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token) {
+ return m_bitmap_table.add_index(phrase_length, phrase, token);
+ }
+
+ int remove_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token) {
+ return m_bitmap_table.remove_index(phrase_length, phrase, token);
+ }
+
+ /* mask out method */
+ bool mask_out(phrase_token_t mask, phrase_token_t value) {
+ return m_bitmap_table.mask_out(mask, value);
+ }
+};
+
+
+static inline int reduce_tokens(const PhraseTokens tokens,
+ TokenVector tokenarray) {
+ int num = 0;
+ g_array_set_size(tokenarray, 0);
+
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ GArray * array = tokens[i];
+ if (NULL == array)
+ continue;
+
+ num += array->len;
+
+ g_array_append_vals(tokenarray, array->data, array->len);
+ }
+
+ /* the following line will be removed in future after code are verified. */
+ assert(0 <= num && num <= 4);
+
+ return num;
+}
+
+/* for compatibility. */
+static inline int get_first_token(const PhraseTokens tokens,
+ /* out */ phrase_token_t & token){
+ token = null_token;
+
+ TokenVector tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ int num = reduce_tokens(tokens, tokenarray);
+ if (num)
+ token = g_array_index(tokenarray, phrase_token_t, 0);
+ g_array_free(tokenarray, TRUE);
+
+ return num;
+}
+
+};
+
+#endif
diff --git a/src/storage/pinyin_custom2.h b/src/storage/pinyin_custom2.h
new file mode 100644
index 0000000..4685a07
--- /dev/null
+++ b/src/storage/pinyin_custom2.h
@@ -0,0 +1,111 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef PINYIN_CUSTOM2_H
+#define PINYIN_CUSTOM2_H
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+/**
+ * PinyinTableFlag:
+ */
+enum PinyinTableFlag{
+ IS_CHEWING = 1U << 1,
+ IS_PINYIN = 1U << 2,
+ PINYIN_INCOMPLETE = 1U << 3,
+ CHEWING_INCOMPLETE = 1U << 4,
+ USE_TONE = 1U << 5,
+ USE_DIVIDED_TABLE = 1U << 6,
+ USE_RESPLIT_TABLE = 1U << 7,
+ DYNAMIC_ADJUST = 1U << 8
+};
+
+/**
+ * PinyinAmbiguity2:
+ *
+ * The enums of pinyin ambiguities.
+ *
+ */
+enum PinyinAmbiguity2{
+ PINYIN_AMB_C_CH = 1U << 9,
+ PINYIN_AMB_S_SH = 1U << 10,
+ PINYIN_AMB_Z_ZH = 1U << 11,
+ PINYIN_AMB_F_H = 1U << 12,
+ PINYIN_AMB_G_K = 1U << 13,
+ PINYIN_AMB_L_N = 1U << 14,
+ PINYIN_AMB_L_R = 1U << 15,
+ PINYIN_AMB_AN_ANG = 1U << 16,
+ PINYIN_AMB_EN_ENG = 1U << 17,
+ PINYIN_AMB_IN_ING = 1U << 18,
+ PINYIN_AMB_ALL = 0x3FFU << 9
+};
+
+/**
+ * PinyinCorrection2:
+ *
+ * The enums of pinyin corrections.
+ *
+ */
+
+enum PinyinCorrection2{
+ PINYIN_CORRECT_GN_NG = 1U << 21,
+ PINYIN_CORRECT_MG_NG = 1U << 22,
+ PINYIN_CORRECT_IOU_IU = 1U << 23,
+ PINYIN_CORRECT_UEI_UI = 1U << 24,
+ PINYIN_CORRECT_UEN_UN = 1U << 25,
+ PINYIN_CORRECT_UE_VE = 1U << 26,
+ PINYIN_CORRECT_V_U = 1U << 27,
+ PINYIN_CORRECT_ON_ONG = 1U << 28,
+ PINYIN_CORRECT_ALL = 0xFFU << 21
+};
+
+/**
+ * @brief enums of Double Pinyin Schemes.
+ */
+enum DoublePinyinScheme
+{
+ DOUBLE_PINYIN_ZRM = 1,
+ DOUBLE_PINYIN_MS = 2,
+ DOUBLE_PINYIN_ZIGUANG = 3,
+ DOUBLE_PINYIN_ABC = 4,
+ DOUBLE_PINYIN_PYJJ = 6,
+ DOUBLE_PINYIN_XHE = 7,
+ DOUBLE_PINYIN_CUSTOMIZED = 30, /* for user's keyboard */
+ DOUBLE_PINYIN_DEFAULT = DOUBLE_PINYIN_MS
+};
+
+/**
+ * @brief enums of Chewing Schemes.
+ */
+enum ChewingScheme
+{
+ CHEWING_STANDARD = 1,
+ CHEWING_IBM = 2,
+ CHEWING_GINYIEH = 3,
+ CHEWING_ETEN = 4,
+ CHEWING_DEFAULT = CHEWING_STANDARD
+};
+
+G_END_DECLS
+
+#endif
diff --git a/src/storage/pinyin_parser2.cpp b/src/storage/pinyin_parser2.cpp
new file mode 100644
index 0000000..5d406ae
--- /dev/null
+++ b/src/storage/pinyin_parser2.cpp
@@ -0,0 +1,989 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "pinyin_parser2.h"
+#include <ctype.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include "stl_lite.h"
+#include "pinyin_phrase2.h"
+#include "pinyin_custom2.h"
+#include "chewing_key.h"
+#include "pinyin_parser_table.h"
+#include "double_pinyin_table.h"
+#include "chewing_table.h"
+
+
+using namespace pinyin;
+
+static bool check_pinyin_options(pinyin_option_t options, const pinyin_index_item_t * item) {
+ guint32 flags = item->m_flags;
+ assert (flags & IS_PINYIN);
+
+ /* handle incomplete pinyin. */
+ if (flags & PINYIN_INCOMPLETE) {
+ if (!(options & PINYIN_INCOMPLETE))
+ return false;
+ }
+
+ /* handle correct pinyin, currently only one flag per item. */
+ flags &= PINYIN_CORRECT_ALL;
+ options &= PINYIN_CORRECT_ALL;
+
+ if (flags) {
+ if ((flags & options) != flags)
+ return false;
+ }
+
+ return true;
+}
+
+static bool check_chewing_options(pinyin_option_t options, const chewing_index_item_t * item) {
+ guint32 flags = item->m_flags;
+ assert (flags & IS_CHEWING);
+
+ /* handle incomplete chewing. */
+ if (flags & CHEWING_INCOMPLETE) {
+ if (!(options & CHEWING_INCOMPLETE))
+ return false;
+ }
+
+ return true;
+}
+
+
+gint _ChewingKey::get_table_index() {
+ assert(m_initial < CHEWING_NUMBER_OF_INITIALS);
+ assert(m_middle < CHEWING_NUMBER_OF_MIDDLES);
+ assert(m_final < CHEWING_NUMBER_OF_FINALS);
+
+ gint index = chewing_key_table[(m_initial * CHEWING_NUMBER_OF_MIDDLES + m_middle) * CHEWING_NUMBER_OF_FINALS + m_final];
+ return index == -1 ? 0 : index;
+}
+
+gchar * _ChewingKey::get_pinyin_string() {
+ assert(m_tone < CHEWING_NUMBER_OF_TONES);
+ gint index = get_table_index();
+ assert(index < G_N_ELEMENTS(content_table));
+ const content_table_item_t & item = content_table[index];
+
+ if (CHEWING_ZERO_TONE == m_tone) {
+ return g_strdup(item.m_pinyin_str);
+ } else {
+ return g_strdup_printf("%s%d", item.m_pinyin_str, m_tone);
+ }
+}
+
+gchar * _ChewingKey::get_shengmu_string() {
+ gint index = get_table_index();
+ assert(index < G_N_ELEMENTS(content_table));
+ const content_table_item_t & item = content_table[index];
+ return g_strdup(item.m_shengmu_str);
+}
+
+gchar * _ChewingKey::get_yunmu_string() {
+ gint index = get_table_index();
+ assert(index < G_N_ELEMENTS(content_table));
+ const content_table_item_t & item = content_table[index];
+ return g_strdup(item.m_yunmu_str);
+}
+
+gchar * _ChewingKey::get_chewing_string() {
+ assert(m_tone < CHEWING_NUMBER_OF_TONES);
+ gint index = get_table_index();
+ assert(index < G_N_ELEMENTS(content_table));
+ const content_table_item_t & item = content_table[index];
+
+ if (CHEWING_ZERO_TONE == m_tone) {
+ return g_strdup(item.m_chewing_str);
+ } else {
+ return g_strdup_printf("%s%s", item.m_chewing_str,
+ chewing_tone_table[m_tone]);
+ }
+}
+
+
+/* Pinyin Parsers */
+
+/* internal information for pinyin parsers. */
+struct parse_value_t{
+ ChewingKey m_key;
+ ChewingKeyRest m_key_rest;
+ gint16 m_num_keys;
+ gint16 m_parsed_len;
+ gint16 m_last_step;
+
+ /* constructor */
+public:
+ parse_value_t(){
+ m_num_keys = 0;
+ m_parsed_len = 0;
+ m_last_step = -1;
+ }
+};
+
+const guint16 max_full_pinyin_length = 7; /* include tone. */
+
+const guint16 max_double_pinyin_length = 3; /* include tone. */
+
+const guint16 max_chewing_length = 4; /* include tone. */
+
+static bool compare_pinyin_less_than(const pinyin_index_item_t & lhs,
+ const pinyin_index_item_t & rhs){
+ return 0 > strcmp(lhs.m_pinyin_input, rhs.m_pinyin_input);
+}
+
+static inline bool search_pinyin_index(pinyin_option_t options,
+ const char * pinyin,
+ ChewingKey & key){
+ pinyin_index_item_t item;
+ memset(&item, 0, sizeof(item));
+ item.m_pinyin_input = pinyin;
+
+ std_lite::pair<const pinyin_index_item_t *,
+ const pinyin_index_item_t *> range;
+ range = std_lite::equal_range
+ (pinyin_index, pinyin_index + G_N_ELEMENTS(pinyin_index),
+ item, compare_pinyin_less_than);
+
+ guint16 range_len = range.second - range.first;
+ assert(range_len <= 1);
+ if (range_len == 1) {
+ const pinyin_index_item_t * index = range.first;
+
+ if (!check_pinyin_options(options, index))
+ return false;
+
+ key = content_table[index->m_table_index].m_chewing_key;
+ assert(key.get_table_index() == index->m_table_index);
+ return true;
+ }
+
+ return false;
+}
+
+static bool compare_chewing_less_than(const chewing_index_item_t & lhs,
+ const chewing_index_item_t & rhs){
+ return 0 > strcmp(lhs.m_chewing_input, rhs.m_chewing_input);
+}
+
+static inline bool search_chewing_index(pinyin_option_t options,
+ const char * chewing,
+ ChewingKey & key){
+ chewing_index_item_t item;
+ memset(&item, 0, sizeof(item));
+ item.m_chewing_input = chewing;
+
+ std_lite::pair<const chewing_index_item_t *,
+ const chewing_index_item_t *> range;
+ range = std_lite::equal_range
+ (chewing_index, chewing_index + G_N_ELEMENTS(chewing_index),
+ item, compare_chewing_less_than);
+
+ guint16 range_len = range.second - range.first;
+ assert (range_len <= 1);
+
+ if (range_len == 1) {
+ const chewing_index_item_t * index = range.first;
+
+ if (!check_chewing_options(options, index))
+ return false;
+
+ key = content_table[index->m_table_index].m_chewing_key;
+ assert(key.get_table_index() == index->m_table_index);
+ return true;
+ }
+
+ return false;
+}
+
+/* Full Pinyin Parser */
+FullPinyinParser2::FullPinyinParser2 (){
+ m_parse_steps = g_array_new(TRUE, FALSE, sizeof(parse_value_t));
+}
+
+
+bool FullPinyinParser2::parse_one_key (pinyin_option_t options,
+ ChewingKey & key,
+ const char * pinyin, int len) const {
+ /* "'" are not accepted in parse_one_key. */
+ gchar * input = g_strndup(pinyin, len);
+ assert(NULL == strchr(input, '\''));
+
+ guint16 tone = CHEWING_ZERO_TONE; guint16 tone_pos = 0;
+ guint16 parsed_len = len;
+ key = ChewingKey();
+
+ if (options & USE_TONE) {
+ /* find the tone in the last character. */
+ char chr = input[parsed_len - 1];
+ if ( '0' < chr && chr <= '5' ) {
+ tone = chr - '0';
+ parsed_len --;
+ tone_pos = parsed_len;
+ }
+ }
+
+ /* parse pinyin core staff here. */
+
+ /* Note: optimize here? */
+ input[parsed_len] = '\0';
+ if (!search_pinyin_index(options, input, key)) {
+ g_free(input);
+ return false;
+ }
+
+ if (options & USE_TONE) {
+ /* post processing tone. */
+ if ( parsed_len == tone_pos ) {
+ if (tone != CHEWING_ZERO_TONE) {
+ key.m_tone = tone;
+ parsed_len ++;
+ }
+ }
+ }
+
+ g_free(input);
+ return parsed_len == len;
+}
+
+
+int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
+ ChewingKeyRestVector & key_rests,
+ const char *str, int len) const {
+ int i;
+ /* clear arrays. */
+ g_array_set_size(keys, 0);
+ g_array_set_size(key_rests, 0);
+
+ /* init m_parse_steps, and prepare dynamic programming. */
+ int step_len = len + 1;
+ g_array_set_size(m_parse_steps, 0);
+ parse_value_t value;
+ for (i = 0; i < step_len; ++i) {
+ g_array_append_val(m_parse_steps, value);
+ }
+
+ size_t next_sep = 0;
+ gchar * input = g_strndup(str, len);
+ parse_value_t * curstep = NULL, * nextstep = NULL;
+
+ for (i = 0; i < len; ++i) {
+ if (input[i] == '\'') {
+ curstep = &g_array_index(m_parse_steps, parse_value_t, i);
+ nextstep = &g_array_index(m_parse_steps, parse_value_t, i + 1);
+
+ /* propagate current step into next step. */
+ nextstep->m_key = ChewingKey();
+ nextstep->m_key_rest = ChewingKeyRest();
+ nextstep->m_num_keys = curstep->m_num_keys;
+ nextstep->m_parsed_len = curstep->m_parsed_len + 1;
+ nextstep->m_last_step = i;
+ next_sep = 0;
+ continue;
+ }
+
+ /* forward to next "'" */
+ if ( 0 == next_sep ) {
+ int k;
+ for (k = i; k < len; ++k) {
+ if (input[k] == '\'')
+ break;
+ }
+ next_sep = k;
+ }
+
+ /* dynamic programming here. */
+ /* for (size_t m = i; m < next_sep; ++m) */
+ {
+ size_t m = i;
+ curstep = &g_array_index(m_parse_steps, parse_value_t, m);
+ size_t try_len = std_lite::min
+ (m + max_full_pinyin_length, next_sep);
+ for (size_t n = m + 1; n < try_len + 1; ++n) {
+ nextstep = &g_array_index(m_parse_steps, parse_value_t, n);
+
+ /* gen next step */
+ const char * onepinyin = input + m;
+ gint16 onepinyinlen = n - m;
+ value = parse_value_t();
+
+ ChewingKey key; ChewingKeyRest rest;
+ bool parsed = parse_one_key
+ (options, key, onepinyin, onepinyinlen);
+ rest.m_raw_begin = m; rest.m_raw_end = n;
+ if (!parsed)
+ continue;
+
+ //printf("onepinyin:%s len:%d\n", onepinyin, onepinyinlen);
+
+ value.m_key = key; value.m_key_rest = rest;
+ value.m_num_keys = curstep->m_num_keys + 1;
+ value.m_parsed_len = curstep->m_parsed_len + onepinyinlen;
+ value.m_last_step = m;
+
+ /* save next step */
+ /* no previous result */
+ if (-1 == nextstep->m_last_step)
+ *nextstep = value;
+ /* prefer the longest pinyin */
+ if (value.m_parsed_len > nextstep->m_parsed_len)
+ *nextstep = value;
+ /* prefer the shortest keys with the same pinyin length */
+ if (value.m_parsed_len == nextstep->m_parsed_len &&
+ value.m_num_keys < nextstep->m_num_keys)
+ *nextstep = value;
+
+ /* handle with the same pinyin length and the number of keys */
+ if (value.m_parsed_len == nextstep->m_parsed_len &&
+ value.m_num_keys == nextstep->m_num_keys) {
+
+#if 0
+ /* prefer the complete pinyin with shengmu
+ * over without shengmu,
+ * ex: "kaneiji" -> "ka'nei'ji".
+ */
+ if ((value.m_key.m_initial != CHEWING_ZERO_INITIAL &&
+ !(value.m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+ value.m_key.m_final == CHEWING_ZERO_FINAL)) &&
+ nextstep->m_key.m_initial == CHEWING_ZERO_INITIAL)
+ *nextstep = value;
+
+ /* prefer the complete pinyin 'er'
+ * over the in-complete pinyin 'r',
+ * ex: "xierqi" -> "xi'er'qi."
+ */
+ if ((value.m_key.m_initial == CHEWING_ZERO_INITIAL &&
+ value.m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+ value.m_key.m_final == CHEWING_ER) &&
+ (nextstep->m_key.m_initial == CHEWING_R &&
+ nextstep->m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+ nextstep->m_key.m_final == CHEWING_ZERO_FINAL))
+ *nextstep = value;
+#endif
+
+ /* prefer the 'a' at the end of clause,
+ * ex: "zheyanga$" -> "zhe'yang'a$".
+ */
+ if (value.m_parsed_len == len &&
+ (nextstep->m_key.m_initial != CHEWING_ZERO_INITIAL &&
+ nextstep->m_key.m_final == CHEWING_A) &&
+ (value.m_key.m_initial == CHEWING_ZERO_INITIAL &&
+ value.m_key.m_middle == CHEWING_ZERO_MIDDLE &&
+ value.m_key.m_final == CHEWING_A))
+ *nextstep = value;
+ }
+ }
+ }
+ }
+
+ /* final step for back tracing. */
+ gint16 parsed_len = final_step(step_len, keys, key_rests);
+
+ /* post processing for re-split table. */
+ if (options & USE_RESPLIT_TABLE) {
+ post_process2(options, keys, key_rests, str, len);
+ }
+
+ g_free(input);
+ return parsed_len;
+}
+
+int FullPinyinParser2::final_step(size_t step_len, ChewingKeyVector & keys,
+ ChewingKeyRestVector & key_rests) const{
+ int i;
+ gint16 parsed_len = 0;
+ parse_value_t * curstep = NULL;
+
+ /* find longest match, which starts from the beginning of input. */
+ for (i = step_len - 1; i >= 0; --i) {
+ curstep = &g_array_index(m_parse_steps, parse_value_t, i);
+ if (i == curstep->m_parsed_len)
+ break;
+ }
+ /* prepare saving. */
+ parsed_len = curstep->m_parsed_len;
+ gint16 num_keys = curstep->m_num_keys;
+ g_array_set_size(keys, num_keys);
+ g_array_set_size(key_rests, num_keys);
+
+ /* save the match. */
+ while (curstep->m_last_step != -1) {
+ gint16 pos = curstep->m_num_keys - 1;
+
+ /* skip "'" */
+ if (0 != curstep->m_key.get_table_index()) {
+ ChewingKey * key = &g_array_index(keys, ChewingKey, pos);
+ ChewingKeyRest * rest = &g_array_index
+ (key_rests, ChewingKeyRest, pos);
+ *key = curstep->m_key; *rest = curstep->m_key_rest;
+ }
+
+ /* back ward */
+ curstep = &g_array_index(m_parse_steps, parse_value_t,
+ curstep->m_last_step);
+ }
+ return parsed_len;
+}
+
+bool FullPinyinParser2::post_process2(pinyin_option_t options,
+ ChewingKeyVector & keys,
+ ChewingKeyRestVector & key_rests,
+ const char * str,
+ int len) const {
+ int i;
+ assert(keys->len == key_rests->len);
+ gint num_keys = keys->len;
+
+ ChewingKey * cur_key = NULL, * next_key = NULL;
+ ChewingKeyRest * cur_rest = NULL, * next_rest = NULL;
+ guint16 next_tone = CHEWING_ZERO_TONE;
+
+ for (i = 0; i < num_keys - 1; ++i) {
+ cur_rest = &g_array_index(key_rests, ChewingKeyRest, i);
+ next_rest = &g_array_index(key_rests, ChewingKeyRest, i + 1);
+
+ /* some "'" here */
+ if (cur_rest->m_raw_end != next_rest->m_raw_begin)
+ continue;
+
+ cur_key = &g_array_index(keys, ChewingKey, i);
+ next_key = &g_array_index(keys, ChewingKey, i + 1);
+
+ /* some tone here */
+ if (CHEWING_ZERO_TONE != cur_key->m_tone)
+ continue;
+
+ /* back up tone */
+ if (options & USE_TONE) {
+ next_tone = next_key->m_tone;
+ if (CHEWING_ZERO_TONE != next_tone) {
+ next_key->m_tone = CHEWING_ZERO_TONE;
+ next_rest->m_raw_end --;
+ }
+ }
+
+ /* lookup re-split table */
+ const resplit_table_item_t * item = NULL;
+
+ item = retrieve_resplit_item_by_original_pinyins
+ (options, cur_key, cur_rest, next_key, next_rest, str, len);
+
+ if (item) {
+ /* no ops */
+ if (item->m_orig_freq >= item->m_new_freq)
+ continue;
+
+ /* do re-split */
+ const char * onepinyin = str + cur_rest->m_raw_begin;
+ size_t len = strlen(item->m_new_keys[0]);
+
+ assert(parse_one_key(options, *cur_key, onepinyin, len));
+ cur_rest->m_raw_end = cur_rest->m_raw_begin + len;
+
+ next_rest->m_raw_begin = cur_rest->m_raw_end;
+ onepinyin = str + next_rest->m_raw_begin;
+ len = strlen(item->m_new_keys[1]);
+
+ assert(parse_one_key(options, *next_key, onepinyin, len));
+ }
+
+ /* restore tones */
+ if (options & USE_TONE) {
+ if (CHEWING_ZERO_TONE != next_tone) {
+ next_key->m_tone = next_tone;
+ next_rest->m_raw_end ++;
+ }
+ }
+ }
+
+ return true;
+}
+
+const divided_table_item_t * FullPinyinParser2::retrieve_divided_item
+(pinyin_option_t options, ChewingKey * key, ChewingKeyRest * rest,
+ const char * str, int len) const {
+
+ /* lookup divided table */
+ size_t k;
+ const divided_table_item_t * item = NULL;
+ for (k = 0; k < G_N_ELEMENTS(divided_table); ++k) {
+ item = divided_table + k;
+
+ const char * onepinyin = str + rest->m_raw_begin;
+ size_t len = strlen(item->m_orig_key);
+
+ if (rest->length() != len)
+ continue;
+
+ if (0 == strncmp(onepinyin, item->m_orig_key, len))
+ break;
+ }
+
+ /* found the match */
+ if (k < G_N_ELEMENTS(divided_table)) {
+ /* do divided */
+ item = divided_table + k;
+ return item;
+ }
+
+ return NULL;
+}
+
+
+const resplit_table_item_t * FullPinyinParser2::retrieve_resplit_item_by_original_pinyins
+(pinyin_option_t options,
+ ChewingKey * cur_key, ChewingKeyRest * cur_rest,
+ ChewingKey * next_key, ChewingKeyRest * next_rest,
+ const char * str, int len) const{
+ /* lookup re-split table */
+ size_t k;
+ const resplit_table_item_t * item = NULL;
+
+ for (k = 0; k < G_N_ELEMENTS(resplit_table); ++k) {
+ item = resplit_table + k;
+
+ const char * onepinyin = str + cur_rest->m_raw_begin;
+ size_t len = strlen(item->m_orig_keys[0]);
+
+ if (cur_rest->length() != len)
+ continue;
+
+ if (0 != strncmp(onepinyin, item->m_orig_keys[0], len))
+ continue;
+
+ onepinyin = str + next_rest->m_raw_begin;
+ len = strlen(item->m_orig_keys[1]);
+
+ if (next_rest->length() != len)
+ continue;
+
+ if (0 == strncmp(onepinyin, item->m_orig_keys[1], len))
+ break;
+ }
+
+ /* found the match */
+ if (k < G_N_ELEMENTS(resplit_table)) {
+ item = resplit_table + k;
+ return item;
+ }
+
+ return NULL;
+}
+
+const resplit_table_item_t * FullPinyinParser2::retrieve_resplit_item_by_resplit_pinyins
+(pinyin_option_t options,
+ ChewingKey * cur_key, ChewingKeyRest * cur_rest,
+ ChewingKey * next_key, ChewingKeyRest * next_rest,
+ const char * str, int len) const {
+ /* lookup divide table */
+ size_t k;
+ const resplit_table_item_t * item = NULL;
+
+ for (k = 0; k < G_N_ELEMENTS(resplit_table); ++k) {
+ item = resplit_table + k;
+
+ const char * onepinyin = str + cur_rest->m_raw_begin;
+ size_t len = strlen(item->m_new_keys[0]);
+
+ if (cur_rest->length() != len)
+ continue;
+
+ if (0 != strncmp(onepinyin, item->m_new_keys[0], len))
+ continue;
+
+ onepinyin = str + next_rest->m_raw_begin;
+ len = strlen(item->m_new_keys[1]);
+
+ if (next_rest->length() != len)
+ continue;
+
+ if (0 == strncmp(onepinyin, item->m_new_keys[1], len))
+ break;
+ }
+
+ /* found the match */
+ if (k < G_N_ELEMENTS(resplit_table)) {
+ item = resplit_table + k;
+ return item;
+ }
+
+ return NULL;
+}
+
+#define IS_KEY(x) (('a' <= x && x <= 'z') || x == ';')
+
+bool DoublePinyinParser2::parse_one_key(pinyin_option_t options,
+ ChewingKey & key,
+ const char *str, int len) const {
+ options &= ~(PINYIN_CORRECT_ALL|PINYIN_AMB_ALL);
+
+ if (1 == len) {
+ if (!(options & PINYIN_INCOMPLETE))
+ return false;
+
+ char ch = str[0];
+ if (!IS_KEY(ch))
+ return false;
+
+ int charid = ch == ';' ? 26 : ch - 'a';
+ const char * sheng = m_shengmu_table[charid].m_shengmu;
+ if (NULL == sheng || strcmp(sheng, "'") == 0)
+ return false;
+
+ if (search_pinyin_index(options, sheng, key)) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ ChewingTone tone = CHEWING_ZERO_TONE;
+ options &= ~(PINYIN_INCOMPLETE|CHEWING_INCOMPLETE);
+ options |= PINYIN_CORRECT_UE_VE | PINYIN_CORRECT_V_U;
+
+ /* parse tone */
+ if (3 == len) {
+ if (!(options & USE_TONE))
+ return false;
+ char ch = str[2];
+ if (!('0' < ch && ch <= '5'))
+ return false;
+ tone = (ChewingTone) (ch - '0');
+ }
+
+ if (2 == len || 3 == len) {
+ /* parse shengmu here. */
+ char ch = str[0];
+ if (!IS_KEY(ch))
+ return false;
+
+ int charid = ch == ';' ? 26 : ch - 'a';
+ const char * sheng = m_shengmu_table[charid].m_shengmu;
+ if (NULL == sheng)
+ return false;
+ if (0 == strcmp(sheng, "'"))
+ sheng = "";
+
+ /* parse yunmu here. */
+ ch = str[1];
+ if (!IS_KEY(ch))
+ return false;
+
+ gchar * pinyin = NULL;
+ do {
+
+ charid = ch == ';' ? 26 : ch - 'a';
+ /* first yunmu */
+ const char * yun = m_yunmu_table[charid].m_yunmus[0];
+ if (NULL == yun)
+ break;
+
+ pinyin = g_strdup_printf("%s%s", sheng, yun);
+ if (search_pinyin_index(options, pinyin, key)) {
+ key.m_tone = tone;
+ g_free(pinyin);
+ return true;
+ }
+ g_free(pinyin);
+
+ /* second yunmu */
+ yun = m_yunmu_table[charid].m_yunmus[1];
+ if (NULL == yun)
+ break;
+
+ pinyin = g_strdup_printf("%s%s", sheng, yun);
+ if (search_pinyin_index(options, pinyin, key)) {
+ key.m_tone = tone;
+ g_free(pinyin);
+ return true;
+ }
+ g_free(pinyin);
+ } while(0);
+
+#if 1
+ /* support two letter yunmu from full pinyin */
+ if (0 == strcmp(sheng, "")) {
+ pinyin = g_strndup(str, 2);
+ if (search_pinyin_index(options, pinyin, key)) {
+ key.m_tone = tone;
+ g_free(pinyin);
+ return true;
+ }
+ g_free(pinyin);
+ }
+#endif
+ }
+
+ return false;
+}
+
+
+/* only 'a'-'z' and ';' are accepted here. */
+int DoublePinyinParser2::parse(pinyin_option_t options, ChewingKeyVector & keys,
+ ChewingKeyRestVector & key_rests,
+ const char *str, int len) const {
+ g_array_set_size(keys, 0);
+ g_array_set_size(key_rests, 0);
+
+ int maximum_len = 0; int i;
+ /* probe the longest possible double pinyin string. */
+ for (i = 0; i < len; ++i) {
+ const char ch = str[i];
+ if (!(IS_KEY(ch) || ('0' < ch && ch <= '5')))
+ break;
+ }
+ maximum_len = i;
+
+ /* maximum forward match for double pinyin. */
+ int parsed_len = 0;
+ while (parsed_len < maximum_len) {
+ const char * cur_str = str + parsed_len;
+ i = std_lite::min(maximum_len - parsed_len,
+ (int)max_double_pinyin_length);
+
+ ChewingKey key; ChewingKeyRest key_rest;
+ for (; i > 0; --i) {
+ bool success = parse_one_key(options, key, cur_str, i);
+ if (success)
+ break;
+ }
+
+ if (0 == i) /* no more possible double pinyins. */
+ break;
+
+ key_rest.m_raw_begin = parsed_len; key_rest.m_raw_end = parsed_len + i;
+ parsed_len += i;
+
+ /* save the pinyin */
+ g_array_append_val(keys, key);
+ g_array_append_val(key_rests, key_rest);
+ }
+
+ return parsed_len;
+}
+
+#undef IS_KEY
+
+bool DoublePinyinParser2::set_scheme(DoublePinyinScheme scheme) {
+
+ switch (scheme) {
+ case DOUBLE_PINYIN_ZRM:
+ m_shengmu_table = double_pinyin_zrm_sheng;
+ m_yunmu_table = double_pinyin_zrm_yun;
+ return true;
+ case DOUBLE_PINYIN_MS:
+ m_shengmu_table = double_pinyin_mspy_sheng;
+ m_yunmu_table = double_pinyin_mspy_yun;
+ return true;
+ case DOUBLE_PINYIN_ZIGUANG:
+ m_shengmu_table = double_pinyin_zgpy_sheng;
+ m_yunmu_table = double_pinyin_zgpy_yun;
+ return true;
+ case DOUBLE_PINYIN_ABC:
+ m_shengmu_table = double_pinyin_abc_sheng;
+ m_yunmu_table = double_pinyin_abc_yun;
+ return true;
+ case DOUBLE_PINYIN_PYJJ:
+ m_shengmu_table = double_pinyin_pyjj_sheng;
+ m_yunmu_table = double_pinyin_pyjj_yun;
+ return true;
+ case DOUBLE_PINYIN_XHE:
+ m_shengmu_table = double_pinyin_xhe_sheng;
+ m_yunmu_table = double_pinyin_xhe_yun;
+ return true;
+ case DOUBLE_PINYIN_CUSTOMIZED:
+ assert(FALSE);
+ };
+
+ return false; /* no such scheme. */
+}
+
+/* the chewing string must be freed with g_free. */
+static bool search_chewing_symbols(const chewing_symbol_item_t * symbol_table,
+ const char key, const char ** chewing) {
+ *chewing = NULL;
+ /* just iterate the table, as we only have < 50 items. */
+ while (symbol_table->m_input != '\0') {
+ if (symbol_table->m_input == key) {
+ *chewing = symbol_table->m_chewing;
+ return true;
+ }
+ symbol_table ++;
+ }
+ return false;
+}
+
+static bool search_chewing_tones(const chewing_tone_item_t * tone_table,
+ const char key, char * tone) {
+ *tone = CHEWING_ZERO_TONE;
+ /* just iterate the table, as we only have < 10 items. */
+ while (tone_table->m_input != '\0') {
+ if (tone_table->m_input == key) {
+ *tone = tone_table->m_tone;
+ return true;
+ }
+ tone_table ++;
+ }
+ return false;
+}
+
+
+bool ChewingParser2::parse_one_key(pinyin_option_t options,
+ ChewingKey & key,
+ const char *str, int len) const {
+ options &= ~(PINYIN_CORRECT_ALL|PINYIN_AMB_ALL);
+ char tone = CHEWING_ZERO_TONE;
+
+ int symbols_len = len;
+ /* probe whether the last key is tone key in str. */
+ if (options & USE_TONE) {
+ char ch = str[len - 1];
+ /* remove tone from input */
+ if (search_chewing_tones(m_tone_table, ch, &tone))
+ symbols_len --;
+ }
+
+ int i;
+ gchar * chewing = NULL; const char * onechar = NULL;
+
+ /* probe the possible chewing map in the rest of str. */
+ for (i = 0; i < symbols_len; ++i) {
+ if (!search_chewing_symbols(m_symbol_table, str[i], &onechar)) {
+ g_free(chewing);
+ return false;
+ }
+
+ if (!chewing) {
+ chewing = g_strdup(onechar);
+ } else {
+ gchar * tmp = chewing;
+ chewing = g_strconcat(chewing, onechar, NULL);
+ g_free(tmp);
+ }
+ }
+
+ /* search the chewing in the chewing index table. */
+ if (chewing && search_chewing_index(options, chewing, key)) {
+ /* save back tone if available. */
+ key.m_tone = tone;
+ g_free(chewing);
+ return true;
+ }
+
+ g_free(chewing);
+ return false;
+}
+
+
+/* only characters in chewing keyboard scheme are accepted here. */
+int ChewingParser2::parse(pinyin_option_t options, ChewingKeyVector & keys,
+ ChewingKeyRestVector & key_rests,
+ const char *str, int len) const {
+ g_array_set_size(keys, 0);
+ g_array_set_size(key_rests, 0);
+
+ int maximum_len = 0; int i;
+ /* probe the longest possible chewing string. */
+ for (i = 0; i < len; ++i) {
+ if (!in_chewing_scheme(options, str[i], NULL))
+ break;
+ }
+ maximum_len = i;
+
+ /* maximum forward match for chewing. */
+ int parsed_len = 0;
+ while (parsed_len < maximum_len) {
+ const char * cur_str = str + parsed_len;
+ i = std_lite::min(maximum_len - parsed_len,
+ (int)max_chewing_length);
+
+ ChewingKey key; ChewingKeyRest key_rest;
+ for (; i > 0; --i) {
+ bool success = parse_one_key(options, key, cur_str, i);
+ if (success)
+ break;
+ }
+
+ if (0 == i) /* no more possible chewings. */
+ break;
+
+ key_rest.m_raw_begin = parsed_len; key_rest.m_raw_end = parsed_len + i;
+ parsed_len += i;
+
+ /* save the pinyin. */
+ g_array_append_val(keys, key);
+ g_array_append_val(key_rests, key_rest);
+ }
+
+ return parsed_len;
+}
+
+
+bool ChewingParser2::set_scheme(ChewingScheme scheme) {
+ switch(scheme) {
+ case CHEWING_STANDARD:
+ m_symbol_table = chewing_standard_symbols;
+ m_tone_table = chewing_standard_tones;
+ return true;
+ case CHEWING_IBM:
+ m_symbol_table = chewing_ibm_symbols;
+ m_tone_table = chewing_ibm_tones;
+ return true;
+ case CHEWING_GINYIEH:
+ m_symbol_table = chewing_ginyieh_symbols;
+ m_tone_table = chewing_ginyieh_tones;
+ return true;
+ case CHEWING_ETEN:
+ m_symbol_table = chewing_eten_symbols;
+ m_tone_table = chewing_eten_tones;
+ return true;
+ }
+
+ return false;
+}
+
+
+bool ChewingParser2::in_chewing_scheme(pinyin_option_t options,
+ const char key, const char ** symbol)
+ const {
+ const gchar * chewing = NULL;
+ char tone = CHEWING_ZERO_TONE;
+
+ if (search_chewing_symbols(m_symbol_table, key, &chewing)) {
+ if (symbol)
+ *symbol = chewing;
+ return true;
+ }
+
+ if (!(options & USE_TONE))
+ return false;
+
+ if (search_chewing_tones(m_tone_table, key, &tone)) {
+ if (symbol)
+ *symbol = chewing_tone_table[tone];
+ return true;
+ }
+
+ return false;
+}
diff --git a/src/storage/pinyin_parser2.h b/src/storage/pinyin_parser2.h
new file mode 100644
index 0000000..e40b30c
--- /dev/null
+++ b/src/storage/pinyin_parser2.h
@@ -0,0 +1,361 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef PINYIN_PARSER2_H
+#define PINYIN_PARSER2_H
+
+#include <glib.h>
+#include "novel_types.h"
+#include "chewing_key.h"
+#include "pinyin_custom2.h"
+
+namespace pinyin{
+
+typedef struct {
+ const char * m_pinyin_str;
+ const char * m_shengmu_str;
+ const char * m_yunmu_str;
+ const char * m_chewing_str;
+ ChewingKey m_chewing_key;
+} content_table_item_t;
+
+typedef struct {
+ const char * m_pinyin_input;
+ guint32 m_flags;
+ guint16 m_table_index;
+} pinyin_index_item_t;
+
+typedef struct {
+ const char * m_chewing_input;
+ guint32 m_flags;
+ guint16 m_table_index;
+} chewing_index_item_t;
+
+typedef struct {
+ const char * m_orig_key;
+ guint32 m_orig_freq;
+ const char * m_new_keys[2];
+ guint32 m_new_freq;
+} divided_table_item_t;
+
+typedef struct {
+ const char * m_orig_keys[2];
+ guint32 m_orig_freq;
+ const char * m_new_keys[2];
+ guint32 m_new_freq;
+} resplit_table_item_t;
+
+typedef struct {
+ const char * m_shengmu;
+} double_pinyin_scheme_shengmu_item_t;
+
+typedef struct {
+ const char * m_yunmus[2];
+} double_pinyin_scheme_yunmu_item_t;
+
+typedef struct {
+ const char m_input;
+ const char * m_chewing;
+} chewing_symbol_item_t;
+
+typedef struct {
+ const char m_input;
+ const char m_tone;
+} chewing_tone_item_t;
+
+typedef GArray * ParseValueVector;
+
+
+/**
+ * PinyinParser2:
+ *
+ * Parse the ascii string into an array of the struct ChewingKeys.
+ *
+ */
+class PinyinParser2
+{
+public:
+ /**
+ * PinyinParser2::~PinyinParser2:
+ *
+ * The destructor of the PinyinParser2.
+ *
+ */
+ virtual ~PinyinParser2() {}
+
+public:
+ /**
+ * PinyinParser2::parse_one_key:
+ * @options: the pinyin options from pinyin_custom2.h.
+ * @key: the parsed result of struct ChewingKey.
+ * @str: the input of the ascii string.
+ * @len: the length of the str.
+ * @returns: whether the entire string is parsed as one key.
+ *
+ * Parse only one struct ChewingKey from a string.
+ *
+ */
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const = 0;
+
+ /**
+ * PinyinParser2::parse:
+ * @options: the pinyin options from pinyin_custom2.h.
+ * @keys: the parsed result of struct ChewingKeys.
+ * @str: the input of the ascii string.
+ * @len: the length of the str.
+ * @returns: the number of chars were actually used.
+ *
+ * Parse the ascii string into an array of struct ChewingKeys.
+ *
+ */
+ virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const = 0;
+
+};
+
+
+/**
+ * FullPinyinParser2:
+ *
+ * Parses the full pinyin string into an array of struct ChewingKeys.
+ *
+ */
+class FullPinyinParser2 : public PinyinParser2
+{
+ /* Note: some internal pointers to full pinyin table. */
+
+protected:
+ ParseValueVector m_parse_steps;
+
+ int final_step(size_t step_len, ChewingKeyVector & keys,
+ ChewingKeyRestVector & key_rests) const;
+
+ bool post_process2(pinyin_option_t options, ChewingKeyVector & keys,
+ ChewingKeyRestVector & key_rests,
+ const char * str, int len) const;
+
+public:
+ const divided_table_item_t * retrieve_divided_item
+ (pinyin_option_t options, ChewingKey * key, ChewingKeyRest * rest,
+ const char * str, int len) const;
+
+ const resplit_table_item_t * retrieve_resplit_item_by_original_pinyins
+ (pinyin_option_t options,
+ ChewingKey * cur_key, ChewingKeyRest * cur_rest,
+ ChewingKey * next_key, ChewingKeyRest * next_rest,
+ const char * str, int len) const;
+ const resplit_table_item_t * retrieve_resplit_item_by_resplit_pinyins
+ (pinyin_option_t options,
+ ChewingKey * cur_key, ChewingKeyRest * cur_rest,
+ ChewingKey * next_key, ChewingKeyRest * next_rest,
+ const char * str, int len) const;
+
+public:
+ FullPinyinParser2();
+ virtual ~FullPinyinParser2() {
+ g_array_free(m_parse_steps, TRUE);
+ }
+
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+
+ /* Note:
+ * the parse method will use dynamic programming to drive parse_one_key.
+ */
+ virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const;
+};
+
+
+/**
+ * DoublePinyinParser2:
+ *
+ * Parse the double pinyin string into an array of struct ChewingKeys.
+ *
+ */
+/* The valid input chars of ShuangPin is a-z and ';'
+ */
+class DoublePinyinParser2 : public PinyinParser2
+{
+ /* Note: two internal pointers to double pinyin scheme table. */
+protected:
+ const double_pinyin_scheme_shengmu_item_t * m_shengmu_table;
+ const double_pinyin_scheme_yunmu_item_t * m_yunmu_table;
+
+public:
+ DoublePinyinParser2() {
+ m_shengmu_table = NULL; m_yunmu_table = NULL;
+ set_scheme(DOUBLE_PINYIN_DEFAULT);
+ }
+
+ virtual ~DoublePinyinParser2() {}
+
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+
+ virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const;
+
+public:
+ bool set_scheme(DoublePinyinScheme scheme);
+};
+
+
+/**
+ * ChewingParser2:
+ *
+ * Parse the chewing string into an array of struct ChewingKeys.
+ *
+ * Several keyboard scheme are supported:
+ * * Chewing_STANDARD Standard ZhuYin keyboard, which maps 1 to Bo(ㄅ), q to Po(ㄆ) etc.
+ * * Chewing_IBM IBM ZhuYin keyboard, which maps 1 to Bo(ㄅ), 2 to Po(ㄆ) etc.
+ * * Chewing_GINYIEH Gin-Yieh ZhuYin keyboard.
+ * * Chewing_ETEN Eten (倚天) ZhuYin keyboard.
+ *
+ */
+
+/* Note: maybe yunmus shuffle will be supported later.
+ * currently this feature is postponed.
+ */
+class ChewingParser2 : public PinyinParser2
+{
+ /* Note: some internal pointers to chewing scheme table. */
+protected:
+ const chewing_symbol_item_t * m_symbol_table;
+ const chewing_tone_item_t * m_tone_table;
+
+public:
+ ChewingParser2() {
+ m_symbol_table = NULL; m_tone_table = NULL;
+ set_scheme(CHEWING_DEFAULT);
+ }
+
+ virtual ~ChewingParser2() {}
+
+ virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const;
+
+ virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const;
+
+public:
+ bool set_scheme(ChewingScheme scheme);
+ bool in_chewing_scheme(pinyin_option_t options, const char key, const char ** symbol) const;
+};
+
+
+/* compare pinyins with chewing internal representations. */
+inline int pinyin_compare_initial2(pinyin_option_t options,
+ ChewingInitial lhs,
+ ChewingInitial rhs) {
+ if (lhs == rhs)
+ return 0;
+
+ if ((options & PINYIN_AMB_C_CH) &&
+ ((lhs == CHEWING_C && rhs == CHEWING_CH) ||
+ (lhs == CHEWING_CH && rhs == CHEWING_C)))
+ return 0;
+
+ if ((options & PINYIN_AMB_S_SH) &&
+ ((lhs == CHEWING_S && rhs == CHEWING_SH) ||
+ (lhs == CHEWING_SH && rhs == CHEWING_S)))
+ return 0;
+
+ if ((options & PINYIN_AMB_Z_ZH) &&
+ ((lhs == CHEWING_Z && rhs == CHEWING_ZH) ||
+ (lhs == CHEWING_ZH && rhs == CHEWING_Z)))
+ return 0;
+
+ if ((options & PINYIN_AMB_F_H) &&
+ ((lhs == CHEWING_F && rhs == CHEWING_H) ||
+ (lhs == CHEWING_H && rhs == CHEWING_F)))
+ return 0;
+
+ if ((options & PINYIN_AMB_L_N) &&
+ ((lhs == CHEWING_L && rhs == CHEWING_N) ||
+ (lhs == CHEWING_N && rhs == CHEWING_L)))
+ return 0;
+
+ if ((options & PINYIN_AMB_L_R) &&
+ ((lhs == CHEWING_L && rhs == CHEWING_R) ||
+ (lhs == CHEWING_R && rhs == CHEWING_L)))
+ return 0;
+
+ if ((options & PINYIN_AMB_G_K) &&
+ ((lhs == CHEWING_G && rhs == CHEWING_K) ||
+ (lhs == CHEWING_K && rhs == CHEWING_G)))
+ return 0;
+
+ return (lhs - rhs);
+}
+
+
+inline int pinyin_compare_middle_and_final2(pinyin_option_t options,
+ ChewingMiddle middle_lhs,
+ ChewingMiddle middle_rhs,
+ ChewingFinal final_lhs,
+ ChewingFinal final_rhs) {
+ if (middle_lhs == middle_rhs && final_lhs == final_rhs)
+ return 0;
+
+ /* both pinyin and chewing incomplete options will enable this. */
+ if (options & (PINYIN_INCOMPLETE | CHEWING_INCOMPLETE)) {
+ if (middle_lhs == CHEWING_ZERO_MIDDLE &&
+ final_lhs == CHEWING_ZERO_FINAL)
+ return 0;
+ if (middle_rhs == CHEWING_ZERO_MIDDLE &&
+ final_rhs == CHEWING_ZERO_FINAL)
+ return 0;
+ }
+
+ /* compare chewing middle first. */
+ int middle_diff = middle_lhs - middle_rhs;
+ if (middle_diff)
+ return middle_diff;
+
+ if ((options & PINYIN_AMB_AN_ANG) &&
+ ((final_lhs == CHEWING_AN && final_rhs == CHEWING_ANG) ||
+ (final_lhs == CHEWING_ANG && final_rhs == CHEWING_AN)))
+ return 0;
+
+ if ((options & PINYIN_AMB_EN_ENG) &&
+ ((final_lhs == CHEWING_EN && final_rhs == CHEWING_ENG) ||
+ (final_lhs == CHEWING_ENG && final_rhs == CHEWING_EN)))
+ return 0;
+
+ if ((options & PINYIN_AMB_IN_ING) &&
+ ((final_lhs == PINYIN_IN && final_rhs == PINYIN_ING) ||
+ (final_lhs == PINYIN_ING && final_rhs == PINYIN_IN)))
+ return 0;
+
+ return (final_lhs - final_rhs);
+}
+
+
+inline int pinyin_compare_tone2(pinyin_option_t options,
+ ChewingTone lhs,
+ ChewingTone rhs) {
+ if (lhs == rhs)
+ return 0;
+ if (lhs == CHEWING_ZERO_TONE)
+ return 0;
+ if (rhs == CHEWING_ZERO_TONE)
+ return 0;
+ return (lhs - rhs);
+}
+
+
+};
+
+#endif
diff --git a/src/storage/pinyin_parser_table.h b/src/storage/pinyin_parser_table.h
new file mode 100644
index 0000000..f633604
--- /dev/null
+++ b/src/storage/pinyin_parser_table.h
@@ -0,0 +1,3393 @@
+/* This file is generated by python scripts. Don't edit this file directly.
+ */
+
+#ifndef PINYIN_PARSER_TABLE_H
+#define PINYIN_PARSER_TABLE_H
+
+namespace pinyin{
+
+const pinyin_index_item_t pinyin_index[] = {
+{"a", IS_CHEWING|IS_PINYIN, 1},
+{"agn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 4},
+{"ai", IS_CHEWING|IS_PINYIN, 2},
+{"amg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 4},
+{"an", IS_CHEWING|IS_PINYIN, 3},
+{"ang", IS_CHEWING|IS_PINYIN, 4},
+{"ao", IS_CHEWING|IS_PINYIN, 5},
+{"b", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 6},
+{"ba", IS_CHEWING|IS_PINYIN, 7},
+{"bagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 10},
+{"bai", IS_CHEWING|IS_PINYIN, 8},
+{"bamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 10},
+{"ban", IS_CHEWING|IS_PINYIN, 9},
+{"bang", IS_CHEWING|IS_PINYIN, 10},
+{"bao", IS_CHEWING|IS_PINYIN, 11},
+{"begn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 14},
+{"bei", IS_CHEWING|IS_PINYIN, 12},
+{"bemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 14},
+{"ben", IS_CHEWING|IS_PINYIN, 13},
+{"beng", IS_CHEWING|IS_PINYIN, 14},
+{"bi", IS_CHEWING|IS_PINYIN, 15},
+{"bian", IS_CHEWING|IS_PINYIN, 16},
+{"biao", IS_CHEWING|IS_PINYIN, 17},
+{"bie", IS_CHEWING|IS_PINYIN, 18},
+{"bign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 20},
+{"bimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 20},
+{"bin", IS_CHEWING|IS_PINYIN, 19},
+{"bing", IS_CHEWING|IS_PINYIN, 20},
+{"bo", IS_CHEWING|IS_PINYIN, 21},
+{"bu", IS_CHEWING|IS_PINYIN, 22},
+{"c", IS_PINYIN|PINYIN_INCOMPLETE, 23},
+{"ca", IS_CHEWING|IS_PINYIN, 24},
+{"cagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 27},
+{"cai", IS_CHEWING|IS_PINYIN, 25},
+{"camg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 27},
+{"can", IS_CHEWING|IS_PINYIN, 26},
+{"cang", IS_CHEWING|IS_PINYIN, 27},
+{"cao", IS_CHEWING|IS_PINYIN, 28},
+{"ce", IS_CHEWING|IS_PINYIN, 29},
+{"cegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 31},
+{"cemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 31},
+{"cen", IS_CHEWING|IS_PINYIN, 30},
+{"ceng", IS_CHEWING|IS_PINYIN, 31},
+{"ch", IS_PINYIN|PINYIN_INCOMPLETE, 32},
+{"cha", IS_CHEWING|IS_PINYIN, 33},
+{"chagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 36},
+{"chai", IS_CHEWING|IS_PINYIN, 34},
+{"chamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 36},
+{"chan", IS_CHEWING|IS_PINYIN, 35},
+{"chang", IS_CHEWING|IS_PINYIN, 36},
+{"chao", IS_CHEWING|IS_PINYIN, 37},
+{"che", IS_CHEWING|IS_PINYIN, 38},
+{"chegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 40},
+{"chemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 40},
+{"chen", IS_CHEWING|IS_PINYIN, 39},
+{"cheng", IS_CHEWING|IS_PINYIN, 40},
+{"chi", IS_CHEWING|IS_PINYIN, 41},
+{"chogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 42},
+{"chomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 42},
+{"chon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 42},
+{"chong", IS_CHEWING|IS_PINYIN, 42},
+{"chou", IS_CHEWING|IS_PINYIN, 43},
+{"chu", IS_CHEWING|IS_PINYIN, 44},
+{"chuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 48},
+{"chuai", IS_CHEWING|IS_PINYIN, 46},
+{"chuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 48},
+{"chuan", IS_CHEWING|IS_PINYIN, 47},
+{"chuang", IS_CHEWING|IS_PINYIN, 48},
+{"chuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 49},
+{"chuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 50},
+{"chui", IS_CHEWING|IS_PINYIN, 49},
+{"chun", IS_CHEWING|IS_PINYIN, 50},
+{"chuo", IS_CHEWING|IS_PINYIN, 51},
+{"ci", IS_CHEWING|IS_PINYIN, 52},
+{"cogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 53},
+{"comg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 53},
+{"con", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 53},
+{"cong", IS_CHEWING|IS_PINYIN, 53},
+{"cou", IS_CHEWING|IS_PINYIN, 54},
+{"cu", IS_CHEWING|IS_PINYIN, 55},
+{"cuan", IS_CHEWING|IS_PINYIN, 56},
+{"cuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 57},
+{"cuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 58},
+{"cui", IS_CHEWING|IS_PINYIN, 57},
+{"cun", IS_CHEWING|IS_PINYIN, 58},
+{"cuo", IS_CHEWING|IS_PINYIN, 59},
+{"d", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 60},
+{"da", IS_CHEWING|IS_PINYIN, 61},
+{"dagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 64},
+{"dai", IS_CHEWING|IS_PINYIN, 62},
+{"damg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 64},
+{"dan", IS_CHEWING|IS_PINYIN, 63},
+{"dang", IS_CHEWING|IS_PINYIN, 64},
+{"dao", IS_CHEWING|IS_PINYIN, 65},
+{"de", IS_CHEWING|IS_PINYIN, 66},
+{"degn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 69},
+{"dei", IS_CHEWING|IS_PINYIN, 67},
+{"demg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 69},
+{"deng", IS_CHEWING|IS_PINYIN, 69},
+{"di", IS_CHEWING|IS_PINYIN, 70},
+{"dia", IS_CHEWING|IS_PINYIN, 71},
+{"dian", IS_CHEWING|IS_PINYIN, 72},
+{"diao", IS_CHEWING|IS_PINYIN, 73},
+{"die", IS_CHEWING|IS_PINYIN, 74},
+{"dign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 76},
+{"dimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 76},
+{"ding", IS_CHEWING|IS_PINYIN, 76},
+{"diou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 77},
+{"diu", IS_CHEWING|IS_PINYIN, 77},
+{"dogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 78},
+{"domg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 78},
+{"don", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 78},
+{"dong", IS_CHEWING|IS_PINYIN, 78},
+{"dou", IS_CHEWING|IS_PINYIN, 79},
+{"du", IS_CHEWING|IS_PINYIN, 80},
+{"duan", IS_CHEWING|IS_PINYIN, 81},
+{"duei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 82},
+{"duen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 83},
+{"dui", IS_CHEWING|IS_PINYIN, 82},
+{"dun", IS_CHEWING|IS_PINYIN, 83},
+{"duo", IS_CHEWING|IS_PINYIN, 84},
+{"e", IS_CHEWING|IS_PINYIN, 85},
+{"egn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 88},
+{"ei", IS_CHEWING|IS_PINYIN, 86},
+{"emg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 88},
+{"en", IS_CHEWING|IS_PINYIN, 87},
+{"er", IS_CHEWING|IS_PINYIN, 89},
+{"f", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 90},
+{"fa", IS_CHEWING|IS_PINYIN, 91},
+{"fagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 93},
+{"famg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 93},
+{"fan", IS_CHEWING|IS_PINYIN, 92},
+{"fang", IS_CHEWING|IS_PINYIN, 93},
+{"fegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 97},
+{"fei", IS_CHEWING|IS_PINYIN, 95},
+{"femg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 97},
+{"fen", IS_CHEWING|IS_PINYIN, 96},
+{"feng", IS_CHEWING|IS_PINYIN, 97},
+{"fo", IS_CHEWING|IS_PINYIN, 98},
+{"fou", IS_CHEWING|IS_PINYIN, 99},
+{"fu", IS_CHEWING|IS_PINYIN, 100},
+{"g", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 101},
+{"ga", IS_CHEWING|IS_PINYIN, 102},
+{"gagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 105},
+{"gai", IS_CHEWING|IS_PINYIN, 103},
+{"gamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 105},
+{"gan", IS_CHEWING|IS_PINYIN, 104},
+{"gang", IS_CHEWING|IS_PINYIN, 105},
+{"gao", IS_CHEWING|IS_PINYIN, 106},
+{"ge", IS_CHEWING|IS_PINYIN, 107},
+{"gegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 110},
+{"gei", IS_CHEWING|IS_PINYIN, 108},
+{"gemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 110},
+{"gen", IS_CHEWING|IS_PINYIN, 109},
+{"geng", IS_CHEWING|IS_PINYIN, 110},
+{"gogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 111},
+{"gomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 111},
+{"gon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 111},
+{"gong", IS_CHEWING|IS_PINYIN, 111},
+{"gou", IS_CHEWING|IS_PINYIN, 112},
+{"gu", IS_CHEWING|IS_PINYIN, 113},
+{"gua", IS_CHEWING|IS_PINYIN, 114},
+{"guagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 117},
+{"guai", IS_CHEWING|IS_PINYIN, 115},
+{"guamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 117},
+{"guan", IS_CHEWING|IS_PINYIN, 116},
+{"guang", IS_CHEWING|IS_PINYIN, 117},
+{"guei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 118},
+{"guen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 119},
+{"gui", IS_CHEWING|IS_PINYIN, 118},
+{"gun", IS_CHEWING|IS_PINYIN, 119},
+{"guo", IS_CHEWING|IS_PINYIN, 120},
+{"h", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 121},
+{"ha", IS_CHEWING|IS_PINYIN, 122},
+{"hagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 125},
+{"hai", IS_CHEWING|IS_PINYIN, 123},
+{"hamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 125},
+{"han", IS_CHEWING|IS_PINYIN, 124},
+{"hang", IS_CHEWING|IS_PINYIN, 125},
+{"hao", IS_CHEWING|IS_PINYIN, 126},
+{"he", IS_CHEWING|IS_PINYIN, 127},
+{"hegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 130},
+{"hei", IS_CHEWING|IS_PINYIN, 128},
+{"hemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 130},
+{"hen", IS_CHEWING|IS_PINYIN, 129},
+{"heng", IS_CHEWING|IS_PINYIN, 130},
+{"hogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 131},
+{"homg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 131},
+{"hon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 131},
+{"hong", IS_CHEWING|IS_PINYIN, 131},
+{"hou", IS_CHEWING|IS_PINYIN, 132},
+{"hu", IS_CHEWING|IS_PINYIN, 133},
+{"hua", IS_CHEWING|IS_PINYIN, 134},
+{"huagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 137},
+{"huai", IS_CHEWING|IS_PINYIN, 135},
+{"huamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 137},
+{"huan", IS_CHEWING|IS_PINYIN, 136},
+{"huang", IS_CHEWING|IS_PINYIN, 137},
+{"huei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 138},
+{"huen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 139},
+{"hui", IS_CHEWING|IS_PINYIN, 138},
+{"hun", IS_CHEWING|IS_PINYIN, 139},
+{"huo", IS_CHEWING|IS_PINYIN, 140},
+{"j", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 141},
+{"ji", IS_CHEWING|IS_PINYIN, 142},
+{"jia", IS_CHEWING|IS_PINYIN, 143},
+{"jiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 145},
+{"jiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 145},
+{"jian", IS_CHEWING|IS_PINYIN, 144},
+{"jiang", IS_CHEWING|IS_PINYIN, 145},
+{"jiao", IS_CHEWING|IS_PINYIN, 146},
+{"jie", IS_CHEWING|IS_PINYIN, 147},
+{"jign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 149},
+{"jimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 149},
+{"jin", IS_CHEWING|IS_PINYIN, 148},
+{"jing", IS_CHEWING|IS_PINYIN, 149},
+{"jiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 150},
+{"jiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 150},
+{"jion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 150},
+{"jiong", IS_CHEWING|IS_PINYIN, 150},
+{"jiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 151},
+{"jiu", IS_CHEWING|IS_PINYIN, 151},
+{"ju", IS_CHEWING|IS_PINYIN, 152},
+{"juan", IS_CHEWING|IS_PINYIN, 153},
+{"jue", IS_CHEWING|IS_PINYIN, 154},
+{"juen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 155},
+{"jun", IS_CHEWING|IS_PINYIN, 155},
+{"jv", IS_PINYIN|PINYIN_CORRECT_V_U, 152},
+{"jvan", IS_PINYIN|PINYIN_CORRECT_V_U, 153},
+{"jve", IS_PINYIN|PINYIN_CORRECT_V_U, 154},
+{"jvn", IS_PINYIN|PINYIN_CORRECT_V_U, 155},
+{"k", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 156},
+{"ka", IS_CHEWING|IS_PINYIN, 157},
+{"kagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 160},
+{"kai", IS_CHEWING|IS_PINYIN, 158},
+{"kamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 160},
+{"kan", IS_CHEWING|IS_PINYIN, 159},
+{"kang", IS_CHEWING|IS_PINYIN, 160},
+{"kao", IS_CHEWING|IS_PINYIN, 161},
+{"ke", IS_CHEWING|IS_PINYIN, 162},
+{"kegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 165},
+{"kemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 165},
+{"ken", IS_CHEWING|IS_PINYIN, 164},
+{"keng", IS_CHEWING|IS_PINYIN, 165},
+{"kogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 166},
+{"komg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 166},
+{"kon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 166},
+{"kong", IS_CHEWING|IS_PINYIN, 166},
+{"kou", IS_CHEWING|IS_PINYIN, 167},
+{"ku", IS_CHEWING|IS_PINYIN, 168},
+{"kua", IS_CHEWING|IS_PINYIN, 169},
+{"kuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 172},
+{"kuai", IS_CHEWING|IS_PINYIN, 170},
+{"kuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 172},
+{"kuan", IS_CHEWING|IS_PINYIN, 171},
+{"kuang", IS_CHEWING|IS_PINYIN, 172},
+{"kuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 173},
+{"kuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 174},
+{"kui", IS_CHEWING|IS_PINYIN, 173},
+{"kun", IS_CHEWING|IS_PINYIN, 174},
+{"kuo", IS_CHEWING|IS_PINYIN, 175},
+{"l", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 176},
+{"la", IS_CHEWING|IS_PINYIN, 177},
+{"lagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 180},
+{"lai", IS_CHEWING|IS_PINYIN, 178},
+{"lamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 180},
+{"lan", IS_CHEWING|IS_PINYIN, 179},
+{"lang", IS_CHEWING|IS_PINYIN, 180},
+{"lao", IS_CHEWING|IS_PINYIN, 181},
+{"le", IS_CHEWING|IS_PINYIN, 182},
+{"legn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 185},
+{"lei", IS_CHEWING|IS_PINYIN, 183},
+{"lemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 185},
+{"leng", IS_CHEWING|IS_PINYIN, 185},
+{"li", IS_CHEWING|IS_PINYIN, 186},
+{"lia", IS_CHEWING|IS_PINYIN, 187},
+{"liagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 189},
+{"liamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 189},
+{"lian", IS_CHEWING|IS_PINYIN, 188},
+{"liang", IS_CHEWING|IS_PINYIN, 189},
+{"liao", IS_CHEWING|IS_PINYIN, 190},
+{"lie", IS_CHEWING|IS_PINYIN, 191},
+{"lign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 193},
+{"limg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 193},
+{"lin", IS_CHEWING|IS_PINYIN, 192},
+{"ling", IS_CHEWING|IS_PINYIN, 193},
+{"liou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 194},
+{"liu", IS_CHEWING|IS_PINYIN, 194},
+{"lo", IS_CHEWING|IS_PINYIN, 195},
+{"logn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 196},
+{"lomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 196},
+{"lon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 196},
+{"long", IS_CHEWING|IS_PINYIN, 196},
+{"lou", IS_CHEWING|IS_PINYIN, 197},
+{"lu", IS_CHEWING|IS_PINYIN, 198},
+{"luan", IS_CHEWING|IS_PINYIN, 199},
+{"lue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 203},
+{"luen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 200},
+{"lun", IS_CHEWING|IS_PINYIN, 200},
+{"luo", IS_CHEWING|IS_PINYIN, 201},
+{"lv", IS_CHEWING|IS_PINYIN, 202},
+{"lve", IS_CHEWING|IS_PINYIN, 203},
+{"m", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 204},
+{"ma", IS_CHEWING|IS_PINYIN, 205},
+{"magn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 208},
+{"mai", IS_CHEWING|IS_PINYIN, 206},
+{"mamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 208},
+{"man", IS_CHEWING|IS_PINYIN, 207},
+{"mang", IS_CHEWING|IS_PINYIN, 208},
+{"mao", IS_CHEWING|IS_PINYIN, 209},
+{"me", IS_CHEWING|IS_PINYIN, 210},
+{"megn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 213},
+{"mei", IS_CHEWING|IS_PINYIN, 211},
+{"memg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 213},
+{"men", IS_CHEWING|IS_PINYIN, 212},
+{"meng", IS_CHEWING|IS_PINYIN, 213},
+{"mi", IS_CHEWING|IS_PINYIN, 214},
+{"mian", IS_CHEWING|IS_PINYIN, 215},
+{"miao", IS_CHEWING|IS_PINYIN, 216},
+{"mie", IS_CHEWING|IS_PINYIN, 217},
+{"mign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 219},
+{"mimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 219},
+{"min", IS_CHEWING|IS_PINYIN, 218},
+{"ming", IS_CHEWING|IS_PINYIN, 219},
+{"miou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 220},
+{"miu", IS_CHEWING|IS_PINYIN, 220},
+{"mo", IS_CHEWING|IS_PINYIN, 221},
+{"mou", IS_CHEWING|IS_PINYIN, 222},
+{"mu", IS_CHEWING|IS_PINYIN, 223},
+{"n", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 224},
+{"na", IS_CHEWING|IS_PINYIN, 225},
+{"nagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 228},
+{"nai", IS_CHEWING|IS_PINYIN, 226},
+{"namg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 228},
+{"nan", IS_CHEWING|IS_PINYIN, 227},
+{"nang", IS_CHEWING|IS_PINYIN, 228},
+{"nao", IS_CHEWING|IS_PINYIN, 229},
+{"ne", IS_CHEWING|IS_PINYIN, 230},
+{"negn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 233},
+{"nei", IS_CHEWING|IS_PINYIN, 231},
+{"nemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 233},
+{"nen", IS_CHEWING|IS_PINYIN, 232},
+{"neng", IS_CHEWING|IS_PINYIN, 233},
+{"ng", IS_CHEWING|IS_PINYIN, 234},
+{"ni", IS_CHEWING|IS_PINYIN, 235},
+{"niagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 238},
+{"niamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 238},
+{"nian", IS_CHEWING|IS_PINYIN, 237},
+{"niang", IS_CHEWING|IS_PINYIN, 238},
+{"niao", IS_CHEWING|IS_PINYIN, 239},
+{"nie", IS_CHEWING|IS_PINYIN, 240},
+{"nign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 242},
+{"nimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 242},
+{"nin", IS_CHEWING|IS_PINYIN, 241},
+{"ning", IS_CHEWING|IS_PINYIN, 242},
+{"niou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 243},
+{"niu", IS_CHEWING|IS_PINYIN, 243},
+{"nogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 244},
+{"nomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 244},
+{"non", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 244},
+{"nong", IS_CHEWING|IS_PINYIN, 244},
+{"nou", IS_CHEWING|IS_PINYIN, 245},
+{"nu", IS_CHEWING|IS_PINYIN, 246},
+{"nuan", IS_CHEWING|IS_PINYIN, 247},
+{"nue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 251},
+{"nuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 248},
+{"nuo", IS_CHEWING|IS_PINYIN, 249},
+{"nv", IS_CHEWING|IS_PINYIN, 250},
+{"nve", IS_CHEWING|IS_PINYIN, 251},
+{"o", IS_CHEWING|IS_PINYIN, 252},
+{"ou", IS_CHEWING|IS_PINYIN, 253},
+{"p", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 254},
+{"pa", IS_CHEWING|IS_PINYIN, 255},
+{"pagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 258},
+{"pai", IS_CHEWING|IS_PINYIN, 256},
+{"pamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 258},
+{"pan", IS_CHEWING|IS_PINYIN, 257},
+{"pang", IS_CHEWING|IS_PINYIN, 258},
+{"pao", IS_CHEWING|IS_PINYIN, 259},
+{"pegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 262},
+{"pei", IS_CHEWING|IS_PINYIN, 260},
+{"pemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 262},
+{"pen", IS_CHEWING|IS_PINYIN, 261},
+{"peng", IS_CHEWING|IS_PINYIN, 262},
+{"pi", IS_CHEWING|IS_PINYIN, 263},
+{"pian", IS_CHEWING|IS_PINYIN, 264},
+{"piao", IS_CHEWING|IS_PINYIN, 265},
+{"pie", IS_CHEWING|IS_PINYIN, 266},
+{"pign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 268},
+{"pimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 268},
+{"pin", IS_CHEWING|IS_PINYIN, 267},
+{"ping", IS_CHEWING|IS_PINYIN, 268},
+{"po", IS_CHEWING|IS_PINYIN, 269},
+{"pou", IS_CHEWING|IS_PINYIN, 270},
+{"pu", IS_CHEWING|IS_PINYIN, 271},
+{"q", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 272},
+{"qi", IS_CHEWING|IS_PINYIN, 273},
+{"qia", IS_CHEWING|IS_PINYIN, 274},
+{"qiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 276},
+{"qiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 276},
+{"qian", IS_CHEWING|IS_PINYIN, 275},
+{"qiang", IS_CHEWING|IS_PINYIN, 276},
+{"qiao", IS_CHEWING|IS_PINYIN, 277},
+{"qie", IS_CHEWING|IS_PINYIN, 278},
+{"qign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 280},
+{"qimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 280},
+{"qin", IS_CHEWING|IS_PINYIN, 279},
+{"qing", IS_CHEWING|IS_PINYIN, 280},
+{"qiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 281},
+{"qiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 281},
+{"qion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 281},
+{"qiong", IS_CHEWING|IS_PINYIN, 281},
+{"qiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 282},
+{"qiu", IS_CHEWING|IS_PINYIN, 282},
+{"qu", IS_CHEWING|IS_PINYIN, 283},
+{"quan", IS_CHEWING|IS_PINYIN, 284},
+{"que", IS_CHEWING|IS_PINYIN, 285},
+{"quen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 286},
+{"qun", IS_CHEWING|IS_PINYIN, 286},
+{"qv", IS_PINYIN|PINYIN_CORRECT_V_U, 283},
+{"qvan", IS_PINYIN|PINYIN_CORRECT_V_U, 284},
+{"qve", IS_PINYIN|PINYIN_CORRECT_V_U, 285},
+{"qvn", IS_PINYIN|PINYIN_CORRECT_V_U, 286},
+{"r", IS_PINYIN|PINYIN_INCOMPLETE, 287},
+{"ragn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 289},
+{"ramg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 289},
+{"ran", IS_CHEWING|IS_PINYIN, 288},
+{"rang", IS_CHEWING|IS_PINYIN, 289},
+{"rao", IS_CHEWING|IS_PINYIN, 290},
+{"re", IS_CHEWING|IS_PINYIN, 291},
+{"regn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 293},
+{"remg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 293},
+{"ren", IS_CHEWING|IS_PINYIN, 292},
+{"reng", IS_CHEWING|IS_PINYIN, 293},
+{"ri", IS_CHEWING|IS_PINYIN, 294},
+{"rogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 295},
+{"romg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 295},
+{"ron", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 295},
+{"rong", IS_CHEWING|IS_PINYIN, 295},
+{"rou", IS_CHEWING|IS_PINYIN, 296},
+{"ru", IS_CHEWING|IS_PINYIN, 297},
+{"ruan", IS_CHEWING|IS_PINYIN, 299},
+{"ruei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 300},
+{"ruen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 301},
+{"rui", IS_CHEWING|IS_PINYIN, 300},
+{"run", IS_CHEWING|IS_PINYIN, 301},
+{"ruo", IS_CHEWING|IS_PINYIN, 302},
+{"s", IS_PINYIN|PINYIN_INCOMPLETE, 303},
+{"sa", IS_CHEWING|IS_PINYIN, 304},
+{"sagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 307},
+{"sai", IS_CHEWING|IS_PINYIN, 305},
+{"samg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 307},
+{"san", IS_CHEWING|IS_PINYIN, 306},
+{"sang", IS_CHEWING|IS_PINYIN, 307},
+{"sao", IS_CHEWING|IS_PINYIN, 308},
+{"se", IS_CHEWING|IS_PINYIN, 309},
+{"segn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 311},
+{"semg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 311},
+{"sen", IS_CHEWING|IS_PINYIN, 310},
+{"seng", IS_CHEWING|IS_PINYIN, 311},
+{"sh", IS_PINYIN|PINYIN_INCOMPLETE, 312},
+{"sha", IS_CHEWING|IS_PINYIN, 313},
+{"shagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 316},
+{"shai", IS_CHEWING|IS_PINYIN, 314},
+{"shamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 316},
+{"shan", IS_CHEWING|IS_PINYIN, 315},
+{"shang", IS_CHEWING|IS_PINYIN, 316},
+{"shao", IS_CHEWING|IS_PINYIN, 317},
+{"she", IS_CHEWING|IS_PINYIN, 318},
+{"shegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 321},
+{"shei", IS_CHEWING|IS_PINYIN, 319},
+{"shemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 321},
+{"shen", IS_CHEWING|IS_PINYIN, 320},
+{"sheng", IS_CHEWING|IS_PINYIN, 321},
+{"shi", IS_CHEWING|IS_PINYIN, 322},
+{"shou", IS_CHEWING|IS_PINYIN, 323},
+{"shu", IS_CHEWING|IS_PINYIN, 324},
+{"shua", IS_CHEWING|IS_PINYIN, 325},
+{"shuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 328},
+{"shuai", IS_CHEWING|IS_PINYIN, 326},
+{"shuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 328},
+{"shuan", IS_CHEWING|IS_PINYIN, 327},
+{"shuang", IS_CHEWING|IS_PINYIN, 328},
+{"shuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 329},
+{"shuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 330},
+{"shui", IS_CHEWING|IS_PINYIN, 329},
+{"shun", IS_CHEWING|IS_PINYIN, 330},
+{"shuo", IS_CHEWING|IS_PINYIN, 331},
+{"si", IS_CHEWING|IS_PINYIN, 332},
+{"sogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 333},
+{"somg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 333},
+{"son", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 333},
+{"song", IS_CHEWING|IS_PINYIN, 333},
+{"sou", IS_CHEWING|IS_PINYIN, 334},
+{"su", IS_CHEWING|IS_PINYIN, 335},
+{"suan", IS_CHEWING|IS_PINYIN, 336},
+{"suei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 337},
+{"suen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 338},
+{"sui", IS_CHEWING|IS_PINYIN, 337},
+{"sun", IS_CHEWING|IS_PINYIN, 338},
+{"suo", IS_CHEWING|IS_PINYIN, 339},
+{"t", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 340},
+{"ta", IS_CHEWING|IS_PINYIN, 341},
+{"tagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 344},
+{"tai", IS_CHEWING|IS_PINYIN, 342},
+{"tamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 344},
+{"tan", IS_CHEWING|IS_PINYIN, 343},
+{"tang", IS_CHEWING|IS_PINYIN, 344},
+{"tao", IS_CHEWING|IS_PINYIN, 345},
+{"te", IS_CHEWING|IS_PINYIN, 346},
+{"tegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 347},
+{"temg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 347},
+{"teng", IS_CHEWING|IS_PINYIN, 347},
+{"ti", IS_CHEWING|IS_PINYIN, 348},
+{"tian", IS_CHEWING|IS_PINYIN, 349},
+{"tiao", IS_CHEWING|IS_PINYIN, 350},
+{"tie", IS_CHEWING|IS_PINYIN, 351},
+{"tign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 352},
+{"timg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 352},
+{"ting", IS_CHEWING|IS_PINYIN, 352},
+{"togn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 353},
+{"tomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 353},
+{"ton", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 353},
+{"tong", IS_CHEWING|IS_PINYIN, 353},
+{"tou", IS_CHEWING|IS_PINYIN, 354},
+{"tu", IS_CHEWING|IS_PINYIN, 355},
+{"tuan", IS_CHEWING|IS_PINYIN, 356},
+{"tuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 357},
+{"tuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 358},
+{"tui", IS_CHEWING|IS_PINYIN, 357},
+{"tun", IS_CHEWING|IS_PINYIN, 358},
+{"tuo", IS_CHEWING|IS_PINYIN, 359},
+{"w", IS_PINYIN|PINYIN_INCOMPLETE, 360},
+{"wa", IS_CHEWING|IS_PINYIN, 361},
+{"wagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 364},
+{"wai", IS_CHEWING|IS_PINYIN, 362},
+{"wamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 364},
+{"wan", IS_CHEWING|IS_PINYIN, 363},
+{"wang", IS_CHEWING|IS_PINYIN, 364},
+{"wegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 367},
+{"wei", IS_CHEWING|IS_PINYIN, 365},
+{"wemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 367},
+{"wen", IS_CHEWING|IS_PINYIN, 366},
+{"weng", IS_CHEWING|IS_PINYIN, 367},
+{"wo", IS_CHEWING|IS_PINYIN, 368},
+{"wu", IS_CHEWING|IS_PINYIN, 369},
+{"x", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 370},
+{"xi", IS_CHEWING|IS_PINYIN, 371},
+{"xia", IS_CHEWING|IS_PINYIN, 372},
+{"xiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 374},
+{"xiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 374},
+{"xian", IS_CHEWING|IS_PINYIN, 373},
+{"xiang", IS_CHEWING|IS_PINYIN, 374},
+{"xiao", IS_CHEWING|IS_PINYIN, 375},
+{"xie", IS_CHEWING|IS_PINYIN, 376},
+{"xign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 378},
+{"ximg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 378},
+{"xin", IS_CHEWING|IS_PINYIN, 377},
+{"xing", IS_CHEWING|IS_PINYIN, 378},
+{"xiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 379},
+{"xiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 379},
+{"xion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 379},
+{"xiong", IS_CHEWING|IS_PINYIN, 379},
+{"xiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 380},
+{"xiu", IS_CHEWING|IS_PINYIN, 380},
+{"xu", IS_CHEWING|IS_PINYIN, 381},
+{"xuan", IS_CHEWING|IS_PINYIN, 382},
+{"xue", IS_CHEWING|IS_PINYIN, 383},
+{"xuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 384},
+{"xun", IS_CHEWING|IS_PINYIN, 384},
+{"xv", IS_PINYIN|PINYIN_CORRECT_V_U, 381},
+{"xvan", IS_PINYIN|PINYIN_CORRECT_V_U, 382},
+{"xve", IS_PINYIN|PINYIN_CORRECT_V_U, 383},
+{"xvn", IS_PINYIN|PINYIN_CORRECT_V_U, 384},
+{"y", IS_PINYIN|PINYIN_INCOMPLETE, 385},
+{"ya", IS_CHEWING|IS_PINYIN, 386},
+{"yagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 389},
+{"yamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 389},
+{"yan", IS_CHEWING|IS_PINYIN, 388},
+{"yang", IS_CHEWING|IS_PINYIN, 389},
+{"yao", IS_CHEWING|IS_PINYIN, 390},
+{"ye", IS_CHEWING|IS_PINYIN, 391},
+{"yi", IS_CHEWING|IS_PINYIN, 392},
+{"yign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 394},
+{"yimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 394},
+{"yin", IS_CHEWING|IS_PINYIN, 393},
+{"ying", IS_CHEWING|IS_PINYIN, 394},
+{"yo", IS_CHEWING|IS_PINYIN, 395},
+{"yogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 396},
+{"yomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 396},
+{"yon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 396},
+{"yong", IS_CHEWING|IS_PINYIN, 396},
+{"you", IS_CHEWING|IS_PINYIN, 397},
+{"yu", IS_CHEWING|IS_PINYIN, 398},
+{"yuan", IS_CHEWING|IS_PINYIN, 399},
+{"yue", IS_CHEWING|IS_PINYIN, 400},
+{"yuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 401},
+{"yun", IS_CHEWING|IS_PINYIN, 401},
+{"yv", IS_PINYIN|PINYIN_CORRECT_V_U, 398},
+{"yvan", IS_PINYIN|PINYIN_CORRECT_V_U, 399},
+{"yve", IS_PINYIN|PINYIN_CORRECT_V_U, 400},
+{"yvn", IS_PINYIN|PINYIN_CORRECT_V_U, 401},
+{"z", IS_PINYIN|PINYIN_INCOMPLETE, 402},
+{"za", IS_CHEWING|IS_PINYIN, 403},
+{"zagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 406},
+{"zai", IS_CHEWING|IS_PINYIN, 404},
+{"zamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 406},
+{"zan", IS_CHEWING|IS_PINYIN, 405},
+{"zang", IS_CHEWING|IS_PINYIN, 406},
+{"zao", IS_CHEWING|IS_PINYIN, 407},
+{"ze", IS_CHEWING|IS_PINYIN, 408},
+{"zegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 411},
+{"zei", IS_CHEWING|IS_PINYIN, 409},
+{"zemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 411},
+{"zen", IS_CHEWING|IS_PINYIN, 410},
+{"zeng", IS_CHEWING|IS_PINYIN, 411},
+{"zh", IS_PINYIN|PINYIN_INCOMPLETE, 412},
+{"zha", IS_CHEWING|IS_PINYIN, 413},
+{"zhagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 416},
+{"zhai", IS_CHEWING|IS_PINYIN, 414},
+{"zhamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 416},
+{"zhan", IS_CHEWING|IS_PINYIN, 415},
+{"zhang", IS_CHEWING|IS_PINYIN, 416},
+{"zhao", IS_CHEWING|IS_PINYIN, 417},
+{"zhe", IS_CHEWING|IS_PINYIN, 418},
+{"zhegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 421},
+{"zhemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 421},
+{"zhen", IS_CHEWING|IS_PINYIN, 420},
+{"zheng", IS_CHEWING|IS_PINYIN, 421},
+{"zhi", IS_CHEWING|IS_PINYIN, 422},
+{"zhogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 423},
+{"zhomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 423},
+{"zhon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 423},
+{"zhong", IS_CHEWING|IS_PINYIN, 423},
+{"zhou", IS_CHEWING|IS_PINYIN, 424},
+{"zhu", IS_CHEWING|IS_PINYIN, 425},
+{"zhua", IS_CHEWING|IS_PINYIN, 426},
+{"zhuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 429},
+{"zhuai", IS_CHEWING|IS_PINYIN, 427},
+{"zhuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 429},
+{"zhuan", IS_CHEWING|IS_PINYIN, 428},
+{"zhuang", IS_CHEWING|IS_PINYIN, 429},
+{"zhuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 430},
+{"zhuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 431},
+{"zhui", IS_CHEWING|IS_PINYIN, 430},
+{"zhun", IS_CHEWING|IS_PINYIN, 431},
+{"zhuo", IS_CHEWING|IS_PINYIN, 432},
+{"zi", IS_CHEWING|IS_PINYIN, 433},
+{"zogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 434},
+{"zomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 434},
+{"zon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 434},
+{"zong", IS_CHEWING|IS_PINYIN, 434},
+{"zou", IS_CHEWING|IS_PINYIN, 435},
+{"zu", IS_CHEWING|IS_PINYIN, 436},
+{"zuan", IS_CHEWING|IS_PINYIN, 437},
+{"zuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 438},
+{"zuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 439},
+{"zui", IS_CHEWING|IS_PINYIN, 438},
+{"zun", IS_CHEWING|IS_PINYIN, 439},
+{"zuo", IS_CHEWING|IS_PINYIN, 440}
+};
+
+const chewing_index_item_t chewing_index[] = {
+{"ㄅ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 6},
+{"ㄅㄚ", IS_CHEWING|IS_PINYIN, 7},
+{"ㄅㄛ", IS_CHEWING|IS_PINYIN, 21},
+{"ㄅㄞ", IS_CHEWING|IS_PINYIN, 8},
+{"ㄅㄟ", IS_CHEWING|IS_PINYIN, 12},
+{"ㄅㄠ", IS_CHEWING|IS_PINYIN, 11},
+{"ㄅㄢ", IS_CHEWING|IS_PINYIN, 9},
+{"ㄅㄣ", IS_CHEWING|IS_PINYIN, 13},
+{"ㄅㄤ", IS_CHEWING|IS_PINYIN, 10},
+{"ㄅㄥ", IS_CHEWING|IS_PINYIN, 14},
+{"ㄅㄧ", IS_CHEWING|IS_PINYIN, 15},
+{"ㄅㄧㄝ", IS_CHEWING|IS_PINYIN, 18},
+{"ㄅㄧㄠ", IS_CHEWING|IS_PINYIN, 17},
+{"ㄅㄧㄢ", IS_CHEWING|IS_PINYIN, 16},
+{"ㄅㄧㄣ", IS_CHEWING|IS_PINYIN, 19},
+{"ㄅㄧㄥ", IS_CHEWING|IS_PINYIN, 20},
+{"ㄅㄨ", IS_CHEWING|IS_PINYIN, 22},
+{"ㄆ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 254},
+{"ㄆㄚ", IS_CHEWING|IS_PINYIN, 255},
+{"ㄆㄛ", IS_CHEWING|IS_PINYIN, 269},
+{"ㄆㄞ", IS_CHEWING|IS_PINYIN, 256},
+{"ㄆㄟ", IS_CHEWING|IS_PINYIN, 260},
+{"ㄆㄠ", IS_CHEWING|IS_PINYIN, 259},
+{"ㄆㄡ", IS_CHEWING|IS_PINYIN, 270},
+{"ㄆㄢ", IS_CHEWING|IS_PINYIN, 257},
+{"ㄆㄣ", IS_CHEWING|IS_PINYIN, 261},
+{"ㄆㄤ", IS_CHEWING|IS_PINYIN, 258},
+{"ㄆㄥ", IS_CHEWING|IS_PINYIN, 262},
+{"ㄆㄧ", IS_CHEWING|IS_PINYIN, 263},
+{"ㄆㄧㄝ", IS_CHEWING|IS_PINYIN, 266},
+{"ㄆㄧㄠ", IS_CHEWING|IS_PINYIN, 265},
+{"ㄆㄧㄢ", IS_CHEWING|IS_PINYIN, 264},
+{"ㄆㄧㄣ", IS_CHEWING|IS_PINYIN, 267},
+{"ㄆㄧㄥ", IS_CHEWING|IS_PINYIN, 268},
+{"ㄆㄨ", IS_CHEWING|IS_PINYIN, 271},
+{"ㄇ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 204},
+{"ㄇㄚ", IS_CHEWING|IS_PINYIN, 205},
+{"ㄇㄛ", IS_CHEWING|IS_PINYIN, 221},
+{"ㄇㄜ", IS_CHEWING|IS_PINYIN, 210},
+{"ㄇㄞ", IS_CHEWING|IS_PINYIN, 206},
+{"ㄇㄟ", IS_CHEWING|IS_PINYIN, 211},
+{"ㄇㄠ", IS_CHEWING|IS_PINYIN, 209},
+{"ㄇㄡ", IS_CHEWING|IS_PINYIN, 222},
+{"ㄇㄢ", IS_CHEWING|IS_PINYIN, 207},
+{"ㄇㄣ", IS_CHEWING|IS_PINYIN, 212},
+{"ㄇㄤ", IS_CHEWING|IS_PINYIN, 208},
+{"ㄇㄥ", IS_CHEWING|IS_PINYIN, 213},
+{"ㄇㄧ", IS_CHEWING|IS_PINYIN, 214},
+{"ㄇㄧㄝ", IS_CHEWING|IS_PINYIN, 217},
+{"ㄇㄧㄠ", IS_CHEWING|IS_PINYIN, 216},
+{"ㄇㄧㄡ", IS_CHEWING|IS_PINYIN, 220},
+{"ㄇㄧㄢ", IS_CHEWING|IS_PINYIN, 215},
+{"ㄇㄧㄣ", IS_CHEWING|IS_PINYIN, 218},
+{"ㄇㄧㄥ", IS_CHEWING|IS_PINYIN, 219},
+{"ㄇㄨ", IS_CHEWING|IS_PINYIN, 223},
+{"ㄈ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 90},
+{"ㄈㄚ", IS_CHEWING|IS_PINYIN, 91},
+{"ㄈㄛ", IS_CHEWING|IS_PINYIN, 98},
+{"ㄈㄜ", IS_CHEWING, 94},
+{"ㄈㄟ", IS_CHEWING|IS_PINYIN, 95},
+{"ㄈㄡ", IS_CHEWING|IS_PINYIN, 99},
+{"ㄈㄢ", IS_CHEWING|IS_PINYIN, 92},
+{"ㄈㄣ", IS_CHEWING|IS_PINYIN, 96},
+{"ㄈㄤ", IS_CHEWING|IS_PINYIN, 93},
+{"ㄈㄥ", IS_CHEWING|IS_PINYIN, 97},
+{"ㄈㄨ", IS_CHEWING|IS_PINYIN, 100},
+{"ㄉ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 60},
+{"ㄉㄚ", IS_CHEWING|IS_PINYIN, 61},
+{"ㄉㄜ", IS_CHEWING|IS_PINYIN, 66},
+{"ㄉㄞ", IS_CHEWING|IS_PINYIN, 62},
+{"ㄉㄟ", IS_CHEWING|IS_PINYIN, 67},
+{"ㄉㄠ", IS_CHEWING|IS_PINYIN, 65},
+{"ㄉㄡ", IS_CHEWING|IS_PINYIN, 79},
+{"ㄉㄢ", IS_CHEWING|IS_PINYIN, 63},
+{"ㄉㄣ", IS_CHEWING, 68},
+{"ㄉㄤ", IS_CHEWING|IS_PINYIN, 64},
+{"ㄉㄥ", IS_CHEWING|IS_PINYIN, 69},
+{"ㄉㄧ", IS_CHEWING|IS_PINYIN, 70},
+{"ㄉㄧㄚ", IS_CHEWING|IS_PINYIN, 71},
+{"ㄉㄧㄝ", IS_CHEWING|IS_PINYIN, 74},
+{"ㄉㄧㄠ", IS_CHEWING|IS_PINYIN, 73},
+{"ㄉㄧㄡ", IS_CHEWING|IS_PINYIN, 77},
+{"ㄉㄧㄢ", IS_CHEWING|IS_PINYIN, 72},
+{"ㄉㄧㄣ", IS_CHEWING, 75},
+{"ㄉㄧㄥ", IS_CHEWING|IS_PINYIN, 76},
+{"ㄉㄨ", IS_CHEWING|IS_PINYIN, 80},
+{"ㄉㄨㄛ", IS_CHEWING|IS_PINYIN, 84},
+{"ㄉㄨㄟ", IS_CHEWING|IS_PINYIN, 82},
+{"ㄉㄨㄢ", IS_CHEWING|IS_PINYIN, 81},
+{"ㄉㄨㄣ", IS_CHEWING|IS_PINYIN, 83},
+{"ㄉㄨㄥ", IS_CHEWING|IS_PINYIN, 78},
+{"ㄊ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 340},
+{"ㄊㄚ", IS_CHEWING|IS_PINYIN, 341},
+{"ㄊㄜ", IS_CHEWING|IS_PINYIN, 346},
+{"ㄊㄞ", IS_CHEWING|IS_PINYIN, 342},
+{"ㄊㄠ", IS_CHEWING|IS_PINYIN, 345},
+{"ㄊㄡ", IS_CHEWING|IS_PINYIN, 354},
+{"ㄊㄢ", IS_CHEWING|IS_PINYIN, 343},
+{"ㄊㄤ", IS_CHEWING|IS_PINYIN, 344},
+{"ㄊㄥ", IS_CHEWING|IS_PINYIN, 347},
+{"ㄊㄧ", IS_CHEWING|IS_PINYIN, 348},
+{"ㄊㄧㄝ", IS_CHEWING|IS_PINYIN, 351},
+{"ㄊㄧㄠ", IS_CHEWING|IS_PINYIN, 350},
+{"ㄊㄧㄢ", IS_CHEWING|IS_PINYIN, 349},
+{"ㄊㄧㄥ", IS_CHEWING|IS_PINYIN, 352},
+{"ㄊㄨ", IS_CHEWING|IS_PINYIN, 355},
+{"ㄊㄨㄛ", IS_CHEWING|IS_PINYIN, 359},
+{"ㄊㄨㄟ", IS_CHEWING|IS_PINYIN, 357},
+{"ㄊㄨㄢ", IS_CHEWING|IS_PINYIN, 356},
+{"ㄊㄨㄣ", IS_CHEWING|IS_PINYIN, 358},
+{"ㄊㄨㄥ", IS_CHEWING|IS_PINYIN, 353},
+{"ㄋ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 224},
+{"ㄋㄚ", IS_CHEWING|IS_PINYIN, 225},
+{"ㄋㄜ", IS_CHEWING|IS_PINYIN, 230},
+{"ㄋㄞ", IS_CHEWING|IS_PINYIN, 226},
+{"ㄋㄟ", IS_CHEWING|IS_PINYIN, 231},
+{"ㄋㄠ", IS_CHEWING|IS_PINYIN, 229},
+{"ㄋㄡ", IS_CHEWING|IS_PINYIN, 245},
+{"ㄋㄢ", IS_CHEWING|IS_PINYIN, 227},
+{"ㄋㄣ", IS_CHEWING|IS_PINYIN, 232},
+{"ㄋㄤ", IS_CHEWING|IS_PINYIN, 228},
+{"ㄋㄥ", IS_CHEWING|IS_PINYIN, 233},
+{"ㄋㄧ", IS_CHEWING|IS_PINYIN, 235},
+{"ㄋㄧㄚ", IS_CHEWING, 236},
+{"ㄋㄧㄝ", IS_CHEWING|IS_PINYIN, 240},
+{"ㄋㄧㄠ", IS_CHEWING|IS_PINYIN, 239},
+{"ㄋㄧㄡ", IS_CHEWING|IS_PINYIN, 243},
+{"ㄋㄧㄢ", IS_CHEWING|IS_PINYIN, 237},
+{"ㄋㄧㄣ", IS_CHEWING|IS_PINYIN, 241},
+{"ㄋㄧㄤ", IS_CHEWING|IS_PINYIN, 238},
+{"ㄋㄧㄥ", IS_CHEWING|IS_PINYIN, 242},
+{"ㄋㄨ", IS_CHEWING|IS_PINYIN, 246},
+{"ㄋㄨㄛ", IS_CHEWING|IS_PINYIN, 249},
+{"ㄋㄨㄢ", IS_CHEWING|IS_PINYIN, 247},
+{"ㄋㄨㄣ", IS_CHEWING, 248},
+{"ㄋㄨㄥ", IS_CHEWING|IS_PINYIN, 244},
+{"ㄋㄩ", IS_CHEWING|IS_PINYIN, 250},
+{"ㄋㄩㄝ", IS_CHEWING|IS_PINYIN, 251},
+{"ㄌ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 176},
+{"ㄌㄚ", IS_CHEWING|IS_PINYIN, 177},
+{"ㄌㄛ", IS_CHEWING|IS_PINYIN, 195},
+{"ㄌㄜ", IS_CHEWING|IS_PINYIN, 182},
+{"ㄌㄞ", IS_CHEWING|IS_PINYIN, 178},
+{"ㄌㄟ", IS_CHEWING|IS_PINYIN, 183},
+{"ㄌㄠ", IS_CHEWING|IS_PINYIN, 181},
+{"ㄌㄡ", IS_CHEWING|IS_PINYIN, 197},
+{"ㄌㄢ", IS_CHEWING|IS_PINYIN, 179},
+{"ㄌㄣ", IS_CHEWING, 184},
+{"ㄌㄤ", IS_CHEWING|IS_PINYIN, 180},
+{"ㄌㄥ", IS_CHEWING|IS_PINYIN, 185},
+{"ㄌㄧ", IS_CHEWING|IS_PINYIN, 186},
+{"ㄌㄧㄚ", IS_CHEWING|IS_PINYIN, 187},
+{"ㄌㄧㄝ", IS_CHEWING|IS_PINYIN, 191},
+{"ㄌㄧㄠ", IS_CHEWING|IS_PINYIN, 190},
+{"ㄌㄧㄡ", IS_CHEWING|IS_PINYIN, 194},
+{"ㄌㄧㄢ", IS_CHEWING|IS_PINYIN, 188},
+{"ㄌㄧㄣ", IS_CHEWING|IS_PINYIN, 192},
+{"ㄌㄧㄤ", IS_CHEWING|IS_PINYIN, 189},
+{"ㄌㄧㄥ", IS_CHEWING|IS_PINYIN, 193},
+{"ㄌㄨ", IS_CHEWING|IS_PINYIN, 198},
+{"ㄌㄨㄛ", IS_CHEWING|IS_PINYIN, 201},
+{"ㄌㄨㄢ", IS_CHEWING|IS_PINYIN, 199},
+{"ㄌㄨㄣ", IS_CHEWING|IS_PINYIN, 200},
+{"ㄌㄨㄥ", IS_CHEWING|IS_PINYIN, 196},
+{"ㄌㄩ", IS_CHEWING|IS_PINYIN, 202},
+{"ㄌㄩㄝ", IS_CHEWING|IS_PINYIN, 203},
+{"ㄍ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 101},
+{"ㄍㄚ", IS_CHEWING|IS_PINYIN, 102},
+{"ㄍㄜ", IS_CHEWING|IS_PINYIN, 107},
+{"ㄍㄞ", IS_CHEWING|IS_PINYIN, 103},
+{"ㄍㄟ", IS_CHEWING|IS_PINYIN, 108},
+{"ㄍㄠ", IS_CHEWING|IS_PINYIN, 106},
+{"ㄍㄡ", IS_CHEWING|IS_PINYIN, 112},
+{"ㄍㄢ", IS_CHEWING|IS_PINYIN, 104},
+{"ㄍㄣ", IS_CHEWING|IS_PINYIN, 109},
+{"ㄍㄤ", IS_CHEWING|IS_PINYIN, 105},
+{"ㄍㄥ", IS_CHEWING|IS_PINYIN, 110},
+{"ㄍㄨ", IS_CHEWING|IS_PINYIN, 113},
+{"ㄍㄨㄚ", IS_CHEWING|IS_PINYIN, 114},
+{"ㄍㄨㄛ", IS_CHEWING|IS_PINYIN, 120},
+{"ㄍㄨㄞ", IS_CHEWING|IS_PINYIN, 115},
+{"ㄍㄨㄟ", IS_CHEWING|IS_PINYIN, 118},
+{"ㄍㄨㄢ", IS_CHEWING|IS_PINYIN, 116},
+{"ㄍㄨㄣ", IS_CHEWING|IS_PINYIN, 119},
+{"ㄍㄨㄤ", IS_CHEWING|IS_PINYIN, 117},
+{"ㄍㄨㄥ", IS_CHEWING|IS_PINYIN, 111},
+{"ㄎ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 156},
+{"ㄎㄚ", IS_CHEWING|IS_PINYIN, 157},
+{"ㄎㄜ", IS_CHEWING|IS_PINYIN, 162},
+{"ㄎㄞ", IS_CHEWING|IS_PINYIN, 158},
+{"ㄎㄟ", IS_CHEWING, 163},
+{"ㄎㄠ", IS_CHEWING|IS_PINYIN, 161},
+{"ㄎㄡ", IS_CHEWING|IS_PINYIN, 167},
+{"ㄎㄢ", IS_CHEWING|IS_PINYIN, 159},
+{"ㄎㄣ", IS_CHEWING|IS_PINYIN, 164},
+{"ㄎㄤ", IS_CHEWING|IS_PINYIN, 160},
+{"ㄎㄥ", IS_CHEWING|IS_PINYIN, 165},
+{"ㄎㄨ", IS_CHEWING|IS_PINYIN, 168},
+{"ㄎㄨㄚ", IS_CHEWING|IS_PINYIN, 169},
+{"ㄎㄨㄛ", IS_CHEWING|IS_PINYIN, 175},
+{"ㄎㄨㄞ", IS_CHEWING|IS_PINYIN, 170},
+{"ㄎㄨㄟ", IS_CHEWING|IS_PINYIN, 173},
+{"ㄎㄨㄢ", IS_CHEWING|IS_PINYIN, 171},
+{"ㄎㄨㄣ", IS_CHEWING|IS_PINYIN, 174},
+{"ㄎㄨㄤ", IS_CHEWING|IS_PINYIN, 172},
+{"ㄎㄨㄥ", IS_CHEWING|IS_PINYIN, 166},
+{"ㄏ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 121},
+{"ㄏㄚ", IS_CHEWING|IS_PINYIN, 122},
+{"ㄏㄜ", IS_CHEWING|IS_PINYIN, 127},
+{"ㄏㄞ", IS_CHEWING|IS_PINYIN, 123},
+{"ㄏㄟ", IS_CHEWING|IS_PINYIN, 128},
+{"ㄏㄠ", IS_CHEWING|IS_PINYIN, 126},
+{"ㄏㄡ", IS_CHEWING|IS_PINYIN, 132},
+{"ㄏㄢ", IS_CHEWING|IS_PINYIN, 124},
+{"ㄏㄣ", IS_CHEWING|IS_PINYIN, 129},
+{"ㄏㄤ", IS_CHEWING|IS_PINYIN, 125},
+{"ㄏㄥ", IS_CHEWING|IS_PINYIN, 130},
+{"ㄏㄨ", IS_CHEWING|IS_PINYIN, 133},
+{"ㄏㄨㄚ", IS_CHEWING|IS_PINYIN, 134},
+{"ㄏㄨㄛ", IS_CHEWING|IS_PINYIN, 140},
+{"ㄏㄨㄞ", IS_CHEWING|IS_PINYIN, 135},
+{"ㄏㄨㄟ", IS_CHEWING|IS_PINYIN, 138},
+{"ㄏㄨㄢ", IS_CHEWING|IS_PINYIN, 136},
+{"ㄏㄨㄣ", IS_CHEWING|IS_PINYIN, 139},
+{"ㄏㄨㄤ", IS_CHEWING|IS_PINYIN, 137},
+{"ㄏㄨㄥ", IS_CHEWING|IS_PINYIN, 131},
+{"ㄐ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 141},
+{"ㄐㄧ", IS_CHEWING|IS_PINYIN, 142},
+{"ㄐㄧㄚ", IS_CHEWING|IS_PINYIN, 143},
+{"ㄐㄧㄝ", IS_CHEWING|IS_PINYIN, 147},
+{"ㄐㄧㄠ", IS_CHEWING|IS_PINYIN, 146},
+{"ㄐㄧㄡ", IS_CHEWING|IS_PINYIN, 151},
+{"ㄐㄧㄢ", IS_CHEWING|IS_PINYIN, 144},
+{"ㄐㄧㄣ", IS_CHEWING|IS_PINYIN, 148},
+{"ㄐㄧㄤ", IS_CHEWING|IS_PINYIN, 145},
+{"ㄐㄧㄥ", IS_CHEWING|IS_PINYIN, 149},
+{"ㄐㄩ", IS_CHEWING|IS_PINYIN, 152},
+{"ㄐㄩㄝ", IS_CHEWING|IS_PINYIN, 154},
+{"ㄐㄩㄢ", IS_CHEWING|IS_PINYIN, 153},
+{"ㄐㄩㄣ", IS_CHEWING|IS_PINYIN, 155},
+{"ㄐㄩㄥ", IS_CHEWING|IS_PINYIN, 150},
+{"ㄑ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 272},
+{"ㄑㄧ", IS_CHEWING|IS_PINYIN, 273},
+{"ㄑㄧㄚ", IS_CHEWING|IS_PINYIN, 274},
+{"ㄑㄧㄝ", IS_CHEWING|IS_PINYIN, 278},
+{"ㄑㄧㄠ", IS_CHEWING|IS_PINYIN, 277},
+{"ㄑㄧㄡ", IS_CHEWING|IS_PINYIN, 282},
+{"ㄑㄧㄢ", IS_CHEWING|IS_PINYIN, 275},
+{"ㄑㄧㄣ", IS_CHEWING|IS_PINYIN, 279},
+{"ㄑㄧㄤ", IS_CHEWING|IS_PINYIN, 276},
+{"ㄑㄧㄥ", IS_CHEWING|IS_PINYIN, 280},
+{"ㄑㄩ", IS_CHEWING|IS_PINYIN, 283},
+{"ㄑㄩㄝ", IS_CHEWING|IS_PINYIN, 285},
+{"ㄑㄩㄢ", IS_CHEWING|IS_PINYIN, 284},
+{"ㄑㄩㄣ", IS_CHEWING|IS_PINYIN, 286},
+{"ㄑㄩㄥ", IS_CHEWING|IS_PINYIN, 281},
+{"ㄒ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 370},
+{"ㄒㄧ", IS_CHEWING|IS_PINYIN, 371},
+{"ㄒㄧㄚ", IS_CHEWING|IS_PINYIN, 372},
+{"ㄒㄧㄝ", IS_CHEWING|IS_PINYIN, 376},
+{"ㄒㄧㄠ", IS_CHEWING|IS_PINYIN, 375},
+{"ㄒㄧㄡ", IS_CHEWING|IS_PINYIN, 380},
+{"ㄒㄧㄢ", IS_CHEWING|IS_PINYIN, 373},
+{"ㄒㄧㄣ", IS_CHEWING|IS_PINYIN, 377},
+{"ㄒㄧㄤ", IS_CHEWING|IS_PINYIN, 374},
+{"ㄒㄧㄥ", IS_CHEWING|IS_PINYIN, 378},
+{"ㄒㄩ", IS_CHEWING|IS_PINYIN, 381},
+{"ㄒㄩㄝ", IS_CHEWING|IS_PINYIN, 383},
+{"ㄒㄩㄢ", IS_CHEWING|IS_PINYIN, 382},
+{"ㄒㄩㄣ", IS_CHEWING|IS_PINYIN, 384},
+{"ㄒㄩㄥ", IS_CHEWING|IS_PINYIN, 379},
+{"ㄓ", IS_CHEWING|IS_PINYIN, 422},
+{"ㄓㄚ", IS_CHEWING|IS_PINYIN, 413},
+{"ㄓㄜ", IS_CHEWING|IS_PINYIN, 418},
+{"ㄓㄞ", IS_CHEWING|IS_PINYIN, 414},
+{"ㄓㄟ", IS_CHEWING, 419},
+{"ㄓㄠ", IS_CHEWING|IS_PINYIN, 417},
+{"ㄓㄡ", IS_CHEWING|IS_PINYIN, 424},
+{"ㄓㄢ", IS_CHEWING|IS_PINYIN, 415},
+{"ㄓㄣ", IS_CHEWING|IS_PINYIN, 420},
+{"ㄓㄤ", IS_CHEWING|IS_PINYIN, 416},
+{"ㄓㄥ", IS_CHEWING|IS_PINYIN, 421},
+{"ㄓㄨ", IS_CHEWING|IS_PINYIN, 425},
+{"ㄓㄨㄚ", IS_CHEWING|IS_PINYIN, 426},
+{"ㄓㄨㄛ", IS_CHEWING|IS_PINYIN, 432},
+{"ㄓㄨㄞ", IS_CHEWING|IS_PINYIN, 427},
+{"ㄓㄨㄟ", IS_CHEWING|IS_PINYIN, 430},
+{"ㄓㄨㄢ", IS_CHEWING|IS_PINYIN, 428},
+{"ㄓㄨㄣ", IS_CHEWING|IS_PINYIN, 431},
+{"ㄓㄨㄤ", IS_CHEWING|IS_PINYIN, 429},
+{"ㄓㄨㄥ", IS_CHEWING|IS_PINYIN, 423},
+{"ㄔ", IS_CHEWING|IS_PINYIN, 41},
+{"ㄔㄚ", IS_CHEWING|IS_PINYIN, 33},
+{"ㄔㄜ", IS_CHEWING|IS_PINYIN, 38},
+{"ㄔㄞ", IS_CHEWING|IS_PINYIN, 34},
+{"ㄔㄠ", IS_CHEWING|IS_PINYIN, 37},
+{"ㄔㄡ", IS_CHEWING|IS_PINYIN, 43},
+{"ㄔㄢ", IS_CHEWING|IS_PINYIN, 35},
+{"ㄔㄣ", IS_CHEWING|IS_PINYIN, 39},
+{"ㄔㄤ", IS_CHEWING|IS_PINYIN, 36},
+{"ㄔㄥ", IS_CHEWING|IS_PINYIN, 40},
+{"ㄔㄨ", IS_CHEWING|IS_PINYIN, 44},
+{"ㄔㄨㄚ", IS_CHEWING, 45},
+{"ㄔㄨㄛ", IS_CHEWING|IS_PINYIN, 51},
+{"ㄔㄨㄞ", IS_CHEWING|IS_PINYIN, 46},
+{"ㄔㄨㄟ", IS_CHEWING|IS_PINYIN, 49},
+{"ㄔㄨㄢ", IS_CHEWING|IS_PINYIN, 47},
+{"ㄔㄨㄣ", IS_CHEWING|IS_PINYIN, 50},
+{"ㄔㄨㄤ", IS_CHEWING|IS_PINYIN, 48},
+{"ㄔㄨㄥ", IS_CHEWING|IS_PINYIN, 42},
+{"ㄕ", IS_CHEWING|IS_PINYIN, 322},
+{"ㄕㄚ", IS_CHEWING|IS_PINYIN, 313},
+{"ㄕㄜ", IS_CHEWING|IS_PINYIN, 318},
+{"ㄕㄞ", IS_CHEWING|IS_PINYIN, 314},
+{"ㄕㄟ", IS_CHEWING|IS_PINYIN, 319},
+{"ㄕㄠ", IS_CHEWING|IS_PINYIN, 317},
+{"ㄕㄡ", IS_CHEWING|IS_PINYIN, 323},
+{"ㄕㄢ", IS_CHEWING|IS_PINYIN, 315},
+{"ㄕㄣ", IS_CHEWING|IS_PINYIN, 320},
+{"ㄕㄤ", IS_CHEWING|IS_PINYIN, 316},
+{"ㄕㄥ", IS_CHEWING|IS_PINYIN, 321},
+{"ㄕㄨ", IS_CHEWING|IS_PINYIN, 324},
+{"ㄕㄨㄚ", IS_CHEWING|IS_PINYIN, 325},
+{"ㄕㄨㄛ", IS_CHEWING|IS_PINYIN, 331},
+{"ㄕㄨㄞ", IS_CHEWING|IS_PINYIN, 326},
+{"ㄕㄨㄟ", IS_CHEWING|IS_PINYIN, 329},
+{"ㄕㄨㄢ", IS_CHEWING|IS_PINYIN, 327},
+{"ㄕㄨㄣ", IS_CHEWING|IS_PINYIN, 330},
+{"ㄕㄨㄤ", IS_CHEWING|IS_PINYIN, 328},
+{"ㄖ", IS_CHEWING|IS_PINYIN, 294},
+{"ㄖㄜ", IS_CHEWING|IS_PINYIN, 291},
+{"ㄖㄠ", IS_CHEWING|IS_PINYIN, 290},
+{"ㄖㄡ", IS_CHEWING|IS_PINYIN, 296},
+{"ㄖㄢ", IS_CHEWING|IS_PINYIN, 288},
+{"ㄖㄣ", IS_CHEWING|IS_PINYIN, 292},
+{"ㄖㄤ", IS_CHEWING|IS_PINYIN, 289},
+{"ㄖㄥ", IS_CHEWING|IS_PINYIN, 293},
+{"ㄖㄨ", IS_CHEWING|IS_PINYIN, 297},
+{"ㄖㄨㄚ", IS_CHEWING, 298},
+{"ㄖㄨㄛ", IS_CHEWING|IS_PINYIN, 302},
+{"ㄖㄨㄟ", IS_CHEWING|IS_PINYIN, 300},
+{"ㄖㄨㄢ", IS_CHEWING|IS_PINYIN, 299},
+{"ㄖㄨㄣ", IS_CHEWING|IS_PINYIN, 301},
+{"ㄖㄨㄥ", IS_CHEWING|IS_PINYIN, 295},
+{"ㄗ", IS_CHEWING|IS_PINYIN, 433},
+{"ㄗㄚ", IS_CHEWING|IS_PINYIN, 403},
+{"ㄗㄜ", IS_CHEWING|IS_PINYIN, 408},
+{"ㄗㄞ", IS_CHEWING|IS_PINYIN, 404},
+{"ㄗㄟ", IS_CHEWING|IS_PINYIN, 409},
+{"ㄗㄠ", IS_CHEWING|IS_PINYIN, 407},
+{"ㄗㄡ", IS_CHEWING|IS_PINYIN, 435},
+{"ㄗㄢ", IS_CHEWING|IS_PINYIN, 405},
+{"ㄗㄣ", IS_CHEWING|IS_PINYIN, 410},
+{"ㄗㄤ", IS_CHEWING|IS_PINYIN, 406},
+{"ㄗㄥ", IS_CHEWING|IS_PINYIN, 411},
+{"ㄗㄨ", IS_CHEWING|IS_PINYIN, 436},
+{"ㄗㄨㄛ", IS_CHEWING|IS_PINYIN, 440},
+{"ㄗㄨㄟ", IS_CHEWING|IS_PINYIN, 438},
+{"ㄗㄨㄢ", IS_CHEWING|IS_PINYIN, 437},
+{"ㄗㄨㄣ", IS_CHEWING|IS_PINYIN, 439},
+{"ㄗㄨㄥ", IS_CHEWING|IS_PINYIN, 434},
+{"ㄘ", IS_CHEWING|IS_PINYIN, 52},
+{"ㄘㄚ", IS_CHEWING|IS_PINYIN, 24},
+{"ㄘㄜ", IS_CHEWING|IS_PINYIN, 29},
+{"ㄘㄞ", IS_CHEWING|IS_PINYIN, 25},
+{"ㄘㄠ", IS_CHEWING|IS_PINYIN, 28},
+{"ㄘㄡ", IS_CHEWING|IS_PINYIN, 54},
+{"ㄘㄢ", IS_CHEWING|IS_PINYIN, 26},
+{"ㄘㄣ", IS_CHEWING|IS_PINYIN, 30},
+{"ㄘㄤ", IS_CHEWING|IS_PINYIN, 27},
+{"ㄘㄥ", IS_CHEWING|IS_PINYIN, 31},
+{"ㄘㄨ", IS_CHEWING|IS_PINYIN, 55},
+{"ㄘㄨㄛ", IS_CHEWING|IS_PINYIN, 59},
+{"ㄘㄨㄟ", IS_CHEWING|IS_PINYIN, 57},
+{"ㄘㄨㄢ", IS_CHEWING|IS_PINYIN, 56},
+{"ㄘㄨㄣ", IS_CHEWING|IS_PINYIN, 58},
+{"ㄘㄨㄥ", IS_CHEWING|IS_PINYIN, 53},
+{"ㄙ", IS_CHEWING|IS_PINYIN, 332},
+{"ㄙㄚ", IS_CHEWING|IS_PINYIN, 304},
+{"ㄙㄜ", IS_CHEWING|IS_PINYIN, 309},
+{"ㄙㄞ", IS_CHEWING|IS_PINYIN, 305},
+{"ㄙㄠ", IS_CHEWING|IS_PINYIN, 308},
+{"ㄙㄡ", IS_CHEWING|IS_PINYIN, 334},
+{"ㄙㄢ", IS_CHEWING|IS_PINYIN, 306},
+{"ㄙㄣ", IS_CHEWING|IS_PINYIN, 310},
+{"ㄙㄤ", IS_CHEWING|IS_PINYIN, 307},
+{"ㄙㄥ", IS_CHEWING|IS_PINYIN, 311},
+{"ㄙㄨ", IS_CHEWING|IS_PINYIN, 335},
+{"ㄙㄨㄛ", IS_CHEWING|IS_PINYIN, 339},
+{"ㄙㄨㄟ", IS_CHEWING|IS_PINYIN, 337},
+{"ㄙㄨㄢ", IS_CHEWING|IS_PINYIN, 336},
+{"ㄙㄨㄣ", IS_CHEWING|IS_PINYIN, 338},
+{"ㄙㄨㄥ", IS_CHEWING|IS_PINYIN, 333},
+{"ㄚ", IS_CHEWING|IS_PINYIN, 1},
+{"ㄛ", IS_CHEWING|IS_PINYIN, 252},
+{"ㄜ", IS_CHEWING|IS_PINYIN, 85},
+{"ㄞ", IS_CHEWING|IS_PINYIN, 2},
+{"ㄟ", IS_CHEWING|IS_PINYIN, 86},
+{"ㄠ", IS_CHEWING|IS_PINYIN, 5},
+{"ㄡ", IS_CHEWING|IS_PINYIN, 253},
+{"ㄢ", IS_CHEWING|IS_PINYIN, 3},
+{"ㄣ", IS_CHEWING|IS_PINYIN, 87},
+{"ㄤ", IS_CHEWING|IS_PINYIN, 4},
+{"ㄥ", IS_CHEWING, 88},
+{"ㄦ", IS_CHEWING|IS_PINYIN, 89},
+{"ㄧ", IS_CHEWING|IS_PINYIN, 392},
+{"ㄧㄚ", IS_CHEWING|IS_PINYIN, 386},
+{"ㄧㄛ", IS_CHEWING|IS_PINYIN, 395},
+{"ㄧㄝ", IS_CHEWING|IS_PINYIN, 391},
+{"ㄧㄞ", IS_CHEWING, 387},
+{"ㄧㄠ", IS_CHEWING|IS_PINYIN, 390},
+{"ㄧㄡ", IS_CHEWING|IS_PINYIN, 397},
+{"ㄧㄢ", IS_CHEWING|IS_PINYIN, 388},
+{"ㄧㄣ", IS_CHEWING|IS_PINYIN, 393},
+{"ㄧㄤ", IS_CHEWING|IS_PINYIN, 389},
+{"ㄧㄥ", IS_CHEWING|IS_PINYIN, 394},
+{"ㄨ", IS_CHEWING|IS_PINYIN, 369},
+{"ㄨㄚ", IS_CHEWING|IS_PINYIN, 361},
+{"ㄨㄛ", IS_CHEWING|IS_PINYIN, 368},
+{"ㄨㄞ", IS_CHEWING|IS_PINYIN, 362},
+{"ㄨㄟ", IS_CHEWING|IS_PINYIN, 365},
+{"ㄨㄢ", IS_CHEWING|IS_PINYIN, 363},
+{"ㄨㄣ", IS_CHEWING|IS_PINYIN, 366},
+{"ㄨㄤ", IS_CHEWING|IS_PINYIN, 364},
+{"ㄨㄥ", IS_CHEWING|IS_PINYIN, 367},
+{"ㄩ", IS_CHEWING|IS_PINYIN, 398},
+{"ㄩㄝ", IS_CHEWING|IS_PINYIN, 400},
+{"ㄩㄢ", IS_CHEWING|IS_PINYIN, 399},
+{"ㄩㄣ", IS_CHEWING|IS_PINYIN, 401},
+{"ㄩㄥ", IS_CHEWING|IS_PINYIN, 396},
+{"ㄫ", IS_CHEWING|IS_PINYIN, 234}
+};
+
+const content_table_item_t content_table[] = {
+{"", "", "", "", ChewingKey()},
+{"a", "", "a", "ㄚ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"ai", "", "ai", "ㄞ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"an", "", "an", "ㄢ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"ang", "", "ang", "ㄤ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"ao", "", "ao", "ㄠ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"b", "b", "", "ㄅ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ba", "b", "a", "ㄅㄚ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"bai", "b", "ai", "ㄅㄞ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"ban", "b", "an", "ㄅㄢ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"bang", "b", "ang", "ㄅㄤ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"bao", "b", "ao", "ㄅㄠ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"bei", "b", "ei", "ㄅㄟ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"ben", "b", "en", "ㄅㄣ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"beng", "b", "eng", "ㄅㄥ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"bi", "b", "i", "ㄅㄧ", ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"bian", "b", "ian", "ㄅㄧㄢ", ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AN)},
+{"biao", "b", "iao", "ㄅㄧㄠ", ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AO)},
+{"bie", "b", "ie", "ㄅㄧㄝ", ChewingKey(CHEWING_B, CHEWING_I, CHEWING_E)},
+{"bin", "b", "in", "ㄅㄧㄣ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"bing", "b", "ing", "ㄅㄧㄥ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"bo", "b", "o", "ㄅㄛ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_O)},
+{"bu", "b", "u", "ㄅㄨ", ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"c", "c", "", "ㄘ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ca", "c", "a", "ㄘㄚ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"cai", "c", "ai", "ㄘㄞ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"can", "c", "an", "ㄘㄢ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"cang", "c", "ang", "ㄘㄤ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"cao", "c", "ao", "ㄘㄠ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"ce", "c", "e", "ㄘㄜ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"cen", "c", "en", "ㄘㄣ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"ceng", "c", "eng", "ㄘㄥ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"ch", "ch", "", "ㄔ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"cha", "ch", "a", "ㄔㄚ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"chai", "ch", "ai", "ㄔㄞ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"chan", "ch", "an", "ㄔㄢ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"chang", "ch", "ang", "ㄔㄤ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"chao", "ch", "ao", "ㄔㄠ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"che", "ch", "e", "ㄔㄜ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"chen", "ch", "en", "ㄔㄣ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"cheng", "ch", "eng", "ㄔㄥ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"chi", "ch", "i", "ㄔ", ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"chong", "ch", "ong", "ㄔㄨㄥ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"chou", "ch", "ou", "ㄔㄡ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"chu", "ch", "u", "ㄔㄨ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"chua", "ch", "ua", "ㄔㄨㄚ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_A)},
+{"chuai", "ch", "uai", "ㄔㄨㄞ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AI)},
+{"chuan", "ch", "uan", "ㄔㄨㄢ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AN)},
+{"chuang", "ch", "uang", "ㄔㄨㄤ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ANG)},
+{"chui", "ch", "ui", "ㄔㄨㄟ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EI)},
+{"chun", "ch", "un", "ㄔㄨㄣ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EN)},
+{"chuo", "ch", "uo", "ㄔㄨㄛ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_O)},
+{"ci", "c", "i", "ㄘ", ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"cong", "c", "ong", "ㄘㄨㄥ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"cou", "c", "ou", "ㄘㄡ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"cu", "c", "u", "ㄘㄨ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"cuan", "c", "uan", "ㄘㄨㄢ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_AN)},
+{"cui", "c", "ui", "ㄘㄨㄟ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_EI)},
+{"cun", "c", "un", "ㄘㄨㄣ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_EN)},
+{"cuo", "c", "uo", "ㄘㄨㄛ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_O)},
+{"d", "d", "", "ㄉ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"da", "d", "a", "ㄉㄚ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"dai", "d", "ai", "ㄉㄞ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"dan", "d", "an", "ㄉㄢ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"dang", "d", "ang", "ㄉㄤ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"dao", "d", "ao", "ㄉㄠ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"de", "d", "e", "ㄉㄜ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"dei", "d", "ei", "ㄉㄟ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"den", "d", "en", "ㄉㄣ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"deng", "d", "eng", "ㄉㄥ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"di", "d", "i", "ㄉㄧ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"dia", "d", "ia", "ㄉㄧㄚ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_A)},
+{"dian", "d", "ian", "ㄉㄧㄢ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AN)},
+{"diao", "d", "iao", "ㄉㄧㄠ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AO)},
+{"die", "d", "ie", "ㄉㄧㄝ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_E)},
+{"din", "d", "in", "ㄉㄧㄣ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"ding", "d", "ing", "ㄉㄧㄥ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"diu", "d", "iu", "ㄉㄧㄡ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_OU)},
+{"dong", "d", "ong", "ㄉㄨㄥ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"dou", "d", "ou", "ㄉㄡ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"du", "d", "u", "ㄉㄨ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"duan", "d", "uan", "ㄉㄨㄢ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_AN)},
+{"dui", "d", "ui", "ㄉㄨㄟ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_EI)},
+{"dun", "d", "un", "ㄉㄨㄣ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_EN)},
+{"duo", "d", "uo", "ㄉㄨㄛ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_O)},
+{"e", "", "e", "ㄜ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"ei", "", "ei", "ㄟ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"en", "", "en", "ㄣ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"eng", "", "eng", "ㄥ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"er", "", "er", "ㄦ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ER)},
+{"f", "f", "", "ㄈ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"fa", "f", "a", "ㄈㄚ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"fan", "f", "an", "ㄈㄢ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"fang", "f", "ang", "ㄈㄤ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"fe", "f", "e", "ㄈㄜ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"fei", "f", "ei", "ㄈㄟ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"fen", "f", "en", "ㄈㄣ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"feng", "f", "eng", "ㄈㄥ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"fo", "f", "o", "ㄈㄛ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_O)},
+{"fou", "f", "ou", "ㄈㄡ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"fu", "f", "u", "ㄈㄨ", ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"g", "g", "", "ㄍ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ga", "g", "a", "ㄍㄚ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"gai", "g", "ai", "ㄍㄞ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"gan", "g", "an", "ㄍㄢ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"gang", "g", "ang", "ㄍㄤ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"gao", "g", "ao", "ㄍㄠ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"ge", "g", "e", "ㄍㄜ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"gei", "g", "ei", "ㄍㄟ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"gen", "g", "en", "ㄍㄣ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"geng", "g", "eng", "ㄍㄥ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"gong", "g", "ong", "ㄍㄨㄥ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"gou", "g", "ou", "ㄍㄡ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"gu", "g", "u", "ㄍㄨ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"gua", "g", "ua", "ㄍㄨㄚ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_A)},
+{"guai", "g", "uai", "ㄍㄨㄞ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AI)},
+{"guan", "g", "uan", "ㄍㄨㄢ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AN)},
+{"guang", "g", "uang", "ㄍㄨㄤ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ANG)},
+{"gui", "g", "ui", "ㄍㄨㄟ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_EI)},
+{"gun", "g", "un", "ㄍㄨㄣ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_EN)},
+{"guo", "g", "uo", "ㄍㄨㄛ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_O)},
+{"h", "h", "", "ㄏ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ha", "h", "a", "ㄏㄚ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"hai", "h", "ai", "ㄏㄞ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"han", "h", "an", "ㄏㄢ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"hang", "h", "ang", "ㄏㄤ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"hao", "h", "ao", "ㄏㄠ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"he", "h", "e", "ㄏㄜ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"hei", "h", "ei", "ㄏㄟ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"hen", "h", "en", "ㄏㄣ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"heng", "h", "eng", "ㄏㄥ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"hong", "h", "ong", "ㄏㄨㄥ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"hou", "h", "ou", "ㄏㄡ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"hu", "h", "u", "ㄏㄨ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"hua", "h", "ua", "ㄏㄨㄚ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_A)},
+{"huai", "h", "uai", "ㄏㄨㄞ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AI)},
+{"huan", "h", "uan", "ㄏㄨㄢ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AN)},
+{"huang", "h", "uang", "ㄏㄨㄤ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ANG)},
+{"hui", "h", "ui", "ㄏㄨㄟ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_EI)},
+{"hun", "h", "un", "ㄏㄨㄣ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_EN)},
+{"huo", "h", "uo", "ㄏㄨㄛ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_O)},
+{"j", "j", "", "ㄐ", ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ji", "j", "i", "ㄐㄧ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"jia", "j", "ia", "ㄐㄧㄚ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A)},
+{"jian", "j", "ian", "ㄐㄧㄢ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN)},
+{"jiang", "j", "iang", "ㄐㄧㄤ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ANG)},
+{"jiao", "j", "iao", "ㄐㄧㄠ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AO)},
+{"jie", "j", "ie", "ㄐㄧㄝ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_E)},
+{"jin", "j", "in", "ㄐㄧㄣ", ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"jing", "j", "ing", "ㄐㄧㄥ", ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"jiong", "j", "iong", "ㄐㄩㄥ", ChewingKey(CHEWING_J, CHEWING_I, PINYIN_ONG)},
+{"jiu", "j", "iu", "ㄐㄧㄡ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_OU)},
+{"ju", "j", "u", "ㄐㄩ", ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ZERO_FINAL)},
+{"juan", "j", "uan", "ㄐㄩㄢ", ChewingKey(CHEWING_J, CHEWING_V, CHEWING_AN)},
+{"jue", "j", "ue", "ㄐㄩㄝ", ChewingKey(CHEWING_J, CHEWING_V, CHEWING_E)},
+{"jun", "j", "un", "ㄐㄩㄣ", ChewingKey(CHEWING_J, CHEWING_V, CHEWING_EN)},
+{"k", "k", "", "ㄎ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ka", "k", "a", "ㄎㄚ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"kai", "k", "ai", "ㄎㄞ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"kan", "k", "an", "ㄎㄢ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"kang", "k", "ang", "ㄎㄤ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"kao", "k", "ao", "ㄎㄠ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"ke", "k", "e", "ㄎㄜ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"kei", "k", "ei", "ㄎㄟ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"ken", "k", "en", "ㄎㄣ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"keng", "k", "eng", "ㄎㄥ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"kong", "k", "ong", "ㄎㄨㄥ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"kou", "k", "ou", "ㄎㄡ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"ku", "k", "u", "ㄎㄨ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"kua", "k", "ua", "ㄎㄨㄚ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_A)},
+{"kuai", "k", "uai", "ㄎㄨㄞ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AI)},
+{"kuan", "k", "uan", "ㄎㄨㄢ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AN)},
+{"kuang", "k", "uang", "ㄎㄨㄤ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ANG)},
+{"kui", "k", "ui", "ㄎㄨㄟ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_EI)},
+{"kun", "k", "un", "ㄎㄨㄣ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_EN)},
+{"kuo", "k", "uo", "ㄎㄨㄛ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_O)},
+{"l", "l", "", "ㄌ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"la", "l", "a", "ㄌㄚ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"lai", "l", "ai", "ㄌㄞ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"lan", "l", "an", "ㄌㄢ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"lang", "l", "ang", "ㄌㄤ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"lao", "l", "ao", "ㄌㄠ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"le", "l", "e", "ㄌㄜ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"lei", "l", "ei", "ㄌㄟ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"len", "l", "en", "ㄌㄣ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"leng", "l", "eng", "ㄌㄥ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"li", "l", "i", "ㄌㄧ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"lia", "l", "ia", "ㄌㄧㄚ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_A)},
+{"lian", "l", "ian", "ㄌㄧㄢ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AN)},
+{"liang", "l", "iang", "ㄌㄧㄤ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ANG)},
+{"liao", "l", "iao", "ㄌㄧㄠ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AO)},
+{"lie", "l", "ie", "ㄌㄧㄝ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_E)},
+{"lin", "l", "in", "ㄌㄧㄣ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"ling", "l", "ing", "ㄌㄧㄥ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"liu", "l", "iu", "ㄌㄧㄡ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_OU)},
+{"lo", "l", "o", "ㄌㄛ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_O)},
+{"long", "l", "ong", "ㄌㄨㄥ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"lou", "l", "ou", "ㄌㄡ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"lu", "l", "u", "ㄌㄨ", ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"luan", "l", "uan", "ㄌㄨㄢ", ChewingKey(CHEWING_L, CHEWING_U, CHEWING_AN)},
+{"lun", "l", "un", "ㄌㄨㄣ", ChewingKey(CHEWING_L, CHEWING_U, CHEWING_EN)},
+{"luo", "l", "uo", "ㄌㄨㄛ", ChewingKey(CHEWING_L, CHEWING_U, CHEWING_O)},
+{"lv", "l", "v", "ㄌㄩ", ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ZERO_FINAL)},
+{"lve", "l", "ve", "ㄌㄩㄝ", ChewingKey(CHEWING_L, CHEWING_V, CHEWING_E)},
+{"m", "m", "", "ㄇ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ma", "m", "a", "ㄇㄚ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"mai", "m", "ai", "ㄇㄞ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"man", "m", "an", "ㄇㄢ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"mang", "m", "ang", "ㄇㄤ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"mao", "m", "ao", "ㄇㄠ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"me", "m", "e", "ㄇㄜ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"mei", "m", "ei", "ㄇㄟ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"men", "m", "en", "ㄇㄣ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"meng", "m", "eng", "ㄇㄥ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"mi", "m", "i", "ㄇㄧ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"mian", "m", "ian", "ㄇㄧㄢ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AN)},
+{"miao", "m", "iao", "ㄇㄧㄠ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AO)},
+{"mie", "m", "ie", "ㄇㄧㄝ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_E)},
+{"min", "m", "in", "ㄇㄧㄣ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"ming", "m", "ing", "ㄇㄧㄥ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"miu", "m", "iu", "ㄇㄧㄡ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_OU)},
+{"mo", "m", "o", "ㄇㄛ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_O)},
+{"mou", "m", "ou", "ㄇㄡ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"mu", "m", "u", "ㄇㄨ", ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"n", "n", "", "ㄋ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"na", "n", "a", "ㄋㄚ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"nai", "n", "ai", "ㄋㄞ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"nan", "n", "an", "ㄋㄢ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"nang", "n", "ang", "ㄋㄤ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"nao", "n", "ao", "ㄋㄠ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"ne", "n", "e", "ㄋㄜ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"nei", "n", "ei", "ㄋㄟ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"nen", "n", "en", "ㄋㄣ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"neng", "n", "eng", "ㄋㄥ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"ng", "", "ng", "ㄫ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_NG)},
+{"ni", "n", "i", "ㄋㄧ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"nia", "n", "ia", "ㄋㄧㄚ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_A)},
+{"nian", "n", "ian", "ㄋㄧㄢ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AN)},
+{"niang", "n", "iang", "ㄋㄧㄤ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ANG)},
+{"niao", "n", "iao", "ㄋㄧㄠ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AO)},
+{"nie", "n", "ie", "ㄋㄧㄝ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_E)},
+{"nin", "n", "in", "ㄋㄧㄣ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"ning", "n", "ing", "ㄋㄧㄥ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"niu", "n", "iu", "ㄋㄧㄡ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_OU)},
+{"nong", "n", "ong", "ㄋㄨㄥ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"nou", "n", "ou", "ㄋㄡ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"nu", "n", "u", "ㄋㄨ", ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"nuan", "n", "uan", "ㄋㄨㄢ", ChewingKey(CHEWING_N, CHEWING_U, CHEWING_AN)},
+{"nun", "n", "un", "ㄋㄨㄣ", ChewingKey(CHEWING_N, CHEWING_U, CHEWING_EN)},
+{"nuo", "n", "uo", "ㄋㄨㄛ", ChewingKey(CHEWING_N, CHEWING_U, CHEWING_O)},
+{"nv", "n", "v", "ㄋㄩ", ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ZERO_FINAL)},
+{"nve", "n", "ve", "ㄋㄩㄝ", ChewingKey(CHEWING_N, CHEWING_V, CHEWING_E)},
+{"o", "", "o", "ㄛ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_O)},
+{"ou", "", "ou", "ㄡ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"p", "p", "", "ㄆ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"pa", "p", "a", "ㄆㄚ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"pai", "p", "ai", "ㄆㄞ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"pan", "p", "an", "ㄆㄢ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"pang", "p", "ang", "ㄆㄤ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"pao", "p", "ao", "ㄆㄠ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"pei", "p", "ei", "ㄆㄟ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"pen", "p", "en", "ㄆㄣ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"peng", "p", "eng", "ㄆㄥ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"pi", "p", "i", "ㄆㄧ", ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"pian", "p", "ian", "ㄆㄧㄢ", ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AN)},
+{"piao", "p", "iao", "ㄆㄧㄠ", ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AO)},
+{"pie", "p", "ie", "ㄆㄧㄝ", ChewingKey(CHEWING_P, CHEWING_I, CHEWING_E)},
+{"pin", "p", "in", "ㄆㄧㄣ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"ping", "p", "ing", "ㄆㄧㄥ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"po", "p", "o", "ㄆㄛ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_O)},
+{"pou", "p", "ou", "ㄆㄡ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"pu", "p", "u", "ㄆㄨ", ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"q", "q", "", "ㄑ", ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"qi", "q", "i", "ㄑㄧ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"qia", "q", "ia", "ㄑㄧㄚ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_A)},
+{"qian", "q", "ian", "ㄑㄧㄢ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AN)},
+{"qiang", "q", "iang", "ㄑㄧㄤ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ANG)},
+{"qiao", "q", "iao", "ㄑㄧㄠ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AO)},
+{"qie", "q", "ie", "ㄑㄧㄝ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_E)},
+{"qin", "q", "in", "ㄑㄧㄣ", ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"qing", "q", "ing", "ㄑㄧㄥ", ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"qiong", "q", "iong", "ㄑㄩㄥ", ChewingKey(CHEWING_Q, CHEWING_I, PINYIN_ONG)},
+{"qiu", "q", "iu", "ㄑㄧㄡ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_OU)},
+{"qu", "q", "u", "ㄑㄩ", ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ZERO_FINAL)},
+{"quan", "q", "uan", "ㄑㄩㄢ", ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_AN)},
+{"que", "q", "ue", "ㄑㄩㄝ", ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_E)},
+{"qun", "q", "un", "ㄑㄩㄣ", ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_EN)},
+{"r", "r", "", "ㄖ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ran", "r", "an", "ㄖㄢ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"rang", "r", "ang", "ㄖㄤ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"rao", "r", "ao", "ㄖㄠ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"re", "r", "e", "ㄖㄜ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"ren", "r", "en", "ㄖㄣ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"reng", "r", "eng", "ㄖㄥ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"ri", "r", "i", "ㄖ", ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"rong", "r", "ong", "ㄖㄨㄥ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"rou", "r", "ou", "ㄖㄡ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"ru", "r", "u", "ㄖㄨ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"rua", "r", "ua", "ㄖㄨㄚ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_A)},
+{"ruan", "r", "uan", "ㄖㄨㄢ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_AN)},
+{"rui", "r", "ui", "ㄖㄨㄟ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_EI)},
+{"run", "r", "un", "ㄖㄨㄣ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_EN)},
+{"ruo", "r", "uo", "ㄖㄨㄛ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_O)},
+{"s", "s", "", "ㄙ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"sa", "s", "a", "ㄙㄚ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"sai", "s", "ai", "ㄙㄞ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"san", "s", "an", "ㄙㄢ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"sang", "s", "ang", "ㄙㄤ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"sao", "s", "ao", "ㄙㄠ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"se", "s", "e", "ㄙㄜ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"sen", "s", "en", "ㄙㄣ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"seng", "s", "eng", "ㄙㄥ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"sh", "sh", "", "ㄕ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"sha", "sh", "a", "ㄕㄚ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"shai", "sh", "ai", "ㄕㄞ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"shan", "sh", "an", "ㄕㄢ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"shang", "sh", "ang", "ㄕㄤ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"shao", "sh", "ao", "ㄕㄠ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"she", "sh", "e", "ㄕㄜ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"shei", "sh", "ei", "ㄕㄟ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"shen", "sh", "en", "ㄕㄣ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"sheng", "sh", "eng", "ㄕㄥ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"shi", "sh", "i", "ㄕ", ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"shou", "sh", "ou", "ㄕㄡ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"shu", "sh", "u", "ㄕㄨ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"shua", "sh", "ua", "ㄕㄨㄚ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_A)},
+{"shuai", "sh", "uai", "ㄕㄨㄞ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AI)},
+{"shuan", "sh", "uan", "ㄕㄨㄢ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AN)},
+{"shuang", "sh", "uang", "ㄕㄨㄤ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ANG)},
+{"shui", "sh", "ui", "ㄕㄨㄟ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_EI)},
+{"shun", "sh", "un", "ㄕㄨㄣ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_EN)},
+{"shuo", "sh", "uo", "ㄕㄨㄛ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_O)},
+{"si", "s", "i", "ㄙ", ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"song", "s", "ong", "ㄙㄨㄥ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"sou", "s", "ou", "ㄙㄡ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"su", "s", "u", "ㄙㄨ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"suan", "s", "uan", "ㄙㄨㄢ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_AN)},
+{"sui", "s", "ui", "ㄙㄨㄟ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_EI)},
+{"sun", "s", "un", "ㄙㄨㄣ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_EN)},
+{"suo", "s", "uo", "ㄙㄨㄛ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_O)},
+{"t", "t", "", "ㄊ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ta", "t", "a", "ㄊㄚ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"tai", "t", "ai", "ㄊㄞ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"tan", "t", "an", "ㄊㄢ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"tang", "t", "ang", "ㄊㄤ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"tao", "t", "ao", "ㄊㄠ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"te", "t", "e", "ㄊㄜ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"teng", "t", "eng", "ㄊㄥ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"ti", "t", "i", "ㄊㄧ", ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"tian", "t", "ian", "ㄊㄧㄢ", ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AN)},
+{"tiao", "t", "iao", "ㄊㄧㄠ", ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AO)},
+{"tie", "t", "ie", "ㄊㄧㄝ", ChewingKey(CHEWING_T, CHEWING_I, CHEWING_E)},
+{"ting", "t", "ing", "ㄊㄧㄥ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"tong", "t", "ong", "ㄊㄨㄥ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"tou", "t", "ou", "ㄊㄡ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"tu", "t", "u", "ㄊㄨ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"tuan", "t", "uan", "ㄊㄨㄢ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_AN)},
+{"tui", "t", "ui", "ㄊㄨㄟ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_EI)},
+{"tun", "t", "un", "ㄊㄨㄣ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_EN)},
+{"tuo", "t", "uo", "ㄊㄨㄛ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_O)},
+{"w", "w", "", "PINYIN_W", ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"wa", "w", "a", "ㄨㄚ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_A)},
+{"wai", "w", "ai", "ㄨㄞ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AI)},
+{"wan", "w", "an", "ㄨㄢ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AN)},
+{"wang", "w", "ang", "ㄨㄤ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ANG)},
+{"wei", "w", "ei", "ㄨㄟ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_EI)},
+{"wen", "w", "en", "ㄨㄣ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_EN)},
+{"weng", "w", "eng", "ㄨㄥ", ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"wo", "w", "o", "ㄨㄛ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_O)},
+{"wu", "w", "u", "ㄨ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"x", "x", "", "ㄒ", ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"xi", "x", "i", "ㄒㄧ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"xia", "x", "ia", "ㄒㄧㄚ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_A)},
+{"xian", "x", "ian", "ㄒㄧㄢ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AN)},
+{"xiang", "x", "iang", "ㄒㄧㄤ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ANG)},
+{"xiao", "x", "iao", "ㄒㄧㄠ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AO)},
+{"xie", "x", "ie", "ㄒㄧㄝ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_E)},
+{"xin", "x", "in", "ㄒㄧㄣ", ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"xing", "x", "ing", "ㄒㄧㄥ", ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"xiong", "x", "iong", "ㄒㄩㄥ", ChewingKey(CHEWING_X, CHEWING_I, PINYIN_ONG)},
+{"xiu", "x", "iu", "ㄒㄧㄡ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_OU)},
+{"xu", "x", "u", "ㄒㄩ", ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ZERO_FINAL)},
+{"xuan", "x", "uan", "ㄒㄩㄢ", ChewingKey(CHEWING_X, CHEWING_V, CHEWING_AN)},
+{"xue", "x", "ue", "ㄒㄩㄝ", ChewingKey(CHEWING_X, CHEWING_V, CHEWING_E)},
+{"xun", "x", "un", "ㄒㄩㄣ", ChewingKey(CHEWING_X, CHEWING_V, CHEWING_EN)},
+{"y", "y", "", "PINYIN_Y", ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"ya", "y", "a", "ㄧㄚ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_A)},
+{"yai", "y", "ai", "ㄧㄞ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AI)},
+{"yan", "y", "an", "ㄧㄢ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AN)},
+{"yang", "y", "ang", "ㄧㄤ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ANG)},
+{"yao", "y", "ao", "ㄧㄠ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AO)},
+{"ye", "y", "e", "ㄧㄝ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_E)},
+{"yi", "y", "i", "ㄧ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"yin", "y", "in", "ㄧㄣ", ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_IN)},
+{"ying", "y", "ing", "ㄧㄥ", ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_ING)},
+{"yo", "y", "o", "ㄧㄛ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_O)},
+{"yong", "y", "ong", "ㄩㄥ", ChewingKey(PINYIN_Y, CHEWING_I, PINYIN_ONG)},
+{"you", "y", "ou", "ㄧㄡ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_OU)},
+{"yu", "y", "u", "ㄩ", ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ZERO_FINAL)},
+{"yuan", "y", "uan", "ㄩㄢ", ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_AN)},
+{"yue", "y", "ue", "ㄩㄝ", ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_E)},
+{"yun", "y", "un", "ㄩㄣ", ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_EN)},
+{"z", "z", "", "ㄗ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"za", "z", "a", "ㄗㄚ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"zai", "z", "ai", "ㄗㄞ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"zan", "z", "an", "ㄗㄢ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"zang", "z", "ang", "ㄗㄤ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"zao", "z", "ao", "ㄗㄠ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"ze", "z", "e", "ㄗㄜ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"zei", "z", "ei", "ㄗㄟ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"zen", "z", "en", "ㄗㄣ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"zeng", "z", "eng", "ㄗㄥ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"zh", "zh", "", "ㄓ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)},
+{"zha", "zh", "a", "ㄓㄚ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_A)},
+{"zhai", "zh", "ai", "ㄓㄞ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AI)},
+{"zhan", "zh", "an", "ㄓㄢ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AN)},
+{"zhang", "zh", "ang", "ㄓㄤ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ANG)},
+{"zhao", "zh", "ao", "ㄓㄠ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AO)},
+{"zhe", "zh", "e", "ㄓㄜ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_E)},
+{"zhei", "zh", "ei", "ㄓㄟ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_EI)},
+{"zhen", "zh", "en", "ㄓㄣ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_EN)},
+{"zheng", "zh", "eng", "ㄓㄥ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ENG)},
+{"zhi", "zh", "i", "ㄓ", ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"zhong", "zh", "ong", "ㄓㄨㄥ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"zhou", "zh", "ou", "ㄓㄡ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"zhu", "zh", "u", "ㄓㄨ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"zhua", "zh", "ua", "ㄓㄨㄚ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_A)},
+{"zhuai", "zh", "uai", "ㄓㄨㄞ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AI)},
+{"zhuan", "zh", "uan", "ㄓㄨㄢ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AN)},
+{"zhuang", "zh", "uang", "ㄓㄨㄤ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ANG)},
+{"zhui", "zh", "ui", "ㄓㄨㄟ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_EI)},
+{"zhun", "zh", "un", "ㄓㄨㄣ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_EN)},
+{"zhuo", "zh", "uo", "ㄓㄨㄛ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_O)},
+{"zi", "z", "i", "ㄗ", ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ZERO_FINAL)},
+{"zong", "z", "ong", "ㄗㄨㄥ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, PINYIN_ONG)},
+{"zou", "z", "ou", "ㄗㄡ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_OU)},
+{"zu", "z", "u", "ㄗㄨ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ZERO_FINAL)},
+{"zuan", "z", "uan", "ㄗㄨㄢ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_AN)},
+{"zui", "z", "ui", "ㄗㄨㄟ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_EI)},
+{"zun", "z", "un", "ㄗㄨㄣ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_EN)},
+{"zuo", "z", "uo", "ㄗㄨㄛ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_O)}
+};
+
+const divided_table_item_t divided_table[] = {
+{"bian", 182478, {"bi", "an"}, 100},
+{"bie", 63919, {"bi", "e"}, 100},
+{"dian", 179799, {"di", "an"}, 100},
+{"jian", 435752, {"ji", "an"}, 200},
+{"jiang", 139834, {"ji", "ang"}, 100},
+{"jie", 294175, {"ji", "e"}, 100},
+{"jue", 119987, {"ju", "e"}, 100},
+{"kuai", 63367, {"ku", "ai"}, 100},
+{"lian", 130021, {"li", "an"}, 100},
+{"liang", 185438, {"li", "ang"}, 100},
+{"liao", 39355, {"li", "ao"}, 100},
+{"luan", 17609, {"lu", "an"}, 100},
+{"qian", 195129, {"qi", "an"}, 100},
+{"qie", 70219, {"qi", "e"}, 100},
+{"shuan", 1114, {"shu", "an"}, 100},
+{"tian", 185905, {"ti", "an"}, 100},
+{"tuan", 17287, {"tu", "an"}, 100},
+{"xian", 280991, {"xi", "an"}, 300},
+{"yuan", 280423, {"yu", "an"}, 100},
+{"zuan", 4016, {"zu", "an"}, 100}
+};
+
+const resplit_table_item_t resplit_table[] = {
+{{"a", "nan"}, 0, {"an", "an"}, 100},
+{{"an", "gang"}, 0, {"ang", "ang"}, 100},
+{{"ba", "nan"}, 0, {"ban", "an"}, 100},
+{{"ca", "nan"}, 0, {"can", "an"}, 100},
+{{"chan", "gan"}, 0, {"chang", "an"}, 100},
+{{"chan", "ge"}, 0, {"chang", "e"}, 100},
+{{"che", "nai"}, 0, {"chen", "ai"}, 100},
+{{"chen", "gan"}, 0, {"cheng", "an"}, 100},
+{{"chu", "nan"}, 100, {"chun", "an"}, 100},
+{{"dan", "gan"}, 0, {"dang", "an"}, 100},
+{{"e", "nai"}, 0, {"en", "ai"}, 100},
+{{"fa", "nan"}, 100, {"fan", "an"}, 100},
+{{"fan", "gai"}, 0, {"fang", "ai"}, 100},
+{{"fan", "gan"}, 100, {"fang", "an"}, 100},
+{{"fan", "ge"}, 0, {"fang", "e"}, 100},
+{{"ga", "nai"}, 0, {"gan", "ai"}, 100},
+{{"ga", "nen"}, 0, {"gan", "en"}, 100},
+{{"gan", "gao"}, 0, {"gang", "ao"}, 100},
+{{"guan", "gan"}, 100, {"guang", "an"}, 100},
+{{"hu", "nan"}, 100, {"hun", "an"}, 100},
+{{"huan", "gan"}, 0, {"huang", "an"}, 100},
+{{"ji", "ne"}, 0, {"jin", "e"}, 100},
+{{"ji", "nou"}, 0, {"jin", "ou"}, 100},
+{{"jia", "nai"}, 0, {"jian", "ai"}, 100},
+{{"jia", "nan"}, 100, {"jian", "an"}, 100},
+{{"jia", "ne"}, 0, {"jian", "e"}, 100},
+{{"jia", "nou"}, 0, {"jian", "ou"}, 100},
+{{"jian", "gan"}, 100, {"jiang", "an"}, 100},
+{{"jin", "gai"}, 0, {"jing", "ai"}, 100},
+{{"jin", "gan"}, 0, {"jing", "an"}, 100},
+{{"jin", "ge"}, 0, {"jing", "e"}, 100},
+{{"kuan", "gao"}, 0, {"kuang", "ao"}, 100},
+{{"li", "nan"}, 100, {"lin", "an"}, 100},
+{{"lia", "nai"}, 0, {"lian", "ai"}, 100},
+{{"lia", "ne"}, 0, {"lian", "e"}, 100},
+{{"lian", "gan"}, 0, {"liang", "an"}, 100},
+{{"ma", "ne"}, 0, {"man", "e"}, 100},
+{{"men", "gen"}, 0, {"meng", "en"}, 100},
+{{"min", "gan"}, 100, {"ming", "an"}, 100},
+{{"min", "ge"}, 100, {"ming", "e"}, 100},
+{{"na", "nai"}, 0, {"nan", "ai"}, 100},
+{{"na", "nan"}, 0, {"nan", "an"}, 200},
+{{"na", "nao"}, 0, {"nan", "ao"}, 100},
+{{"na", "nou"}, 0, {"nan", "ou"}, 100},
+{{"nin", "gan"}, 0, {"ning", "an"}, 100},
+{{"pa", "nan"}, 0, {"pan", "an"}, 100},
+{{"pen", "gan"}, 0, {"peng", "an"}, 100},
+{{"pin", "gan"}, 0, {"ping", "an"}, 100},
+{{"qi", "nai"}, 0, {"qin", "ai"}, 100},
+{{"qi", "nan"}, 0, {"qin", "an"}, 100},
+{{"qia", "nan"}, 0, {"qian", "an"}, 200},
+{{"qia", "ne"}, 0, {"qian", "e"}, 100},
+{{"qin", "gai"}, 0, {"qing", "ai"}, 100},
+{{"qin", "gan"}, 0, {"qing", "an"}, 100},
+{{"re", "nai"}, 0, {"ren", "ai"}, 100},
+{{"re", "nan"}, 0, {"ren", "an"}, 100},
+{{"san", "gou"}, 0, {"sang", "ou"}, 100},
+{{"shan", "gan"}, 100, {"shang", "an"}, 100},
+{{"she", "nai"}, 0, {"shen", "ai"}, 100},
+{{"she", "nao"}, 0, {"shen", "ao"}, 200},
+{{"wa", "nan"}, 0, {"wan", "an"}, 200},
+{{"wa", "ne"}, 0, {"wan", "e"}, 100},
+{{"wa", "nou"}, 0, {"wan", "ou"}, 100},
+{{"wen", "gan"}, 0, {"weng", "an"}, 100},
+{{"xi", "nai"}, 200, {"xin", "ai"}, 100},
+{{"xi", "nan"}, 100, {"xin", "an"}, 100},
+{{"xia", "nai"}, 0, {"xian", "ai"}, 100},
+{{"xia", "nan"}, 0, {"xian", "an"}, 100},
+{{"xia", "ne"}, 0, {"xian", "e"}, 100},
+{{"xian", "gai"}, 0, {"xiang", "ai"}, 100},
+{{"xian", "gan"}, 200, {"xiang", "an"}, 100},
+{{"xian", "ge"}, 100, {"xiang", "e"}, 100},
+{{"xin", "gai"}, 0, {"xing", "ai"}, 100},
+{{"xin", "gan"}, 200, {"xing", "an"}, 200},
+{{"ya", "nan"}, 0, {"yan", "an"}, 200},
+{{"yi", "nan"}, 300, {"yin", "an"}, 100},
+{{"yi", "ne"}, 0, {"yin", "e"}, 100},
+{{"zhan", "gai"}, 0, {"zhang", "ai"}, 100},
+{{"zhe", "nai"}, 0, {"zhen", "ai"}, 200},
+{{"zhe", "nan"}, 0, {"zhen", "an"}, 100},
+{{"zhen", "gan"}, 100, {"zheng", "an"}, 100},
+{{"zhua", "nan"}, 0, {"zhuan", "an"}, 100}
+};
+
+const gint chewing_key_table[CHEWING_NUMBER_OF_INITIALS *
+ CHEWING_NUMBER_OF_MIDDLES *
+ CHEWING_NUMBER_OF_FINALS] = {
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+2 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+3 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+4 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+5 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+85 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+86 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+87 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+88 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+89 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+234 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+252 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+253 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, PINYIN_ING) */,
+6 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+7 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+8 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+9 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+10 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+11 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+12 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+13 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+14 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+21 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+19 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+20 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+15 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AI) */,
+16 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ANG) */,
+17 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AO) */,
+18 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_I, PINYIN_ING) */,
+22 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_B, CHEWING_V, PINYIN_ING) */,
+23 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+24 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+25 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+26 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+27 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+28 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+29 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+30 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+31 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+53 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+54 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+52 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_I, PINYIN_ING) */,
+55 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_AI) */,
+56 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, INVALID_EA) */,
+57 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_EI) */,
+58 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_NG) */,
+59 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_C, CHEWING_V, PINYIN_ING) */,
+32 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+33 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+34 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+35 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+36 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+37 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+38 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+39 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+40 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+42 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+43 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+41 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_I, PINYIN_ING) */,
+44 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ZERO_FINAL) */,
+45 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_A) */,
+46 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AI) */,
+47 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AN) */,
+48 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, INVALID_EA) */,
+49 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EI) */,
+50 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_NG) */,
+51 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_CH, CHEWING_V, PINYIN_ING) */,
+60 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+61 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+62 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+63 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+64 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+65 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+66 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+67 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+68 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+69 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+78 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+79 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+75 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+76 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+70 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ZERO_FINAL) */,
+71 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AI) */,
+72 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ANG) */,
+73 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AO) */,
+74 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, PINYIN_ONG) */,
+77 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_I, PINYIN_ING) */,
+80 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_AI) */,
+81 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, INVALID_EA) */,
+82 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_EI) */,
+83 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_NG) */,
+84 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_D, CHEWING_V, PINYIN_ING) */,
+90 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+91 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+92 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+93 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+94 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+95 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+96 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+97 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+98 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+99 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_I, PINYIN_ING) */,
+100 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_F, CHEWING_V, PINYIN_ING) */,
+121 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+122 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+123 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+124 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+125 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+126 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+127 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+128 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+129 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+130 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+131 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+132 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_I, PINYIN_ING) */,
+133 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ZERO_FINAL) */,
+134 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_A) */,
+135 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AI) */,
+136 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AN) */,
+137 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, INVALID_EA) */,
+138 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_EI) */,
+139 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_NG) */,
+140 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_H, CHEWING_V, PINYIN_ING) */,
+101 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+102 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+103 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+104 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+105 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+106 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+107 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+108 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+109 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+110 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+111 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+112 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_I, PINYIN_ING) */,
+113 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ZERO_FINAL) */,
+114 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_A) */,
+115 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AI) */,
+116 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AN) */,
+117 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, INVALID_EA) */,
+118 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_EI) */,
+119 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_NG) */,
+120 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_G, CHEWING_V, PINYIN_ING) */,
+156 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+157 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+158 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+159 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+160 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+161 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+162 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+163 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+164 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+165 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+166 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+167 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_I, PINYIN_ING) */,
+168 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ZERO_FINAL) */,
+169 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_A) */,
+170 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AI) */,
+171 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AN) */,
+172 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, INVALID_EA) */,
+173 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_EI) */,
+174 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_NG) */,
+175 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_K, CHEWING_V, PINYIN_ING) */,
+141 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+148 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+149 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+142 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ZERO_FINAL) */,
+143 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AI) */,
+144 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN) */,
+145 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ANG) */,
+146 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AO) */,
+147 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_O) */,
+150 /* ChewingKey(CHEWING_J, CHEWING_I, PINYIN_ONG) */,
+151 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_I, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_U, PINYIN_ING) */,
+152 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_AI) */,
+153 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_AO) */,
+154 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_EI) */,
+155 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_J, CHEWING_V, PINYIN_ING) */,
+204 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+205 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+206 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+207 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+208 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+209 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+210 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+211 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+212 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+213 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+221 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+222 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+218 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+219 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+214 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AI) */,
+215 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ANG) */,
+216 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AO) */,
+217 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, PINYIN_ONG) */,
+220 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_I, PINYIN_ING) */,
+223 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_M, CHEWING_V, PINYIN_ING) */,
+224 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+225 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+226 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+227 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+228 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+229 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+230 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+231 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+232 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+233 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+244 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+245 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+241 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+242 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+235 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ZERO_FINAL) */,
+236 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AI) */,
+237 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AN) */,
+238 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ANG) */,
+239 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AO) */,
+240 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, PINYIN_ONG) */,
+243 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_I, PINYIN_ING) */,
+246 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_AI) */,
+247 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_EI) */,
+248 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_NG) */,
+249 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_U, PINYIN_ING) */,
+250 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_AO) */,
+251 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_N, CHEWING_V, PINYIN_ING) */,
+176 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+177 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+178 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+179 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+180 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+181 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+182 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+183 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+184 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+185 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+195 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+196 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+197 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+192 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+193 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+186 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ZERO_FINAL) */,
+187 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AI) */,
+188 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AN) */,
+189 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ANG) */,
+190 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AO) */,
+191 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, PINYIN_ONG) */,
+194 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_I, PINYIN_ING) */,
+198 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_AI) */,
+199 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_EI) */,
+200 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_NG) */,
+201 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_U, PINYIN_ING) */,
+202 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_AO) */,
+203 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_L, CHEWING_V, PINYIN_ING) */,
+287 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+288 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+289 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+290 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+291 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+292 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+293 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+295 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+296 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+294 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_I, PINYIN_ING) */,
+297 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ZERO_FINAL) */,
+298 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_AI) */,
+299 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, INVALID_EA) */,
+300 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_EI) */,
+301 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_NG) */,
+302 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_R, CHEWING_V, PINYIN_ING) */,
+254 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+255 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+256 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+257 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+258 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+259 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+260 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+261 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+262 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+269 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+270 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+267 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+268 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+263 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AI) */,
+264 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ANG) */,
+265 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AO) */,
+266 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_I, PINYIN_ING) */,
+271 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_P, CHEWING_V, PINYIN_ING) */,
+272 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+279 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+280 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+273 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ZERO_FINAL) */,
+274 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AI) */,
+275 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AN) */,
+276 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ANG) */,
+277 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AO) */,
+278 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_O) */,
+281 /* ChewingKey(CHEWING_Q, CHEWING_I, PINYIN_ONG) */,
+282 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_I, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_U, PINYIN_ING) */,
+283 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_AI) */,
+284 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_AO) */,
+285 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_EI) */,
+286 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_Q, CHEWING_V, PINYIN_ING) */,
+303 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+304 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+305 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+306 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+307 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+308 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+309 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+310 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+311 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+333 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+334 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+332 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_I, PINYIN_ING) */,
+335 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_AI) */,
+336 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, INVALID_EA) */,
+337 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_EI) */,
+338 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_NG) */,
+339 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_S, CHEWING_V, PINYIN_ING) */,
+312 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+313 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+314 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+315 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+316 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+317 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+318 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+319 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+320 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+321 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+323 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+322 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_I, PINYIN_ING) */,
+324 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ZERO_FINAL) */,
+325 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_A) */,
+326 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AI) */,
+327 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AN) */,
+328 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, INVALID_EA) */,
+329 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_EI) */,
+330 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_NG) */,
+331 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_SH, CHEWING_V, PINYIN_ING) */,
+340 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+341 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+342 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+343 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+344 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+345 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+346 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+347 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+353 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+354 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+352 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+348 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AI) */,
+349 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ANG) */,
+350 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AO) */,
+351 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_I, PINYIN_ING) */,
+355 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_AI) */,
+356 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, INVALID_EA) */,
+357 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_EI) */,
+358 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_NG) */,
+359 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_T, CHEWING_V, PINYIN_ING) */,
+360 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+367 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_I, PINYIN_ING) */,
+369 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ZERO_FINAL) */,
+361 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_A) */,
+362 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AI) */,
+363 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AN) */,
+364 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, INVALID_EA) */,
+365 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_EI) */,
+366 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_NG) */,
+368 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(PINYIN_W, CHEWING_V, PINYIN_ING) */,
+370 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+377 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+378 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+371 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ZERO_FINAL) */,
+372 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AI) */,
+373 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AN) */,
+374 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ANG) */,
+375 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AO) */,
+376 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_O) */,
+379 /* ChewingKey(CHEWING_X, CHEWING_I, PINYIN_ONG) */,
+380 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_I, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_U, PINYIN_ING) */,
+381 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_AI) */,
+382 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_AO) */,
+383 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_EI) */,
+384 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_X, CHEWING_V, PINYIN_ING) */,
+385 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+393 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+394 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+392 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ZERO_FINAL) */,
+386 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_A) */,
+387 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AI) */,
+388 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AN) */,
+389 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ANG) */,
+390 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AO) */,
+391 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_NG) */,
+395 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_O) */,
+396 /* ChewingKey(PINYIN_Y, CHEWING_I, PINYIN_ONG) */,
+397 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_I, PINYIN_ING) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_AI) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, INVALID_EA) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_EI) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_NG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_U, PINYIN_ING) */,
+398 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_AI) */,
+399 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_AO) */,
+400 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_EI) */,
+401 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(PINYIN_Y, CHEWING_V, PINYIN_ING) */,
+402 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+403 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+404 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+405 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+406 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+407 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+408 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+409 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+410 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+411 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+434 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+435 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+433 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_I, PINYIN_ING) */,
+436 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_AI) */,
+437 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, INVALID_EA) */,
+438 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_EI) */,
+439 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_NG) */,
+440 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_Z, CHEWING_V, PINYIN_ING) */,
+412 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */,
+413 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_A) */,
+414 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AI) */,
+415 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AN) */,
+416 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */,
+417 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AO) */,
+418 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, INVALID_EA) */,
+419 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_EI) */,
+420 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_EN) */,
+421 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_O) */,
+423 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */,
+424 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, PINYIN_ING) */,
+422 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, PINYIN_ING) */,
+425 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ZERO_FINAL) */,
+426 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_A) */,
+427 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AI) */,
+428 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AN) */,
+429 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, INVALID_EA) */,
+430 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_EI) */,
+431 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_NG) */,
+432 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, PINYIN_ING) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_ZERO_FINAL) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_A) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_AI) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_AN) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_ANG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_AO) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_E) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, INVALID_EA) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_EI) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_EN) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_ENG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_ER) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_NG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_O) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, PINYIN_ONG) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_OU) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, PINYIN_IN) */,
+-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, PINYIN_ING) */
+};
+
+};
+
+#endif
diff --git a/src/storage/pinyin_phrase2.h b/src/storage/pinyin_phrase2.h
new file mode 100644
index 0000000..ba2f32e
--- /dev/null
+++ b/src/storage/pinyin_phrase2.h
@@ -0,0 +1,267 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef PINYIN_PHRASE2_H
+#define PINYIN_PHRASE2_H
+
+#include "novel_types.h"
+#include "chewing_key.h"
+#include "pinyin_custom2.h"
+#include "pinyin_parser2.h"
+
+namespace pinyin{
+
+inline int pinyin_exact_compare2(const ChewingKey * key_lhs,
+ const ChewingKey * key_rhs,
+ int phrase_length){
+ int i;
+ int result;
+
+ /* compare initial */
+ for (i = 0; i < phrase_length; ++i) {
+ result = key_lhs[i].m_initial - key_rhs[i].m_initial;
+ if (0 != result)
+ return result;
+ }
+
+ /* compare middle and final */
+ for (i = 0; i < phrase_length; ++i) {
+ result = key_lhs[i].m_middle - key_rhs[i].m_middle;
+ if (0 != result)
+ return result;
+ result = key_lhs[i].m_final - key_rhs[i].m_final;
+ if (0 != result)
+ return result;
+ }
+
+ /* compare tone */
+ for (i = 0; i < phrase_length; ++i) {
+ result = key_lhs[i].m_tone - key_rhs[i].m_tone;
+ if (0 != result)
+ return result;
+ }
+
+ return 0;
+}
+
+
+inline int pinyin_compare_with_ambiguities2(pinyin_option_t options,
+ const ChewingKey * key_lhs,
+ const ChewingKey * key_rhs,
+ int phrase_length){
+ int i;
+ int result;
+
+ /* compare initial */
+ for (i = 0; i < phrase_length; ++i) {
+ result = pinyin_compare_initial2
+ (options,
+ (ChewingInitial)key_lhs[i].m_initial,
+ (ChewingInitial)key_rhs[i].m_initial);
+ if (0 != result)
+ return result;
+ }
+
+ /* compare middle and final */
+ for (i = 0; i < phrase_length; ++i) {
+ result = pinyin_compare_middle_and_final2
+ (options,
+ (ChewingMiddle)key_lhs[i].m_middle,
+ (ChewingMiddle)key_rhs[i].m_middle,
+ (ChewingFinal) key_lhs[i].m_final,
+ (ChewingFinal) key_rhs[i].m_final);
+ if (0 != result)
+ return result;
+ }
+
+ /* compare tone */
+ for (i = 0; i < phrase_length; ++i) {
+ result = pinyin_compare_tone2
+ (options,
+ (ChewingTone)key_lhs[i].m_tone,
+ (ChewingTone)key_rhs[i].m_tone);
+ if (0 != result)
+ return result;
+ }
+
+ return 0;
+}
+
+/* compute pinyin lower bound */
+inline void compute_lower_value2(pinyin_option_t options,
+ const ChewingKey * in_keys,
+ ChewingKey * out_keys,
+ int phrase_length) {
+ ChewingKey aKey;
+
+ for (int i = 0; i < phrase_length; ++i) {
+ int k; int sel;
+ aKey = in_keys[i];
+
+ /* compute lower initial */
+ sel = aKey.m_initial;
+ for (k = aKey.m_initial - 1; k >= CHEWING_ZERO_INITIAL; --k) {
+ if (0 != pinyin_compare_initial2
+ (options, (ChewingInitial)aKey.m_initial, (ChewingInitial)k))
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_initial = (ChewingInitial)sel;
+
+ /* compute lower middle, skipped as no fuzzy pinyin here.
+ * if needed in future, still use pinyin_compare_middle_and_final2
+ * to check lower bound.
+ */
+
+ /* as chewing zero middle is the first item, and its value is zero,
+ * no need to adjust it for incomplete pinyin.
+ */
+
+ /* compute lower final */
+ sel = aKey.m_final;
+ for (k = aKey.m_final - 1; k >= CHEWING_ZERO_FINAL; --k) {
+ if (0 != pinyin_compare_middle_and_final2
+ (options,
+ (ChewingMiddle)aKey.m_middle, (ChewingMiddle) aKey.m_middle,
+ (ChewingFinal)aKey.m_final, (ChewingFinal)k))
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_final = (ChewingFinal)sel;
+
+ /* compute lower tone */
+ sel = aKey.m_tone;
+ for (k = aKey.m_tone - 1; k >= CHEWING_ZERO_TONE; --k) {
+ if (0 != pinyin_compare_tone2
+ (options, (ChewingTone)aKey.m_tone, (ChewingTone)k))
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_tone = (ChewingTone)sel;
+
+ /* save the result */
+ out_keys[i] = aKey;
+ }
+}
+
+/* compute pinyin upper bound */
+inline void compute_upper_value2(pinyin_option_t options,
+ const ChewingKey * in_keys,
+ ChewingKey * out_keys,
+ int phrase_length) {
+ ChewingKey aKey;
+
+ for (int i = 0; i < phrase_length; ++i) {
+ int k; int sel;
+ aKey = in_keys[i];
+
+ /* compute upper initial */
+ sel = aKey.m_initial;
+ for (k = aKey.m_initial + 1; k <= CHEWING_LAST_INITIAL; ++k) {
+ if (0 != pinyin_compare_initial2
+ (options, (ChewingInitial)aKey.m_initial, (ChewingInitial)k))
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_initial = (ChewingInitial)sel;
+
+ /* adjust it for incomplete pinyin. */
+
+ /* compute upper middle */
+ sel = aKey.m_middle;
+ for (k = aKey.m_middle + 1; k <= CHEWING_LAST_MIDDLE; ++k) {
+ if (0 != pinyin_compare_middle_and_final2
+ (options,
+ (ChewingMiddle)aKey.m_middle, (ChewingMiddle)k,
+ (ChewingFinal)aKey.m_final, (ChewingFinal)aKey.m_final))
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_middle = (ChewingMiddle)sel;
+
+ /* compute upper final */
+ sel = aKey.m_final;
+ for (k = aKey.m_final + 1; k <= CHEWING_LAST_FINAL; ++k) {
+ if (0 != pinyin_compare_middle_and_final2
+ (options,
+ (ChewingMiddle)aKey.m_middle, (ChewingMiddle)aKey.m_middle,
+ (ChewingFinal)aKey.m_final, (ChewingFinal)k))
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_final = (ChewingFinal)sel;
+
+ /* compute upper tone */
+ sel = aKey.m_tone;
+ for (k = aKey.m_tone + 1; k <= CHEWING_LAST_TONE; ++k) {
+ if (0 != pinyin_compare_tone2
+ (options, (ChewingTone)aKey.m_tone, (ChewingTone)k))
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_tone = (ChewingTone)sel;
+
+ /* save the result */
+ out_keys[i] = aKey;
+ }
+}
+
+
+template<size_t phrase_length>
+struct PinyinIndexItem2{
+ phrase_token_t m_token;
+ ChewingKey m_keys[phrase_length];
+public:
+ PinyinIndexItem2<phrase_length> (const ChewingKey * keys,
+ phrase_token_t token) {
+ memmove(m_keys, keys, sizeof(ChewingKey) * phrase_length);
+ m_token = token;
+ }
+};
+
+
+/* for find the element in the phrase array */
+template<size_t phrase_length>
+inline int phrase_exact_compare2(const PinyinIndexItem2<phrase_length> &lhs,
+ const PinyinIndexItem2<phrase_length> &rhs)
+{
+ ChewingKey * keys_lhs = (ChewingKey *) lhs.m_keys;
+ ChewingKey * keys_rhs = (ChewingKey *) rhs.m_keys;
+ return pinyin_exact_compare2(keys_lhs, keys_rhs, phrase_length);
+}
+
+template<size_t phrase_length>
+inline bool phrase_exact_less_than2(const PinyinIndexItem2<phrase_length> &lhs,
+ const PinyinIndexItem2<phrase_length> &rhs)
+{
+ return 0 > phrase_exact_compare2<phrase_length>(lhs, rhs);
+}
+
+};
+
+#endif
diff --git a/src/storage/table_info.cpp b/src/storage/table_info.cpp
new file mode 100644
index 0000000..795d93d
--- /dev/null
+++ b/src/storage/table_info.cpp
@@ -0,0 +1,272 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2013 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "table_info.h"
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+using namespace pinyin;
+
+
+static const pinyin_table_info_t reserved_tables[] = {
+ {RESERVED, NULL, NULL, NULL, NOT_USED},
+ {GB_DICTIONARY, "gb_char.table", "gb_char.bin", "gb_char.dbin", SYSTEM_FILE},
+ {GBK_DICTIONARY, "gbk_char.table", "gbk_char.bin", "gbk_char.dbin", SYSTEM_FILE},
+
+ {MERGED_DICTIONARY, "merged.table", "merged.bin", "merged.dbin", SYSTEM_FILE},
+
+ {USER_DICTIONARY, NULL, NULL, "user.bin", USER_FILE}
+};
+
+
+SystemTableInfo::SystemTableInfo() {
+ m_binary_format_version = 0;
+ m_model_data_version = 0;
+ m_lambda = 0.;
+
+ size_t i;
+ for (i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ pinyin_table_info_t * table_info = &m_table_info[i];
+
+ table_info->m_dict_index = i;
+ table_info->m_table_filename = NULL;
+ table_info->m_system_filename = NULL;
+ table_info->m_user_filename = NULL;
+ table_info->m_file_type = NOT_USED;
+ }
+}
+
+SystemTableInfo::~SystemTableInfo() {
+ reset();
+}
+
+void SystemTableInfo::reset() {
+ m_binary_format_version = 0;
+ m_model_data_version = 0;
+ m_lambda = 0.;
+
+ size_t i;
+ for (i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ pinyin_table_info_t * table_info = &m_table_info[i];
+
+ g_free((gchar *)table_info->m_table_filename);
+ table_info->m_table_filename = NULL;
+ g_free((gchar *)table_info->m_system_filename);
+ table_info->m_system_filename = NULL;
+ g_free((gchar *)table_info->m_user_filename);
+ table_info->m_user_filename = NULL;
+
+ table_info->m_file_type = NOT_USED;
+ }
+}
+
+void SystemTableInfo::postfix_tables() {
+ size_t i;
+ for (i = 0; i < G_N_ELEMENTS(reserved_tables); ++i) {
+ const pinyin_table_info_t * postfix = &reserved_tables[i];
+
+ guint8 index = postfix->m_dict_index;
+ pinyin_table_info_t * table_info = &m_table_info[index];
+ assert(table_info->m_dict_index == index);
+
+ table_info->m_table_filename = g_strdup(postfix->m_table_filename);
+ table_info->m_system_filename = g_strdup(postfix->m_system_filename);
+ table_info->m_user_filename = g_strdup(postfix->m_user_filename);
+ table_info->m_file_type = postfix->m_file_type;
+ }
+}
+
+static gchar * to_string(const char * str) {
+ if (0 == strcmp(str, "NULL"))
+ return NULL;
+
+ return g_strdup(str);
+}
+
+static PHRASE_FILE_TYPE to_file_type(const char * str) {
+#define HANDLE(x) { \
+ if (0 == strcmp(str, #x)) \
+ return x; \
+ }
+
+ HANDLE(NOT_USED);
+ HANDLE(SYSTEM_FILE);
+ HANDLE(DICTIONARY);
+ HANDLE(USER_FILE);
+
+ assert(false);
+
+#undef HANDLE
+}
+
+bool SystemTableInfo::load(const char * filename) {
+ reset();
+
+ FILE * input = fopen(filename, "r");
+ if (NULL == input) {
+ fprintf(stderr, "open %s failed.\n", filename);
+ return false;
+ }
+
+ int binver = 0, modelver = 0;
+ gfloat lambda = 0.;
+
+ int num = fscanf(input, "binary format version:%d\n", &binver);
+ if (1 != num) {
+ fclose(input);
+ return false;
+ }
+
+ num = fscanf(input, "model data version:%d\n", &modelver);
+ if (1 != num) {
+ fclose(input);
+ return false;
+ }
+
+ num = fscanf(input, "lambda parameter:%f\n", &lambda);
+ if (1 != num) {
+ fclose(input);
+ return false;
+ }
+
+#if 0
+ printf("binver:%d modelver:%d lambda:%f\n", binver, modelver, lambda);
+#endif
+
+ m_binary_format_version = binver;
+ m_model_data_version = modelver;
+ m_lambda = lambda;
+
+ int index = 0;
+ char tablefile[256], sysfile[256], userfile[256], filetype[256];
+ while (!feof(input)) {
+ num = fscanf(input, "%d %s %s %s %s\n",
+ &index, tablefile, sysfile, userfile, filetype);
+
+ if (5 != num)
+ continue;
+
+ if (!(0 <= index && index < PHRASE_INDEX_LIBRARY_COUNT))
+ continue;
+
+ /* save into m_table_info. */
+ pinyin_table_info_t * table_info = &m_table_info[index];
+ assert(index == table_info->m_dict_index);
+
+ table_info->m_table_filename = to_string(tablefile);
+ table_info->m_system_filename = to_string(sysfile);
+ table_info->m_user_filename = to_string(userfile);
+
+ table_info->m_file_type = to_file_type(filetype);
+ }
+
+ fclose(input);
+
+ /* postfix reserved tables. */
+ postfix_tables();
+ return true;
+}
+
+const pinyin_table_info_t * SystemTableInfo::get_table_info() {
+ return m_table_info;
+}
+
+gfloat SystemTableInfo::get_lambda() {
+ return m_lambda;
+}
+
+
+UserTableInfo::UserTableInfo() {
+ m_binary_format_version = 0;
+ m_model_data_version = 0;
+}
+
+void UserTableInfo::reset() {
+ m_binary_format_version = 0;
+ m_model_data_version = 0;
+}
+
+bool UserTableInfo::load(const char * filename) {
+ reset();
+
+ FILE * input = fopen(filename, "r");
+ if (NULL == input) {
+ fprintf(stderr, "open %s failed.", filename);
+ return false;
+ }
+
+ int binver = 0, modelver = 0;
+
+ int num = fscanf(input, "binary format version:%d\n", &binver);
+ if (1 != num) {
+ fclose(input);
+ return false;
+ }
+
+ num = fscanf(input, "model data version:%d\n", &modelver);
+ if (1 != num) {
+ fclose(input);
+ return false;
+ }
+
+#if 0
+ printf("binver:%d modelver:%d\n", binver, modelver);
+#endif
+
+ m_binary_format_version = binver;
+ m_model_data_version = modelver;
+
+ fclose(input);
+
+ return true;
+}
+
+bool UserTableInfo::save(const char * filename) {
+ FILE * output = fopen(filename, "w");
+ if (NULL == output) {
+ fprintf(stderr, "write %s failed.\n", filename);
+ return false;
+ }
+
+ fprintf(output, "binary format version:%d\n", m_binary_format_version);
+ fprintf(output, "model data version:%d\n", m_model_data_version);
+
+ fclose(output);
+
+ return true;
+}
+
+bool UserTableInfo::is_conform(const SystemTableInfo * sysinfo) {
+ if (sysinfo->m_binary_format_version != m_binary_format_version)
+ return false;
+
+ if (sysinfo->m_model_data_version != m_model_data_version)
+ return false;
+
+ return true;
+}
+
+bool UserTableInfo::make_conform(const SystemTableInfo * sysinfo) {
+ m_binary_format_version = sysinfo->m_binary_format_version;
+ m_model_data_version = sysinfo->m_model_data_version;
+ return true;
+}
diff --git a/src/storage/table_info.h b/src/storage/table_info.h
new file mode 100644
index 0000000..8d7fa05
--- /dev/null
+++ b/src/storage/table_info.h
@@ -0,0 +1,97 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2013 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef TABLE_INFO_H
+#define TABLE_INFO_H
+
+#include "novel_types.h"
+
+
+namespace pinyin{
+
+typedef enum {
+ NOT_USED, /* not used. */
+ SYSTEM_FILE, /* system phrase file. */
+ DICTIONARY, /* professional dictionary. */
+ USER_FILE, /* user only phrase file. */
+} PHRASE_FILE_TYPE;
+
+typedef struct {
+ guint8 m_dict_index; /* for assert purpose. */
+ const gchar * m_table_filename;
+ const gchar * m_system_filename;
+ const gchar * m_user_filename;
+ PHRASE_FILE_TYPE m_file_type;
+} pinyin_table_info_t;
+
+
+class UserTableInfo;
+
+class SystemTableInfo{
+ friend class UserTableInfo;
+private:
+ int m_binary_format_version;
+ int m_model_data_version;
+ gfloat m_lambda;
+
+ pinyin_table_info_t m_table_info[PHRASE_INDEX_LIBRARY_COUNT];
+
+private:
+ void reset();
+
+ void postfix_tables();
+
+public:
+ SystemTableInfo();
+
+ ~SystemTableInfo();
+
+ bool load(const char * filename);
+
+ const pinyin_table_info_t * get_table_info();
+
+ gfloat get_lambda();
+};
+
+class UserTableInfo{
+private:
+ int m_binary_format_version;
+ int m_model_data_version;
+
+private:
+ void reset();
+
+public:
+ UserTableInfo();
+
+ bool load(const char * filename);
+
+ bool save(const char * filename);
+
+ bool is_conform(const SystemTableInfo * sysinfo);
+
+ bool make_conform(const SystemTableInfo * sysinfo);
+};
+
+};
+
+
+#endif
diff --git a/src/storage/tag_utility.cpp b/src/storage/tag_utility.cpp
new file mode 100644
index 0000000..081e931
--- /dev/null
+++ b/src/storage/tag_utility.cpp
@@ -0,0 +1,420 @@
+#include <glib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "phrase_large_table2.h"
+#include "tag_utility.h"
+
+namespace pinyin{
+
+/* internal taglib structure */
+struct tag_entry{
+ int m_line_type;
+ char * m_line_tag;
+ int m_num_of_values;
+ char ** m_required_tags;
+ /* char ** m_optional_tags; */
+ /* int m_optional_count = 0; */
+ char ** m_ignored_tags;
+};
+
+tag_entry tag_entry_copy(int line_type, const char * line_tag,
+ int num_of_values,
+ char * required_tags[],
+ char * ignored_tags[]){
+ tag_entry entry;
+ entry.m_line_type = line_type;
+ entry.m_line_tag = g_strdup( line_tag );
+ entry.m_num_of_values = num_of_values;
+ entry.m_required_tags = g_strdupv( required_tags );
+ entry.m_ignored_tags = g_strdupv( ignored_tags );
+ return entry;
+}
+
+tag_entry tag_entry_clone(tag_entry * entry){
+ return tag_entry_copy(entry->m_line_type, entry->m_line_tag,
+ entry->m_num_of_values,
+ entry->m_required_tags, entry->m_ignored_tags);
+}
+
+void tag_entry_reclaim(tag_entry * entry){
+ g_free( entry->m_line_tag );
+ g_strfreev( entry->m_required_tags );
+ g_strfreev(entry->m_ignored_tags);
+}
+
+static bool taglib_free_tag_array(GArray * tag_array){
+ for ( size_t i = 0; i < tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+ tag_entry_reclaim(entry);
+ }
+ g_array_free(tag_array, TRUE);
+ return true;
+}
+
+/* special unichar to be handled in split_line. */
+static gunichar backslash = 0;
+static gunichar quote = 0;
+
+static gboolean split_line_init(){
+ backslash = g_utf8_get_char("\\");
+ quote = g_utf8_get_char("\"");
+ return TRUE;
+}
+
+/* Pointer Array of Array of tag_entry */
+static GPtrArray * g_tagutils_stack = NULL;
+
+bool taglib_init(){
+ assert( g_tagutils_stack == NULL);
+ g_tagutils_stack = g_ptr_array_new();
+ GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
+ g_ptr_array_add(g_tagutils_stack, tag_array);
+
+ /* init split_line. */
+ split_line_init();
+ return true;
+}
+
+bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values,
+ const char * required_tags, const char * ignored_tags){
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack,
+ g_tagutils_stack->len - 1);
+
+ /* some duplicate tagname or line_type check here. */
+ for ( size_t i = 0; i < tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+ if ( entry->m_line_type == line_type ||
+ strcmp( entry->m_line_tag, line_tag ) == 0 )
+ return false;
+ }
+
+ char ** required = g_strsplit_set(required_tags, ",:", -1);
+ char ** ignored = g_strsplit_set(ignored_tags, ",:", -1);
+
+ tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values,
+ required, ignored);
+ g_array_append_val(tag_array, entry);
+
+ g_strfreev(required);
+ g_strfreev(ignored);
+ return true;
+}
+
+static void ptr_array_entry_free(gpointer data, gpointer user_data){
+ g_free(data);
+}
+
+static gboolean hash_table_key_value_free(gpointer key, gpointer value,
+ gpointer user_data){
+ g_free(key);
+ g_free(value);
+ return TRUE;
+}
+
+/* split the line into tokens. */
+static gchar ** split_line(const gchar * line){
+ /* array for tokens. */
+ GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *));
+
+ for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){
+ gunichar unichar = g_utf8_get_char(cur);
+ const gchar * begin = cur;
+ gchar * token = NULL;
+
+ if ( g_unichar_isspace (unichar) ) {
+ continue;
+ }else if ( unichar == quote ) {
+ /* handles "\"". */
+ /* skip the first '"'. */
+ begin = cur = g_utf8_next_char(cur);
+ while (*cur) {
+ unichar = g_utf8_get_char(cur);
+ if ( unichar == backslash ) {
+ cur = g_utf8_next_char(cur);
+ g_return_val_if_fail(*cur, NULL);
+ } else if ( unichar == quote ){
+ break;
+ }
+ cur = g_utf8_next_char(cur);
+ }
+ gchar * tmp = g_strndup( begin, cur - begin);
+ /* TODO: switch to own strdup_escape implementation
+ for \"->" transforming. */
+ token = g_strdup_printf("%s", tmp);
+ g_free(tmp);
+ } else {
+ /* handles other tokens. */
+ while(*cur) {
+ unichar = g_utf8_get_char(cur);
+ if ( g_unichar_isgraph(unichar) ) {
+ /* next unichar */
+ cur = g_utf8_next_char(cur);
+ } else {
+ /* space and other characters handles. */
+ break;
+ }
+ }
+ token = g_strndup( begin, cur - begin );
+ }
+
+ g_array_append_val(tokens, token);
+ if ( !*cur )
+ break;
+ }
+
+ return (gchar **)g_array_free(tokens, FALSE);
+}
+
+bool taglib_read(const char * input_line, int & line_type, GPtrArray * values,
+ GHashTable * required){
+ /* reset values and required. */
+ g_ptr_array_foreach(values, ptr_array_entry_free, NULL);
+ g_ptr_array_set_size(values, 0);
+ g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL);
+
+ /* use own version of split_line
+ instead of g_strsplit_set for special token.*/
+ char ** tokens = split_line(input_line);
+ int num_of_tokens = g_strv_length(tokens);
+
+ char * line_tag = tokens[0];
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+
+ tag_entry * cur_entry = NULL;
+ /* find line type. */
+ for ( size_t i = 0; i < tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+ if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) {
+ cur_entry = entry;
+ break;
+ }
+ }
+
+ if ( !cur_entry )
+ return false;
+
+ line_type = cur_entry->m_line_type;
+
+ for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) {
+ g_return_val_if_fail(i < num_of_tokens, false);
+ char * value = g_strdup( tokens[i] );
+ g_ptr_array_add(values, value);
+ }
+
+ int ignored_len = g_strv_length( cur_entry->m_ignored_tags );
+ int required_len = g_strv_length( cur_entry->m_required_tags);
+
+ for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){
+ g_return_val_if_fail(i < num_of_tokens, false);
+ const char * tmp = tokens[i];
+
+ /* check ignored tags. */
+ bool tag_ignored = false;
+ for ( int m = 0; m < ignored_len; ++m) {
+ if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) {
+ tag_ignored = true;
+ break;
+ }
+ }
+
+ if ( tag_ignored ) {
+ ++i;
+ continue;
+ }
+
+ /* check required tags. */
+ bool tag_required = false;
+ for ( int m = 0; m < required_len; ++m) {
+ if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) {
+ tag_required = true;
+ break;
+ }
+ }
+
+ /* warning on the un-expected tags. */
+ if ( !tag_required ) {
+ g_warning("un-expected tags:%s.\n", tmp);
+ ++i;
+ continue;
+ }
+
+ char * key = g_strdup(tokens[i]);
+ ++i;
+ g_return_val_if_fail(i < num_of_tokens, false);
+ char * value = g_strdup(tokens[i]);
+ g_hash_table_insert(required, key, value);
+ }
+
+ /* check for all required tags. */
+ for ( int i = 0; i < required_len; ++i) {
+ const char * required_tag_str = cur_entry->m_required_tags[i];
+ gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL);
+ if ( !result ) {
+ g_warning("missed required tags: %s.\n", required_tag_str);
+ g_strfreev(tokens);
+ return false;
+ }
+ }
+
+ g_strfreev(tokens);
+ return true;
+}
+
+bool taglib_remove_tag(int line_type){
+ /* Note: duplicate entry check is in taglib_add_tag. */
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+ for ( size_t i = 0; i < tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+ if (entry->m_line_type != line_type)
+ continue;
+ tag_entry_reclaim(entry);
+ g_array_remove_index(tag_array, i);
+ return true;
+ }
+ return false;
+}
+
+bool taglib_push_state(){
+ assert(g_tagutils_stack->len >= 1);
+ GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
+ GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+ for ( size_t i = 0; i < prev_tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i);
+ tag_entry new_entry = tag_entry_clone(entry);
+ g_array_append_val(next_tag_array, new_entry);
+ }
+ g_ptr_array_add(g_tagutils_stack, next_tag_array);
+ return true;
+}
+
+bool taglib_pop_state(){
+ assert(g_tagutils_stack->len > 1);
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+ g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+ taglib_free_tag_array(tag_array);
+ return true;
+}
+
+bool taglib_fini(){
+ for ( size_t i = 0; i < g_tagutils_stack->len; ++i){
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i);
+ taglib_free_tag_array(tag_array);
+ }
+ g_ptr_array_free(g_tagutils_stack, TRUE);
+ g_tagutils_stack = NULL;
+ return true;
+}
+
+#if 0
+
+static phrase_token_t taglib_special_string_to_token(const char * string){
+ struct token_pair{
+ phrase_token_t token;
+ const char * string;
+ };
+
+ static const token_pair tokens [] = {
+ {sentence_start, "<start>"},
+ {0, NULL}
+ };
+
+ const token_pair * pair = tokens;
+ while (pair->string) {
+ if ( strcmp(string, pair->string ) == 0 )
+ return pair->token;
+ pair++;
+ }
+
+ fprintf(stderr, "error: unknown token:%s.\n", string);
+ return 0;
+}
+
+phrase_token_t taglib_string_to_token(PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ const char * string){
+ phrase_token_t token = null_token;
+ if ( string[0] == '<' ) {
+ return taglib_special_string_to_token(string);
+ }
+
+ glong phrase_len = g_utf8_strlen(string, -1);
+ ucs4_t * phrase = g_utf8_to_ucs4(string, -1, NULL, NULL, NULL);
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index->prepare_tokens(tokens);
+ int result = phrase_table->search(phrase_len, phrase, tokens);
+ int num = get_first_token(tokens, token);
+ phrase_index->destroy_tokens(tokens);
+
+ if ( !(result & SEARCH_OK) )
+ fprintf(stderr, "error: unknown token:%s.\n", string);
+
+ g_free(phrase);
+ return token;
+}
+
+#endif
+
+static const char * taglib_special_token_to_string(phrase_token_t token){
+ struct token_pair{
+ phrase_token_t token;
+ const char * string;
+ };
+
+ static const token_pair tokens [] = {
+ {sentence_start, "<start>"},
+ {0, NULL}
+ };
+
+ const token_pair * pair = tokens;
+ while (pair->token) {
+ if ( token == pair->token )
+ return pair->string;
+ pair++;
+ }
+
+ fprintf(stderr, "error: unknown token:%d.\n", token);
+ return NULL;
+}
+
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+ phrase_token_t token) {
+ PhraseItem item;
+ ucs4_t buffer[MAX_PHRASE_LENGTH];
+
+ gchar * phrase;
+ /* deal with the special phrase index, for "<start>..." */
+ if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
+ return g_strdup(taglib_special_token_to_string(token));
+ }
+
+ int result = phrase_index->get_phrase_item(token, item);
+ if (result != ERROR_OK) {
+ fprintf(stderr, "error: unknown token:%d.\n", token);
+ return NULL;
+ }
+
+ item.get_phrase_string(buffer);
+ guint8 length = item.get_phrase_length();
+ phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
+ return phrase;
+}
+
+bool taglib_validate_token_with_string(FacadePhraseIndex * phrase_index,
+ phrase_token_t token,
+ const char * string){
+ bool result = false;
+
+ char * str = taglib_token_to_string(phrase_index, token);
+ result = (0 == strcmp(str, string));
+ g_free(str);
+
+ return result;
+}
+
+
+};
diff --git a/src/storage/tag_utility.h b/src/storage/tag_utility.h
new file mode 100644
index 0000000..ceb1d6c
--- /dev/null
+++ b/src/storage/tag_utility.h
@@ -0,0 +1,151 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef TAG_UTILITY_H
+#define TAG_UTILITY_H
+
+#include "novel_types.h"
+
+/* Note: the optional tag has been removed from the first implementation.
+ * Maybe the optional tag will be added back later.
+ */
+
+namespace pinyin{
+
+/**
+ * taglib_init:
+ * @returns: whether the initialize operation is successful.
+ *
+ * Initialize the n-gram tag parse library.
+ *
+ */
+bool taglib_init();
+
+/**
+ * taglib_add_tag:
+ * @line_type: the line type.
+ * @line_tag: the line tag.
+ * @num_of_values: the number of values following the line tag.
+ * @required_tags: the required tags of the line.
+ * @ignored_tags: the ignored tags of the line.
+ * @returns: whether the add operation is successful.
+ *
+ * Add one line tag to the tag parse library.
+ *
+ * Note: the required and ignored tags are separated by ',' or ':' .
+ *
+ */
+bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags);
+
+/**
+ * taglib_read:
+ * @input_line: one input line.
+ * @line_type: the line type.
+ * @values: the values following the line tag.
+ * @required: the required tags of the line type.
+ * @returns: whether the line is parsed ok.
+ *
+ * Parse one input line into line_type, values and required tags.
+ *
+ * Note: most parameters are hash table of string (const char *).
+ *
+ */
+bool taglib_read(const char * input_line, int & line_type,
+ GPtrArray * values, GHashTable * required);
+
+/**
+ * taglib_remove_tag:
+ * @line_type: the type of the line tag.
+ * @returns: whether the remove operation is successful.
+ *
+ * Remove one line tag.
+ *
+ */
+bool taglib_remove_tag(int line_type);
+
+/**
+ * taglib_push_state:
+ * @returns: whether the push operation is successful.
+ *
+ * Push the current state onto the stack.
+ *
+ * Note: the taglib_push/pop_state functions are used to save
+ * the current known tag list in stack.
+ * Used when the parsing context is changed.
+ */
+bool taglib_push_state();
+
+/**
+ * taglib_pop_state:
+ * @returns: whether the pop operation is successful.
+ *
+ * Pop the current state off the stack.
+ *
+ */
+bool taglib_pop_state();
+
+/**
+ * taglib_fini:
+ * @returns: whether the finish operation is successful.
+ *
+ * Finish the n-gram tag parse library.
+ *
+ */
+bool taglib_fini();
+
+class PhraseLargeTable2;
+class FacadePhraseIndex;
+
+
+/**
+ * taglib_token_to_string:
+ * @phrase_index: the phrase index for phrase string lookup.
+ * @token: the phrase token.
+ * @returns: the phrase string found in phrase index.
+ *
+ * Translate one token into the phrase string.
+ *
+ */
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+ phrase_token_t token);
+
+/**
+ * taglib_validate_token_with_string:
+ * @phrase_index: the phrase index.
+ * @token: the phrase token.
+ * @string: the phrase string.
+ * @returns: whether the token is validated with the phrase string.
+ *
+ * Validate the token with the phrase string.
+ *
+ */
+bool taglib_validate_token_with_string(FacadePhraseIndex * phrase_index,
+ phrase_token_t token,
+ const char * string);
+
+/* Note: the following function is only available when the optional tag exists.
+ bool taglib_report_status(int line_type); */
+
+/* Note: taglib_write is omited, as printf is more suitable for this. */
+
+};
+
+#endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..5783407
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,33 @@
+add_subdirectory(include)
+add_subdirectory(storage)
+add_subdirectory(lookup)
+
+add_executable(
+ test_pinyin
+ test_pinyin.cpp
+)
+
+target_link_libraries(
+ test_pinyin
+ libpinyin
+)
+
+add_executable(
+ test_phrase
+ test_phrase.cpp
+)
+
+target_link_libraries(
+ test_phrase
+ libpinyin
+)
+
+add_executable(
+ test_chewing
+ test_chewing.cpp
+)
+
+target_link_libraries(
+ test_chewing
+ libpinyin
+)
diff --git a/tests/Makefile.am b/tests/Makefile.am
new file mode 100644
index 0000000..8208214
--- /dev/null
+++ b/tests/Makefile.am
@@ -0,0 +1,50 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+AUTOMAKE_OPTIONS = gnu
+SUBDIRS = include storage lookup
+
+MAINTAINERCLEANFILES = Makefile.in
+
+CLEANFILES = *.bak
+
+ACLOCAL = aclocal -I $(ac_aux_dir)
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ @GLIB2_CFLAGS@
+
+noinst_HEADERS = timer.h \
+ tests_helper.h
+
+noinst_PROGRAMS = test_pinyin \
+ test_phrase \
+ test_chewing
+
+test_pinyin_SOURCES = test_pinyin.cpp
+
+test_pinyin_LDADD = ../src/libpinyin.la @GLIB2_LIBS@
+
+test_phrase_SOURCES = test_phrase.cpp
+
+test_phrase_LDADD = ../src/libpinyin.la @GLIB2_LIBS@
+
+test_chewing_SOURCES = test_chewing.cpp
+
+test_chewing_LDADD = ../src/libpinyin.la @GLIB2_LIBS@
diff --git a/tests/include/CMakeLists.txt b/tests/include/CMakeLists.txt
new file mode 100644
index 0000000..f51c87e
--- /dev/null
+++ b/tests/include/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(
+ test_memory_chunk
+ test_memory_chunk.cpp
+)
+
+target_link_libraries(
+ test_memory_chunk
+ libpinyin
+) \ No newline at end of file
diff --git a/tests/include/Makefile.am b/tests/include/Makefile.am
new file mode 100644
index 0000000..7174bec
--- /dev/null
+++ b/tests/include/Makefile.am
@@ -0,0 +1,31 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ @GLIB2_CFLAGS@
+
+TESTS = test_memory_chunk
+
+noinst_PROGRAMS = test_memory_chunk
+
+test_memory_chunk_SOURCES = test_memory_chunk.cpp
+
+test_memory_chunk_LDADD = @GLIB2_LIBS@
+
diff --git a/tests/include/test_memory_chunk.cpp b/tests/include/test_memory_chunk.cpp
new file mode 100644
index 0000000..9779c8f
--- /dev/null
+++ b/tests/include/test_memory_chunk.cpp
@@ -0,0 +1,64 @@
+#include <stdio.h>
+#include "pinyin_internal.h"
+
+//Test Memory Chunk Functionality
+int main(int argc, char * argv[]){
+ MemoryChunk* chunk;
+ chunk = new MemoryChunk();
+ int i = 12;
+ chunk->set_content(0, &i, sizeof(int));
+
+ int * p = (int *)chunk->begin();
+ assert(chunk->size() == sizeof(int));
+ printf("%d\n", *p);
+ printf("%ld\n", chunk->capacity());
+
+ p = & i;
+ chunk->set_chunk(p, sizeof(int), NULL);
+ short t = 5;
+ chunk->set_content(sizeof(int), &t, sizeof(short));
+ assert( sizeof(int) + sizeof(short) == chunk->size());
+ printf("%ld\n", chunk->capacity());
+
+ p = (int *)chunk->begin();
+ short * p2 =(short *)(((char *) (chunk->begin())) + sizeof(int));
+ printf("%d\t%d\n", *p, *p2);
+
+ chunk->set_content(sizeof(int) + sizeof(short), &t, sizeof(short));
+
+ assert( sizeof(int) + (sizeof(short) << 1) == chunk->size());
+ printf("%ld\n", chunk->capacity());
+ p = (int *)chunk->begin();
+ p2 =(short *)(((char *) (chunk->begin())) + sizeof(int));
+ printf("%d\t%d\t%d\n", *p, *p2, *(p2 + 1));
+
+ chunk->set_size(sizeof(int) + sizeof(short) *3);
+ p = (int *)chunk->begin();
+ p2 =(short *)(((char *) (chunk->begin())) + sizeof(int));
+
+ chunk->set_content(0, &i, sizeof(int));
+
+ *(p2+2) = 3;
+ printf("%d\t%d\t%d\t%d\n", *p, *p2, *(p2 + 1), *(p2+2));
+
+ int m = 10;
+ chunk->set_chunk(&m, sizeof(int), NULL);
+ int n = 12;
+ chunk->insert_content(sizeof(int), &n, sizeof(int));
+ n = 11;
+ chunk->insert_content(sizeof(int), &n, sizeof(int));
+
+ int * p3 = (int *)chunk->begin();
+ printf("%d\t%d\t%d\n", *p3, *(p3+1), *(p3+2));
+
+ chunk->remove_content(sizeof(int), sizeof(int));
+ printf("%d\t%d\n", *p3, *(p3+1));
+
+ int tmp;
+ assert(chunk->get_content(sizeof(int), &tmp, sizeof(int)));
+ printf("%d\n", tmp);
+
+ delete chunk;
+
+ return 0;
+}
diff --git a/tests/lookup/CMakeLists.txt b/tests/lookup/CMakeLists.txt
new file mode 100644
index 0000000..3304c47
--- /dev/null
+++ b/tests/lookup/CMakeLists.txt
@@ -0,0 +1,21 @@
+include_directories(..)
+
+add_executable(
+ test_pinyin_lookup
+ test_pinyin_lookup.cpp
+)
+
+target_link_libraries(
+ test_pinyin_lookup
+ libpinyin
+)
+
+add_executable(
+ test_phrase_lookup
+ test_phrase_lookup.cpp
+)
+
+target_link_libraries(
+ test_phrase_lookup
+ libpinyin
+)
diff --git a/tests/lookup/Makefile.am b/tests/lookup/Makefile.am
new file mode 100644
index 0000000..4bcc176
--- /dev/null
+++ b/tests/lookup/Makefile.am
@@ -0,0 +1,34 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/tests \
+ @GLIB2_CFLAGS@
+
+noinst_PROGRAMS = test_pinyin_lookup \
+ test_phrase_lookup
+
+test_pinyin_lookup_SOURCES = test_pinyin_lookup.cpp
+
+test_pinyin_lookup_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+test_phrase_lookup_SOURCES = test_phrase_lookup.cpp
+
+test_phrase_lookup_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ \ No newline at end of file
diff --git a/tests/lookup/test_phrase_lookup.cpp b/tests/lookup/test_phrase_lookup.cpp
new file mode 100644
index 0000000..c7bfd19
--- /dev/null
+++ b/tests/lookup/test_phrase_lookup.cpp
@@ -0,0 +1,118 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "tests_helper.h"
+
+
+bool try_phrase_lookup(PhraseLookup * phrase_lookup,
+ ucs4_t * ucs4_str, glong ucs4_len){
+ char * result_string = NULL;
+ MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ phrase_lookup->get_best_match(ucs4_len, ucs4_str, results);
+#if 0
+ for ( size_t i = 0; i < results->len; ++i) {
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ if ( *token == null_token )
+ continue;
+ printf("%d:%d\t", i, *token);
+ }
+ printf("\n");
+#endif
+ phrase_lookup->convert_to_utf8(results, result_string);
+ if (result_string)
+ printf("%s\n", result_string);
+ else
+ fprintf(stderr, "Error: Un-segmentable sentence encountered!\n");
+ g_array_free(results, TRUE);
+ g_free(result_string);
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load("../../data/table.conf");
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* init phrase table */
+ FacadePhraseTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/phrase_index.bin");
+ phrase_table.load(chunk, NULL);
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ /* init phrase index */
+ FacadePhraseIndex phrase_index;
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ /* init bi-gram */
+ Bigram system_bigram;
+ system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
+ Bigram user_bigram;
+
+ gfloat lambda = system_table_info.get_lambda();
+
+ /* init phrase lookup */
+ PhraseLookup phrase_lookup(lambda,
+ &phrase_table, &phrase_index,
+ &system_bigram, &user_bigram);
+
+ /* try one sentence */
+ char * linebuf = NULL;
+ size_t size = 0;
+ ssize_t read;
+ while( (read = getline(&linebuf, &size, stdin)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if ( strcmp ( linebuf, "quit" ) == 0)
+ break;
+
+ /* check non-ucs4 characters */
+ const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+ glong len = 0;
+ ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+ if ( len != num_of_chars ) {
+ fprintf(stderr, "non-ucs4 characters are not accepted.\n");
+ g_free(sentence);
+ continue;
+ }
+
+ try_phrase_lookup(&phrase_lookup, sentence, len);
+ g_free(sentence);
+ }
+
+ free(linebuf);
+ return 0;
+}
diff --git a/tests/lookup/test_pinyin_lookup.cpp b/tests/lookup/test_pinyin_lookup.cpp
new file mode 100644
index 0000000..3175db0
--- /dev/null
+++ b/tests/lookup/test_pinyin_lookup.cpp
@@ -0,0 +1,126 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "timer.h"
+#include <string.h>
+#include "pinyin_internal.h"
+#include "tests_helper.h"
+
+size_t bench_times = 100;
+
+int main( int argc, char * argv[]){
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load("../../data/table.conf");
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ pinyin_option_t options =
+ USE_TONE | USE_RESPLIT_TABLE | PINYIN_CORRECT_ALL | PINYIN_AMB_ALL;
+ FacadeChewingTable largetable;
+
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/pinyin_index.bin");
+ largetable.load(options, chunk, NULL);
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ FacadePhraseIndex phrase_index;
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram system_bigram;
+ system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
+ Bigram user_bigram;
+ user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE);
+
+ gfloat lambda = system_table_info.get_lambda();
+
+ PinyinLookup2 pinyin_lookup(lambda, options,
+ &largetable, &phrase_index,
+ &system_bigram, &user_bigram);
+
+ /* prepare the prefixes for get_best_match. */
+ TokenVector prefixes = g_array_new
+ (FALSE, FALSE, sizeof(phrase_token_t));
+ g_array_append_val(prefixes, sentence_start);
+
+ CandidateConstraints constraints = g_array_new
+ (TRUE, FALSE, sizeof(lookup_constraint_t));
+
+ MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ char* linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, stdin)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if ( strcmp ( linebuf, "quit" ) == 0)
+ break;
+
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests =
+ g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+ parser.parse(options, keys, key_rests, linebuf, strlen(linebuf));
+
+ if ( 0 == keys->len ) /* invalid pinyin */
+ continue;
+
+ /* initialize constraints. */
+ g_array_set_size(constraints, keys->len);
+ for ( size_t i = 0; i < constraints->len; ++i){
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+
+ guint32 start_time = record_time();
+ for ( size_t i = 0; i < bench_times; ++i)
+ pinyin_lookup.get_best_match(prefixes, keys, constraints, results);
+ print_time(start_time, bench_times);
+ for ( size_t i = 0; i < results->len; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ if ( null_token == *token)
+ continue;
+ printf("pos:%ld,token:%d\t", i, *token);
+ }
+ printf("\n");
+ char * sentence = NULL;
+ pinyin_lookup.convert_to_utf8(results, sentence);
+ printf("%s\n", sentence);
+
+ g_array_free(keys, TRUE);
+ g_array_free(key_rests, TRUE);
+ g_free(sentence);
+ }
+
+ g_array_free(prefixes, TRUE);
+ g_array_free(constraints, TRUE);
+ g_array_free(results, TRUE);
+
+ free(linebuf);
+ return 0;
+}
diff --git a/tests/storage/CMakeLists.txt b/tests/storage/CMakeLists.txt
new file mode 100644
index 0000000..96b12fc
--- /dev/null
+++ b/tests/storage/CMakeLists.txt
@@ -0,0 +1,71 @@
+include_directories(..)
+
+add_executable(
+ test_parser2
+ test_parser2.cpp
+)
+
+target_link_libraries(
+ test_parser2
+ libpinyin
+)
+
+add_executable(
+ test_chewing_table
+ test_chewing_table.cpp
+)
+
+target_link_libraries(
+ test_chewing_table
+ libpinyin
+)
+
+add_executable(
+ test_phrase_index
+ test_phrase_index.cpp
+)
+
+target_link_libraries(
+ test_phrase_index
+ libpinyin
+)
+
+add_executable(
+ test_phrase_index_logger
+ test_phrase_index_logger.cpp
+)
+
+target_link_libraries(
+ test_phrase_index_logger
+ libpinyin
+)
+
+add_executable(
+ test_phrase_table
+ test_phrase_table.cpp
+)
+
+target_link_libraries(
+ test_phrase_table
+ libpinyin
+)
+
+add_executable(
+ test_ngram
+ test_ngram.cpp
+)
+
+target_link_libraries(
+ test_ngram
+ libpinyin
+)
+
+add_executable(
+ test_flexible_ngram
+ test_flexible_ngram.cpp
+)
+
+target_link_libraries(
+ test_flexible_ngram
+ libpinyin
+)
diff --git a/tests/storage/Makefile.am b/tests/storage/Makefile.am
new file mode 100644
index 0000000..b7ed8b6
--- /dev/null
+++ b/tests/storage/Makefile.am
@@ -0,0 +1,71 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/tests \
+ @GLIB2_CFLAGS@
+
+TESTS = test_phrase_index_logger \
+ test_ngram \
+ test_flexible_ngram
+
+noinst_PROGRAMS = test_phrase_index \
+ test_phrase_index_logger \
+ test_phrase_table \
+ test_ngram \
+ test_flexible_ngram \
+ test_parser2 \
+ test_chewing_table \
+ test_table_info
+
+
+test_phrase_index_SOURCES = test_phrase_index.cpp
+
+test_phrase_index_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+test_phrase_index_logger_SOURCES = test_phrase_index_logger.cpp
+
+test_phrase_index_logger_LDADD = ../../src/libpinyin_internal.la \
+ @GLIB2_LIBS@
+
+test_phrase_table_SOURCES = test_phrase_table.cpp
+
+test_phrase_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+test_ngram_SOURCES = test_ngram.cpp
+
+test_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+test_flexible_ngram_SOURCES = test_flexible_ngram.cpp
+
+test_flexible_ngram_LDADD = ../../src/libpinyin_internal.la \
+ @GLIB2_LIBS@
+
+test_parser2_SOURCES = test_parser2.cpp
+
+test_parser2_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+test_chewing_table_SOURCES = test_chewing_table.cpp
+
+test_chewing_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+test_table_info_SOURCES = test_table_info.cpp
+
+test_table_info_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/tests/storage/test_chewing_table.cpp b/tests/storage/test_chewing_table.cpp
new file mode 100644
index 0000000..f3d0f5d
--- /dev/null
+++ b/tests/storage/test_chewing_table.cpp
@@ -0,0 +1,148 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "timer.h"
+#include <string.h>
+#include "pinyin_internal.h"
+#include "tests_helper.h"
+
+size_t bench_times = 1000;
+
+int main(int argc, char * argv[]) {
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load("../../data/table.conf");
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE;
+ ChewingLargeTable largetable(options);
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_table(phrase_files, &largetable, NULL, &phrase_index))
+ exit(ENOENT);
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ largetable.store(new_chunk);
+ largetable.load(new_chunk);
+
+ char* linebuf = NULL; size_t size = 0; ssize_t read;
+ while ((read = getline(&linebuf, &size, stdin)) != -1) {
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if ( strcmp ( linebuf, "quit" ) == 0)
+ break;
+
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests =
+ g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ parser.parse(options, keys, key_rests, linebuf, strlen(linebuf));
+ if (0 == keys->len) {
+ fprintf(stderr, "Invalid input.\n");
+ continue;
+ }
+
+ guint32 start = record_time();
+ PhraseIndexRanges ranges;
+ memset(ranges, 0, sizeof(PhraseIndexRanges));
+
+ phrase_index.prepare_ranges(ranges);
+
+ for (size_t i = 0; i < bench_times; ++i) {
+ phrase_index.clear_ranges(ranges);
+ largetable.search(keys->len, (ChewingKey *)keys->data, ranges);
+ }
+ print_time(start, bench_times);
+
+ phrase_index.clear_ranges(ranges);
+ largetable.search(keys->len, (ChewingKey *)keys->data, ranges);
+
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ GArray * & range = ranges[i];
+ if (!range)
+ continue;
+
+ if (range->len)
+ printf("range items number:%d\n", range->len);
+
+ for (size_t k = 0; k < range->len; ++k) {
+ PhraseIndexRange * onerange =
+ &g_array_index(range, PhraseIndexRange, k);
+ printf("start:%d\tend:%d\n", onerange->m_range_begin,
+ onerange->m_range_end);
+
+ PhraseItem item;
+ for ( phrase_token_t token = onerange->m_range_begin;
+ token != onerange->m_range_end; ++token){
+
+ phrase_index.get_phrase_item( token, item);
+
+ /* get phrase string */
+ ucs4_t buffer[MAX_PHRASE_LENGTH + 1];
+ item.get_phrase_string(buffer);
+ char * string = g_ucs4_to_utf8
+ ( buffer, item.get_phrase_length(),
+ NULL, NULL, NULL);
+ printf("%s\t", string);
+ g_free(string);
+
+ ChewingKey chewing_buffer[MAX_PHRASE_LENGTH];
+ size_t npron = item.get_n_pronunciation();
+ guint32 freq;
+ for (size_t m = 0; m < npron; ++m){
+ item.get_nth_pronunciation(m, chewing_buffer, freq);
+ for (size_t n = 0; n < item.get_phrase_length();
+ ++n){
+ gchar * pinyins =
+ chewing_buffer[n].get_pinyin_string();
+ printf("%s'", pinyins);
+ g_free(pinyins);
+ }
+ printf("\b\t%d\t", freq);
+ }
+ }
+ printf("\n");
+ }
+ g_array_set_size(range, 0);
+ }
+
+ phrase_index.destroy_ranges(ranges);
+ g_array_free(keys, TRUE);
+ g_array_free(key_rests, TRUE);
+ }
+
+ if (linebuf)
+ free(linebuf);
+
+ /* mask out all index items. */
+ largetable.mask_out(0x0, 0x0);
+
+ return 0;
+}
diff --git a/tests/storage/test_flexible_ngram.cpp b/tests/storage/test_flexible_ngram.cpp
new file mode 100644
index 0000000..d7d7950
--- /dev/null
+++ b/tests/storage/test_flexible_ngram.cpp
@@ -0,0 +1,138 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "pinyin_internal.h"
+
+int main(int argc, char * argv[]) {
+ FlexibleSingleGram<guint32, guint32> single_gram;
+ typedef FlexibleSingleGram<guint32, guint32>::ArrayItemWithToken array_item_t;
+
+ const guint32 total_freq = 16;
+ assert(single_gram.set_array_header(total_freq));
+
+ phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3 };
+ guint32 freqs[6] = { 1, 2, 4, 8, 16, 32};
+
+ guint32 freq;
+
+ for ( size_t i = 0; i < G_N_ELEMENTS(tokens); ++i ){
+ if ( single_gram.get_array_item(tokens[i], freq) )
+ assert(single_gram.set_array_item(tokens[i], freqs[i]));
+ else
+ assert(single_gram.insert_array_item(tokens[i], freqs[i]));
+ }
+
+ single_gram.get_array_item(3, freq);
+ assert(freq == 32);
+
+ printf("--------------------------------------------------------\n");
+ PhraseIndexRange range;
+ FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(array_item_t));
+ range.m_range_begin = 0; range.m_range_end = 8;
+ single_gram.search(&range, array);
+ for ( size_t i = 0; i < array->len; ++i ){
+ array_item_t * item = &g_array_index(array, array_item_t, i);
+ printf("item:%d:%d\n", item->m_token, item->m_item);
+ }
+
+ assert(single_gram.get_array_header(freq));
+ assert(freq == total_freq);
+
+ FlexibleBigram<guint32, guint32, guint32> bigram("TEST");
+ assert(bigram.attach("/tmp/training.db", ATTACH_READWRITE|ATTACH_CREATE));
+ bigram.store(1, &single_gram);
+ assert(single_gram.insert_array_item(5, 8));
+ assert(single_gram.remove_array_item(1, freq));
+ assert(single_gram.set_array_header(32));
+ assert(single_gram.get_array_header(freq));
+ printf("new array header:%d\n", freq);
+ bigram.store(2, &single_gram);
+
+ for (int m = 1; m <= 2; ++m ){
+ printf("--------------------------------------------------------\n");
+ FlexibleSingleGram<guint32, guint32> * train_gram;
+ bigram.load(m, train_gram);
+ g_array_set_size(array, 0);
+ range.m_range_begin = 0; range.m_range_end = 8;
+ train_gram->search(&range, array);
+ for ( size_t i = 0; i < array->len; ++i ){
+ array_item_t * item = &g_array_index(array, array_item_t, i);
+ printf("item:%d:%d\n", item->m_token, item->m_item);
+ }
+ delete train_gram;
+ }
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram.get_all_items(items);
+ printf("-----------------------items----------------------------\n");
+ for ( size_t i = 0; i < items->len; ++i ){
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ printf("item:%d\n", *token);
+ }
+
+ printf("-----------------------magic header---------------------\n");
+ bigram.set_magic_header(total_freq);
+ bigram.get_magic_header(freq);
+ assert(total_freq == freq);
+ printf("magic header:%d\n", freq);
+
+ printf("-----------------------array header---------------------\n");
+ for ( int i = 1; i <= 2; ++i){
+ bigram.get_array_header(i, freq);
+ printf("single gram: %d, freq:%d\n", i, freq);
+ }
+
+ bigram.set_array_header(1, 1);
+
+ printf("-----------------------array header---------------------\n");
+ for ( int i = 1; i <= 2; ++i){
+ bigram.get_array_header(i, freq);
+ printf("single gram: %d, freq:%d\n", i, freq);
+ }
+
+ for (int m = 1; m <= 2; ++m ){
+ printf("--------------------------------------------------------\n");
+ FlexibleSingleGram<guint32, guint32> * train_gram;
+ bigram.load(m, train_gram);
+ g_array_set_size(array, 0);
+ range.m_range_begin = 0; range.m_range_end = 8;
+ train_gram->search(&range, array);
+ for ( size_t i = 0; i < array->len; ++i ){
+ array_item_t * item = &g_array_index(array, array_item_t, i);
+ printf("item:%d:%d\n", item->m_token, item->m_item);
+ }
+ delete train_gram;
+ }
+
+ assert(bigram.remove(1));
+
+ bigram.get_all_items(items);
+ printf("-----------------------items----------------------------\n");
+ for ( size_t i = 0; i < items->len; ++i ){
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ printf("item:%d\n", *token);
+ }
+
+ g_array_free(items, TRUE);
+ g_array_free(array, TRUE);
+ return 0;
+}
diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp
new file mode 100644
index 0000000..f82cf1f
--- /dev/null
+++ b/tests/storage/test_ngram.cpp
@@ -0,0 +1,87 @@
+#include <stdio.h>
+#include "pinyin_internal.h"
+
+
+int main(int argc, char * argv[]){
+ SingleGram single_gram;
+
+ const guint32 total_freq = 16;
+ assert(single_gram.set_total_freq(total_freq));
+
+ phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3};
+ guint32 freqs[6] = { 1, 2, 4, 8, 16, 32};
+
+ guint32 freq;
+
+ for(size_t i = 0; i < 6 ;++i){
+ if ( single_gram.get_freq(tokens[i], freq))
+ assert(single_gram.set_freq(tokens[i], freqs[i]));
+ else
+ assert(single_gram.insert_freq(tokens[i], freqs[i]));
+ }
+
+ single_gram.get_freq(3, freq);
+ assert(freq == 32);
+
+ printf("--------------------------------------------------------\n");
+ PhraseIndexRange range;
+ BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem));
+ range.m_range_begin = 0; range.m_range_end = 8;
+ single_gram.search(&range,array);
+ for ( size_t i = 0; i < array->len; ++i){
+ BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+ printf("item:%d:%f\n", item->m_token, item->m_freq);
+ }
+
+ assert(single_gram.get_total_freq(freq));
+ assert(freq == total_freq);
+
+ Bigram bigram;
+ assert(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE));
+ bigram.store(1, &single_gram);
+ assert(single_gram.insert_freq(5, 8));
+ assert(single_gram.remove_freq(1, freq));
+ single_gram.set_total_freq(32);
+
+ bigram.store(2, &single_gram);
+
+
+ SingleGram * gram = NULL;
+ for ( int m = 1; m <= 2; ++m ){
+ printf("--------------------------------------------------------\n");
+ bigram.load(m, gram);
+ g_array_set_size(array, 0);
+ range.m_range_begin = 0; range.m_range_end = 8;
+ gram->search(&range,array);
+ for ( size_t i = 0; i < array->len; ++i){
+ BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+ printf("item:%d:%f\n", item->m_token, item->m_freq);
+ }
+ delete gram;
+ }
+
+ printf("--------------------------------------------------------\n");
+ assert(single_gram.get_total_freq(freq));
+ printf("total_freq:%d\n", freq);
+
+ g_array_free(array, TRUE);
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram.get_all_items(items);
+
+ printf("----------------------system----------------------------\n");
+ for ( size_t i = 0; i < items->len; ++i){
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ printf("item:%d\n", *token);
+ }
+
+ assert(bigram.load_db("/tmp/test.db"));
+ assert(bigram.save_db("/tmp/test.db"));
+
+ g_array_free(items, TRUE);
+
+ /* mask out all index items. */
+ bigram.mask_out(0x0, 0x0);
+
+ return 0;
+}
diff --git a/tests/storage/test_parser2.cpp b/tests/storage/test_parser2.cpp
new file mode 100644
index 0000000..638cd96
--- /dev/null
+++ b/tests/storage/test_parser2.cpp
@@ -0,0 +1,144 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "timer.h"
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pinyin_parser2.h"
+
+
+static const gchar * parsername = "";
+static gboolean incomplete = FALSE;
+
+static GOptionEntry entries[] =
+{
+ {"parser", 'p', 0, G_OPTION_ARG_STRING, &parsername, "parser", "fullpinyin doublepinyin chewing"},
+ {"incomplete", 'i', 0, G_OPTION_ARG_NONE, &incomplete, "incomplete pinyin", NULL},
+ {NULL}
+};
+
+#if 0
+ " -s <scheme> specify scheme for doublepinyin/chewing.\n"
+ " schemes for doublepinyin: zrm, ms, ziguang, abc, pyjj, xhe.\n"
+ " schemes for chewing: standard, ibm, ginyieh, eten.\n"
+#endif
+
+
+size_t bench_times = 1000;
+
+using namespace pinyin;
+
+
+int main(int argc, char * argv[]) {
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- test pinyin parser");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE | USE_RESPLIT_TABLE;
+ if (incomplete)
+ options |= PINYIN_INCOMPLETE | CHEWING_INCOMPLETE;
+
+ PinyinParser2 * parser = NULL;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests =
+ g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ /* create the parser */
+ if (strcmp("fullpinyin", parsername) == 0) {
+ parser = new FullPinyinParser2();
+ } else if (strcmp("doublepinyin", parsername) == 0) {
+ parser = new DoublePinyinParser2();
+ } else if (strcmp("chewing", parsername) == 0) {
+ parser = new ChewingParser2();
+ }
+
+ if (!parser)
+ parser = new FullPinyinParser2();
+
+ char* linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, stdin)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if ( strcmp ( linebuf, "quit" ) == 0)
+ break;
+
+#if 0
+ ChewingKey key;
+ bool success = parser->parse_one_key(options, key,
+ linebuf, strlen(linebuf));
+ if (success) {
+ gchar * pinyins = key.get_pinyin_string();
+ printf("pinyin:%s\n", pinyins);
+ g_free(pinyins);
+ }
+#endif
+
+#if 1
+ int len = 0;
+ guint32 start_time = record_time();
+ for ( size_t i = 0; i < bench_times; ++i)
+ len = parser->parse(options, keys, key_rests,
+ linebuf, strlen(linebuf));
+
+ print_time(start_time, bench_times);
+
+ printf("parsed %d chars, %d keys.\n", len, keys->len);
+
+ assert(keys->len == key_rests->len);
+
+ for (size_t i = 0; i < keys->len; ++i) {
+ ChewingKey * key =
+ &g_array_index(keys, ChewingKey, i);
+ ChewingKeyRest * key_rest =
+ &g_array_index(key_rests, ChewingKeyRest, i);
+
+ gchar * pinyins = key->get_pinyin_string();
+ printf("%s %d %d\t", pinyins,
+ key_rest->m_raw_begin, key_rest->m_raw_end);
+ g_free(pinyins);
+ }
+ printf("\n");
+#endif
+
+ }
+
+ if (linebuf)
+ free(linebuf);
+
+ delete parser;
+
+ g_array_free(key_rests, TRUE);
+ g_array_free(keys, TRUE);
+
+ return 0;
+}
diff --git a/tests/storage/test_phrase_index.cpp b/tests/storage/test_phrase_index.cpp
new file mode 100644
index 0000000..79a3ca4
--- /dev/null
+++ b/tests/storage/test_phrase_index.cpp
@@ -0,0 +1,122 @@
+#include "timer.h"
+#include <stdio.h>
+#include <errno.h>
+#include "pinyin_internal.h"
+#include "tests_helper.h"
+
+size_t bench_times = 100000;
+
+int main(int argc, char * argv[]){
+ PhraseItem phrase_item;
+ ucs4_t string1 = 2;
+ ChewingKey key1 = ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ENG);
+ ChewingKey key2 = ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ANG);
+
+
+ phrase_item.set_phrase_string(1, &string1);
+ phrase_item.add_pronunciation(&key1, 100);
+ phrase_item.add_pronunciation(&key2, 300);
+
+ assert(phrase_item.get_phrase_length() == 1);
+
+ ChewingKey key3;
+ guint32 freq;
+ phrase_item.get_nth_pronunciation(0, &key3, freq);
+ assert(key3 == key1);
+ assert(freq == 100);
+ phrase_item.get_nth_pronunciation(1, &key3, freq);
+ assert(key3 == key2);
+ assert(freq == 300);
+
+ pinyin_option_t options = 0;
+ gfloat poss = phrase_item.get_pronunciation_possibility(options, &key1);
+ printf("pinyin possiblitiy:%f\n", poss);
+
+ assert(phrase_item.get_unigram_frequency() == 0);
+
+ ucs4_t string2;
+ phrase_item.get_phrase_string(&string2);
+ assert(string1 == string2);
+
+ FacadePhraseIndex phrase_index_test;
+ assert(!phrase_index_test.add_phrase_item(1, &phrase_item));
+
+ MemoryChunk* chunk = new MemoryChunk;
+ assert(phrase_index_test.store(0, chunk));
+ assert(phrase_index_test.load(0, chunk));
+
+ PhraseItem item2;
+ guint32 time = record_time();
+ for ( size_t i = 0; i < bench_times; ++i){
+ phrase_index_test.get_phrase_item(1, item2);
+ assert(item2.get_unigram_frequency() == 0);
+ assert(item2.get_n_pronunciation() == 2);
+ assert(item2.get_phrase_length() == 1);
+ assert(item2.get_pronunciation_possibility(options, &key2) == 0.75);
+ }
+ print_time(time, bench_times);
+
+ {
+ PhraseItem item3;
+ phrase_index_test.get_phrase_item(1, item3);
+ item3.increase_pronunciation_possibility(options, &key1, 200);
+ assert(item3.get_pronunciation_possibility(options, &key1) == 0.5) ;
+ }
+
+ {
+ PhraseItem item5;
+ phrase_index_test.get_phrase_item(1, item5);
+ gfloat poss = item5.get_pronunciation_possibility(options, &key1);
+ printf("pinyin poss:%f\n", poss);
+ assert(poss == 0.5);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load("../../data/table.conf");
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_table(phrase_files, NULL, NULL, &phrase_index))
+ exit(ENOENT);
+
+ phrase_index.compact();
+
+ MemoryChunk* store1 = new MemoryChunk;
+ phrase_index.store(1, store1);
+ phrase_index.load(1, store1);
+
+ MemoryChunk* store2 = new MemoryChunk;
+ phrase_index.store(2, store2);
+ phrase_index.load(2, store2);
+
+ phrase_index.compact();
+
+ phrase_index.get_phrase_item(16870553, item2);
+ assert( item2.get_phrase_length() == 14);
+ assert( item2.get_n_pronunciation() == 1);
+
+ ucs4_t buf[1024];
+ item2.get_phrase_string(buf);
+ char * string = g_ucs4_to_utf8( buf, 14, NULL, NULL, NULL);
+ printf("%s\n", string);
+ g_free(string);
+
+ guint32 delta = 3;
+ phrase_index.add_unigram_frequency(16870553, delta);
+ phrase_index.get_phrase_item(16870553, item2);
+ assert( item2.get_unigram_frequency() == 3);
+
+ phrase_index.get_phrase_item(16777222, item2);
+ assert(item2.get_phrase_length() == 1);
+ assert(item2.get_n_pronunciation() == 2);
+
+ return 0;
+}
diff --git a/tests/storage/test_phrase_index_logger.cpp b/tests/storage/test_phrase_index_logger.cpp
new file mode 100644
index 0000000..c423c40
--- /dev/null
+++ b/tests/storage/test_phrase_index_logger.cpp
@@ -0,0 +1,67 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "pinyin_internal.h"
+
+
+/* TODO: check whether gb_char.bin and gb_char2.bin should be the same. */
+
+int main(int argc, char * argv[]){
+ FacadePhraseIndex phrase_index;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+
+ PhraseIndexRange range;
+ assert(ERROR_OK == phrase_index.get_range(1, range));
+ for (size_t i = range.m_range_begin; i < range.m_range_end; ++i ) {
+ phrase_index.add_unigram_frequency(i, 1);
+ }
+
+ printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq());
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ phrase_index.store(1, new_chunk);
+ new_chunk->save("/tmp/gb_char.bin");
+ delete new_chunk;
+
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ new_chunk = new MemoryChunk;
+ assert(phrase_index.diff(1, chunk, new_chunk));
+ new_chunk->save("/tmp/gb_char.dbin");
+ delete new_chunk;
+
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+ new_chunk = new MemoryChunk;
+ new_chunk->load("/tmp/gb_char.dbin");
+ assert(phrase_index.merge(1, new_chunk));
+ chunk = new MemoryChunk;
+ phrase_index.store(1, chunk);
+ chunk->save("/tmp/gb_char2.bin");
+ delete chunk;
+
+ printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq());
+
+ return 0;
+}
diff --git a/tests/storage/test_phrase_table.cpp b/tests/storage/test_phrase_table.cpp
new file mode 100644
index 0000000..a9c8ed5
--- /dev/null
+++ b/tests/storage/test_phrase_table.cpp
@@ -0,0 +1,86 @@
+#include "timer.h"
+#include <string.h>
+#include "pinyin_internal.h"
+#include "tests_helper.h"
+
+size_t bench_times = 1000;
+
+int main(int argc, char * argv[]){
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load("../../data/table.conf");
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ PhraseLargeTable2 largetable;
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_table(phrase_files, NULL, &largetable, &phrase_index))
+ exit(ENOENT);
+
+ MemoryChunk * chunk = new MemoryChunk;
+ largetable.store(chunk);
+ largetable.load(chunk);
+
+ char* linebuf = NULL; size_t size = 0; ssize_t read;
+ while ((read = getline(&linebuf, &size, stdin)) != -1) {
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if ( strcmp ( linebuf, "quit" ) == 0)
+ break;
+
+ glong phrase_len = g_utf8_strlen(linebuf, -1);
+ ucs4_t * new_phrase = g_utf8_to_ucs4(linebuf, -1, NULL, NULL, NULL);
+
+ if (0 == phrase_len)
+ continue;
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index.prepare_tokens(tokens);
+
+ guint32 start = record_time();
+ for (size_t i = 0; i < bench_times; ++i){
+ phrase_index.clear_tokens(tokens);
+ largetable.search(phrase_len, new_phrase, tokens);
+ }
+ print_time(start, bench_times);
+
+ phrase_index.clear_tokens(tokens);
+ int retval = largetable.search(phrase_len, new_phrase, tokens);
+
+ if (retval & SEARCH_OK) {
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ GArray * array = tokens[i];
+ if (NULL == array)
+ continue;
+
+ for (size_t k = 0; k < array->len; ++k) {
+ phrase_token_t token = g_array_index
+ (array, phrase_token_t, k);
+
+ printf("token:%d\t", token);
+ }
+ }
+ printf("\n");
+ }
+
+ phrase_index.destroy_tokens(tokens);
+ g_free(new_phrase);
+ }
+
+ if ( linebuf )
+ free(linebuf);
+
+ /* mask out all index items. */
+ largetable.mask_out(0x0, 0x0);
+
+ return 0;
+}
diff --git a/tests/storage/test_table_info.cpp b/tests/storage/test_table_info.cpp
new file mode 100644
index 0000000..68b4735
--- /dev/null
+++ b/tests/storage/test_table_info.cpp
@@ -0,0 +1,84 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2013 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include "pinyin_internal.h"
+
+
+int main(int argc, char * argv[]) {
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load("../../data/table.conf");
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ printf("lambda:%f\n", system_table_info.get_lambda());
+
+ size_t i;
+ for (i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info =
+ system_table_info.get_table_info() + i;
+
+ assert(i == table_info->m_dict_index);
+ printf("table index:%d\n", table_info->m_dict_index);
+
+ switch(table_info->m_file_type) {
+ case NOT_USED:
+ printf("not used.\n");
+ break;
+
+ case SYSTEM_FILE:
+ printf("system file:%s %s %s.\n", table_info->m_table_filename,
+ table_info->m_system_filename, table_info->m_user_filename);
+ break;
+
+ case DICTIONARY:
+ printf("dictionary:%s %s %s.\n", table_info->m_table_filename,
+ table_info->m_system_filename, table_info->m_user_filename);
+ break;
+
+ case USER_FILE:
+ printf("user file:%s.\n", table_info->m_user_filename);
+ break;
+
+ default:
+ assert(false);
+ }
+ }
+
+ UserTableInfo user_table_info;
+ retval = user_table_info.is_conform(&system_table_info);
+ assert(!retval);
+
+ user_table_info.make_conform(&system_table_info);
+ retval = user_table_info.is_conform(&system_table_info);
+ assert(retval);
+
+ assert(user_table_info.save("/tmp/user.conf"));
+ assert(user_table_info.load("/tmp/user.conf"));
+
+ retval = user_table_info.is_conform(&system_table_info);
+ assert(retval);
+
+ return 0;
+}
diff --git a/tests/test_chewing.cpp b/tests/test_chewing.cpp
new file mode 100644
index 0000000..5a5701f
--- /dev/null
+++ b/tests/test_chewing.cpp
@@ -0,0 +1,68 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "pinyin.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int main(int argc, char * argv[]){
+ pinyin_context_t * context =
+ pinyin_init("../data", "../data");
+
+ pinyin_instance_t * instance = pinyin_alloc_instance(context);
+
+ char* linebuf = NULL;
+ size_t size = 0;
+ ssize_t read;
+ while( (read = getline(&linebuf, &size, stdin)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if ( strcmp ( linebuf, "quit" ) == 0)
+ break;
+
+ pinyin_parse_more_chewings
+ (instance, linebuf);
+ pinyin_guess_sentence(instance);
+
+ char * sentence = NULL;
+ pinyin_get_sentence (instance, &sentence);
+ if (sentence)
+ printf("%s\n", sentence);
+ g_free(sentence);
+
+ pinyin_train(instance);
+ pinyin_reset(instance);
+ pinyin_save(context);
+ }
+
+ pinyin_free_instance(instance);
+
+ pinyin_mask_out(context, 0x0, 0x0);
+ pinyin_save(context);
+ pinyin_fini(context);
+
+ free(linebuf);
+ return 0;
+}
diff --git a/tests/test_phrase.cpp b/tests/test_phrase.cpp
new file mode 100644
index 0000000..6e5ef3b
--- /dev/null
+++ b/tests/test_phrase.cpp
@@ -0,0 +1,74 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "pinyin.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int main(int argc, char * argv[]){
+ pinyin_context_t * context =
+ pinyin_init("../data", "../data");
+
+ pinyin_instance_t * instance = pinyin_alloc_instance(context);
+
+ char* linebuf = NULL;
+ size_t size = 0;
+ ssize_t read;
+ while( (read = getline(&linebuf, &size, stdin)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if ( strcmp ( linebuf, "quit" ) == 0)
+ break;
+
+ pinyin_phrase_segment(instance, linebuf);
+ guint len = 0;
+ pinyin_get_n_phrase(instance, &len);
+
+ for ( size_t i = 0; i < len; ++i ){
+ phrase_token_t token = null_token;
+ pinyin_get_phrase_token(instance, i, &token);
+
+ if ( null_token == token )
+ continue;
+
+ char * word = NULL;
+ pinyin_token_get_phrase(instance, token, NULL, &word);
+ printf("%s\t", word);
+ g_free(word);
+ }
+ printf("\n");
+
+ pinyin_save(context);
+ }
+
+ pinyin_free_instance(instance);
+
+ pinyin_mask_out(context, 0x0, 0x0);
+ pinyin_save(context);
+ pinyin_fini(context);
+
+ free(linebuf);
+ return 0;
+}
diff --git a/tests/test_pinyin.cpp b/tests/test_pinyin.cpp
new file mode 100644
index 0000000..f94263b
--- /dev/null
+++ b/tests/test_pinyin.cpp
@@ -0,0 +1,97 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "pinyin.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int main(int argc, char * argv[]){
+ pinyin_context_t * context =
+ pinyin_init("../data", "../data");
+
+ pinyin_option_t options =
+ PINYIN_CORRECT_ALL | USE_DIVIDED_TABLE | USE_RESPLIT_TABLE |
+ DYNAMIC_ADJUST;
+ pinyin_set_options(context, options);
+
+ pinyin_instance_t * instance = pinyin_alloc_instance(context);
+
+ char * prefixbuf = NULL; size_t prefixsize = 0;
+ char * linebuf = NULL; size_t linesize = 0;
+ ssize_t read;
+
+ while( TRUE ){
+ fprintf(stdout, "prefix:");
+ fflush(stdout);
+
+ if ((read = getline(&prefixbuf, &prefixsize, stdin)) == -1)
+ break;
+
+ if ( '\n' == prefixbuf[strlen(prefixbuf) - 1] ) {
+ prefixbuf[strlen(prefixbuf) - 1] = '\0';
+ }
+
+ fprintf(stdout, "pinyin:");
+ fflush(stdout);
+
+ if ((read = getline(&linebuf, &linesize, stdin)) == -1)
+ break;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if ( strcmp ( linebuf, "quit" ) == 0)
+ break;
+
+ pinyin_parse_more_full_pinyins(instance, linebuf);
+ pinyin_guess_sentence_with_prefix(instance, prefixbuf);
+ pinyin_guess_full_pinyin_candidates(instance, 0);
+
+ guint len = 0;
+ pinyin_get_n_candidate(instance, &len);
+ for (size_t i = 0; i < len; ++i) {
+ lookup_candidate_t * candidate = NULL;
+ pinyin_get_candidate(instance, i, &candidate);
+
+ const char * word = NULL;
+ pinyin_get_candidate_string(instance, candidate, &word);
+
+ printf("%s\t", word);
+ }
+ printf("\n");
+
+ pinyin_train(instance);
+ pinyin_reset(instance);
+ pinyin_save(context);
+ }
+
+ pinyin_free_instance(instance);
+
+ pinyin_mask_out(context, 0x0, 0x0);
+ pinyin_save(context);
+ pinyin_fini(context);
+
+ free(prefixbuf); free(linebuf);
+ return 0;
+}
diff --git a/tests/tests_helper.h b/tests/tests_helper.h
new file mode 100644
index 0000000..431dbc8
--- /dev/null
+++ b/tests/tests_helper.h
@@ -0,0 +1,86 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef TESTS_HELPER_H
+#define TESTS_HELPER_H
+
+static bool load_phrase_index(const pinyin_table_info_t * phrase_files,
+ FacadePhraseIndex * phrase_index){
+ MemoryChunk * chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (SYSTEM_FILE != table_info->m_file_type)
+ continue;
+
+ const char * binfile = table_info->m_system_filename;
+
+ gchar * filename = g_build_filename("..", "..", "data",
+ binfile, NULL);
+ chunk = new MemoryChunk;
+ bool retval = chunk->load(filename);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", binfile);
+ delete chunk;
+ return false;
+ }
+
+ phrase_index->load(i, chunk);
+ g_free(filename);
+ }
+ return true;
+}
+
+static bool load_phrase_table(const pinyin_table_info_t * phrase_files,
+ ChewingLargeTable * chewing_table,
+ PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index){
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (SYSTEM_FILE != table_info->m_file_type)
+ continue;
+
+ const char * tablename = table_info->m_table_filename;
+
+ gchar * filename = g_build_filename("..", "..", "data",
+ tablename, NULL);
+ FILE * tablefile = fopen(filename, "r");
+ if (NULL == tablefile) {
+ fprintf(stderr, "open %s failed!\n", tablename);
+ return false;
+ }
+ g_free(filename);
+
+ if (chewing_table)
+ chewing_table->load_text(tablefile);
+ fseek(tablefile, 0L, SEEK_SET);
+ if (phrase_table)
+ phrase_table->load_text(tablefile);
+ fseek(tablefile, 0L, SEEK_SET);
+ if (phrase_index)
+ phrase_index->load_text(i, tablefile);
+ fclose(tablefile);
+ }
+ return true;
+}
+
+#endif
diff --git a/tests/timer.h b/tests/timer.h
new file mode 100644
index 0000000..d3f0822
--- /dev/null
+++ b/tests/timer.h
@@ -0,0 +1,48 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef TIMER_H
+#define TIMER_H
+
+#include <sys/time.h>
+#include <stdio.h>
+#include <glib.h>
+
+
+static guint32 record_time ()
+{
+ timeval tv;
+ gettimeofday (&tv, NULL);
+ return (guint32) tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+static void print_time (guint32 old_time, guint32 times)
+{
+ timeval tv;
+ gettimeofday (&tv, NULL);
+
+ guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time;
+
+ printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted );
+}
+
+
+#endif
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
new file mode 100644
index 0000000..dbd7855
--- /dev/null
+++ b/utils/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(segment)
+add_subdirectory(storage)
+add_subdirectory(training) \ No newline at end of file
diff --git a/utils/Makefile.am b/utils/Makefile.am
new file mode 100644
index 0000000..bc0f3e5
--- /dev/null
+++ b/utils/Makefile.am
@@ -0,0 +1,27 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+AUTOMAKE_OPTIONS = gnu
+SUBDIRS = storage segment training
+
+MAINTAINERCLEANFILES = Makefile.in
+
+CLEANFILES = *.bak
+
+ACLOCAL = aclocal -I $(ac_aux_dir)
+
+noinst_HEADERS = utils_helper.h
diff --git a/utils/segment/CMakeLists.txt b/utils/segment/CMakeLists.txt
new file mode 100644
index 0000000..82e4deb
--- /dev/null
+++ b/utils/segment/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_executable(
+ spseg
+ spseg.cpp
+)
+
+target_link_libraries(
+ spseg
+ libpinyin
+)
+
+add_executable(
+ ngseg
+ ngseg.cpp
+)
+
+target_link_libraries(
+ ngseg
+ libpinyin
+) \ No newline at end of file
diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am
new file mode 100644
index 0000000..579d6e4
--- /dev/null
+++ b/utils/segment/Makefile.am
@@ -0,0 +1,39 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
+ @GLIB2_CFLAGS@
+
+noinst_PROGRAMS = spseg ngseg mergeseq
+
+spseg_SOURCES = spseg.cpp
+
+spseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+ngseg_SOURCES = ngseg.cpp
+
+ngseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+mergeseq_SOURCES = mergeseq.cpp
+
+mergeseq_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp
new file mode 100644
index 0000000..1a26064
--- /dev/null
+++ b/utils/segment/mergeseq.cpp
@@ -0,0 +1,278 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2013 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <string.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+ printf("Usage: mergeseq [-o outputfile] [inputfile]\n");
+}
+
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+ {NULL}
+};
+
+
+/* data structure definition. */
+typedef struct{
+ phrase_token_t m_token;
+ gint m_token_len;
+} TokenInfo;
+
+
+/* GArray of ucs4 characters. */
+typedef GArray * UnicodeCharVector;
+/* GArray of TokenInfo. */
+typedef GArray * TokenInfoVector;
+
+gint calculate_sequence_length(TokenInfoVector tokeninfos) {
+ gint len = 0;
+
+ size_t i = 0;
+ for (i = 0; i < tokeninfos->len; ++i) {
+ TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i);
+ len += token_info->m_token_len;
+ }
+
+ return len;
+}
+
+/* if merge sequence found, merge and output it,
+ * if not, just output the first token;
+ * pop the first token or sequence.
+ */
+bool merge_sequence(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ UnicodeCharVector unichars,
+ TokenInfoVector tokeninfos) {
+ assert(tokeninfos->len > 0);
+
+ bool found = false;
+ TokenInfo * token_info = NULL;
+ phrase_token_t token = null_token;
+
+ ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index->prepare_tokens(tokens);
+
+ /* search the merge sequence. */
+ size_t index = tokeninfos->len;
+ gint seq_len = calculate_sequence_length(tokeninfos);
+ while (seq_len > 0) {
+ /* do phrase table search. */
+ int retval = phrase_table->search(seq_len, ucs4_str, tokens);
+
+ if (retval & SEARCH_OK) {
+ int num = get_first_token(tokens, token);
+ found = true;
+ break;
+ }
+
+ --index;
+ token_info = &g_array_index(tokeninfos, TokenInfo, index);
+ seq_len -= token_info->m_token_len;
+ }
+
+ phrase_index->destroy_tokens(tokens);
+
+ /* push the merged sequence back. */
+ if (found) {
+ /* pop up the origin sequence. */
+ g_array_remove_range(tokeninfos, 0, index);
+
+ TokenInfo info;
+ info.m_token = token;
+ info.m_token_len = seq_len;
+ g_array_prepend_val(tokeninfos, info);
+ }
+
+ return found;
+}
+
+bool pop_first_token(UnicodeCharVector unichars,
+ TokenInfoVector tokeninfos,
+ FILE * output) {
+ ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
+
+ /* pop it. */
+ TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, 0);
+ phrase_token_t token = token_info->m_token;
+ gint token_len = token_info->m_token_len;
+
+ glong read = 0;
+ gchar * utf8_str = g_ucs4_to_utf8(ucs4_str, token_len, &read, NULL, NULL);
+ assert(read == token_len);
+ fprintf(output, "%d %s\n", token, utf8_str);
+ g_free(utf8_str);
+
+ g_array_remove_range(unichars, 0, token_len);
+ g_array_remove_index(tokeninfos, 0);
+
+ return true;
+}
+
+bool feed_line(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ UnicodeCharVector unichars,
+ TokenInfoVector tokeninfos,
+ const char * linebuf,
+ FILE * output) {
+
+ TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf);
+
+ if (null_token == token) {
+ /* empty the queue. */
+ while (0 != tokeninfos->len) {
+ merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+ pop_first_token(unichars, tokeninfos, output);
+ }
+
+ assert(0 == unichars->len);
+ assert(0 == tokeninfos->len);
+
+ /* restore the null token line. */
+ fprintf(output, "%s\n", linebuf);
+
+ return false;
+ }
+
+ PhraseItem item;
+ phrase_index->get_phrase_item(token, item);
+ gint len = item.get_phrase_length();
+
+ TokenInfo info;
+ info.m_token = token;
+ info.m_token_len = len;
+ g_array_append_val(tokeninfos, info);
+
+ ucs4_t buffer[MAX_PHRASE_LENGTH];
+ item.get_phrase_string(buffer);
+ g_array_append_vals(unichars, buffer, len);
+
+ /* probe merge sequence. */
+ len = calculate_sequence_length(tokeninfos);
+ while (len >= MAX_PHRASE_LENGTH) {
+ merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+ pop_first_token(unichars, tokeninfos, output);
+ len = calculate_sequence_length(tokeninfos);
+ }
+
+ return true;
+}
+
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- merge word sequence");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (outputfile) {
+ output = fopen(outputfile, "w");
+ if (NULL == output) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ if (argc > 2) {
+ fprintf(stderr, "too many arguments.\n");
+ exit(EINVAL);
+ }
+
+ if (2 == argc) {
+ input = fopen(argv[1], "r");
+ if (NULL == input) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* init phrase table */
+ FacadePhraseTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ /* init phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ GArray * unichars = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+ GArray * tokeninfos = g_array_new(TRUE, TRUE, sizeof(TokenInfo));
+
+ char * linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, input)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ if (0 == strlen(linebuf))
+ continue;
+
+ feed_line(&phrase_table, &phrase_index,
+ unichars, tokeninfos,
+ linebuf, output);
+ }
+
+ g_array_free(unichars, TRUE);
+ g_array_free(tokeninfos, TRUE);
+ free(linebuf);
+ fclose(input);
+ fclose(output);
+ return 0;
+}
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
new file mode 100644
index 0000000..03fe5b4
--- /dev/null
+++ b/utils/segment/ngseg.cpp
@@ -0,0 +1,261 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+ printf("Usage: ngseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
+}
+
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+ {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+ {NULL}
+};
+
+
+/* n-gram based sentence segment. */
+
+/* Note:
+ * Currently libpinyin supports ucs4 characters.
+ * This is a pre-processor tool for raw corpus,
+ * and skips non-Chinese characters.
+ */
+
+/* TODO:
+ * Try to add punctuation mark and english support,
+ * such as ',', '.', '?', '!', <english>, and other punctuations.
+ */
+
+enum CONTEXT_STATE{
+ CONTEXT_INIT,
+ CONTEXT_SEGMENTABLE,
+ CONTEXT_UNKNOWN
+};
+
+bool deal_with_segmentable(PhraseLookup * phrase_lookup,
+ GArray * current_ucs4,
+ FILE * output){
+ char * result_string = NULL;
+ MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ phrase_lookup->get_best_match(current_ucs4->len,
+ (ucs4_t *) current_ucs4->data, results);
+
+ phrase_lookup->convert_to_utf8(results, result_string);
+
+ if (result_string) {
+ fprintf(output, "%s\n", result_string);
+ } else {
+ char * tmp_string = g_ucs4_to_utf8
+ ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+ NULL, NULL, NULL);
+ fprintf(stderr, "Un-segmentable sentence encountered:%s\n",
+ tmp_string);
+ g_array_free(results, TRUE);
+ return false;
+ }
+ g_array_free(results, TRUE);
+ g_free(result_string);
+ return true;
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+ char * result_string = g_ucs4_to_utf8
+ ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+ NULL, NULL, NULL);
+ fprintf(output, "%d %s\n", null_token, result_string);
+ g_free(result_string);
+ return true;
+}
+
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- n-gram segment");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (outputfile) {
+ output = fopen(outputfile, "w");
+ if (NULL == output) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ if (argc > 2) {
+ fprintf(stderr, "too many arguments.\n");
+ exit(EINVAL);
+ }
+
+ if (2 == argc) {
+ input = fopen(argv[1], "r");
+ if (NULL == input) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* init phrase table */
+ FacadePhraseTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ /* init phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ /* init bi-gram */
+ Bigram system_bigram;
+ system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+ Bigram user_bigram;
+
+ gfloat lambda = system_table_info.get_lambda();
+
+ /* init phrase lookup */
+ PhraseLookup phrase_lookup(lambda,
+ &phrase_table, &phrase_index,
+ &system_bigram, &user_bigram);
+
+
+ CONTEXT_STATE state, next_state;
+ GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index.prepare_tokens(tokens);
+
+ /* split the sentence */
+ char * linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, input)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ /* check non-ucs4 characters */
+ const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+ glong len = 0;
+ ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+ if ( len != num_of_chars ) {
+ fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ /* only new-line persists. */
+ if ( 0 == num_of_chars ) {
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ state = CONTEXT_INIT;
+ int result = phrase_table.search( 1, sentence, tokens);
+ g_array_append_val( current_ucs4, sentence[0]);
+ if ( result & SEARCH_OK )
+ state = CONTEXT_SEGMENTABLE;
+ else
+ state = CONTEXT_UNKNOWN;
+
+ for ( int i = 1; i < num_of_chars; ++i) {
+ int result = phrase_table.search( 1, sentence + i, tokens);
+ if ( result & SEARCH_OK )
+ next_state = CONTEXT_SEGMENTABLE;
+ else
+ next_state = CONTEXT_UNKNOWN;
+
+ if ( state == next_state ){
+ g_array_append_val(current_ucs4, sentence[i]);
+ continue;
+ }
+
+ assert ( state != next_state );
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_lookup, current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+
+ /* save the current character */
+ g_array_set_size(current_ucs4, 0);
+ g_array_append_val(current_ucs4, sentence[i]);
+ state = next_state;
+ }
+
+ if ( current_ucs4->len ) {
+ /* this seems always true. */
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_lookup, current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+ g_array_set_size(current_ucs4, 0);
+ }
+
+ /* print extra enter */
+ if ( gen_extra_enter )
+ fprintf(output, "%d \n", null_token);
+
+ g_free(sentence);
+ }
+ phrase_index.destroy_tokens(tokens);
+
+ /* print enter at file tail */
+ fprintf(output, "%d \n", null_token);
+ g_array_free(current_ucs4, TRUE);
+ free(linebuf);
+ fclose(input);
+ fclose(output);
+ return 0;
+}
diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp
new file mode 100644
index 0000000..b543cc5
--- /dev/null
+++ b/utils/segment/spseg.cpp
@@ -0,0 +1,343 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010,2013 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+ printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
+}
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+ {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+ {NULL}
+};
+
+
+/* graph shortest path sentence segment. */
+
+/* Note:
+ * Currently libpinyin only supports ucs4 characters, as this is a
+ * pre-processor tool for raw corpus, it will skip all sentences
+ * which contains non-ucs4 characters.
+ */
+
+enum CONTEXT_STATE{
+ CONTEXT_INIT,
+ CONTEXT_SEGMENTABLE,
+ CONTEXT_UNKNOWN
+};
+
+struct SegmentStep{
+ phrase_token_t m_handle;
+ ucs4_t * m_phrase;
+ size_t m_phrase_len;
+ //use formula W = number of words. Zero handle means one word.
+ guint m_nword;
+ //backtrace information, -1 one step backward.
+ gint m_backward_nstep;
+public:
+ SegmentStep(){
+ m_handle = null_token;
+ m_phrase = NULL;
+ m_phrase_len = 0;
+ m_nword = UINT_MAX;
+ m_backward_nstep = -0;
+ }
+};
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
+
+/* Note: do not free phrase, as it is used by strings (array of segment). */
+bool segment(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ GArray * current_ucs4,
+ GArray * strings /* Array of SegmentStep. */){
+ ucs4_t * phrase = (ucs4_t *)current_ucs4->data;
+ guint phrase_len = current_ucs4->len;
+
+ /* Prepare for shortest path segment dynamic programming. */
+ GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+ SegmentStep step;
+ for ( glong i = 0; i < phrase_len + 1; ++i ){
+ g_array_append_val(steps, step);
+ }
+
+ SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0);
+ first_step->m_nword = 0;
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index->prepare_tokens(tokens);
+
+ for ( glong i = 0; i < phrase_len + 1; ++i ) {
+ SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
+ size_t nword = step_begin->m_nword;
+ for ( glong k = i + 1; k < phrase_len + 1; ++k ) {
+ size_t len = k - i;
+ ucs4_t * cur_phrase = phrase + i;
+
+ phrase_token_t token = null_token;
+ int result = phrase_table->search(len, cur_phrase, tokens);
+ int num = get_first_token(tokens, token);
+
+ if ( !(result & SEARCH_OK) ){
+ token = null_token;
+ if ( 1 != len )
+ continue;
+ }
+ ++nword;
+
+ SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
+ if ( nword < step_end->m_nword ) {
+ step_end->m_handle = token;
+ step_end->m_phrase = cur_phrase;
+ step_end->m_phrase_len = len;
+ step_end->m_nword = nword;
+ step_end->m_backward_nstep = i - k;
+ }
+ if ( !(result & SEARCH_CONTINUED) )
+ break;
+ }
+ }
+ phrase_index->destroy_tokens(tokens);
+
+ return backtrace(steps, phrase_len, strings);
+}
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings){
+ /* backtracing to get the result. */
+ size_t cur_step = phrase_len;
+ g_array_set_size(strings, 0);
+ while ( cur_step ){
+ SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step);
+ g_array_append_val(strings, *step);
+ cur_step = cur_step + step->m_backward_nstep;
+ /* intended to avoid leaking internal informations. */
+ step->m_nword = 0; step->m_backward_nstep = 0;
+ }
+
+ /* reverse the strings. */
+ for ( size_t i = 0; i < strings->len / 2; ++i ) {
+ SegmentStep * head, * tail;
+ head = &g_array_index(strings, SegmentStep, i);
+ tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i );
+ SegmentStep tmp;
+ tmp = *head;
+ *head = *tail;
+ *tail = tmp;
+ }
+
+ g_array_free(steps, TRUE);
+ return true;
+}
+
+bool deal_with_segmentable(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ GArray * current_ucs4,
+ FILE * output){
+
+ /* do segment stuff. */
+ GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+ segment(phrase_table, phrase_index, current_ucs4, strings);
+
+ /* print out the split phrase. */
+ for ( glong i = 0; i < strings->len; ++i ) {
+ SegmentStep * step = &g_array_index(strings, SegmentStep, i);
+ char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
+ fprintf(output, "%d %s\n", step->m_handle, string);
+ g_free(string);
+ }
+
+ g_array_free(strings, TRUE);
+ return true;
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+ char * result_string = g_ucs4_to_utf8
+ ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+ NULL, NULL, NULL);
+ fprintf(output, "%d %s\n", null_token, result_string);
+ g_free(result_string);
+ return true;
+}
+
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- shortest path segment");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (outputfile) {
+ output = fopen(outputfile, "w");
+ if (NULL == output) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ if (argc > 2) {
+ fprintf(stderr, "too many arguments.\n");
+ exit(EINVAL);
+ }
+
+ if (2 == argc) {
+ input = fopen(argv[1], "r");
+ if (NULL == input) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* init phrase table */
+ FacadePhraseTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ /* init phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ CONTEXT_STATE state, next_state;
+ GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index.prepare_tokens(tokens);
+
+ char * linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, input)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ /* check non-ucs4 characters. */
+ const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+ glong len = 0;
+ ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+ if ( len != num_of_chars ) {
+ fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ /* only new-line persists. */
+ if ( 0 == num_of_chars ) {
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ state = CONTEXT_INIT;
+ int result = phrase_table.search( 1, sentence, tokens);
+ g_array_append_val( current_ucs4, sentence[0]);
+ if ( result & SEARCH_OK )
+ state = CONTEXT_SEGMENTABLE;
+ else
+ state = CONTEXT_UNKNOWN;
+
+ for ( int i = 1; i < num_of_chars; ++i) {
+ int result = phrase_table.search( 1, sentence + i, tokens);
+ if ( result & SEARCH_OK )
+ next_state = CONTEXT_SEGMENTABLE;
+ else
+ next_state = CONTEXT_UNKNOWN;
+
+ if ( state == next_state ){
+ g_array_append_val(current_ucs4, sentence[i]);
+ continue;
+ }
+
+ assert ( state != next_state );
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_table, &phrase_index,
+ current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+
+ /* save the current character */
+ g_array_set_size(current_ucs4, 0);
+ g_array_append_val(current_ucs4, sentence[i]);
+ state = next_state;
+ }
+
+ if ( current_ucs4->len ) {
+ /* this seems always true. */
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_table, &phrase_index,
+ current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+ g_array_set_size(current_ucs4, 0);
+ }
+
+ /* print extra enter */
+ if ( gen_extra_enter )
+ fprintf(output, "%d \n", null_token);
+
+ g_free(sentence);
+ }
+ phrase_index.destroy_tokens(tokens);
+
+ /* print enter at file tail */
+ fprintf(output, "%d \n", null_token);
+ g_array_free(current_ucs4, TRUE);
+ free(linebuf);
+ fclose(input);
+ fclose(output);
+ return 0;
+}
diff --git a/utils/storage/CMakeLists.txt b/utils/storage/CMakeLists.txt
new file mode 100644
index 0000000..63cabcd
--- /dev/null
+++ b/utils/storage/CMakeLists.txt
@@ -0,0 +1,29 @@
+add_executable(
+ gen_binary_files
+ gen_binary_files.cpp
+)
+
+target_link_libraries(
+ gen_binary_files
+ libpinyin
+)
+
+add_executable(
+ import_interpolation
+ import_interpolation.cpp
+)
+
+target_link_libraries(
+ import_interpolation
+ libpinyin
+)
+
+add_executable(
+ export_interpolation
+ export_interpolation.cpp
+)
+
+target_link_libraries(
+ export_interpolation
+ libpinyin
+)
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
new file mode 100644
index 0000000..db63488
--- /dev/null
+++ b/utils/storage/Makefile.am
@@ -0,0 +1,45 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
+ @GLIB2_CFLAGS@
+
+bin_PROGRAMS = gen_binary_files \
+ import_interpolation
+
+noinst_PROGRAMS = export_interpolation \
+ gen_pinyin_table
+
+gen_binary_files_SOURCES = gen_binary_files.cpp
+
+gen_binary_files_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+import_interpolation_SOURCES = import_interpolation.cpp
+
+import_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+export_interpolation_SOURCES = export_interpolation.cpp
+
+export_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_pinyin_table_SOURCES = gen_pinyin_table.cpp
+
+gen_pinyin_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp
new file mode 100644
index 0000000..c43eefb
--- /dev/null
+++ b/utils/storage/export_interpolation.cpp
@@ -0,0 +1,144 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+/* export interpolation model as textual format */
+
+bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index);
+bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram);
+
+bool begin_data(FILE * output){
+ fprintf(output, "\\data model interpolation\n");
+ return true;
+}
+
+bool end_data(FILE * output){
+ fprintf(output, "\\end\n");
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * output = stdout;
+ const char * bigram_filename = SYSTEM_BIGRAM;
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ bigram.attach(bigram_filename, ATTACH_READONLY);
+
+ begin_data(output);
+
+ gen_unigram(output, &phrase_index);
+ gen_bigram(output, &phrase_index, &bigram);
+
+ end_data(output);
+ return 0;
+}
+
+bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
+ fprintf(output, "\\1-gram\n");
+ for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) {
+
+ PhraseIndexRange range;
+ int result = phrase_index->get_range(i, range);
+ if (ERROR_OK != result )
+ continue;
+
+ PhraseItem item;
+ for (phrase_token_t token = range.m_range_begin;
+ token < range.m_range_end; token++) {
+ int result = phrase_index->get_phrase_item(token, item);
+
+ if ( result == ERROR_NO_ITEM )
+ continue;
+ assert( result == ERROR_OK);
+
+ size_t freq = item.get_unigram_frequency();
+ if ( 0 == freq )
+ continue;
+ char * phrase = taglib_token_to_string(phrase_index, token);
+ if ( phrase )
+ fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq);
+
+ g_free(phrase);
+ }
+ }
+ return true;
+}
+
+bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){
+ fprintf(output, "\\2-gram\n");
+
+ /* Retrieve all user items. */
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ bigram->get_all_items(items);
+
+ PhraseItem item;
+
+ for(size_t i = 0; i < items->len; i++){
+ phrase_token_t token = g_array_index(items, phrase_token_t, i);
+ SingleGram * single_gram = NULL;
+ bigram->load(token, single_gram);
+
+ BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+ single_gram->retrieve_all(array);
+ for(size_t j = 0; j < array->len; j++) {
+ BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);
+
+ char * word1 = taglib_token_to_string(phrase_index, token);
+ char * word2 = taglib_token_to_string(phrase_index, item->m_token);
+ guint32 freq = item->m_count;
+
+ if ( word1 && word2)
+ fprintf(output, "\\item %d %s %d %s count %d\n",
+ token, word1, item->m_token, word2, freq);
+
+ g_free(word1); g_free(word2);
+ }
+
+ g_array_free(array, TRUE);
+ delete single_gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp
new file mode 100644
index 0000000..4216b44
--- /dev/null
+++ b/utils/storage/gen_binary_files.cpp
@@ -0,0 +1,115 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+ {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+ {NULL}
+};
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate binary files");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ bool retval = system_table_info.load(filename);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+ g_free(filename);
+
+ /* generate pinyin index*/
+ pinyin_option_t options = USE_TONE;
+ ChewingLargeTable chewing_table(options);
+ PhraseLargeTable2 phrase_table;
+
+ /* generate phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+ assert(table_info->m_dict_index == i);
+
+ if (SYSTEM_FILE != table_info->m_file_type &&
+ DICTIONARY != table_info->m_file_type)
+ continue;
+
+ const char * tablename = table_info->m_table_filename;
+
+ filename = g_build_filename(table_dir, tablename, NULL);
+ FILE * tablefile = fopen(filename, "r");
+
+ if (NULL == tablefile) {
+ fprintf(stderr, "open %s failed!\n", tablename);
+ exit(ENOENT);
+ }
+
+ chewing_table.load_text(tablefile);
+ fseek(tablefile, 0L, SEEK_SET);
+ phrase_table.load_text(tablefile);
+ fseek(tablefile, 0L, SEEK_SET);
+ phrase_index.load_text(i, tablefile);
+ fclose(tablefile);
+ g_free(filename);
+ }
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ chewing_table.store(new_chunk);
+ new_chunk->save(SYSTEM_PINYIN_INDEX);
+ chewing_table.load(new_chunk);
+
+ new_chunk = new MemoryChunk;
+ phrase_table.store(new_chunk);
+ new_chunk->save(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(new_chunk);
+
+ phrase_index.compact();
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ if (!save_dictionary(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}
diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp
new file mode 100644
index 0000000..3b541d1
--- /dev/null
+++ b/utils/storage/gen_pinyin_table.cpp
@@ -0,0 +1,330 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <stdio.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+
+
+void print_help(){
+ printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n"
+ "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"
+ "<OUTPUTFILE> the result output file\n"
+ "<FILEi> input pinyin files\n"
+ "<PHRASE_INDEX> phrase index identifier\n");
+}
+
+
+static gint phrase_index = 0;
+static const gchar * outputfile = "temp.out";
+
+static GOptionEntry entries[] =
+{
+ {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL},
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL},
+ {NULL}
+};
+
+
+using namespace pinyin;
+
+/* map from phrase_item to GArray of chewing_and_freq_item */
+GTree * g_chewing_tree;
+/* Array of GArray of phrase_and_array_item */
+GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
+
+struct phrase_item{
+ size_t length;
+ gunichar * uniphrase;
+};
+
+struct chewing_and_freq_item{
+ ChewingKeyVector keys;
+ ChewingKeyRestVector key_rests;
+ guint32 freq;
+};
+
+struct phrase_and_array_item{
+ phrase_item phrase; /* the key of g_chewing_tree */
+ /* Array of chewing_and_freq_item */
+ GArray * chewing_and_freq_array; /* the value of g_chewing_tree */
+};
+
+
+void feed_file(const char * filename);
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data);
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata);
+
+void gen_phrase_file(const char * outputfile, int phrase_index);
+
+
+gint phrase_item_compare(gconstpointer a, gconstpointer b){
+ phrase_item * itema = (phrase_item *) a;
+ phrase_item * itemb = (phrase_item *) b;
+ if ( itema->length != itemb->length )
+ return itema->length - itemb->length;
+ else
+ return memcmp(itema->uniphrase, itemb->uniphrase,
+ sizeof(gunichar) * itema->length);
+}
+
+
+int main(int argc, char * argv[]){
+ int i;
+
+ g_chewing_tree = g_tree_new(phrase_item_compare);
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate pinyin table");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ for (i = 1; i < argc; ++i) {
+ feed_file(argv[i]);
+ }
+
+ printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree));
+
+ /* store in item array */
+ g_item_array[0] = NULL;
+ for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_item_array[i] = g_array_new
+ (FALSE, TRUE, sizeof(phrase_and_array_item));
+ }
+ g_tree_foreach(g_chewing_tree, store_one_item, NULL);
+
+ /* sort item array */
+ for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
+ g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
+ }
+
+ gen_phrase_file(outputfile, phrase_index);
+
+ return 0;
+}
+
+void feed_file ( const char * filename){
+ char phrase[1024], pinyin[1024];
+ guint32 freq;
+
+ FILE * infile = fopen(filename, "r");
+ if ( NULL == infile ){
+ fprintf(stderr, "Can't open file %s.\n", filename);
+ exit(ENOENT);
+ }
+
+ while ( !feof(infile)){
+ int num = fscanf(infile, "%s %s %u",
+ phrase, pinyin, &freq);
+
+ if (3 != num)
+ continue;
+
+ if (feof(infile))
+ break;
+
+ feed_line(phrase, pinyin, freq);
+ }
+
+ fclose(infile);
+}
+
+void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
+ phrase_item * item = new phrase_item;
+ item->length = g_utf8_strlen(phrase, -1);
+
+ /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
+ * where is the code which I don't want to touch. :-)
+ */
+
+ if (item->length >= MAX_PHRASE_LENGTH) {
+ fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
+
+ FullPinyinParser2 parser;
+ ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
+ ChewingKeyRestVector key_rests = g_array_new
+ (FALSE, FALSE, sizeof(ChewingKeyRest));
+
+ pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE;
+ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
+ assert(keys->len == key_rests->len);
+
+ if (keys->len != item->length) {
+ fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
+ delete item;
+ return;
+ }
+
+ GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);
+
+ chewing_and_freq_item value_item;
+ value_item.keys = keys; value_item.key_rests = key_rests;
+ value_item.freq = freq;
+
+ assert(item->length == value_item.keys->len);
+ if (NULL == array) {
+ array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ return;
+ }
+
+ bool found = false;
+ for (size_t i = 0; i < array->len; ++i) {
+ chewing_and_freq_item * cur_item =
+ &g_array_index(array, chewing_and_freq_item, i);
+ int result = pinyin_exact_compare2
+ ((ChewingKey *) value_item.keys->data,
+ (ChewingKey *) cur_item->keys->data,
+ value_item.keys->len);
+
+ if (0 == result) {
+ fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
+ phrase, pinyin, freq);
+ cur_item->freq += freq;
+ found = true;
+ }
+ }
+
+ if (!found) {
+ g_array_append_val(array, value_item);
+ g_tree_insert(g_chewing_tree, item, array);
+ } else {
+ /* clean up */
+ g_array_free(keys, TRUE);
+ g_array_free(key_rests, TRUE);
+ }
+
+ delete item;
+}
+
+
+gboolean store_one_item(gpointer key, gpointer value, gpointer data) {
+ phrase_and_array_item item;
+ item.phrase = *((phrase_item *) key);
+ item.chewing_and_freq_array = (GArray *) value;
+ int len = item.phrase.length;
+ g_array_append_val(g_item_array[len], item);
+ return FALSE;
+}
+
+
+int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
+ gpointer userdata) {
+ int phrase_length = *((int *) userdata);
+ phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs;
+ phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs;
+
+ ChewingKeyVector keys_lhs = g_array_index
+ (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ ChewingKeyVector keys_rhs = g_array_index
+ (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
+ return pinyin_exact_compare2((ChewingKey *)keys_lhs->data,
+ (ChewingKey *)keys_rhs->data, phrase_length);
+}
+
+
+void gen_phrase_file(const char * outputfile, int phrase_index){
+ FILE * outfile = fopen(outputfile, "w");
+ if (NULL == outfile ) {
+ fprintf(stderr, "Can't write file %s.\n", outputfile);
+ exit(ENOENT);
+ }
+
+ phrase_token_t token = 1;
+
+ /* phrase length index */
+ for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) {
+ GArray * item_array = g_item_array[i];
+
+ /* item array index */
+ for (size_t m = 0; m < item_array->len; ++m) {
+ phrase_and_array_item * item = &g_array_index
+ (item_array, phrase_and_array_item, m);
+ phrase_item phrase = item->phrase;
+ GArray * chewing_and_freqs = item->chewing_and_freq_array;
+
+ gchar * phrase_str = g_ucs4_to_utf8
+ (phrase.uniphrase, phrase.length, NULL, NULL, NULL);
+
+ /* iterate each pinyin */
+ for (size_t n = 0; n < chewing_and_freqs->len; ++n) {
+ chewing_and_freq_item * chewing_and_freq =
+ &g_array_index
+ (chewing_and_freqs, chewing_and_freq_item, n);
+
+ ChewingKeyVector keys = chewing_and_freq->keys;
+ ChewingKeyRestVector key_rests = chewing_and_freq->key_rests;
+
+ GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *));
+ gchar * pinyin = NULL;
+
+ size_t k;
+ for (k = 0; k < keys->len; ++k) {
+ ChewingKey key = g_array_index(keys, ChewingKey, k);
+ ChewingKeyRest key_rest = g_array_index
+ (key_rests, ChewingKeyRest, k);
+
+ //assert (CHEWING_ZERO_TONE != key.m_tone);
+ pinyin = key.get_pinyin_string();
+ g_array_append_val(pinyins, pinyin);
+ }
+ gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data);
+
+ for (k = 0; k < pinyins->len; ++k) {
+ g_free(g_array_index(pinyins, gchar *, k));
+ }
+ g_array_free(pinyins, TRUE);
+
+ guint32 freq = chewing_and_freq->freq;
+
+ /* avoid zero freq */
+ if (freq < 3) freq = 3;
+
+ fprintf(outfile, "%s\t%s\t%d\t%d\n",
+ pinyin_str, phrase_str,
+ PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq);
+
+ g_free(pinyin_str);
+ }
+ g_free(phrase_str);
+ token++;
+ }
+ }
+
+ fclose(outfile);
+}
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
new file mode 100644
index 0000000..205a27a
--- /dev/null
+++ b/utils/storage/import_interpolation.cpp
@@ -0,0 +1,313 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+ {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+ {NULL}
+};
+
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline();
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index);
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * bigram);
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+ return result;
+}
+
+bool parse_headline(){
+ /* enter "\data" line */
+ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
+
+ /* read "\data" line */
+ if ( !taglib_read(linebuf, line_type, values, required) ) {
+ fprintf(stderr, "error: interpolation model expected.\n");
+ return false;
+ }
+
+ assert(line_type == BEGIN_LINE);
+ /* check header */
+ TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+ if ( !( strcmp("interpolation", model) == 0 ) ) {
+ fprintf(stderr, "error: interpolation model expected.\n");
+ return false;
+ }
+ return true;
+}
+
+bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ goto end;
+ case GRAM_1_LINE:
+ my_getline(input);
+ parse_unigram(input, phrase_table, phrase_index);
+ goto retry;
+ case GRAM_2_LINE:
+ my_getline(input);
+ parse_bigram(input, phrase_table, phrase_index, bigram);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1) ;
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", ""));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_1_ITEM_LINE:{
+ /* handle \item in \1-gram */
+ TAGLIB_GET_TOKEN(token, 0);
+ TAGLIB_GET_PHRASE_STRING(word, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token, word));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ phrase_index->add_unigram_frequency(token, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", ""));
+
+ phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL;
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two tokens */
+ TAGLIB_GET_TOKEN(token1, 0);
+ TAGLIB_GET_PHRASE_STRING(word1, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token1, word1));
+
+ TAGLIB_GET_TOKEN(token2, 2);
+ TAGLIB_GET_PHRASE_STRING(word2, 3);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token2, word2));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+
+ if ( last_token != token1 ) {
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+
+ /* safe guard */
+ last_token = null_token;
+ last_single_gram = NULL;
+ }
+ SingleGram * single_gram = NULL;
+ bigram->load(token1, single_gram);
+
+ /* create the new single gram */
+ if ( single_gram == NULL )
+ single_gram = new SingleGram;
+ last_token = token1;
+ last_single_gram = single_gram;
+ }
+
+ /* save the freq */
+ assert(NULL != last_single_gram);
+ guint32 total_freq = 0;
+ assert(last_single_gram->get_total_freq(total_freq));
+ assert(last_single_gram->insert_freq(token2, count));
+ total_freq += count;
+ assert(last_single_gram->set_total_freq(total_freq));
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ //safe guard
+ last_token = 0;
+ last_single_gram = NULL;
+ }
+
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ const char * bigram_filename = SYSTEM_BIGRAM;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- import interpolation model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ bool retval = system_table_info.load(filename);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+ g_free(filename);
+
+ PhraseLargeTable2 phrase_table;
+
+ MemoryChunk * chunk = new MemoryChunk;
+ retval = chunk->load(SYSTEM_PHRASE_INDEX);
+ if (!retval) {
+ fprintf(stderr, "open phrase_index.bin failed!\n");
+ exit(ENOENT);
+ }
+ phrase_table.load(chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", bigram_filename);
+ exit(ENOENT);
+ }
+
+ taglib_init();
+
+ values = g_ptr_array_new();
+ required = g_hash_table_new(g_str_hash, g_str_equal);
+
+ /* read first line */
+ ssize_t result = my_getline(input);
+ if ( result == -1 ) {
+ fprintf(stderr, "empty file input.\n");
+ exit(ENODATA);
+ }
+
+ if (!parse_headline())
+ exit(ENODATA);
+
+ result = my_getline(input);
+ if ( result != -1 )
+ parse_body(input, &phrase_table, &phrase_index, &bigram);
+
+ taglib_fini();
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}
diff --git a/utils/training/CMakeLists.txt b/utils/training/CMakeLists.txt
new file mode 100644
index 0000000..ee59bcd
--- /dev/null
+++ b/utils/training/CMakeLists.txt
@@ -0,0 +1,129 @@
+add_executable(
+ gen_ngram
+ gen_ngram.cpp
+)
+
+target_link_libraries(
+ gen_ngram
+ libpinyin
+)
+
+add_executable(
+ gen_deleted_ngram
+ gen_deleted_ngram.cpp
+)
+
+target_link_libraries(
+ gen_deleted_ngram
+ libpinyin
+)
+
+add_executable(
+ gen_unigram
+ gen_unigram.cpp
+)
+
+target_link_libraries(
+ gen_unigram
+ libpinyin
+)
+
+add_executable(
+ gen_k_mixture_model
+ gen_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ gen_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ estimate_interpolation
+ estimate_interpolation.cpp
+)
+
+target_link_libraries(
+ estimate_interpolation
+ libpinyin
+)
+
+add_executable(
+ estimate_k_mixture_model
+ estimate_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ estimate_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ merge_k_mixture_model
+ merge_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ merge_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ prune_k_mixture_model
+ prune_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ prune_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ import_k_mixture_model
+ import_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ import_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ export_k_mixture_model
+ export_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ export_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ k_mixture_model_to_interpolation
+ k_mixture_model_to_interpolation.cpp
+)
+
+target_link_libraries(
+ k_mixture_model_to_interpolation
+ libpinyin
+)
+
+add_executable(
+ validate_k_mixture_model
+ validate_k_mixture_model.cpp
+)
+
+target_link_libraries(
+ validate_k_mixture_model
+ libpinyin
+)
+
+add_executable(
+ eval_correction_rate
+ eval_correction_rate.cpp
+)
+
+target_link_libraries(
+ eval_correction_rate
+ libpinyin
+) \ No newline at end of file
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am
new file mode 100644
index 0000000..dc834ec
--- /dev/null
+++ b/utils/training/Makefile.am
@@ -0,0 +1,97 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ -I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
+ @GLIB2_CFLAGS@
+
+noinst_HEADERS = k_mixture_model.h
+
+bin_PROGRAMS = gen_unigram
+
+noinst_PROGRAMS = gen_ngram \
+ gen_deleted_ngram \
+ gen_k_mixture_model \
+ estimate_interpolation \
+ estimate_k_mixture_model \
+ merge_k_mixture_model \
+ prune_k_mixture_model \
+ import_k_mixture_model \
+ export_k_mixture_model \
+ k_mixture_model_to_interpolation \
+ validate_k_mixture_model \
+ eval_correction_rate
+
+gen_ngram_SOURCES = gen_ngram.cpp
+
+gen_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_deleted_ngram_SOURCES = gen_deleted_ngram.cpp
+
+gen_deleted_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_unigram_SOURCES = gen_unigram.cpp
+
+gen_unigram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+gen_k_mixture_model_SOURCES = gen_k_mixture_model.cpp
+
+gen_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+estimate_interpolation_SOURCES = estimate_interpolation.cpp
+
+estimate_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+estimate_k_mixture_model_SOURCES = estimate_k_mixture_model.cpp
+
+estimate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \
+ @GLIB2_LIBS@
+
+merge_k_mixture_model_SOURCES = merge_k_mixture_model.cpp
+
+merge_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+prune_k_mixture_model_SOURCES = prune_k_mixture_model.cpp
+
+prune_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+import_k_mixture_model_SOURCES = import_k_mixture_model.cpp
+
+import_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+export_k_mixture_model_SOURCES = export_k_mixture_model.cpp
+
+export_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp
+
+k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin_internal.la \
+ @GLIB2_LIBS@
+
+validate_k_mixture_model_SOURCES = validate_k_mixture_model.cpp
+
+validate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \
+ @GLIB2_LIBS@
+
+eval_correction_rate_SOURCES = eval_correction_rate.cpp
+
+eval_correction_rate_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp
new file mode 100644
index 0000000..5cdc680
--- /dev/null
+++ b/utils/training/estimate_interpolation.cpp
@@ -0,0 +1,144 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2008 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <math.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+parameter_t compute_interpolation(SingleGram * deleted_bigram,
+ FacadePhraseIndex * unigram,
+ SingleGram * bigram){
+ bool success;
+ parameter_t lambda = 0, next_lambda = 0.6;
+ parameter_t epsilon = 0.001;
+
+ while ( fabs(lambda - next_lambda) > epsilon){
+ lambda = next_lambda;
+ next_lambda = 0;
+ guint32 table_num = 0;
+ parameter_t numerator = 0;
+ parameter_t part_of_denominator = 0;
+
+ BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+ deleted_bigram->retrieve_all(array);
+
+ for ( int i = 0; i < array->len; ++i){
+ BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, i);
+ //get the phrase token
+ phrase_token_t token = item->m_token;
+ guint32 deleted_count = item->m_count;
+
+ {
+ guint32 freq = 0;
+ parameter_t elem_poss = 0;
+ if (bigram && bigram->get_freq(token, freq)){
+ guint32 total_freq;
+ assert(bigram->get_total_freq(total_freq));
+ assert(0 != total_freq);
+ elem_poss = freq / (parameter_t) total_freq;
+ }
+ numerator = lambda * elem_poss;
+ }
+
+ {
+ parameter_t elem_poss = 0;
+ PhraseItem item;
+ if (!unigram->get_phrase_item(token, item)){
+ guint32 freq = item.get_unigram_frequency();
+ guint32 total_freq = unigram->get_phrase_index_total_freq();
+ elem_poss = freq / (parameter_t)total_freq;
+ }
+ part_of_denominator = (1 - lambda) * elem_poss;
+ }
+
+ if (0 == (numerator + part_of_denominator))
+ continue;
+
+ next_lambda += deleted_count * (numerator / (numerator + part_of_denominator));
+ }
+ assert(deleted_bigram->get_total_freq(table_num));
+ next_lambda /= table_num;
+
+ g_array_free(array, TRUE);
+ }
+ lambda = next_lambda;
+ return lambda;
+}
+
+int main(int argc, char * argv[]){
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+
+ Bigram deleted_bigram;
+ deleted_bigram.attach(DELETED_BIGRAM, ATTACH_READONLY);
+
+ GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ deleted_bigram.get_all_items(deleted_items);
+
+ parameter_t lambda_sum = 0;
+ int lambda_count = 0;
+
+ for ( int i = 0; i < deleted_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
+ SingleGram * single_gram = NULL;
+ bigram.load(*token, single_gram);
+
+ SingleGram * deleted_single_gram = NULL;
+ deleted_bigram.load(*token, deleted_single_gram);
+
+ parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram);
+
+ printf("token:%d lambda:%f\n", *token, lambda);
+
+ lambda_sum += lambda;
+ lambda_count ++;
+
+ if (single_gram)
+ delete single_gram;
+ delete deleted_single_gram;
+ }
+
+ printf("average lambda:%f\n", (lambda_sum/lambda_count));
+ g_array_free(deleted_items, TRUE);
+ return 0;
+}
+
diff --git a/utils/training/estimate_k_mixture_model.cpp b/utils/training/estimate_k_mixture_model.cpp
new file mode 100644
index 0000000..c0fa03f
--- /dev/null
+++ b/utils/training/estimate_k_mixture_model.cpp
@@ -0,0 +1,159 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+static const gchar * bigram_filename = "k_mixture_model_ngram.db";
+static const gchar * deleted_bigram_filename = "k_mixture_model_deleted_ngram.db";
+
+static GOptionEntry entries[] =
+{
+ {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "the bigram file", NULL},
+ {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &deleted_bigram_filename, "the deleted bigram file", NULL},
+ {NULL}
+};
+
+
+parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram,
+ KMixtureModelBigram * unigram,
+ KMixtureModelSingleGram * bigram){
+ bool success;
+ parameter_t lambda = 0, next_lambda = 0.6;
+ parameter_t epsilon = 0.001;
+
+ KMixtureModelMagicHeader magic_header;
+ assert(unigram->get_magic_header(magic_header));
+ assert(0 != magic_header.m_total_freq);
+
+ while (fabs(lambda - next_lambda) > epsilon){
+ lambda = next_lambda;
+ next_lambda = 0;
+ parameter_t numerator = 0;
+ parameter_t part_of_denominator = 0;
+
+ FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ deleted_bigram->retrieve_all(array);
+
+ for ( size_t i = 0; i < array->len; ++i){
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i);
+ //get the phrase token
+ phrase_token_t token = item->m_token;
+ guint32 deleted_count = item->m_item.m_WC;
+
+ {
+ parameter_t elem_poss = 0;
+ KMixtureModelArrayHeader array_header;
+ KMixtureModelArrayItem array_item;
+ if ( bigram && bigram->get_array_item(token, array_item) ){
+ assert(bigram->get_array_header(array_header));
+ assert(0 != array_header.m_WC);
+ elem_poss = array_item.m_WC / (parameter_t) array_header.m_WC;
+ }
+ numerator = lambda * elem_poss;
+ }
+
+ {
+ parameter_t elem_poss = 0;
+ KMixtureModelArrayHeader array_header;
+ if (unigram->get_array_header(token, array_header)){
+ elem_poss = array_header.m_freq / (parameter_t) magic_header.m_total_freq;
+ }
+ part_of_denominator = (1 - lambda) * elem_poss;
+ }
+ if (0 == (numerator + part_of_denominator))
+ continue;
+
+ next_lambda += deleted_count * (numerator / (numerator + part_of_denominator));
+ }
+ KMixtureModelArrayHeader header;
+ assert(deleted_bigram->get_array_header(header));
+ assert(0 != header.m_WC);
+ next_lambda /= header.m_WC;
+
+ g_array_free(array, TRUE);
+ }
+ lambda = next_lambda;
+ return lambda;
+}
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- estimate k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ /* TODO: magic header signature check here. */
+ KMixtureModelBigram unigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ unigram.attach(bigram_filename, ATTACH_READONLY);
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(bigram_filename, ATTACH_READONLY);
+
+ KMixtureModelBigram deleted_bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ deleted_bigram.attach(deleted_bigram_filename, ATTACH_READONLY);
+
+ GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ deleted_bigram.get_all_items(deleted_items);
+
+ parameter_t lambda_sum = 0;
+ int lambda_count = 0;
+
+ for( size_t i = 0; i < deleted_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ bigram.load(*token, single_gram);
+
+ KMixtureModelSingleGram * deleted_single_gram = NULL;
+ deleted_bigram.load(*token, deleted_single_gram);
+
+ KMixtureModelArrayHeader array_header;
+ if (single_gram)
+ assert(single_gram->get_array_header(array_header));
+ KMixtureModelArrayHeader deleted_array_header;
+ assert(deleted_single_gram->get_array_header(deleted_array_header));
+
+ if ( 0 != deleted_array_header.m_WC ) {
+ parameter_t lambda = compute_interpolation(deleted_single_gram, &unigram, single_gram);
+
+ printf("token:%d lambda:%f\n", *token, lambda);
+
+ lambda_sum += lambda;
+ lambda_count ++;
+ }
+
+ if (single_gram)
+ delete single_gram;
+ delete deleted_single_gram;
+ }
+
+ printf("average lambda:%f\n", (lambda_sum/lambda_count));
+ g_array_free(deleted_items, TRUE);
+ return 0;
+}
diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp
new file mode 100644
index 0000000..b45781d
--- /dev/null
+++ b/utils/training/eval_correction_rate.cpp
@@ -0,0 +1,211 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+ printf("Usage: eval_correction_rate\n");
+}
+
+bool get_possible_pinyin(FacadePhraseIndex * phrase_index,
+ TokenVector tokens, ChewingKeyVector keys){
+ ChewingKey buffer[MAX_PHRASE_LENGTH];
+ size_t key_index; guint32 max_freq;
+ guint32 freq;
+ g_array_set_size(keys, 0);
+
+ for (size_t i = 0; i < tokens->len; ++i){
+ phrase_token_t * token = &g_array_index(tokens, phrase_token_t, i);
+ PhraseItem item;
+ phrase_index->get_phrase_item(*token, item);
+ key_index = 0; max_freq = 0;
+ for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) {
+ freq = 0;
+ assert(item.get_nth_pronunciation(m, buffer, freq));
+ if ( freq > max_freq ) {
+ key_index = m;
+ max_freq = freq;
+ }
+ }
+
+ assert(item.get_nth_pronunciation(key_index, buffer, freq));
+ assert(max_freq == freq);
+ guint8 len = item.get_phrase_length();
+ g_array_append_vals(keys, buffer, len);
+ }
+ return true;
+}
+
+bool get_best_match(PinyinLookup2 * pinyin_lookup,
+ ChewingKeyVector keys, TokenVector tokens){
+ /* prepare the prefixes for get_best_match. */
+ TokenVector prefixes = g_array_new
+ (FALSE, FALSE, sizeof(phrase_token_t));
+ g_array_append_val(prefixes, sentence_start);
+
+ /* initialize constraints. */
+ CandidateConstraints constraints = g_array_new
+ (TRUE, FALSE, sizeof(lookup_constraint_t));
+ g_array_set_size(constraints, keys->len);
+ for ( size_t i = 0; i < constraints->len; ++i ) {
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+
+ bool retval = pinyin_lookup->get_best_match(prefixes, keys, constraints, tokens);
+
+ g_array_free(prefixes, TRUE);
+ g_array_free(constraints, TRUE);
+ return retval;
+}
+
+bool do_one_test(PinyinLookup2 * pinyin_lookup,
+ FacadePhraseIndex * phrase_index,
+ TokenVector tokens){
+ bool retval = false;
+
+ ChewingKeyVector keys = g_array_new(FALSE, TRUE, sizeof(ChewingKey));
+ TokenVector guessed_tokens = g_array_new
+ (FALSE, TRUE, sizeof(phrase_token_t));
+
+ get_possible_pinyin(phrase_index, tokens, keys);
+ get_best_match(pinyin_lookup, keys, guessed_tokens);
+ /* compare the results */
+ char * sentence = NULL; char * guessed_sentence = NULL;
+ pinyin_lookup->convert_to_utf8(tokens, sentence);
+ pinyin_lookup->convert_to_utf8
+ (guessed_tokens, guessed_sentence);
+
+ if ( strcmp(sentence, guessed_sentence) != 0 ) {
+ fprintf(stderr, "test sentence:%s\n", sentence);
+ fprintf(stderr, "guessed sentence:%s\n", guessed_sentence);
+ fprintf(stderr, "the result mis-matches.\n");
+ retval = false;
+ } else {
+ retval = true;
+ }
+
+ g_free(sentence); g_free(guessed_sentence);
+ g_array_free(keys, TRUE);
+ g_array_free(guessed_tokens, TRUE);
+ return retval;
+}
+
+int main(int argc, char * argv[]){
+ const char * evals_text = "evals2.text";
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ pinyin_option_t options = USE_TONE;
+ FacadeChewingTable largetable;
+
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PINYIN_INDEX);
+ largetable.load(options, chunk, NULL);
+
+ FacadePhraseTable2 phrase_table;
+ chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram system_bigram;
+ system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+ Bigram user_bigram;
+ user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE);
+
+ gfloat lambda = system_table_info.get_lambda();
+
+ PinyinLookup2 pinyin_lookup(lambda, options,
+ &largetable, &phrase_index,
+ &system_bigram, &user_bigram);
+
+ /* open evals text. */
+ FILE * evals_file = fopen(evals_text, "r");
+ if ( NULL == evals_file ) {
+ fprintf(stderr, "Can't open file:%s\n", evals_text);
+ exit(ENOENT);
+ }
+
+ /* Evaluates the correction rate of test text documents. */
+ size_t tested_count = 0; size_t passed_count = 0;
+ char* linebuf = NULL; size_t size = 0;
+ TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t));
+
+ phrase_token_t token = null_token;
+ while( getline(&linebuf, &size, evals_file) ) {
+ if ( feof(evals_file) )
+ break;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
+ if ( null_token == token ) {
+ if ( tokens->len ) { /* one test. */
+ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
+ tested_count ++; passed_count ++;
+ } else {
+ tested_count ++;
+ }
+ g_array_set_size(tokens, 0);
+ }
+ } else {
+ g_array_append_val(tokens, token);
+ }
+ }
+
+ if ( tokens->len ) { /* one test. */
+ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
+ tested_count ++; passed_count ++;
+ } else {
+ tested_count ++;
+ }
+ }
+
+ parameter_t rate = passed_count / (parameter_t) tested_count;
+ printf("correction rate:%f\n", rate);
+
+ g_array_free(tokens, TRUE);
+ fclose(evals_file);
+ free(linebuf);
+
+ return 0;
+}
diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp
new file mode 100644
index 0000000..e446e79
--- /dev/null
+++ b/utils/training/export_k_mixture_model.cpp
@@ -0,0 +1,156 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+#include "utils_helper.h"
+
+static const gchar * k_mixture_model_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL},
+ {NULL}
+};
+
+
+bool print_k_mixture_model_magic_header(FILE * output,
+ KMixtureModelBigram * bigram){
+ KMixtureModelMagicHeader magic_header;
+ if ( !bigram->get_magic_header(magic_header) ){
+ fprintf(stderr, "no magic header in k mixture model.\n");
+ exit(ENODATA);
+ }
+ fprintf(output, "\\data model \"k mixture model\" count %d N %d "
+ "total_freq %d\n", magic_header.m_WC, magic_header.m_N,
+ magic_header.m_total_freq);
+ return true;
+}
+
+bool print_k_mixture_model_array_headers(FILE * output,
+ KMixtureModelBigram * bigram,
+ FacadePhraseIndex * phrase_index){
+ fprintf(output, "\\1-gram\n");
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t token = g_array_index(items, phrase_token_t, i);
+ KMixtureModelArrayHeader array_header;
+ assert(bigram->get_array_header(token, array_header));
+ char * phrase = taglib_token_to_string(phrase_index, token);
+ if ( phrase )
+ fprintf(output, "\\item %d %s count %d freq %d\n",
+ token, phrase, array_header.m_WC, array_header.m_freq);
+
+ g_free(phrase);
+ }
+ return true;
+}
+
+bool print_k_mixture_model_array_items(FILE * output,
+ KMixtureModelBigram * bigram,
+ FacadePhraseIndex * phrase_index){
+ fprintf(output, "\\2-gram\n");
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t token = g_array_index(items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ assert(bigram->load(token, single_gram));
+ FlexibleBigramPhraseArray array = g_array_new
+ (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ single_gram->retrieve_all(array);
+
+ for (size_t m = 0; m < array->len; ++m){
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
+ char * word1 = taglib_token_to_string(phrase_index, token);
+ char * word2 = taglib_token_to_string(phrase_index, item->m_token);
+
+ if (word1 && word2)
+ fprintf(output, "\\item %d %s %d %s count %d T %d N_n_0 %d n_1 %d Mr %d\n",
+ token, word1, item->m_token, word2,
+ item->m_item.m_WC, item->m_item.m_WC,
+ item->m_item.m_N_n_0, item->m_item.m_n_1,
+ item->m_item.m_Mr);
+
+ g_free(word1); g_free(word2);
+ }
+
+ g_array_free(array, TRUE);
+ delete single_gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
+
+bool end_data(FILE * output){
+ fprintf(output, "\\end\n");
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * output = stdout;
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- export k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ if (!bigram.attach(k_mixture_model_filename, ATTACH_READONLY)) {
+ fprintf(stderr, "open %s failed.\n", k_mixture_model_filename);
+ exit(ENOENT);
+ }
+
+ print_k_mixture_model_magic_header(output, &bigram);
+ print_k_mixture_model_array_headers(output, &bigram, &phrase_index);
+ print_k_mixture_model_array_items(output, &bigram, &phrase_index);
+
+ end_data(output);
+
+ return 0;
+}
diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp
new file mode 100644
index 0000000..b6f96fa
--- /dev/null
+++ b/utils/training/gen_deleted_ngram.cpp
@@ -0,0 +1,128 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007, 2011 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static gboolean train_pi_gram = TRUE;
+static const gchar * bigram_filename = DELETED_BIGRAM;
+
+static GOptionEntry entries[] =
+{
+ {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL},
+ {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "deleted bi-gram file", NULL},
+ {NULL}
+};
+
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate deleted n-gram");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* load phrase table. */
+ PhraseLargeTable2 phrase_table;
+ MemoryChunk * new_chunk = new MemoryChunk;
+ new_chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(new_chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENODATA);
+
+ Bigram bigram;
+ bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+
+ char* linebuf = NULL; size_t size = 0;
+ phrase_token_t last_token, cur_token = last_token = 0;
+ while( getline(&linebuf, &size, stdin) ){
+ if ( feof(stdin) )
+ break;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
+ last_token = cur_token;
+ cur_token = token;
+
+ /* skip null_token in second word. */
+ if ( null_token == cur_token )
+ continue;
+
+ /* skip pi-gram training. */
+ if ( null_token == last_token ){
+ if ( !train_pi_gram )
+ continue;
+ last_token = sentence_start;
+ }
+
+ /* train bi-gram */
+ SingleGram * single_gram = NULL;
+ bigram.load(last_token, single_gram);
+
+ if ( NULL == single_gram ){
+ single_gram = new SingleGram;
+ }
+ guint32 freq, total_freq;
+ //increase freq
+ if (single_gram->get_freq(cur_token, freq))
+ assert(single_gram->set_freq(cur_token, freq + 1));
+ else
+ assert(single_gram->insert_freq(cur_token, 1));
+ //increase total freq
+ single_gram->get_total_freq(total_freq);
+ single_gram->set_total_freq(total_freq + 1);
+
+ bigram.store(last_token, single_gram);
+ delete single_gram;
+ }
+
+ free(linebuf);
+ return 0;
+}
diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp
new file mode 100644
index 0000000..2dfb3d1
--- /dev/null
+++ b/utils/training/gen_k_mixture_model.cpp
@@ -0,0 +1,411 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include <glib.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+#include "k_mixture_model.h"
+
+/* Hash token of Hash token of word count. */
+typedef GHashTable * HashofDocument;
+typedef GHashTable * HashofSecondWord;
+
+typedef GHashTable * HashofUnigram;
+
+
+void print_help(){
+ printf("Usage: gen_k_mixture_model [--skip-pi-gram-training]\n"
+ " [--maximum-occurs-allowed <INT>]\n"
+ " [--maximum-increase-rates-allowed <FLOAT>]\n"
+ " [--k-mixture-model-file <FILENAME>]\n"
+ " {<FILENAME>}+\n");
+}
+
+
+static gint g_maximum_occurs = 20;
+static parameter_t g_maximum_increase_rates = 3.;
+static gboolean g_train_pi_gram = TRUE;
+static const gchar * g_k_mixture_model_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &g_train_pi_gram, "skip pi-gram training", NULL},
+ {"maximum-occurs-allowed", 0, 0, G_OPTION_ARG_INT, &g_maximum_occurs, "maximum occurs allowed", NULL},
+ {"maximum-increase-rates-allowed", 0, 0, G_OPTION_ARG_DOUBLE, &g_maximum_increase_rates, "maximum increase rates allowed", NULL},
+ {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &g_k_mixture_model_filename, "k mixture model file", NULL},
+ {NULL}
+};
+
+
+bool read_document(PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ FILE * document,
+ HashofDocument hash_of_document,
+ HashofUnigram hash_of_unigram){
+
+ char * linebuf = NULL;size_t size = 0;
+ phrase_token_t last_token, cur_token = last_token = 0;
+
+ while ( getline(&linebuf, &size, document) ){
+ if ( feof(document) )
+ break;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf);
+
+ last_token = cur_token;
+ cur_token = token;
+
+ /* skip null_token in second word. */
+ if ( null_token == cur_token )
+ continue;
+
+ gpointer value = NULL;
+ gboolean lookup_result = g_hash_table_lookup_extended
+ (hash_of_unigram, GUINT_TO_POINTER(cur_token),
+ NULL, &value);
+ if ( !lookup_result ){
+ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token),
+ GUINT_TO_POINTER(1));
+ } else {
+ guint32 freq = GPOINTER_TO_UINT(value);
+ freq ++;
+ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token),
+ GUINT_TO_POINTER(freq));
+ }
+
+ /* skip pi-gram training. */
+ if ( null_token == last_token ){
+ if ( !g_train_pi_gram )
+ continue;
+ last_token = sentence_start;
+ }
+
+ /* remember the (last_token, cur_token) word pair. */
+ HashofSecondWord hash_of_second_word = NULL;
+ lookup_result = g_hash_table_lookup_extended
+ (hash_of_document, GUINT_TO_POINTER(last_token),
+ NULL, &value);
+ if ( !lookup_result ){
+ hash_of_second_word = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
+ } else {
+ hash_of_second_word = (HashofSecondWord) value;
+ }
+
+ value = NULL;
+ lookup_result = g_hash_table_lookup_extended
+ (hash_of_second_word, GUINT_TO_POINTER(cur_token),
+ NULL, &value);
+ guint32 count = 0;
+ if ( lookup_result ) {
+ count = GPOINTER_TO_UINT(value);
+ }
+ count ++;
+ g_hash_table_insert(hash_of_second_word,
+ GUINT_TO_POINTER(cur_token),
+ GUINT_TO_POINTER(count));
+ g_hash_table_insert(hash_of_document,
+ GUINT_TO_POINTER(last_token),
+ hash_of_second_word);
+ }
+
+ free(linebuf);
+
+ return true;
+}
+
+static void train_word_pair(HashofUnigram hash_of_unigram,
+ KMixtureModelSingleGram * single_gram,
+ phrase_token_t token2, guint32 count){
+ KMixtureModelArrayItem array_item;
+
+ bool exists = single_gram->get_array_item(token2, array_item);
+ if ( exists ) {
+ guint32 maximum_occurs_allowed = std_lite::max
+ ((guint32)g_maximum_occurs,
+ (guint32)ceil(array_item.m_Mr * g_maximum_increase_rates));
+ /* Exceeds the maximum occurs allowed of the word or phrase,
+ * in a single document.
+ */
+ if ( count > maximum_occurs_allowed ){
+ gpointer value = NULL;
+ assert( g_hash_table_lookup_extended
+ (hash_of_unigram, GUINT_TO_POINTER(token2),
+ NULL, &value) );
+ guint32 freq = GPOINTER_TO_UINT(value);
+ freq -= count;
+ if ( freq > 0 ) {
+ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2),
+ GUINT_TO_POINTER(freq));
+ } else if ( freq == 0 ) {
+ assert(g_hash_table_steal(hash_of_unigram,
+ GUINT_TO_POINTER(token2)));
+ } else {
+ assert(false);
+ }
+ return;
+ }
+ array_item.m_WC += count;
+ /* array_item.m_T += count; the same as m_WC. */
+ array_item.m_N_n_0 ++;
+ if ( 1 == count )
+ array_item.m_n_1 ++;
+ array_item.m_Mr = std_lite::max(array_item.m_Mr, count);
+ assert(single_gram->set_array_item(token2, array_item));
+ } else { /* item doesn't exist. */
+ /* the same as above. */
+ if ( count > g_maximum_occurs ){
+ gpointer value = NULL;
+ assert( g_hash_table_lookup_extended
+ (hash_of_unigram, GUINT_TO_POINTER(token2),
+ NULL, &value) );
+ guint32 freq = GPOINTER_TO_UINT(value);
+ freq -= count;
+ if ( freq > 0 ) {
+ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2),
+ GUINT_TO_POINTER(freq));
+ } else if ( freq == 0 ) {
+ assert(g_hash_table_steal(hash_of_unigram,
+ GUINT_TO_POINTER(token2)));
+ } else {
+ assert(false);
+ }
+ return;
+ }
+ memset(&array_item, 0, sizeof(KMixtureModelArrayItem));
+ array_item.m_WC = count;
+ /* array_item.m_T = count; the same as m_WC. */
+ array_item.m_N_n_0 = 1;
+ if ( 1 == count )
+ array_item.m_n_1 = 1;
+ array_item.m_Mr = count;
+ assert(single_gram->insert_array_item(token2, array_item));
+ }
+
+ /* save delta in the array header. */
+ KMixtureModelArrayHeader array_header;
+ single_gram->get_array_header(array_header);
+ array_header.m_WC += count;
+ single_gram->set_array_header(array_header);
+}
+
+bool train_single_gram(HashofUnigram hash_of_unigram,
+ HashofDocument hash_of_document,
+ KMixtureModelSingleGram * single_gram,
+ phrase_token_t token1,
+ guint32 & delta){
+ assert(NULL != single_gram);
+ delta = 0; /* delta in WC of single_gram. */
+ KMixtureModelArrayHeader array_header;
+ assert(single_gram->get_array_header(array_header));
+ guint32 saved_array_header_WC = array_header.m_WC;
+
+ HashofSecondWord hash_of_second_word = NULL;
+ gpointer key, value = NULL;
+ assert(g_hash_table_lookup_extended
+ (hash_of_document, GUINT_TO_POINTER(token1),
+ NULL, &value));
+ hash_of_second_word = (HashofSecondWord) value;
+ assert(NULL != hash_of_second_word);
+
+ /* train word pair */
+ GHashTableIter iter;
+ g_hash_table_iter_init(&iter, hash_of_second_word);
+ while (g_hash_table_iter_next(&iter, &key, &value)) {
+ phrase_token_t token2 = GPOINTER_TO_UINT(key);
+ guint32 count = GPOINTER_TO_UINT(value);
+ train_word_pair(hash_of_unigram, single_gram, token2, count);
+ }
+
+ assert(single_gram->get_array_header(array_header));
+ delta = array_header.m_WC - saved_array_header_WC;
+ return true;
+}
+
+static bool train_second_word(HashofUnigram hash_of_unigram,
+ KMixtureModelBigram * bigram,
+ HashofDocument hash_of_document,
+ phrase_token_t token1){
+ guint32 delta = 0;
+
+ KMixtureModelSingleGram * single_gram = NULL;
+ bool exists = bigram->load(token1, single_gram);
+ if ( !exists )
+ single_gram = new KMixtureModelSingleGram;
+ train_single_gram(hash_of_unigram, hash_of_document,
+ single_gram, token1, delta);
+
+ if ( 0 == delta ){ /* Please consider maximum occurs allowed. */
+ delete single_gram;
+ return false;
+ }
+
+ /* save the single gram. */
+ assert(bigram->store(token1, single_gram));
+ delete single_gram;
+
+ KMixtureModelMagicHeader magic_header;
+ if (!bigram->get_magic_header(magic_header)){
+ /* the first time to access the new k mixture model file. */
+ memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ }
+
+ if ( magic_header.m_WC + delta < magic_header.m_WC ){
+ fprintf(stderr, "the m_WC integer in magic header overflows.\n");
+ return false;
+ }
+ magic_header.m_WC += delta;
+ assert(bigram->set_magic_header(magic_header));
+
+ return true;
+}
+
+/* Note: this method is a post-processing method, run this last. */
+static bool post_processing_unigram(KMixtureModelBigram * bigram,
+ HashofUnigram hash_of_unigram){
+ GHashTableIter iter;
+ gpointer key, value;
+ guint32 total_freq = 0;
+
+ g_hash_table_iter_init(&iter, hash_of_unigram);
+ while (g_hash_table_iter_next(&iter, &key, &value)){
+ guint32 token = GPOINTER_TO_UINT(key);
+ guint32 freq = GPOINTER_TO_UINT(value);
+ KMixtureModelArrayHeader array_header;
+ bool result = bigram->get_array_header(token, array_header);
+ array_header.m_freq += freq;
+ total_freq += freq;
+ bigram->set_array_header(token, array_header);
+ }
+
+ KMixtureModelMagicHeader magic_header;
+ assert(bigram->get_magic_header(magic_header));
+ if ( magic_header.m_total_freq + total_freq < magic_header.m_total_freq ){
+ fprintf(stderr, "the m_total_freq in magic header overflows.\n");
+ return false;
+ }
+ magic_header.m_total_freq += total_freq;
+ assert(bigram->set_magic_header(magic_header));
+
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ int i = 1;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ PhraseLargeTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+ while ( i < argc ){
+ const char * filename = argv[i];
+ FILE * document = fopen(filename, "r");
+ if ( NULL == document ){
+ int err_saved = errno;
+ fprintf(stderr, "can't open file: %s.\n", filename);
+ fprintf(stderr, "error:%s.\n", strerror(err_saved));
+ exit(err_saved);
+ }
+
+ HashofDocument hash_of_document = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
+ HashofUnigram hash_of_unigram = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
+
+ assert(read_document(&phrase_table, &phrase_index, document,
+ hash_of_document, hash_of_unigram));
+ fclose(document);
+ document = NULL;
+
+ GHashTableIter iter;
+ gpointer key, value;
+
+ /* train the document, and convert it to k mixture model. */
+ g_hash_table_iter_init(&iter, hash_of_document);
+ while (g_hash_table_iter_next(&iter, &key, &value)) {
+ phrase_token_t token1 = GPOINTER_TO_UINT(key);
+ train_second_word(hash_of_unigram, &bigram,
+ hash_of_document, token1);
+ }
+
+ KMixtureModelMagicHeader magic_header;
+ assert(bigram.get_magic_header(magic_header));
+ magic_header.m_N ++;
+ assert(bigram.set_magic_header(magic_header));
+
+ post_processing_unigram(&bigram, hash_of_unigram);
+
+ /* free resources of g_hash_of_document */
+ g_hash_table_iter_init(&iter, hash_of_document);
+ while (g_hash_table_iter_next(&iter, &key, &value)) {
+ HashofSecondWord second_word = (HashofSecondWord) value;
+ g_hash_table_iter_steal(&iter);
+ g_hash_table_unref(second_word);
+ }
+ g_hash_table_unref(hash_of_document);
+ hash_of_document = NULL;
+
+ g_hash_table_unref(hash_of_unigram);
+ hash_of_unigram = NULL;
+
+ ++i;
+ }
+
+ return 0;
+}
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
new file mode 100644
index 0000000..1947959
--- /dev/null
+++ b/utils/training/gen_ngram.cpp
@@ -0,0 +1,136 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007, 2011 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static gboolean train_pi_gram = TRUE;
+static const gchar * bigram_filename = SYSTEM_BIGRAM;
+
+static GOptionEntry entries[] =
+{
+ {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL},
+ {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "bi-gram file", NULL},
+ {NULL}
+};
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- generate n-gram");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ PhraseLargeTable2 phrase_table;
+ /* init phrase table */
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ Bigram bigram;
+ bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+
+ char* linebuf = NULL; size_t size = 0;
+ phrase_token_t last_token, cur_token = last_token = 0;
+ while( getline(&linebuf, &size, input) ){
+ if ( feof(input) )
+ break;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
+
+ last_token = cur_token;
+ cur_token = token;
+
+ /* skip null_token in second word. */
+ if ( null_token == cur_token )
+ continue;
+
+ /* training uni-gram */
+ phrase_index.add_unigram_frequency(cur_token, 1);
+
+ /* skip pi-gram training. */
+ if ( null_token == last_token ){
+ if ( !train_pi_gram )
+ continue;
+ last_token = sentence_start;
+ }
+
+ /* train bi-gram */
+ SingleGram * single_gram = NULL;
+ bigram.load(last_token, single_gram);
+
+ if ( NULL == single_gram ){
+ single_gram = new SingleGram;
+ }
+ guint32 freq, total_freq;
+ /* increase freq */
+ if (single_gram->get_freq(cur_token, freq))
+ assert(single_gram->set_freq(cur_token, freq + 1));
+ else
+ assert(single_gram->insert_freq(cur_token, 1));
+ /* increase total freq */
+ single_gram->get_total_freq(total_freq);
+ single_gram->set_total_freq(total_freq + 1);
+
+ bigram.store(last_token, single_gram);
+ delete single_gram;
+ }
+
+ free(linebuf);
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}
diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp
new file mode 100644
index 0000000..f4c51af
--- /dev/null
+++ b/utils/training/gen_unigram.cpp
@@ -0,0 +1,111 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+static const gchar * table_dir = ".";
+
+static GOptionEntry entries[] =
+{
+ {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL},
+ {NULL}
+};
+
+/* increase all unigram frequency by a constant. */
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- increase uni-gram");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL);
+ bool retval = system_table_info.load(filename);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+ g_free(filename);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ /* Note: please increase the value when corpus size becomes larger.
+ * To avoid zero value when computing unigram frequency in float format.
+ */
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+ assert(table_info->m_dict_index == i);
+
+ if (SYSTEM_FILE != table_info->m_file_type &&
+ DICTIONARY != table_info->m_file_type)
+ continue;
+
+ guint32 freq = 1;
+#if 0
+ /* skip GBK_DICTIONARY. */
+ if (GBK_DICTIONARY == table_info->m_dict_index)
+ freq = 1;
+#endif
+
+ const char * binfile = table_info->m_system_filename;
+
+ MemoryChunk * chunk = new MemoryChunk;
+ bool retval = chunk->load(binfile);
+ if (!retval) {
+ fprintf(stderr, "load %s failed!\n", binfile);
+ exit(ENOENT);
+ }
+
+ phrase_index.load(i, chunk);
+
+ PhraseIndexRange range;
+ int result = phrase_index.get_range(i, range);
+ if ( result == ERROR_OK ) {
+ for (size_t token = range.m_range_begin;
+ token <= range.m_range_end; ++token) {
+ phrase_index.add_unigram_frequency(token, freq);
+ }
+ }
+ }
+
+ if (!save_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ if (!save_dictionary(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ return 0;
+}
diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp
new file mode 100644
index 0000000..40870cf
--- /dev/null
+++ b/utils/training/import_k_mixture_model.cpp
@@ -0,0 +1,322 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+#include "k_mixture_model.h"
+
+static const gchar * k_mixture_model_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL},
+ {NULL}
+};
+
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline(KMixtureModelBigram * bigram);
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram);
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram);
+
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+ return result;
+}
+
+bool parse_headline(KMixtureModelBigram * bigram){
+ /* enter "\data" line */
+ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", ""));
+
+ /* read "\data" line */
+ if ( !taglib_read(linebuf, line_type, values, required) ) {
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ assert(line_type == BEGIN_LINE);
+ /* check header */
+ TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+ if ( !( strcmp("k mixture model", model) == 0 ) ) {
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ TAGLIB_GET_TAGVALUE(glong, N, atol);
+ TAGLIB_GET_TAGVALUE(glong, total_freq, atol);
+
+ KMixtureModelMagicHeader magic_header;
+ memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ magic_header.m_WC =count; magic_header.m_N = N;
+ magic_header.m_total_freq = total_freq;
+ bigram->set_magic_header(magic_header);
+
+ return true;
+}
+
+bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ goto end;
+ case GRAM_1_LINE:
+ my_getline(input);
+ parse_unigram(input, phrase_table, phrase_index, bigram);
+ goto retry;
+ case GRAM_2_LINE:
+ my_getline(input);
+ parse_bigram(input, phrase_table, phrase_index, bigram);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1) ;
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", ""));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_1_ITEM_LINE:{
+ /* handle \item in \1-gram */
+ TAGLIB_GET_TOKEN(token, 0);
+ TAGLIB_GET_PHRASE_STRING(word, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token, word));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ TAGLIB_GET_TAGVALUE(glong, freq, atol);
+
+ KMixtureModelArrayHeader array_header;
+ memset(&array_header, 0, sizeof(KMixtureModelArrayHeader));
+ array_header.m_WC = count; array_header.m_freq = freq;
+ bigram->set_array_header(token, array_header);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ KMixtureModelBigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
+ "count:T:N_n_0:n_1:Mr", ""));
+
+ phrase_token_t last_token = null_token;
+ KMixtureModelSingleGram * last_single_gram = NULL;
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two tokens */
+ TAGLIB_GET_TOKEN(token1, 0);
+ TAGLIB_GET_PHRASE_STRING(word1, 1);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token1, word1));
+
+ TAGLIB_GET_TOKEN(token2, 2);
+ TAGLIB_GET_PHRASE_STRING(word2, 3);
+ assert(taglib_validate_token_with_string
+ (phrase_index, token2, word2));
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ TAGLIB_GET_TAGVALUE(glong, T, atol);
+ assert(count == T);
+ TAGLIB_GET_TAGVALUE(glong, N_n_0, atol);
+ TAGLIB_GET_TAGVALUE(glong, n_1, atol);
+ TAGLIB_GET_TAGVALUE(glong, Mr, atol);
+
+ KMixtureModelArrayItem array_item;
+ memset(&array_item, 0, sizeof(KMixtureModelArrayItem));
+ array_item.m_WC = count; array_item.m_N_n_0 = N_n_0;
+ array_item.m_n_1 = n_1; array_item.m_Mr = Mr;
+
+ if ( last_token != token1 ) {
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ /* safe guard */
+ last_token = null_token;
+ last_single_gram = NULL;
+ }
+ KMixtureModelSingleGram * single_gram = NULL;
+ bigram->load(token1, single_gram);
+
+ /* create the new single gram */
+ if ( single_gram == NULL )
+ single_gram = new KMixtureModelSingleGram;
+ last_token = token1;
+ last_single_gram = single_gram;
+ }
+
+ assert(NULL != last_single_gram);
+ assert(last_single_gram->insert_array_item(token2, array_item));
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ /* safe guard */
+ last_token = null_token;
+ last_single_gram = NULL;
+ }
+
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- import k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ PhraseLargeTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk);
+
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+ taglib_init();
+
+ /* prepare to read n-gram model */
+ values = g_ptr_array_new();
+ required = g_hash_table_new(g_str_hash, g_str_equal);
+
+ ssize_t result = my_getline(input);
+ if ( result == -1 ) {
+ fprintf(stderr, "empty file input.\n");
+ exit(ENODATA);
+ }
+
+ if (!parse_headline(&bigram))
+ exit(ENODATA);
+
+ result = my_getline(input);
+ if ( result != -1 )
+ parse_body(input, &phrase_table, &phrase_index, &bigram);
+
+ taglib_fini();
+
+ return 0;
+}
diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h
new file mode 100644
index 0000000..ad8d3d8
--- /dev/null
+++ b/utils/training/k_mixture_model.h
@@ -0,0 +1,172 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef K_MIXTURE_MODEL
+#define K_MIXTURE_MODEL
+
+#include <math.h>
+#include "novel_types.h"
+#include "flexible_ngram.h"
+
+namespace pinyin{
+
+typedef guint32 corpus_count_t;
+
+/* Note: storage parameters: N, T, n_r.
+ * N: the total number of documents.
+ * T: the total number of instances of the word or phrase.
+ * n_r: the number of documents having exactly <b>r</b> occurrences.
+ * only n_0, n_1 are used here.
+ */
+
+static inline parameter_t compute_alpha(corpus_count_t N, corpus_count_t n_0){
+ parameter_t alpha = 1 - n_0 / (parameter_t) N;
+ return alpha;
+}
+
+static inline parameter_t compute_gamma(corpus_count_t N,
+ corpus_count_t n_0,
+ corpus_count_t n_1){
+ parameter_t gamma = 1 - n_1 / (parameter_t) (N - n_0);
+ return gamma;
+}
+
+static inline parameter_t compute_B(corpus_count_t N,
+ corpus_count_t T,
+ corpus_count_t n_0,
+ corpus_count_t n_1){
+ /* Note: re-check this, to see if we can remove if statement. */
+ /* Please consider B_2 is no less than 2 in paper. */
+#if 1
+ if ( 0 == T - n_1 && 0 == N - n_0 - n_1 )
+ return 2;
+#endif
+
+ parameter_t B = (T - n_1 ) / (parameter_t) (N - n_0 - n_1);
+ return B;
+}
+
+/* three parameters model */
+static inline parameter_t compute_Pr_G_3(corpus_count_t k,
+ parameter_t alpha,
+ parameter_t gamma,
+ parameter_t B){
+ if ( k == 0 )
+ return 1 - alpha;
+
+ if ( k == 1 )
+ return alpha * (1 - gamma);
+
+ if ( k > 1 ) {
+ return (alpha * gamma / (B - 1)) * pow((1 - 1 / (B - 1)) , k - 2);
+ }
+
+ assert(false);
+}
+
+static inline parameter_t compute_Pr_G_3_with_count(corpus_count_t k,
+ corpus_count_t N,
+ corpus_count_t T,
+ corpus_count_t n_0,
+ corpus_count_t n_1){
+ parameter_t alpha = compute_alpha(N, n_0);
+ parameter_t gamma = compute_gamma(N, n_0, n_1);
+ parameter_t B = compute_B(N, T, n_0, n_1);
+
+ return compute_Pr_G_3(k, alpha, gamma, B);
+}
+
+/* two parameters model */
+static inline parameter_t compute_Pr_G_2(corpus_count_t k,
+ parameter_t alpha,
+ parameter_t B){
+ parameter_t gamma = 1 - 1 / (B - 1);
+ return compute_Pr_G_3(k, alpha, gamma, B);
+}
+
+static inline parameter_t compute_Pr_G_2_with_count(corpus_count_t k,
+ corpus_count_t N,
+ corpus_count_t T,
+ corpus_count_t n_0,
+ corpus_count_t n_1){
+ parameter_t alpha = compute_alpha(N, n_0);
+ parameter_t B = compute_B(N, T, n_0, n_1);
+ return compute_Pr_G_2(k, alpha, B);
+}
+
+#define K_MIXTURE_MODEL_MAGIC_NUMBER "KMMP"
+
+typedef struct{
+ /* the total number of instances of all words. */
+ guint32 m_WC;
+ /* the total number of documents. */
+ guint32 m_N;
+ /* the total freq of uni-gram. */
+ guint32 m_total_freq;
+} KMixtureModelMagicHeader;
+
+typedef struct{
+ /* the total number of instances of word W1. */
+ guint32 m_WC;
+ /* the freq of uni-gram. see m_total_freq in magic header also. */
+ guint32 m_freq;
+} KMixtureModelArrayHeader;
+
+typedef struct{
+ /* the total number of all W1,W2 word pair. */
+ guint32 m_WC;
+
+ /* the total number of instances of the word or phrase.
+ (two word phrase) */
+ /* guint32 m_T; Please use m_WC instead.
+ alias of m_WC, always the same. */
+
+ /* n_r: the number of documents having exactly r occurrences. */
+ /* guint32 m_n_0;
+ Note: compute this value using the following equation.
+ m_n_0 = KMixtureModelMagicHeader.m_N - m_N_n_0;
+ m_N_n_0, the number of documents which contains the word or phrase.
+ (two word phrase) */
+ guint32 m_N_n_0;
+ guint32 m_n_1;
+
+ /* maximum instances of the word or phrase (two word phrase)
+ in previous documents last seen. */
+ guint32 m_Mr;
+} KMixtureModelArrayItem;
+
+typedef FlexibleBigram<KMixtureModelMagicHeader,
+ KMixtureModelArrayHeader,
+ KMixtureModelArrayItem>
+KMixtureModelBigram;
+
+typedef FlexibleSingleGram<KMixtureModelArrayHeader,
+ KMixtureModelArrayItem>
+KMixtureModelSingleGram;
+
+typedef KMixtureModelSingleGram::ArrayItemWithToken
+KMixtureModelArrayItemWithToken;
+
+};
+
+
+#endif
diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp
new file mode 100644
index 0000000..c5a66ec
--- /dev/null
+++ b/utils/training/k_mixture_model_to_interpolation.cpp
@@ -0,0 +1,214 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_headline(FILE * input, FILE * output);
+
+bool parse_unigram(FILE * input, FILE * output);
+
+bool parse_bigram(FILE * input, FILE * output);
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ linebuf[strlen(linebuf) - 1] = '\0';
+ return result;
+}
+
+bool parse_headline(FILE * input, FILE * output) {
+ /* enter "\data" line */
+ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model",
+ "count:N:total_freq"));
+
+ /* read "\data" line */
+ if ( !taglib_read(linebuf, line_type, values, required) ) {
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ assert(line_type == BEGIN_LINE);
+ TAGLIB_GET_TAGVALUE(const char *, model, (const char *));
+ if ( !( strcmp("k mixture model", model) == 0 ) ){
+ fprintf(stderr, "error: k mixture model expected.\n");
+ return false;
+ }
+
+ /* print header */
+ fprintf(output, "\\data model interpolation\n");
+
+ return true;
+}
+
+bool parse_body(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ fprintf(output, "\\end\n");
+ goto end;
+ case GRAM_1_LINE:
+ fprintf(output, "\\1-gram\n");
+ my_getline(input);
+ parse_unigram(input, output);
+ goto retry;
+ case GRAM_2_LINE:
+ fprintf(output, "\\2-gram\n");
+ my_getline(input);
+ parse_bigram(input, output);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count"));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case GRAM_1_ITEM_LINE: {
+ /* handle \item in \1-gram */
+ TAGLIB_GET_TOKEN(token, 0);
+ TAGLIB_GET_PHRASE_STRING(word, 1);
+
+ /* remove the "<start>" in the uni-gram of interpolation model */
+ if ( sentence_start == token )
+ break;
+
+ TAGLIB_GET_TAGVALUE(glong, freq, atol);
+
+ /* ignore zero unigram freq item */
+ if ( 0 != freq )
+ fprintf(output, "\\item %d %s count %ld\n", token, word, freq);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, FILE * output){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4,
+ "count", "T:N_n_0:n_1:Mr"));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two strings */
+ TAGLIB_GET_TOKEN(token1, 0);
+ TAGLIB_GET_PHRASE_STRING(word1, 1);
+
+ TAGLIB_GET_TOKEN(token2, 2);
+ TAGLIB_GET_PHRASE_STRING(word2, 3);
+
+ TAGLIB_GET_TAGVALUE(glong, count, atol);
+ fprintf(output, "\\item %d %s %d %s count %ld\n",
+ token1, word1, token2, word2, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ taglib_init();
+
+ values = g_ptr_array_new();
+ required = g_hash_table_new(g_str_hash, g_str_equal);
+
+ ssize_t result = my_getline(input);
+ if ( result == -1 ) {
+ fprintf(stderr, "empty file input.\n");
+ exit(ENODATA);
+ }
+
+ if (!parse_headline(input, output))
+ exit(ENODATA);
+
+ result = my_getline(input);
+ if ( result != -1 )
+ parse_body(input, output);
+
+ taglib_fini();
+
+ return 0;
+}
diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp
new file mode 100644
index 0000000..ab08010
--- /dev/null
+++ b/utils/training/merge_k_mixture_model.cpp
@@ -0,0 +1,239 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+void print_help(){
+ printf("Usage: merge_k_mixture_model [--result-file <RESULT_FILENAME>]\n");
+ printf(" {<SOURCE_FILENAME>}+\n");
+}
+
+static const gchar * result_filename = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"result-file", 0, 0, G_OPTION_ARG_FILENAME, &result_filename, "merged result file", NULL},
+ {NULL}
+};
+
+static bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first,
+ /* in */ FlexibleBigramPhraseArray second,
+ /* out */ FlexibleBigramPhraseArray & merged ){
+ /* avoid to do empty merge. */
+ assert( NULL != first && NULL != second && NULL != merged );
+
+ /* merge two arrays. */
+ guint first_index, second_index = first_index = 0;
+ KMixtureModelArrayItemWithToken * first_item,
+ * second_item = first_item = NULL;
+ while ( first_index < first->len && second_index < second->len ){
+ first_item = &g_array_index(first, KMixtureModelArrayItemWithToken,
+ first_index);
+ second_item = &g_array_index(second, KMixtureModelArrayItemWithToken,
+ second_index);
+ if ( first_item->m_token > second_item->m_token ) {
+ g_array_append_val(merged, *second_item);
+ second_index ++;
+ } else if ( first_item->m_token < second_item->m_token ) {
+ g_array_append_val(merged, *first_item);
+ first_index ++;
+ } else /* first_item->m_token == second_item->m_token */ {
+ KMixtureModelArrayItemWithToken merged_item;
+ memset(&merged_item, 0, sizeof(KMixtureModelArrayItemWithToken));
+ merged_item.m_token = first_item->m_token;/* same as second_item */
+ merged_item.m_item.m_WC = first_item->m_item.m_WC +
+ second_item->m_item.m_WC;
+ /* merged_item.m_item.m_T = first_item->m_item.m_T +
+ second_item->m_item.m_T; */
+ merged_item.m_item.m_N_n_0 = first_item->m_item.m_N_n_0 +
+ second_item->m_item.m_N_n_0;
+ merged_item.m_item.m_n_1 = first_item->m_item.m_n_1 +
+ second_item->m_item.m_n_1;
+ merged_item.m_item.m_Mr = std_lite::max(first_item->m_item.m_Mr,
+ second_item->m_item.m_Mr);
+ g_array_append_val(merged, merged_item);
+ first_index ++; second_index ++;
+ }
+ }
+
+ /* add remained items. */
+ while ( first_index < first->len ){
+ first_item = &g_array_index(first, KMixtureModelArrayItemWithToken,
+ first_index);
+ g_array_append_val(merged, *first_item);
+ first_index++;
+ }
+
+ while ( second_index < second->len ){
+ second_item = &g_array_index(second, KMixtureModelArrayItemWithToken,
+ second_index);
+ g_array_append_val(merged, *second_item);
+ second_index++;
+ }
+
+ return true;
+}
+
+static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one ){
+
+ KMixtureModelMagicHeader target_magic_header;
+ KMixtureModelMagicHeader new_magic_header;
+ KMixtureModelMagicHeader merged_magic_header;
+
+ memset(&merged_magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ if (!target->get_magic_header(target_magic_header)) {
+ memset(&target_magic_header, 0, sizeof(KMixtureModelMagicHeader));
+ }
+ assert(new_one->get_magic_header(new_magic_header));
+ if ( target_magic_header.m_WC + new_magic_header.m_WC <
+ std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){
+ fprintf(stderr, "the m_WC integer in magic header overflows.\n");
+ return false;
+ }
+ if ( target_magic_header.m_total_freq + new_magic_header.m_total_freq <
+ std_lite::max( target_magic_header.m_total_freq,
+ new_magic_header.m_total_freq ) ){
+ fprintf(stderr, "the m_total_freq in magic header overflows.\n");
+ return false;
+ }
+
+ merged_magic_header.m_WC = target_magic_header.m_WC +
+ new_magic_header.m_WC;
+ merged_magic_header.m_N = target_magic_header.m_N +
+ new_magic_header.m_N;
+ merged_magic_header.m_total_freq = target_magic_header.m_total_freq +
+ new_magic_header.m_total_freq;
+
+ assert(target->set_magic_header(merged_magic_header));
+ return true;
+}
+
+static bool merge_array_items( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one ){
+
+ GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ new_one->get_all_items(new_items);
+
+ for ( size_t i = 0; i < new_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i);
+ KMixtureModelSingleGram * target_single_gram = NULL;
+ KMixtureModelSingleGram * new_single_gram = NULL;
+
+ assert(new_one->load(*token, new_single_gram));
+ bool exists_in_target = target->load(*token, target_single_gram);
+ if ( !exists_in_target ){
+ target->store(*token, new_single_gram);
+ delete new_single_gram;
+ continue;
+ }
+
+ /* word count in array header in parallel with array items */
+ KMixtureModelArrayHeader target_array_header;
+ KMixtureModelArrayHeader new_array_header;
+ KMixtureModelArrayHeader merged_array_header;
+
+ assert(new_one->get_array_header(*token, new_array_header));
+ assert(target->get_array_header(*token, target_array_header));
+ memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader));
+
+ merged_array_header.m_WC = target_array_header.m_WC +
+ new_array_header.m_WC;
+ merged_array_header.m_freq = target_array_header.m_freq +
+ new_array_header.m_freq;
+ /* end of word count in array header computing. */
+
+ assert(NULL != target_single_gram);
+ KMixtureModelSingleGram * merged_single_gram =
+ new KMixtureModelSingleGram;
+
+ FlexibleBigramPhraseArray target_array =
+ g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ target_single_gram->retrieve_all(target_array);
+
+ FlexibleBigramPhraseArray new_array =
+ g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ new_single_gram->retrieve_all(new_array);
+ FlexibleBigramPhraseArray merged_array =
+ g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+
+ assert(merge_two_phrase_array(target_array, new_array, merged_array));
+
+ g_array_free(target_array, TRUE);
+ g_array_free(new_array, TRUE);
+ delete target_single_gram; delete new_single_gram;
+
+ for ( size_t m = 0; m < merged_array->len; ++m ){
+ KMixtureModelArrayItemWithToken * item =
+ &g_array_index(merged_array,
+ KMixtureModelArrayItemWithToken, m);
+ merged_single_gram->insert_array_item(item->m_token, item->m_item);
+ }
+
+ assert(merged_single_gram->set_array_header(merged_array_header));
+ assert(target->store(*token, merged_single_gram));
+ delete merged_single_gram;
+ g_array_free(merged_array, TRUE);
+ }
+
+ g_array_free(new_items, TRUE);
+ return true;
+}
+
+bool merge_two_k_mixture_model( /* in & out */ KMixtureModelBigram * target,
+ /* in */ KMixtureModelBigram * new_one ){
+ assert(NULL != target);
+ assert(NULL != new_one);
+ return merge_array_items(target, new_one) &&
+ merge_magic_header(target, new_one);
+}
+
+int main(int argc, char * argv[]){
+ int i = 1;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- merge k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ KMixtureModelBigram target(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ target.attach(result_filename, ATTACH_READWRITE|ATTACH_CREATE);
+
+ while (i < argc){
+ const char * new_filename = argv[i];
+ KMixtureModelBigram new_one(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ new_one.attach(new_filename, ATTACH_READONLY);
+ if ( !merge_two_k_mixture_model(&target, &new_one) )
+ exit(EOVERFLOW);
+ ++i;
+ }
+
+ return 0;
+}
diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp
new file mode 100644
index 0000000..40dfb87
--- /dev/null
+++ b/utils/training/prune_k_mixture_model.cpp
@@ -0,0 +1,192 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+
+#include <errno.h>
+#include <locale.h>
+#include <limits.h>
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+
+void print_help(){
+ printf("Usage: prune_k_mixture_model -k <INT> --CDF <DOUBLE> <FILENAME>\n");
+}
+
+static gint g_prune_k = 3;
+static parameter_t g_prune_poss = 0.99;
+
+static GOptionEntry entries[] =
+{
+ {"pruneK", 'k', 0, G_OPTION_ARG_INT, &g_prune_k, "k parameter", NULL},
+ {"CDF", 0, 0, G_OPTION_ARG_DOUBLE, &g_prune_poss, "CDF parameter", NULL},
+ {NULL}
+};
+
+
+bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header,
+ KMixtureModelSingleGram * & bigram,
+ FlexibleBigramPhraseArray removed_array){
+ bool success;
+
+ FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ bigram->retrieve_all(array);
+
+ for ( size_t i = 0; i < array->len; ++i) {
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i);
+ phrase_token_t token = item->m_token;
+ parameter_t remained_poss = 1; parameter_t one_poss = 0;
+ bool errors = false;
+ for ( size_t k = 0; k < g_prune_k; ++k){
+ one_poss = compute_Pr_G_3_with_count
+ (k, magic_header->m_N, item->m_item.m_WC,
+ magic_header->m_N - item->m_item.m_N_n_0,
+ item->m_item.m_n_1);
+ if ( !(0 <= one_poss && one_poss <= 1) )
+ errors = true;
+ remained_poss -= one_poss;
+ }
+
+ if ( fabs(remained_poss) < DBL_EPSILON )
+ remained_poss = 0.;
+
+ /* some wrong possibility. */
+ if ( errors || !(0 <= remained_poss && remained_poss <= 1) ) {
+ fprintf(stderr, "some wrong possibility is encountered:%f.\n",
+ remained_poss);
+ fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n",
+ g_prune_k, magic_header->m_N, item->m_item.m_WC,
+ magic_header->m_N - item->m_item.m_N_n_0,
+ item->m_item.m_n_1);
+ exit(EDOM);
+ }
+
+ if ( remained_poss < g_prune_poss ) {
+ /* prune this word or phrase. */
+ KMixtureModelArrayItem removed_item;
+ bigram->remove_array_item(token, removed_item);
+ assert( memcmp(&removed_item, &(item->m_item),
+ sizeof(KMixtureModelArrayItem)) == 0 );
+
+ KMixtureModelArrayItemWithToken removed_item_with_token;
+ removed_item_with_token.m_token = token;
+ removed_item_with_token.m_item = removed_item;
+ g_array_append_val(removed_array, removed_item_with_token);
+
+ KMixtureModelArrayHeader array_header;
+ bigram->get_array_header(array_header);
+ guint32 removed_count = removed_item.m_WC;
+ array_header.m_WC -= removed_count;
+ bigram->set_array_header(array_header);
+ magic_header->m_WC -= removed_count;
+ magic_header->m_total_freq -= removed_count;
+ }
+ }
+
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- prune k mixture model");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (2 != argc) {
+ fprintf(stderr, "wrong arguments.\n");
+ exit(EINVAL);
+ }
+
+ const gchar * bigram_filename = argv[1];
+
+ /* TODO: magic header signature check here. */
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(bigram_filename, ATTACH_READWRITE);
+
+ KMixtureModelMagicHeader magic_header;
+ if (!bigram.get_magic_header(magic_header)) {
+ fprintf(stderr, "no magic header in k mixture model.\n");
+ exit(ENODATA);
+ }
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram.get_all_items(items);
+
+ /* print prune progress */
+ size_t progress = 0; size_t onestep = items->len / 20;
+ for ( size_t i = 0; i < items->len; ++i ){
+ if ( progress >= onestep ) {
+ progress = 0; fprintf(stderr, "*");
+ }
+ progress ++;
+
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ bigram.load(*token, single_gram);
+
+ FlexibleBigramPhraseArray removed_array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+
+ prune_k_mixture_model(&magic_header, single_gram, removed_array);
+ bigram.store(*token, single_gram);
+
+ delete single_gram;
+
+ /* post processing for unigram reduce */
+ for (size_t m = 0; m < removed_array->len; ++m ){
+ KMixtureModelArrayItemWithToken * item =
+ &g_array_index(removed_array,
+ KMixtureModelArrayItemWithToken, m);
+ KMixtureModelArrayHeader array_header;
+ assert(bigram.get_array_header(item->m_token, array_header));
+ array_header.m_freq -= item->m_item.m_WC;
+ assert(array_header.m_freq >= 0);
+ assert(bigram.set_array_header(item->m_token, array_header));
+ }
+
+ g_array_free(removed_array, TRUE);
+ removed_array = NULL;
+ }
+
+ fprintf(stderr, "\n");
+
+ bigram.set_magic_header(magic_header);
+
+ /* post processing clean up zero items */
+ KMixtureModelArrayHeader array_header;
+ for ( size_t i = 0; i < items->len; ++i ){
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ assert(bigram.get_array_header(*token, array_header));
+ if ( 0 == array_header.m_WC && 0 == array_header.m_freq )
+ assert(bigram.remove(*token));
+ }
+
+ g_array_free(items, TRUE);
+
+ return 0;
+}
diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp
new file mode 100644
index 0000000..7c057b9
--- /dev/null
+++ b/utils/training/validate_k_mixture_model.cpp
@@ -0,0 +1,174 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "pinyin_internal.h"
+#include "k_mixture_model.h"
+
+void print_help(){
+ printf("Usage: validate_k_mixture_model <FILENAME>\n");
+}
+
+bool validate_unigram(KMixtureModelBigram * bigram){
+ KMixtureModelMagicHeader magic_header;
+ if( !bigram->get_magic_header(magic_header) ){
+ fprintf(stderr, "no magic header in k mixture model.\n");
+ return false;
+ }
+
+ guint32 expected_word_count = magic_header.m_WC;
+ if ( 0 == expected_word_count ){
+ fprintf(stderr, "word count in magic header is unexpected zero.\n");
+ return false;
+ }
+ guint32 expected_total_freq = magic_header.m_total_freq;
+ if ( 0 == expected_total_freq ){
+ fprintf(stderr, "total freq in magic header is unexpected zero.\n");
+ return false;
+ }
+
+ if ( expected_word_count != expected_total_freq ){
+ fprintf(stderr, "the word count doesn't match the total freq.\n");
+ return false;
+ }
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ guint32 word_count = 0; guint32 total_freq = 0;
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ KMixtureModelArrayHeader array_header;
+ assert(bigram->get_array_header(*token, array_header));
+ word_count += array_header.m_WC;
+ total_freq += array_header.m_freq;
+ }
+
+ if ( word_count != expected_word_count ){
+ fprintf(stderr, "word count in magic header:%d\n",
+ expected_word_count);
+ fprintf(stderr, "sum of word count in array headers:%d\n", word_count);
+ fprintf(stderr, "the sum differs from word count.\n");
+ return false;
+ }
+ if ( total_freq != expected_total_freq ){
+ fprintf(stderr, "total freq in magic header:%d\n",
+ expected_total_freq);
+ fprintf(stderr, "sum of freqs in array headers:%d\n", total_freq);
+ fprintf(stderr, "the total freq differs from sum of freqs.\n");
+ return false;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
+
+bool validate_bigram(KMixtureModelBigram * bigram){
+ bool result = true;
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram->get_all_items(items);
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
+ KMixtureModelSingleGram * single_gram = NULL;
+ assert(bigram->load(*token, single_gram));
+
+ FlexibleBigramPhraseArray array = g_array_new
+ (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
+ single_gram->retrieve_all(array);
+
+ KMixtureModelArrayHeader array_header;
+ assert(single_gram->get_array_header(array_header));
+
+ guint32 expected_sum = array_header.m_WC;
+ guint32 freq = array_header.m_freq;
+ if ( 0 == expected_sum ){
+ if ( 0 != array->len ){
+ fprintf(stderr, "in the array header of token %d:\n", *token);
+ fprintf(stderr, "word count is zero but has array items.\n");
+ result = false;
+ }
+ if ( 0 != freq ){
+ delete single_gram;
+ continue;
+ } else {
+ fprintf(stderr, "in the array header of token %d:\n", *token);
+ fprintf(stderr, "both word count and freq are "
+ "unexpected zero.\n");
+ result = false;
+ }
+ }
+
+ guint32 sum = 0;
+ for (size_t m = 0; m< array->len; ++m){
+ KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
+
+ sum += item->m_item.m_WC;
+ }
+
+ if ( sum != expected_sum ){
+ fprintf(stderr, "word count in array header:%d\n", expected_sum);
+ fprintf(stderr, "sum of word count in array items:%d\n", sum);
+ fprintf(stderr, "the sum differs from word count.\n");
+ result = false;
+ }
+
+ g_array_free(array, TRUE);
+ delete single_gram;
+ }
+
+ g_array_free(items, TRUE);
+ return result;
+}
+
+int main(int argc, char * argv[]){
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- validate k mixture model");
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (2 != argc) {
+ fprintf(stderr, "wrong arguments.\n");
+ exit(EINVAL);
+ }
+
+ const char * k_mixture_model_filename = argv[1];
+
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
+ bigram.attach(k_mixture_model_filename, ATTACH_READONLY);
+
+ if (!validate_unigram(&bigram)) {
+ fprintf(stderr, "k mixture model validation failed.\n");
+ exit(ENODATA);
+ }
+
+ if (!validate_bigram(&bigram)) {
+ fprintf(stderr, "k mixture model validation failed.\n");
+ exit(ENODATA);
+ }
+
+ return 0;
+}
diff --git a/utils/utils_helper.h b/utils/utils_helper.h
new file mode 100644
index 0000000..b91067b
--- /dev/null
+++ b/utils/utils_helper.h
@@ -0,0 +1,147 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef UTILS_HELPER_H
+#define UTILS_HELPER_H
+
+
+#define TAGLIB_GET_TOKEN(var, index) \
+ phrase_token_t var = null_token; \
+ { \
+ const char * string = (const char *) g_ptr_array_index \
+ (values, index); \
+ var = atoi(string); \
+ }
+
+#define TAGLIB_GET_PHRASE_STRING(var, index) \
+ const char * var = NULL; \
+ { \
+ var = (const char *) g_ptr_array_index \
+ (values, index); \
+ }
+
+#define TAGLIB_GET_TAGVALUE(type, var, conv) \
+ type var; \
+ { \
+ gpointer value = NULL; \
+ assert(g_hash_table_lookup_extended \
+ (required, #var, NULL, &value)); \
+ var = conv((const char *)value); \
+ }
+
+#define TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, var, line) \
+ phrase_token_t var = null_token; \
+ do { \
+ if (0 == strlen(line)) \
+ break; \
+ \
+ gchar ** strs = g_strsplit_set(line, " \t", 2); \
+ if (2 != g_strv_length(strs)) \
+ assert(false); \
+ \
+ phrase_token_t _token = atoi(strs[0]); \
+ const char * phrase = strs[1]; \
+ if (null_token != _token) \
+ assert(taglib_validate_token_with_string \
+ (phrase_index, _token, phrase)); \
+ \
+ var = _token; \
+ \
+ g_strfreev(strs); \
+ } while(false);
+
+
+static bool load_phrase_index(const pinyin_table_info_t * phrase_files,
+ FacadePhraseIndex * phrase_index) {
+ MemoryChunk * chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (SYSTEM_FILE != table_info->m_file_type)
+ continue;
+
+ const char * binfile = table_info->m_system_filename;
+
+ chunk = new MemoryChunk;
+ bool retval = chunk->load(binfile);
+ if (!retval) {
+ fprintf(stderr, "load %s failed!\n", binfile);
+ delete chunk;
+ return false;
+ }
+
+ phrase_index->load(i, chunk);
+ }
+ return true;
+}
+
+static bool save_phrase_index(const pinyin_table_info_t * phrase_files,
+ FacadePhraseIndex * phrase_index) {
+ MemoryChunk * new_chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (SYSTEM_FILE != table_info->m_file_type)
+ continue;
+
+ const char * binfile = table_info->m_system_filename;
+
+ new_chunk = new MemoryChunk;
+ phrase_index->store(i, new_chunk);
+ bool retval = new_chunk->save(binfile);
+ if (!retval) {
+ fprintf(stderr, "save %s failed.", binfile);
+ delete new_chunk;
+ return false;
+ }
+
+ phrase_index->load(i, new_chunk);
+ }
+ return true;
+}
+
+static bool save_dictionary(const pinyin_table_info_t * phrase_files,
+ FacadePhraseIndex * phrase_index) {
+ MemoryChunk * new_chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const pinyin_table_info_t * table_info = phrase_files + i;
+
+ if (DICTIONARY != table_info->m_file_type)
+ continue;
+
+ const char * binfile = table_info->m_system_filename;
+
+ new_chunk = new MemoryChunk;
+ phrase_index->store(i, new_chunk);
+ bool retval = new_chunk->save(binfile);
+ if (!retval) {
+ fprintf(stderr, "save %s failed.", binfile);
+ delete new_chunk;
+ return false;
+ }
+
+ phrase_index->load(i, new_chunk);
+ }
+ return true;
+}
+
+#endif