diff options
author | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
commit | b78429d78df745dd327b6dada6b9bd71ea5df84e (patch) | |
tree | 82c4625db8674c66d69fd566fce8efc347e3cb3a | |
download | libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.gz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.xz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.zip |
import libpinyin code
144 files changed, 29200 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1d8db03 --- /dev/null +++ b/.gitignore @@ -0,0 +1,85 @@ +configure +Makefile +Makefile.in +aclocal.m4 +config.log +INSTALL +autom4te.cache +config.guess +config.h +config.h.in +config.status +config.sub +depcomp +install-sh +libtool +ltmain.sh +missing +stamp-h1 +libltdl +*~ +*.o +*.lo +*.pyc +.deps +.libs +tags +TAGS +cscope.* +*.la +libpinyin.pc +libpinyin.spec +libpinyin.so* +src/lookup/liblookup.a +src/storage/libstorage.a +tests/include/test_memory_chunk +tests/lookup/test_phrase_lookup +tests/storage/test_flexible_ngram +tests/storage/test_ngram +tests/storage/test_parser +tests/storage/test_parser2 +tests/storage/test_phrase_index +tests/storage/test_phrase_index_logger +tests/storage/test_phrase_table +tests/test_chewing +tests/test_phrase +tests/test_pinyin +tests/lookup/test_pinyin_lookup +tests/storage/test_chewing_table +tests/storage/test_pinyin_table +utils/segment/ngseg +utils/segment/spseg +utils/storage/export_interpolation +utils/storage/gen_binary_files +utils/storage/gen_pinyin_table +utils/storage/gen_chewing_table +utils/storage/gen_zhuyin_map +utils/storage/import_interpolation +utils/training/estimate_interpolation +utils/training/estimate_k_mixture_model +utils/training/eval_correction_rate +utils/training/export_k_mixture_model +utils/training/gen_deleted_ngram +utils/training/gen_k_mixture_model +utils/training/gen_ngram +utils/training/gen_unigram +utils/training/import_k_mixture_model +utils/training/k_mixture_model_to_interpolation +utils/training/merge_k_mixture_model +utils/training/prune_k_mixture_model +utils/training/validate_k_mixture_model +data/bigram.db +data/gb_char.bin +data/gb_char.dbin +data/gb_char.table +data/gbk_char.bin +data/gbk_char.dbin +data/gbk_char.table +data/interpolation.text +data/phrase_index.bin +data/pinyin_index.bin +data/user.db +CMakeFiles +CMakeCache.txt +cmake_install.cmake +CTestTestfile.cmake @@ -0,0 +1,2 @@ +James Su 2002,2003,2006 <suzhe@tsinghua.edu.cn> +Peng Wu 2006-2007 2010-2011 <alexepico@gmail.com> diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..09421f6 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,151 @@ +## Copyright (C) 2011 BYVoid +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +######## Project settings +cmake_minimum_required(VERSION 2.8) +set (PACKAGE_NAME libpinyin) +project (${PACKAGE_NAME} CXX C) +enable_testing() + +######## Package information +set (PACKAGE_URL https://github.com/libpinyin/libpinyin) +set (PACKAGE_BUGREPORT https://github.com/libpinyin/libpinyin/issues) +set (LIBPINYIN_VERSION_MAJOR 0) +set (LIBPINYIN_VERSION_MINOR 7) +set (LIBPINYIN_VERSION_REVISION 0) +set (LIBPINYIN_BINARY_VERSION 2.0) + +if (CMAKE_BUILD_TYPE MATCHES Debug) + set (version_suffix .Debug) +endif (CMAKE_BUILD_TYPE MATCHES Debug) + +set ( + LIBPINYIN_VERSION + ${LIBPINYIN_VERSION_MAJOR}.${LIBPINYIN_VERSION_MINOR}.${LIBPINYIN_VERSION_REVISION}${version_suffix} +) + +set (VERSION ${LIBPINYIN_VERSION}) + +######## Validation + +include(CheckIncludeFileCXX) +check_include_file_cxx(locale.h HAVE_LOCALE_H) +check_include_file_cxx(libintl.h HAVE_LIBINTL_H) +check_include_file_cxx(stdlib.h HAVE_STDLIB_H) +check_include_file_cxx(string.h HAVE_STRING_H) +check_include_file_cxx(sys/time.h HAVE_SYS_TIME_H) +check_include_file_cxx(unistd.h HAVE_UNISTD_H) + +include(CheckFunctionExists) +check_function_exists(gettimeofday HAVE_GETTIMEOFDAY) +check_function_exists(malloc HAVE_MALLOC) +check_function_exists(memcmp HAVE_MEMCMP) +check_function_exists(memmove HAVE_MEMMOVE) +check_function_exists(memset HAVE_MEMSET) +check_function_exists(realloc HAVE_REALLOC) +check_function_exists(setlocale HAVE_SETLOCALE) +check_function_exists(stat HAVE_STAT) + +include(CheckTypeSize) +check_type_size(size_t SIZE_OF_SIZE_T) + +set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) +find_package(GLIB2 REQUIRED) +find_package(BerkeleyDB REQUIRED) + +######## Windows + +if (WIN32) + set(CMAKE_SHARED_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX}) + set(CMAKE_STATIC_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX}) +endif (WIN32) + +######## Directory + +set (DIR_PREFIX ${CMAKE_INSTALL_PREFIX}) +set (DIR_LIBRARY ${DIR_PREFIX}/${CMAKE_SHARED_LIBRARY_PREFIX}) +set (DIR_LIBRARY_STATIC ${DIR_PREFIX}/${CMAKE_STATIC_LIBRARY_PREFIX}) +set (DIR_INCLUDE ${DIR_PREFIX}/include) +set (DIR_SHARE ${DIR_PREFIX}/share) +set (DIR_BIN ${DIR_PREFIX}/bin) +set (DIR_ETC ${DIR_PREFIX}/etc) + +if (DEFINED CMAKE_INSTALL_LIBDIR) + set (DIR_LIBRARY ${CMAKE_INSTALL_LIBDIR}) + set (DIR_LIBRARY_STATIC ${CMAKE_INSTALL_LIBDIR}) +endif (DEFINED CMAKE_INSTALL_LIBDIR) + +if (DEFINED SHARE_INSTALL_PREFIX) + set (DIR_SHARE ${SHARE_INSTALL_PREFIX}) +endif (DEFINED SHARE_INSTALL_PREFIX) + +if (DEFINED INCLUDE_INSTALL_DIR) + set (DIR_INCLUDE ${INCLUDE_INSTALL_DIR}) +endif (DEFINED INCLUDE_INSTALL_DIR) + +if (DEFINED SYSCONF_INSTALL_DIR) + set (DIR_ETC ${SYSCONF_INSTALL_DIR}) +endif (DEFINED SYSCONF_INSTALL_DIR) + +set (DIR_SHARE_LIBPINYIN ${DIR_SHARE}/libpinyin) +set (DIR_INCLUDE_LIBPINYIN ${DIR_INCLUDE}/libpinyin-${LIBPINYIN_BINARY_VERSION}) + +######## Configuration + +set (prefix ${DIR_PREFIX}) +set (exec_prefix ${DIR_PREFIX}) +set (libdir ${DIR_LIBRARY}) +set (includedir ${DIR_INCLUDE}) +set (datadir ${DIR_SHARE}) + +configure_file( + libpinyin.pc.in + libpinyin.pc + @ONLY +) + +install( + FILES + ${CMAKE_BINARY_DIR}/libpinyin.pc + DESTINATION + ${DIR_LIBRARY}/pkgconfig +) + +######## Definition + +if (CMAKE_BUILD_TYPE MATCHES Debug) + add_definitions( + -O0 + -g3 + ) +endif (CMAKE_BUILD_TYPE MATCHES Debug) + +include_directories( + ${GLIB2_INCLUDE_DIR} + ${PROJECT_SOURCE_DIR}/src + ${PROJECT_SOURCE_DIR}/src/include + ${PROJECT_SOURCE_DIR}/src/storage + ${PROJECT_SOURCE_DIR}/src/lookup + ${PROJECT_SOURCE_DIR}/utils + ${PROJECT_SOURCE_DIR}/tests +) + +######## Subdirectories + +add_subdirectory(src) +add_subdirectory(tests) +add_subdirectory(utils) +add_subdirectory(data) @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..f773af9 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,37 @@ +version 0.6.91 +* support ucs4 characters; +* support guess sentence with prefix; +* initially support fuzzy pinyin segment. + +version 0.6.0 +* the first official release of 0.6.x. + +version 0.5.92 +* fixes new parsers and chewing large table; +* improves pinyin_save. + +version 0.5.91 +* some code re-factor and simplify; +* fixes the self-learning work around. + +version 0.5.0 +* the first official release of 0.5.x. + +version 0.4.93 +* fixes some bugs in new parsers. + +version 0.4.92 +* enable parallel make. + +version 0.4.91 +* New parsers for full pinyin/double pinyin/chewing. + * libpinyin now fully supports all pinyin auto corrections in +ibus-pinyin; + * libpinyin now better supports an/ang, en/eng, in/ing fuzzy +pinyin match. + +version 0.3.0 +* the first official release of 0.3.x. + +version 0.2.99 +* import from pinyin. diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..aac12f0 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,30 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +EXTRA_DIST = COPYING + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = src tests utils data doc + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I . + +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = libpinyin.pc @@ -0,0 +1,4 @@ +libpinyin +Library to deal with pinyin. + +The libpinyin project aims to provide the algorithms core for intelligent sentence-based Chinese pinyin input methods. diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 0000000..0eb6f85 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# Run this to generate all the initial makefiles, etc. + +srcdir=`dirname $0` +test -z "$srcdir" && srcdir=. + +PKG_NAME="libpinyin" + +(test -f $srcdir/configure.ac \ + && test -f $srcdir/README ) || { + echo -n "**Error**: Directory "\`$srcdir\'" does not look like the" + echo " top-level $PKG_NAME directory" + exit 1 +} + +which gnome-autogen.sh || { + echo "You need to install gnome-common from the GNOME CVS" + exit 1 +} + +(test -f $srcdir/ChangeLog) || { + touch $srcdir/ChangeLog +} + +CFLAGS=${CFLAGS-"-Wall -Werror"} + +ACLOCAL_FLAGS="$ACLOCAL_FLAGS" +REQUIRED_AUTOMAKE_VERSION=1.8 + +. gnome-autogen.sh "$@" diff --git a/cmake/FindBerkeleyDB.cmake b/cmake/FindBerkeleyDB.cmake new file mode 100644 index 0000000..749f166 --- /dev/null +++ b/cmake/FindBerkeleyDB.cmake @@ -0,0 +1,25 @@ +# - Try to find Berkeley DB +# Once done this will define +# +# BERKELEY_DB_FOUND - system has Berkeley DB +# BERKELEY_DB_INCLUDE_DIR - the Berkeley DB include directory +# BERKELEY_DB_LIBRARIES - Link these to use Berkeley DB +# BERKELEY_DB_DEFINITIONS - Compiler switches required for using Berkeley DB + +# Copyright (c) 2006, Alexander Dymo, <adymo@kdevelop.org> +# +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. + +FIND_PATH(BERKELEY_DB_INCLUDE_DIR db.h + /usr/include/db4 + /usr/local/include/db4 +) + +FIND_LIBRARY(BERKELEY_DB_LIBRARIES NAMES db ) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Berkeley "Could not find Berkeley DB >= 4.1" BERKELEY_DB_INCLUDE_DIR BERKELEY_DB_LIBRARIES) +# show the BERKELEY_DB_INCLUDE_DIR and BERKELEY_DB_LIBRARIES variables only in the advanced view +MARK_AS_ADVANCED(BERKELEY_DB_INCLUDE_DIR BERKELEY_DB_LIBRARIES ) + diff --git a/cmake/FindGLIB2.cmake b/cmake/FindGLIB2.cmake new file mode 100644 index 0000000..8c55991 --- /dev/null +++ b/cmake/FindGLIB2.cmake @@ -0,0 +1,53 @@ +# - Try to find the GLIB2 libraries +# Once done this will define +# +# GLIB2_FOUND - system has glib2 +# GLIB2_INCLUDE_DIR - the glib2 include directory +# GLIB2_LIBRARIES - glib2 library + +# Copyright (c) 2008 Laurent Montel, <montel@kde.org> +# +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. + + +if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) + # Already in cache, be silent + set(GLIB2_FIND_QUIETLY TRUE) +endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) + +find_package(PkgConfig) +pkg_check_modules(PC_LibGLIB2 QUIET glib-2.0) + +find_path(GLIB2_MAIN_INCLUDE_DIR + NAMES glib.h + HINTS ${PC_LibGLIB2_INCLUDEDIR} + PATH_SUFFIXES glib-2.0) + +find_library(GLIB2_LIBRARY + NAMES glib-2.0 + HINTS ${PC_LibGLIB2_LIBDIR} +) + +set(GLIB2_LIBRARIES ${GLIB2_LIBRARY}) + +# search the glibconfig.h include dir under the same root where the library is found +get_filename_component(glib2LibDir "${GLIB2_LIBRARIES}" PATH) + +find_path(GLIB2_INTERNAL_INCLUDE_DIR glibconfig.h + PATH_SUFFIXES glib-2.0/include + HINTS ${PC_LibGLIB2_INCLUDEDIR} "${glib2LibDir}" ${CMAKE_SYSTEM_LIBRARY_PATH}) + +set(GLIB2_INCLUDE_DIR "${GLIB2_MAIN_INCLUDE_DIR}") + +# not sure if this include dir is optional or required +# for now it is optional +if(GLIB2_INTERNAL_INCLUDE_DIR) + set(GLIB2_INCLUDE_DIR ${GLIB2_INCLUDE_DIR} "${GLIB2_INTERNAL_INCLUDE_DIR}") +endif(GLIB2_INTERNAL_INCLUDE_DIR) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GLIB2 DEFAULT_MSG GLIB2_LIBRARIES GLIB2_MAIN_INCLUDE_DIR) + +mark_as_advanced(GLIB2_INCLUDE_DIR GLIB2_LIBRARIES) + diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..34fe28a --- /dev/null +++ b/configure.ac @@ -0,0 +1,105 @@ +# -*- Autoconf -*- +# Process this file with autoconf to produce a configure script. + + +m4_define([libpinyin_major_version], [0]) +m4_define([libpinyin_minor_version], [9]) +m4_define([libpinyin_micro_version], [93]) +m4_define([libpinyin_abi_current], [4]) +m4_define([libpinyin_abi_revision], [0]) + +m4_define([libpinyin_version], + [libpinyin_major_version.libpinyin_minor_version.libpinyin_micro_version]) +m4_define([libpinyin_binary_version], + [libpinyin_abi_current.libpinyin_abi_revision]) + +AC_PREREQ(2.60) +AC_INIT([libpinyin], [libpinyin_version], [https://github.com/libpinyin/libpinyin/issues/new]) +AM_INIT_AUTOMAKE +AC_CONFIG_SRCDIR([config.h.in]) +AC_CONFIG_HEADER([config.h]) +m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])]) + +# Define a string for binary compatibility +m4_define([lt_current], [libpinyin_abi_current]) +m4_define([lt_revision], [libpinyin_abi_revision]) +LT_VERSION_INFO="lt_current:lt_revision" +AC_SUBST(LT_VERSION_INFO) + +LIBPINYIN_BINARY_VERSION="libpinyin_binary_version" +AC_SUBST(LIBPINYIN_BINARY_VERSION) + +# Checks for programs. +AC_PROG_CXX +AC_PROG_CC +AC_PROG_CPP +AC_PROG_INSTALL +AC_PROG_LN_S +AC_PROG_MAKE_SET + +AC_GNU_SOURCE + +# Init libtool +AC_PROG_LIBTOOL +AC_SUBST(LIBTOOL_DEPS) + +# libtool option to control which symbols are exported +# right now, symbols starting with _ are not exported +LIBTOOL_EXPORT_OPTIONS='-export-symbols-regex "^[[^_]].*"' +AC_SUBST(LIBTOOL_EXPORT_OPTIONS) + +# Checks for libraries. +PKG_CHECK_MODULES(GLIB2, [glib-2.0 >= 2.4.0]) + +# Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS([locale.h stdlib.h string.h sys/time.h unistd.h]) + +# Checks for typedefs, structures, and compiler characteristics. +AC_HEADER_STDBOOL +AC_C_CONST +AC_C_INLINE +AC_TYPE_SIZE_T +AC_HEADER_TIME + +# Checks for library functions. +AC_FUNC_MALLOC +AC_FUNC_MEMCMP +AC_FUNC_REALLOC +AC_FUNC_STAT +AC_FUNC_MMAP +AC_CHECK_FUNCS([gettimeofday memmove memset setlocale]) + +AC_CHECK_HEADERS([libintl.h string.h]) + +AC_CHECK_HEADER([db.h], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4])) + +AC_SEARCH_LIBS([db_create], [db], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4])) + + +AC_CONFIG_FILES([libpinyin.pc + libpinyin.spec + Makefile + doc/Makefile + data/Makefile + src/Makefile + src/include/Makefile + src/storage/Makefile + src/lookup/Makefile + tests/Makefile + tests/include/Makefile + tests/storage/Makefile + tests/lookup/Makefile + utils/Makefile + utils/storage/Makefile + utils/segment/Makefile + utils/training/Makefile +]) + +AC_OUTPUT + +AC_MSG_RESULT([ +Build options: + Version $VERSION + Install prefix $prefix +]) diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt new file mode 100644 index 0000000..7301279 --- /dev/null +++ b/data/CMakeLists.txt @@ -0,0 +1,95 @@ +set( + BINARY_MODEL_DATA + gb_char.bin + gbk_char.bin + phrase_index.bin + pinyin_index.bin + bigram.db +) + +set( + BINARY_MODEL_DATA_FILES + ${CMAKE_BINARY_DIR}/data/gb_char.bin + ${CMAKE_BINARY_DIR}/data/gbk_char.bin + ${CMAKE_BINARY_DIR}/data/phrase_index.bin + ${CMAKE_BINARY_DIR}/data/pinyin_index.bin + ${CMAKE_BINARY_DIR}/data/bigram.db +) + +set( + gen_binary_files_BIN + ${CMAKE_BINARY_DIR}/utils/storage/gen_binary_files +) + +set( + import_interpolation_BIN + ${CMAKE_BINARY_DIR}/utils/storage/import_interpolation +) + +set( + gen_unigram_BIN + ${CMAKE_BINARY_DIR}/utils/training/gen_unigram +) + +add_custom_target( + data + ALL + DEPENDS + ${BINARY_MODEL_DATA} +) + +add_custom_command( + OUTPUT + ${CMAKE_SOURCE_DIR}/data/gb_char.table + ${CMAKE_SOURCE_DIR}/data/gbk_char.table + ${CMAKE_SOURCE_DIR}/data/interpolation2.text + COMMENT + "Downloading textual model data..." + COMMAND + wget http://downloads.sourceforge.net/libpinyin/models/model5.text.tar.gz + COMMAND + tar xvf model5.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data +) + +add_custom_command( + OUTPUT + gb_char.bin + gbk_char.bin + phrase_index.bin + pinyin_index.bin + COMMENT + "Building binary model data..." + COMMAND + ${gen_binary_files_BIN} --table-dir ${CMAKE_SOURCE_DIR}/data + DEPENDS + gen_binary_files + ${CMAKE_SOURCE_DIR}/data/gb_char.table + ${CMAKE_SOURCE_DIR}/data/gbk_char.table +) + +add_custom_command( + OUTPUT + bigram.db + COMMENT + "Building binary bigram data..." + COMMAND + ${import_interpolation_BIN} < ${CMAKE_SOURCE_DIR}/data/interpolation2.text + COMMAND + ${gen_unigram_BIN} + DEPENDS + import_interpolation + ${CMAKE_SOURCE_DIR}/data/interpolation2.text +) + +install( + FILES + ${BINARY_MODEL_DATA_FILES} + DESTINATION + ${DIR_SHARE_LIBPINYIN}/data +) + +set_directory_properties( + PROPERTIES + ADDITIONAL_MAKE_CLEAN_FILES + ${BINARY_MODEL_DATA_FILES} +) diff --git a/data/Makefile.am b/data/Makefile.am new file mode 100644 index 0000000..c75fd95 --- /dev/null +++ b/data/Makefile.am @@ -0,0 +1,67 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2011 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +tablefiles = gb_char.table gbk_char.table \ + merged.table \ + art.table culture.table economy.table \ + geology.table history.table life.table \ + nature.table scitech.table \ + society.table sport.table + +binfiles = ${tablefiles:.table=.bin} + + +textual_model_data = interpolation2.text \ + $(tablefiles) + + +binary_model_data = phrase_index.bin pinyin_index.bin \ + bigram.db \ + $(binfiles) + + +MAINTAINERCLEANFILES = Makefile.in + +EXTRA_DIST = $(textual_model_data) \ + table.conf + +libpinyin_db_DATA = $(binary_model_data) \ + table.conf + +libpinyin_dbdir = $(libdir)/libpinyin/data + +CLEANFILES = $(binary_model_data) + +interpolation2.text: + wget http://downloads.sourceforge.net/libpinyin/models/model6.text.tar.gz + tar xvf model6.text.tar.gz -C $(top_srcdir)/data + + +$(tablefiles): interpolation2.text + +bigram.db: $(textual_model_data) + $(RM) $(binary_model_data) + ../utils/storage/gen_binary_files --table-dir $(top_srcdir)/data + ../utils/storage/import_interpolation --table-dir $(top_srcdir)/data < $(top_srcdir)/data/interpolation2.text + ../utils/training/gen_unigram --table-dir $(top_srcdir)/data + +phrase_index.bin pinyin_index.bin $(binfiles): bigram.db + +modify: + git reset --hard + sed -i -r -e "s'lambda parameter:0\\.[0-9]{3,6}'lambda parameter:$(LAMBDA_PARAMETER)'" table.conf diff --git a/data/table.conf b/data/table.conf new file mode 100644 index 0000000..096907c --- /dev/null +++ b/data/table.conf @@ -0,0 +1,17 @@ +binary format version:3 +model data version:6 +lambda parameter:0.276607 + +4 art.table art.bin art.dbin DICTIONARY +5 culture.table culture.bin culture.dbin DICTIONARY +6 economy.table economy.bin economy.dbin DICTIONARY +7 geology.table geology.bin geology.dbin DICTIONARY +8 history.table history.bin history.dbin DICTIONARY + +9 life.table life.bin life.dbin DICTIONARY +10 nature.table nature.bin nature.dbin DICTIONARY +11 scitech.table scitech.bin scitech.dbin DICTIONARY +12 society.table society.bin society.dbin DICTIONARY +13 sport.table sport.bin sport.dbin DICTIONARY + +14 NULL NULL network.bin USER_FILE
\ No newline at end of file diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 0000000..358100e --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1,24 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +man_MANS = libpinyin.1 \ + gen_binary_files.1 \ + import_interpolation.1 \ + gen_unigram.1 + +EXTRA_DIST = $(man_MANS) diff --git a/doc/gen_binary_files.1 b/doc/gen_binary_files.1 new file mode 100644 index 0000000..394a953 --- /dev/null +++ b/doc/gen_binary_files.1 @@ -0,0 +1 @@ +.so man1/libpinyin.1 diff --git a/doc/gen_unigram.1 b/doc/gen_unigram.1 new file mode 100644 index 0000000..394a953 --- /dev/null +++ b/doc/gen_unigram.1 @@ -0,0 +1 @@ +.so man1/libpinyin.1 diff --git a/doc/import_interpolation.1 b/doc/import_interpolation.1 new file mode 100644 index 0000000..394a953 --- /dev/null +++ b/doc/import_interpolation.1 @@ -0,0 +1 @@ +.so man1/libpinyin.1 diff --git a/doc/libpinyin.1 b/doc/libpinyin.1 new file mode 100644 index 0000000..419ef90 --- /dev/null +++ b/doc/libpinyin.1 @@ -0,0 +1,38 @@ +.TH LIBPINYIN "1" "Fed 2012" "libpinyin" "User Commands" + +.SH NAME +libpinyin \- Library to deal with pinyin + +.SH DESCRIPTION +The libpinyin project aims to provide the algorithms core for intelligent sentence-based Chinese pinyin input methods. + +.SH TOOLS +gen_binary_files \- generate initially binary pinyin libraries +import_interpolation \- import libpinyin textual format model data +gen_unigram \- increase the unigram frequency for all phrases + +.SH USAGE +.HP +gen_binary_files --table-dir <DIRNAME> +.RS +.HP +.B --table-dir +Read textual format files from the <DIRNAME> directory. +.RE +.HP +import_interpolation \< <MODELFILE> +.HP +gen_unigram + +.SH EXAMPLE +Download the model.text.tar.gz, and extracts all files into a folder, then run the commands below to generate the binary model data. + +.RS +rm gb_char.bin gbk_char.bin phrase_index.bin pinyin_index.bin bigram.db + +gen_binary_files --table-dir ../data + +import_interpolation < ../data/interpolation.text + +gen_unigram +.RE diff --git a/libpinyin.pc.in b/libpinyin.pc.in new file mode 100644 index 0000000..ea08282 --- /dev/null +++ b/libpinyin.pc.in @@ -0,0 +1,15 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ +pkgdatadir=@libdir@/libpinyin + +libpinyinincludedir=${includedir}/libpinyin-@VERSION@ +libpinyin_binary_version=@LIBPINYIN_BINARY_VERSION@ + +Name: libpinyin +Description: Library to deal with pinyin +Version: @VERSION@ +Requires: glib-2.0 +Libs: -L${libdir} -lpinyin +Cflags: -I${libpinyinincludedir} diff --git a/libpinyin.spec.in b/libpinyin.spec.in new file mode 100644 index 0000000..00be0d0 --- /dev/null +++ b/libpinyin.spec.in @@ -0,0 +1,121 @@ +Name: libpinyin +Version: @VERSION@ +Release: 1%{?dist} +Summary: Library to deal with pinyin + +License: GPLv2+ +URL: https://github.com/libpinyin/libpinyin +Source0: http://downloads.sourceforge.net/libpinyin/libpinyin/%{name}-%{version}.tar.gz + +BuildRequires: db4-devel, glib2-devel +Requires: %{name}-data%{?_isa} = %{version}-%{release} + +%description +The libpinyin project aims to provide the algorithms core +for intelligent sentence-based Chinese pinyin input methods. + + +%package devel +Summary: Development files for %{name} +Requires: %{name} = %{version}-%{release} + +%description devel +The %{name}-devel package contains libraries and header files for +developing applications that use %{name}. + + +%package data +Summary: Data files for %{name} +Requires: %{name} = %{version}-%{release} + +%description data +The %{name}-data package contains data files. + + +%package tools +Summary: Tools for %{name} +Requires: %{name} = %{version}-%{release} + +%description tools +The %{name}-tools package contains tools. + + +%prep +%setup -q + + +%build +%configure --disable-static +make %{?_smp_mflags} + +%install +make install DESTDIR=$RPM_BUILD_ROOT +find $RPM_BUILD_ROOT -name '*.la' -exec rm -f {} ';' + + +%post -p /sbin/ldconfig + +%postun -p /sbin/ldconfig + + +%files +%doc AUTHORS COPYING README +%{_libdir}/*.so.* +%dir %{_datadir}/libpinyin + +%files devel +%doc +%dir %{_includedir}/libpinyin-@VERSION@ +%{_includedir}/libpinyin-@VERSION@/* +%{_libdir}/*.so +%{_libdir}/pkgconfig/libpinyin.pc + +%files data +%doc +%{_datadir}/libpinyin/data + +%files tools +%{_bindir}/gen_binary_files +%{_bindir}/import_interpolation +%{_bindir}/gen_unigram +%{_mandir}/man1/*.1.* + +%changelog +* Thu May 24 2012 Peng Wu <pwu@redhat.com> - 0.6.91-1 +- Update to 0.6.91 + +* Mon Feb 13 2012 Peng Wu <pwu@redhat.com> - 0.5.91-1 +- Update to 0.5.91 + +* Wed Jan 18 2012 Peng Wu <pwu@redhat.com> - 0.5.0-1 +- Update to 0.5.0 + +* Fri Jan 13 2012 Peng Wu <pwu@redhat.com> - 0.4.93-1 +- Update to 0.4.93 + +* Mon Jan 09 2012 Peng Wu <pwu@redhat.com> - 0.4.92-2 +- Split tools sub package + +* Thu Dec 29 2011 Peng Wu <pwu@redhat.com> - 0.4.92-1 +- Update to 0.4.92 + +* Tue Dec 27 2011 Peng Wu <pwu@redhat.com> - 0.4.91-1 +- Update to 0.4.91 + +* Fri Nov 18 2011 Peng Wu <pwu@redhat.com> - 0.3.0-1 +- Update to 0.3.0 + +* Thu Nov 03 2011 Peng Wu <pwu@redhat.com> - 0.2.99.3-1 +- Update to 0.2.99.3 + +* Tue Oct 11 2011 Peng Wu <pwu@redhat.com> - 0.2.99.2-1 +- Update to 0.2.99.2 + +* Wed Sep 28 2011 Peng Wu <pwu@redhat.com> - 0.2.99.1-1 +- Update to 0.2.99.1 + +* Thu Sep 08 2011 Peng Wu <pwu@redhat.com> - 0.2.99-2 +- Split data sub package + +* Wed Aug 31 2011 Peng Wu <alexepico@gmail.com> - 0.2.99-1 +- Initial version diff --git a/scripts/Makefile.data b/scripts/Makefile.data new file mode 100644 index 0000000..7929e97 --- /dev/null +++ b/scripts/Makefile.data @@ -0,0 +1,15 @@ +all: pinyins.txt + + +pinyins.txt: + python3 genpinyins.py + + +update-header: + python3 genpinyinheader.py > ../src/storage/pinyin_parser_table.h + python3 gendoublepinyinheader.py > ../src/storage/double_pinyin_table.h + python3 genbopomofoheader.py > ../src/storage/chewing_table.h + python3 genchewingkey.py > ../src/storage/chewing_enum.h + + +.PHONY: pinyins.txt diff --git a/scripts/bopomofo.py b/scripts/bopomofo.py new file mode 100644 index 0000000..91a8744 --- /dev/null +++ b/scripts/bopomofo.py @@ -0,0 +1,530 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (c) 2010 BYVoid <byvoid1@gmail.com> +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +BOPOMOFO_PINYIN_MAP = { + "ㄅ" : "b", + "ㄅㄚ" : "ba", + "ㄅㄛ" : "bo", + "ㄅㄞ" : "bai", + "ㄅㄟ" : "bei", + "ㄅㄠ" : "bao", + "ㄅㄢ" : "ban", + "ㄅㄣ" : "ben", + "ㄅㄤ" : "bang", + "ㄅㄥ" : "beng", + "ㄅㄧ" : "bi", + "ㄅㄧㄝ" : "bie", + "ㄅㄧㄠ" : "biao", + "ㄅㄧㄢ" : "bian", + "ㄅㄧㄣ" : "bin", + "ㄅㄧㄥ" : "bing", + "ㄅㄨ" : "bu", + "ㄆ" : "p", + "ㄆㄚ" : "pa", + "ㄆㄛ" : "po", + "ㄆㄞ" : "pai", + "ㄆㄟ" : "pei", + "ㄆㄠ" : "pao", + "ㄆㄡ" : "pou", + "ㄆㄢ" : "pan", + "ㄆㄣ" : "pen", + "ㄆㄤ" : "pang", + "ㄆㄥ" : "peng", + "ㄆㄧ" : "pi", + "ㄆㄧㄝ" : "pie", + "ㄆㄧㄠ" : "piao", + "ㄆㄧㄢ" : "pian", + "ㄆㄧㄣ" : "pin", + "ㄆㄧㄥ" : "ping", + "ㄆㄨ" : "pu", + "ㄇ" : "m", + "ㄇㄚ" : "ma", + "ㄇㄛ" : "mo", + "ㄇㄜ" : "me", + "ㄇㄞ" : "mai", + "ㄇㄟ" : "mei", + "ㄇㄠ" : "mao", + "ㄇㄡ" : "mou", + "ㄇㄢ" : "man", + "ㄇㄣ" : "men", + "ㄇㄤ" : "mang", + "ㄇㄥ" : "meng", + "ㄇㄧ" : "mi", + "ㄇㄧㄝ" : "mie", + "ㄇㄧㄠ" : "miao", + "ㄇㄧㄡ" : "miu", + "ㄇㄧㄢ" : "mian", + "ㄇㄧㄣ" : "min", + "ㄇㄧㄥ" : "ming", + "ㄇㄨ" : "mu", + "ㄈ" : "f", + "ㄈㄚ" : "fa", + "ㄈㄛ" : "fo", + "ㄈㄜ" : "fe", + "ㄈㄟ" : "fei", + "ㄈㄡ" : "fou", + "ㄈㄢ" : "fan", + "ㄈㄣ" : "fen", + "ㄈㄤ" : "fang", + "ㄈㄥ" : "feng", + "ㄈㄨ" : "fu", + "ㄉ" : "d", + "ㄉㄚ" : "da", + "ㄉㄜ" : "de", + "ㄉㄞ" : "dai", + "ㄉㄟ" : "dei", + "ㄉㄠ" : "dao", + "ㄉㄡ" : "dou", + "ㄉㄢ" : "dan", + "ㄉㄣ" : "den", + "ㄉㄤ" : "dang", + "ㄉㄥ" : "deng", + "ㄉㄧ" : "di", + "ㄉㄧㄚ" : "dia", + "ㄉㄧㄝ" : "die", + "ㄉㄧㄠ" : "diao", + "ㄉㄧㄡ" : "diu", + "ㄉㄧㄢ" : "dian", + "ㄉㄧㄣ" : "din", + "ㄉㄧㄥ" : "ding", + "ㄉㄨ" : "du", + "ㄉㄨㄛ" : "duo", + "ㄉㄨㄟ" : "dui", + "ㄉㄨㄢ" : "duan", + "ㄉㄨㄣ" : "dun", + "ㄉㄨㄥ" : "dong", + "ㄊ" : "t", + "ㄊㄚ" : "ta", + "ㄊㄜ" : "te", + "ㄊㄞ" : "tai", + "ㄊㄠ" : "tao", + "ㄊㄡ" : "tou", + "ㄊㄢ" : "tan", + "ㄊㄤ" : "tang", + "ㄊㄥ" : "teng", + "ㄊㄧ" : "ti", + "ㄊㄧㄝ" : "tie", + "ㄊㄧㄠ" : "tiao", + "ㄊㄧㄢ" : "tian", + "ㄊㄧㄥ" : "ting", + "ㄊㄨ" : "tu", + "ㄊㄨㄛ" : "tuo", + "ㄊㄨㄟ" : "tui", + "ㄊㄨㄢ" : "tuan", + "ㄊㄨㄣ" : "tun", + "ㄊㄨㄥ" : "tong", + "ㄋ" : "n", + "ㄋㄚ" : "na", + "ㄋㄜ" : "ne", + "ㄋㄞ" : "nai", + "ㄋㄟ" : "nei", + "ㄋㄠ" : "nao", + "ㄋㄡ" : "nou", + "ㄋㄢ" : "nan", + "ㄋㄣ" : "nen", + "ㄋㄤ" : "nang", + "ㄋㄥ" : "neng", + "ㄋㄧ" : "ni", + "ㄋㄧㄚ" : "nia", + "ㄋㄧㄝ" : "nie", + "ㄋㄧㄠ" : "niao", + "ㄋㄧㄡ" : "niu", + "ㄋㄧㄢ" : "nian", + "ㄋㄧㄣ" : "nin", + "ㄋㄧㄤ" : "niang", + "ㄋㄧㄥ" : "ning", + "ㄋㄨ" : "nu", + "ㄋㄨㄛ" : "nuo", + "ㄋㄨㄢ" : "nuan", + "ㄋㄨㄣ" : "nun", + "ㄋㄨㄥ" : "nong", + "ㄋㄩ" : "nv", + "ㄋㄩㄝ" : "nve", + "ㄌ" : "l", + "ㄌㄚ" : "la", + "ㄌㄛ" : "lo", + "ㄌㄜ" : "le", + "ㄌㄞ" : "lai", + "ㄌㄟ" : "lei", + "ㄌㄠ" : "lao", + "ㄌㄡ" : "lou", + "ㄌㄢ" : "lan", + "ㄌㄣ" : "len", + "ㄌㄤ" : "lang", + "ㄌㄥ" : "leng", + "ㄌㄧ" : "li", + "ㄌㄧㄚ" : "lia", + "ㄌㄧㄝ" : "lie", + "ㄌㄧㄠ" : "liao", + "ㄌㄧㄡ" : "liu", + "ㄌㄧㄢ" : "lian", + "ㄌㄧㄣ" : "lin", + "ㄌㄧㄤ" : "liang", + "ㄌㄧㄥ" : "ling", + "ㄌㄨ" : "lu", + "ㄌㄨㄛ" : "luo", + "ㄌㄨㄢ" : "luan", + "ㄌㄨㄣ" : "lun", + "ㄌㄨㄥ" : "long", + "ㄌㄩ" : "lv", + "ㄌㄩㄝ" : "lve", + "ㄍ" : "g", + "ㄍㄚ" : "ga", + "ㄍㄜ" : "ge", + "ㄍㄞ" : "gai", + "ㄍㄟ" : "gei", + "ㄍㄠ" : "gao", + "ㄍㄡ" : "gou", + "ㄍㄢ" : "gan", + "ㄍㄣ" : "gen", + "ㄍㄤ" : "gang", + "ㄍㄥ" : "geng", + "ㄍㄨ" : "gu", + "ㄍㄨㄚ" : "gua", + "ㄍㄨㄛ" : "guo", + "ㄍㄨㄞ" : "guai", + "ㄍㄨㄟ" : "gui", + "ㄍㄨㄢ" : "guan", + "ㄍㄨㄣ" : "gun", + "ㄍㄨㄤ" : "guang", + "ㄍㄨㄥ" : "gong", + "ㄎ" : "k", + "ㄎㄚ" : "ka", + "ㄎㄜ" : "ke", + "ㄎㄞ" : "kai", + "ㄎㄟ" : "kei", + "ㄎㄠ" : "kao", + "ㄎㄡ" : "kou", + "ㄎㄢ" : "kan", + "ㄎㄣ" : "ken", + "ㄎㄤ" : "kang", + "ㄎㄥ" : "keng", + "ㄎㄨ" : "ku", + "ㄎㄨㄚ" : "kua", + "ㄎㄨㄛ" : "kuo", + "ㄎㄨㄞ" : "kuai", + "ㄎㄨㄟ" : "kui", + "ㄎㄨㄢ" : "kuan", + "ㄎㄨㄣ" : "kun", + "ㄎㄨㄤ" : "kuang", + "ㄎㄨㄥ" : "kong", + "ㄏ" : "h", + "ㄏㄚ" : "ha", + "ㄏㄜ" : "he", + "ㄏㄞ" : "hai", + "ㄏㄟ" : "hei", + "ㄏㄠ" : "hao", + "ㄏㄡ" : "hou", + "ㄏㄢ" : "han", + "ㄏㄣ" : "hen", + "ㄏㄤ" : "hang", + "ㄏㄥ" : "heng", + "ㄏㄨ" : "hu", + "ㄏㄨㄚ" : "hua", + "ㄏㄨㄛ" : "huo", + "ㄏㄨㄞ" : "huai", + "ㄏㄨㄟ" : "hui", + "ㄏㄨㄢ" : "huan", + "ㄏㄨㄣ" : "hun", + "ㄏㄨㄤ" : "huang", + "ㄏㄨㄥ" : "hong", + "ㄐ" : "j", + "ㄐㄧ" : "ji", + "ㄐㄧㄚ" : "jia", + "ㄐㄧㄝ" : "jie", + "ㄐㄧㄠ" : "jiao", + "ㄐㄧㄡ" : "jiu", + "ㄐㄧㄢ" : "jian", + "ㄐㄧㄣ" : "jin", + "ㄐㄧㄤ" : "jiang", + "ㄐㄧㄥ" : "jing", + "ㄐㄩ" : "ju", + "ㄐㄩㄝ" : "jue", + "ㄐㄩㄢ" : "juan", + "ㄐㄩㄣ" : "jun", + "ㄐㄩㄥ" : "jiong", + "ㄑ" : "q", + "ㄑㄧ" : "qi", + "ㄑㄧㄚ" : "qia", + "ㄑㄧㄝ" : "qie", + "ㄑㄧㄠ" : "qiao", + "ㄑㄧㄡ" : "qiu", + "ㄑㄧㄢ" : "qian", + "ㄑㄧㄣ" : "qin", + "ㄑㄧㄤ" : "qiang", + "ㄑㄧㄥ" : "qing", + "ㄑㄩ" : "qu", + "ㄑㄩㄝ" : "que", + "ㄑㄩㄢ" : "quan", + "ㄑㄩㄣ" : "qun", + "ㄑㄩㄥ" : "qiong", + "ㄒ" : "x", + "ㄒㄧ" : "xi", + "ㄒㄧㄚ" : "xia", + "ㄒㄧㄝ" : "xie", + "ㄒㄧㄠ" : "xiao", + "ㄒㄧㄡ" : "xiu", + "ㄒㄧㄢ" : "xian", + "ㄒㄧㄣ" : "xin", + "ㄒㄧㄤ" : "xiang", + "ㄒㄧㄥ" : "xing", + "ㄒㄩ" : "xu", + "ㄒㄩㄝ" : "xue", + "ㄒㄩㄢ" : "xuan", + "ㄒㄩㄣ" : "xun", + "ㄒㄩㄥ" : "xiong", + "ㄓ" : "zhi", + "ㄓㄚ" : "zha", + "ㄓㄜ" : "zhe", + "ㄓㄞ" : "zhai", + "ㄓㄟ" : "zhei", + "ㄓㄠ" : "zhao", + "ㄓㄡ" : "zhou", + "ㄓㄢ" : "zhan", + "ㄓㄣ" : "zhen", + "ㄓㄤ" : "zhang", + "ㄓㄥ" : "zheng", + "ㄓㄨ" : "zhu", + "ㄓㄨㄚ" : "zhua", + "ㄓㄨㄛ" : "zhuo", + "ㄓㄨㄞ" : "zhuai", + "ㄓㄨㄟ" : "zhui", + "ㄓㄨㄢ" : "zhuan", + "ㄓㄨㄣ" : "zhun", + "ㄓㄨㄤ" : "zhuang", + "ㄓㄨㄥ" : "zhong", + "ㄔ" : "chi", + "ㄔㄚ" : "cha", + "ㄔㄜ" : "che", + "ㄔㄞ" : "chai", + "ㄔㄠ" : "chao", + "ㄔㄡ" : "chou", + "ㄔㄢ" : "chan", + "ㄔㄣ" : "chen", + "ㄔㄤ" : "chang", + "ㄔㄥ" : "cheng", + "ㄔㄨ" : "chu", + "ㄔㄨㄚ" : "chua", + "ㄔㄨㄛ" : "chuo", + "ㄔㄨㄞ" : "chuai", + "ㄔㄨㄟ" : "chui", + "ㄔㄨㄢ" : "chuan", + "ㄔㄨㄣ" : "chun", + "ㄔㄨㄤ" : "chuang", + "ㄔㄨㄥ" : "chong", + "ㄕ" : "shi", + "ㄕㄚ" : "sha", + "ㄕㄜ" : "she", + "ㄕㄞ" : "shai", + "ㄕㄟ" : "shei", + "ㄕㄠ" : "shao", + "ㄕㄡ" : "shou", + "ㄕㄢ" : "shan", + "ㄕㄣ" : "shen", + "ㄕㄤ" : "shang", + "ㄕㄥ" : "sheng", + "ㄕㄨ" : "shu", + "ㄕㄨㄚ" : "shua", + "ㄕㄨㄛ" : "shuo", + "ㄕㄨㄞ" : "shuai", + "ㄕㄨㄟ" : "shui", + "ㄕㄨㄢ" : "shuan", + "ㄕㄨㄣ" : "shun", + "ㄕㄨㄤ" : "shuang", + "ㄖ" : "ri", + "ㄖㄜ" : "re", + "ㄖㄠ" : "rao", + "ㄖㄡ" : "rou", + "ㄖㄢ" : "ran", + "ㄖㄣ" : "ren", + "ㄖㄤ" : "rang", + "ㄖㄥ" : "reng", + "ㄖㄨ" : "ru", + "ㄖㄨㄚ" : "rua", + "ㄖㄨㄛ" : "ruo", + "ㄖㄨㄟ" : "rui", + "ㄖㄨㄢ" : "ruan", + "ㄖㄨㄣ" : "run", + "ㄖㄨㄥ" : "rong", + "ㄗ" : "zi", + "ㄗㄚ" : "za", + "ㄗㄜ" : "ze", + "ㄗㄞ" : "zai", + "ㄗㄟ" : "zei", + "ㄗㄠ" : "zao", + "ㄗㄡ" : "zou", + "ㄗㄢ" : "zan", + "ㄗㄣ" : "zen", + "ㄗㄤ" : "zang", + "ㄗㄥ" : "zeng", + "ㄗㄨ" : "zu", + "ㄗㄨㄛ" : "zuo", + "ㄗㄨㄟ" : "zui", + "ㄗㄨㄢ" : "zuan", + "ㄗㄨㄣ" : "zun", + "ㄗㄨㄥ" : "zong", + "ㄘ" : "ci", + "ㄘㄚ" : "ca", + "ㄘㄜ" : "ce", + "ㄘㄞ" : "cai", + "ㄘㄠ" : "cao", + "ㄘㄡ" : "cou", + "ㄘㄢ" : "can", + "ㄘㄣ" : "cen", + "ㄘㄤ" : "cang", + "ㄘㄥ" : "ceng", + "ㄘㄨ" : "cu", + "ㄘㄨㄛ" : "cuo", + "ㄘㄨㄟ" : "cui", + "ㄘㄨㄢ" : "cuan", + "ㄘㄨㄣ" : "cun", + "ㄘㄨㄥ" : "cong", + "ㄙ" : "si", + "ㄙㄚ" : "sa", + "ㄙㄜ" : "se", + "ㄙㄞ" : "sai", + "ㄙㄠ" : "sao", + "ㄙㄡ" : "sou", + "ㄙㄢ" : "san", + "ㄙㄣ" : "sen", + "ㄙㄤ" : "sang", + "ㄙㄥ" : "seng", + "ㄙㄨ" : "su", + "ㄙㄨㄛ" : "suo", + "ㄙㄨㄟ" : "sui", + "ㄙㄨㄢ" : "suan", + "ㄙㄨㄣ" : "sun", + "ㄙㄨㄥ" : "song", + "ㄚ" : "a", + "ㄛ" : "o", + "ㄜ" : "e", + "ㄞ" : "ai", + "ㄟ" : "ei", + "ㄠ" : "ao", + "ㄡ" : "ou", + "ㄢ" : "an", + "ㄣ" : "en", + "ㄤ" : "ang", + "ㄥ" : "eng", + "ㄦ" : "er", + "ㄧ" : "yi", + "ㄧㄚ" : "ya", + "ㄧㄛ" : "yo", + "ㄧㄝ" : "ye", + "ㄧㄞ" : "yai", + "ㄧㄠ" : "yao", + "ㄧㄡ" : "you", + "ㄧㄢ" : "yan", + "ㄧㄣ" : "yin", + "ㄧㄤ" : "yang", + "ㄧㄥ" : "ying", + "ㄨ" : "wu", + "ㄨㄚ" : "wa", + "ㄨㄛ" : "wo", + "ㄨㄞ" : "wai", + "ㄨㄟ" : "wei", + "ㄨㄢ" : "wan", + "ㄨㄣ" : "wen", + "ㄨㄤ" : "wang", + "ㄨㄥ" : "weng", + "ㄩ" : "yu", + "ㄩㄝ" : "yue", + "ㄩㄢ" : "yuan", + "ㄩㄣ" : "yun", + "ㄩㄥ" : "yong", + "ㄫ" : "ng", +} + +PINYIN_BOPOMOFO_MAP = dict([(v, k) for k, v in BOPOMOFO_PINYIN_MAP.items()]) + +SPECIAL_INITIAL_SET = {'ci', 'chi', 'si', 'shi', 'zi', 'zhi', 'ri'} + +''' +SHENG_YUN_BOPOMOFO_MAP = { + "b" : "ㄅ", + "p" : "ㄆ", + "m" : "ㄇ", + "f" : "ㄈ", + "d" : "ㄉ", + "t" : "ㄊ", + "n" : "ㄋ", + "l" : "ㄌ", + "g" : "ㄍ", + "k" : "ㄎ", + "h" : "ㄏ", + "j" : "ㄐ", + "q" : "ㄑ", + "x" : "ㄒ", + "zh" : "ㄓ", + "ch" : "ㄔ", + "sh" : "ㄕ", + "r" : "ㄖ", + "z" : "ㄗ", + "c" : "ㄘ", + "s" : "ㄙ", + + # 韻母為u,ue,un,uan,ong時ㄧ省略 + "y" : ("ㄧ", (("u", "ue", "un", "uan", "ong"), "")), + "w" : "ㄨ", + "a" : "ㄚ", + "o" : "ㄛ", + "e" : ("ㄜ", ("y", "ㄝ")), # y後面為ㄝ + + # zh ch sh r z c s y後面為空 + "i" : ("ㄧ", (("zh", "ch", "sh", "r", "z", "c", "s", "y"), "")), + + # jqxy後面為ㄩ w後面為空 + "u" : ("ㄨ", ("jqxy", "ㄩ")), + "v" : "ㄩ", + "ai" : "ㄞ", + "ei" : "ㄟ", + "ao" : "ㄠ", + "ou" : "ㄡ", + "an" : "ㄢ", + "en" : "ㄣ", + "ang" : "ㄤ", + "eng" : "ㄥ", + "er" : "ㄦ", + "ia" : "ㄧㄚ", + "ie" : "ㄧㄝ", + "iai" : "ㄧㄞ", + "iao" : "ㄧㄠ", + "iu" : "ㄧㄡ", + "ian" : "ㄧㄢ", + "in" : ("ㄧㄣ", ("y", "ㄣ")), #y後面為ㄣ + "iang" : "ㄧㄤ", + "ing" : ("ㄧㄥ", ("y", "ㄥ")), #y後面為ㄥ + "ua" : "ㄨㄚ", + "uo" : "ㄨㄛ", + "ue" : "ㄩㄝ", + # TODO: "ve" is OK? + "ve" : "ㄩㄝ", + "uai" : "ㄨㄞ", + "ui" : "ㄨㄟ", + "uan" : ("ㄨㄢ", ("jqxy", "ㄩㄢ")), # jqxy後面是ㄩㄢ + "un" : ("ㄨㄣ", ("jqxy", "ㄩㄣ")), # jqxy後面是ㄩㄣ + "uang" : ("ㄨㄤ", ("jqxy", "ㄩㄤ")), # jqxy後面是ㄩㄤ + "ong" : ("ㄨㄥ", ("jqxy", "ㄩㄥ")), # y後面為ㄩㄥ + "iong" : "ㄩㄥ", +} +''' diff --git a/scripts/chewing.py b/scripts/chewing.py new file mode 100644 index 0000000..b49c84f --- /dev/null +++ b/scripts/chewing.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +ASCII_CHEWING_INITIAL_MAP = { + "CHEWING_B" : "ㄅ", + "CHEWING_C" : "ㄘ", + "CHEWING_CH" : "ㄔ", + "CHEWING_D" : "ㄉ", + "CHEWING_F" : "ㄈ", + "CHEWING_H" : "ㄏ", + "CHEWING_G" : "ㄍ", + "CHEWING_K" : "ㄎ", + "CHEWING_J" : "ㄐ", + "CHEWING_M" : "ㄇ", + "CHEWING_N" : "ㄋ", + "CHEWING_L" : "ㄌ", + "CHEWING_R" : "ㄖ", + "CHEWING_P" : "ㄆ", + "CHEWING_Q" : "ㄑ", + "CHEWING_S" : "ㄙ", + "CHEWING_SH" : "ㄕ", + "CHEWING_T" : "ㄊ", + "CHEWING_X" : "ㄒ", + "CHEWING_Z" : "ㄗ", + "CHEWING_ZH" : "ㄓ", +} + +CHEWING_ASCII_INITIAL_MAP = dict([(v, k) for k, v in ASCII_CHEWING_INITIAL_MAP.items()]) + +ASCII_CHEWING_MIDDLE_MAP = { + "CHEWING_I" : "ㄧ", + "CHEWING_U" : "ㄨ", + "CHEWING_V" : "ㄩ", +} + +CHEWING_ASCII_MIDDLE_MAP = dict([(v, k) for k, v in ASCII_CHEWING_MIDDLE_MAP.items()]) + +ASCII_CHEWING_FINAL_MAP = { + "CHEWING_A" : "ㄚ", + "CHEWING_AI" : "ㄞ", + "CHEWING_AN" : "ㄢ", + "CHEWING_ANG" : "ㄤ", + "CHEWING_AO" : "ㄠ", + "CHEWING_E" : "ㄝ", # merge "ㄝ" and "ㄜ" + "CHEWING_EI" : "ㄟ", + "CHEWING_EN" : "ㄣ", + "CHEWING_ENG" : "ㄥ", + "CHEWING_ER" : "ㄦ", + "CHEWING_NG" : "ㄫ", + "CHEWING_O" : "ㄛ", + "CHEWING_OU" : "ㄡ", +} + +CHEWING_ASCII_FINAL_MAP = dict([(v, k) for k, v in ASCII_CHEWING_FINAL_MAP.items()]) diff --git a/scripts/chewing_enum.h.in b/scripts/chewing_enum.h.in new file mode 100644 index 0000000..46072df --- /dev/null +++ b/scripts/chewing_enum.h.in @@ -0,0 +1,45 @@ +#ifndef CHEWING_ENUM_H +#define CHEWING_ENUM_H + +namespace pinyin{ + +/** + * @brief enums of chewing initial element. + */ + +enum ChewingInitial +{ +@CHEWING_INITIAL@ +}; + + +/** + * @brief enums of chewing middle element. + */ + +enum ChewingMiddle +{ +@CHEWING_MIDDLE@ +}; + + +/** + * @brief enums of chewing final element. + */ +enum ChewingFinal +{ +@CHEWING_FINAL@ +}; + + +/** + * @brief enums of chewing tone element. + */ +enum ChewingTone +{ +@CHEWING_TONE@ +}; + +}; + +#endif diff --git a/scripts/chewing_table.h.in b/scripts/chewing_table.h.in new file mode 100644 index 0000000..8780b17 --- /dev/null +++ b/scripts/chewing_table.h.in @@ -0,0 +1,50 @@ +#ifndef CHEWING_TABLE_H +#define CHEWING_TABLE_H + +namespace pinyin{ + +const chewing_symbol_item_t chewing_standard_symbols[] = { +@STANDARD_SYMBOLS@ +}; + +const chewing_tone_item_t chewing_standard_tones[] = { +@STANDARD_TONES@ +}; + + +const chewing_symbol_item_t chewing_ginyieh_symbols[] = { +@GINYIEH_SYMBOLS@ +}; + +const chewing_tone_item_t chewing_ginyieh_tones[] = { +@GINYIEH_TONES@ +}; + +const chewing_symbol_item_t chewing_eten_symbols[] = { +@ETEN_SYMBOLS@ +}; + +const chewing_tone_item_t chewing_eten_tones[] = { +@ETEN_TONES@ +}; + +const chewing_symbol_item_t chewing_ibm_symbols[] = { +@IBM_SYMBOLS@ +}; + +const chewing_tone_item_t chewing_ibm_tones[] = { +@IBM_TONES@ +}; + +const char * chewing_tone_table[CHEWING_NUMBER_OF_TONES] = { +"", +"ˉ", +"ˊ", +"ˇ", +"ˋ", +"˙" +}; + +}; + +#endif diff --git a/scripts/chewingkey.py b/scripts/chewingkey.py new file mode 100644 index 0000000..5f5770f --- /dev/null +++ b/scripts/chewingkey.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +CHEWING_INITIAL_LIST = [ + 'CHEWING_ZERO_INITIAL', #Zero Initial + 'CHEWING_B', #"ㄅ" + 'CHEWING_C', #"ㄘ" + 'CHEWING_CH', #"ㄔ" + 'CHEWING_D', #"ㄉ" + 'CHEWING_F', #"ㄈ" + 'CHEWING_H', #"ㄏ" + 'CHEWING_G', #"ㄍ" + 'CHEWING_K', #"ㄎ" + 'CHEWING_J', #"ㄐ" + 'CHEWING_M', #"ㄇ" + 'CHEWING_N', #"ㄋ" + 'CHEWING_L', #"ㄌ" + 'CHEWING_R', #"ㄖ" + 'CHEWING_P', #"ㄆ" + 'CHEWING_Q', #"ㄑ" + 'CHEWING_S', #"ㄙ" + 'CHEWING_SH', #"ㄕ" + 'CHEWING_T', #"ㄊ" + 'PINYIN_W', #Invalid Chewing + 'CHEWING_X', #"ㄒ" + 'PINYIN_Y', #Invalid Chewing + 'CHEWING_Z', #"ㄗ" + 'CHEWING_ZH' #"ㄓ" +] + + +CHEWING_MIDDLE_LIST = [ + 'CHEWING_ZERO_MIDDLE', #Zero Middle + 'CHEWING_I', #"ㄧ" + 'CHEWING_U', #"ㄨ" + 'CHEWING_V' #"ㄩ" +] + + +CHEWING_FINAL_LIST = [ + 'CHEWING_ZERO_FINAL', #Zero Final + 'CHEWING_A', #"ㄚ" + 'CHEWING_AI', #"ㄞ" + 'CHEWING_AN', #"ㄢ" + 'CHEWING_ANG', #"ㄤ" + 'CHEWING_AO', #"ㄠ" + 'CHEWING_E', #"ㄝ" and "ㄜ" + 'INVALID_EA', #Invalid Pinyin/Chewing + 'CHEWING_EI', #"ㄟ" + 'CHEWING_EN', #"ㄣ" + 'CHEWING_ENG', #"ㄥ" + 'CHEWING_ER', #"ㄦ" + 'CHEWING_NG', #"ㄫ" + 'CHEWING_O', #"ㄛ" + 'PINYIN_ONG', #"ueng" + 'CHEWING_OU', #"ㄡ" + 'PINYIN_IN', #"ien" + 'PINYIN_ING' #"ieng" +] + + +CHEWING_TONE_LIST = [ + 'CHEWING_ZERO_TONE', #Zero Tone + 'CHEWING_1', #" " + 'CHEWING_2', #'ˊ' + 'CHEWING_3', #'ˇ' + 'CHEWING_4', #'ˋ' + 'CHEWING_5' #'˙' +] + + +def gen_entries(items, last_enum, num_enum): + entries = [] + for enum, item in enumerate(items, start=0): + entry = '{0} = {1}'.format(item, enum) + entries.append(entry) + + #last enum + entry = last_enum + ' = ' + items[-1] + entries.append(entry) + + #num enum + entry = num_enum + entries.append(entry) + + return ",\n".join(entries) + + +def gen_initials(): + return gen_entries(CHEWING_INITIAL_LIST, 'CHEWING_LAST_INITIAL', + 'CHEWING_NUMBER_OF_INITIALS = CHEWING_LAST_INITIAL + 1') + + +def gen_middles(): + return gen_entries(CHEWING_MIDDLE_LIST, 'CHEWING_LAST_MIDDLE', + 'CHEWING_NUMBER_OF_MIDDLES = CHEWING_LAST_MIDDLE + 1') + + +def gen_finals(): + return gen_entries(CHEWING_FINAL_LIST, 'CHEWING_LAST_FINAL', + 'CHEWING_NUMBER_OF_FINALS = CHEWING_LAST_FINAL + 1') + + +def gen_tones(): + return gen_entries(CHEWING_TONE_LIST, 'CHEWING_LAST_TONE', + 'CHEWING_NUMBER_OF_TONES = CHEWING_LAST_TONE + 1') + + +def gen_table_index(content_table): + entries = [] + for i in range(0, len(CHEWING_INITIAL_LIST)): + initial = CHEWING_INITIAL_LIST[i] + for m in range(0, len(CHEWING_MIDDLE_LIST)): + middle = CHEWING_MIDDLE_LIST[m] + for f in range(0, len(CHEWING_FINAL_LIST)): + final = CHEWING_FINAL_LIST[f] + chewingkey = 'ChewingKey({0}, {1}, {2})'.format(initial, middle, final) + index = -1 + try: + index = [x[2] for x in content_table].index(chewingkey) + except ValueError: + pass + + entry = '{0:<7} /* {1} */'.format(index, chewingkey) + entries.append(entry) + return ",\n".join(entries) + + +### main function ### +if __name__ == "__main__": + print(gen_initials() + gen_middles() + gen_finals() + gen_tones()) diff --git a/scripts/correct.py b/scripts/correct.py new file mode 100644 index 0000000..ffd5998 --- /dev/null +++ b/scripts/correct.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (c) 2007-2008 Peng Huang <shawn.p.huang@gmail.com> +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +auto_correct = [ + # "correct", "wrong" + ("ng", "gn"), + ("ng", "mg"), + ("iu", "iou"), + ("ui", "uei"), + ("un", "uen"), +# ("ue", "ve"), + ("ve", "ue"), + ("ong", "on"), +] + +auto_correct_ext = [ + # "correct", "wrong", flag + ("ju", "jv", "PINYIN_CORRECT_V_U"), + ("qu", "qv", "PINYIN_CORRECT_V_U"), + ("xu", "xv", "PINYIN_CORRECT_V_U"), + ("yu", "yv", "PINYIN_CORRECT_V_U"), + + ("jue", "jve", "PINYIN_CORRECT_V_U"), + ("que", "qve", "PINYIN_CORRECT_V_U"), + ("xue", "xve", "PINYIN_CORRECT_V_U"), + ("yue", "yve", "PINYIN_CORRECT_V_U"), + + ("juan", "jvan", "PINYIN_CORRECT_V_U"), + ("quan", "qvan", "PINYIN_CORRECT_V_U"), + ("xuan", "xvan", "PINYIN_CORRECT_V_U"), + ("yuan", "yvan", "PINYIN_CORRECT_V_U"), + + ("jun", "jvn", "PINYIN_CORRECT_V_U"), + ("qun", "qvn", "PINYIN_CORRECT_V_U"), + ("xun", "xvn", "PINYIN_CORRECT_V_U"), + ("yun", "yvn", "PINYIN_CORRECT_V_U"), + +# ("juang", "jvang", "PINYIN_CORRECT_V_U"), +# ("quang", "qvang", "PINYIN_CORRECT_V_U"), +# ("xuang", "xvang", "PINYIN_CORRECT_V_U"), +# ("yuang", "yvang", "PINYIN_CORRECT_V_U"), + +# ("jun", "jven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +# ("qun", "qven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +# ("xun", "xven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +# ("yun", "yven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +] + + +''' +fuzzy_shengmu = [ + ("c", "ch"), + ("ch", "c"), + ("z", "zh"), + ("zh", "z"), + ("s", "sh"), + ("sh", "s"), + ("l", "n"), + ("n", "l"), + ("f", "h"), + ("h", "f"), + ("l", "r"), + ("r", "l"), + ("k", "g"), + ("g", "k"), +] + +fuzzy_yunmu = [ + ("an", "ang"), + ("ang", "an"), + ("en", "eng"), + ("eng", "en"), + ("in", "ing"), + ("ing", "in"), +] +''' diff --git a/scripts/double_pinyin_table.h.in b/scripts/double_pinyin_table.h.in new file mode 100644 index 0000000..15a8ee9 --- /dev/null +++ b/scripts/double_pinyin_table.h.in @@ -0,0 +1,56 @@ +#ifndef DOUBLE_PINYIN_TABLE_H +#define DOUBLE_PINYIN_TABLE_H + +namespace pinyin{ + +const double_pinyin_scheme_shengmu_item_t double_pinyin_mspy_sheng[] = { +@MSPY_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_mspy_yun[] = { +@MSPY_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_zrm_sheng[] = { +@ZRM_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_zrm_yun[] = { +@ZRM_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_abc_sheng[] = { +@ABC_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_abc_yun[] = { +@ABC_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_zgpy_sheng[] = { +@ZGPY_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_zgpy_yun[] = { +@ZGPY_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_pyjj_sheng[] = { +@PYJJ_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_pyjj_yun[] = { +@PYJJ_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_xhe_sheng[] = { +@XHE_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_xhe_yun[] = { +@XHE_YUN@ +}; + +}; + +#endif diff --git a/scripts/genbopomofoheader.py b/scripts/genbopomofoheader.py new file mode 100644 index 0000000..cb0fa86 --- /dev/null +++ b/scripts/genbopomofoheader.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (c) 2010 BYVoid <byvoid1@gmail.com> +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +from operator import itemgetter +from utils import expand_file + +bopomofo = [ + 'ㄅ', 'ㄆ', 'ㄇ', 'ㄈ', 'ㄉ', 'ㄊ', 'ㄋ', 'ㄌ', 'ㄍ', 'ㄎ', + 'ㄏ', 'ㄐ', 'ㄑ', 'ㄒ', 'ㄓ', 'ㄔ', 'ㄕ', 'ㄖ', 'ㄗ', 'ㄘ', 'ㄙ', + + 'ㄧ', 'ㄨ', 'ㄩ', 'ㄚ', 'ㄛ', 'ㄜ', 'ㄝ', 'ㄞ', 'ㄟ', 'ㄠ', 'ㄡ', + 'ㄢ', 'ㄣ', 'ㄤ', 'ㄥ', 'ㄦ', + + 'ˉ', 'ˊ', 'ˇ', 'ˋ', '˙', +] + +#陰平聲不標號, use space key +num_tones = -5 + +bopomofo_keyboards = { + #標準注音鍵盤 + 'STANDARD': + ( + "1","q","a","z","2","w","s","x","e","d","c","r","f","v","5","t","g","b","y","h","n", + "u","j","m","8","i","k",",","9","o","l",".","0","p",";","/","-", + " ","6","3","4","7", + ), + #精業注音鍵盤 + 'GINYIEH': + ( + "2","w","s","x","3","e","d","c","r","f","v","t","g","b","6","y","h","n","u","j","m", + "-","[","'","8","i","k",",","9","o","l",".","0","p",";","/","=", + " ","q","a","z","1", + ), + #倚天注音鍵盤 + 'ETEN': + ( + "b","p","m","f","d","t","n","l","v","k","h","g","7","c",",",".","/","j",";","'","s", + "e","x","u","a","o","r","w","i","q","z","y","8","9","0","-","=", + " ","2","3","4","1", + ), + #IBM注音鍵盤 + 'IBM': + ( + "1","2","3","4","5","6","7","8","9","0","-","q","w","e","r","t","y","u","i","o","p", + "a","s","d","f","g","h","j","k","l",";","z","x","c","v","b","n", + " ","m",",",".","/", + ), +} + + +def escape_char(ch): + if ch == "'" or ch == "\\": + ch = "\\" + ch; + return "'{0}'".format(ch) + + +#generate shengmu and yunmu here +def gen_chewing_symbols(scheme): + keyboard = bopomofo_keyboards[scheme] + keyboard = keyboard[: num_tones] + items = [] + for (i, key) in enumerate(keyboard): + items.append((key, bopomofo[i])) + items = sorted(items, key=itemgetter(0)) + entries = [] + for (key, string) in items: + key = escape_char(key) + string = '"{0}"'.format(string) + entry = "{{{0: <5}, {1}}}".format(key, string) + entries.append(entry) + entries.append("{'\\0', NULL}") + return ",\n".join(entries) + + +#generate tones here +def gen_chewing_tones(scheme): + keyboard = bopomofo_keyboards[scheme] + keyboard = keyboard[num_tones:] + items = [] + for (i, key) in enumerate(keyboard, start=1): + items.append((key, i)); + items = sorted(items, key=itemgetter(0)) + entries = [] + for (key, tone) in items: + key = escape_char(key); + entry = "{{{0: <5}, {1}}}".format(key, tone) + entries.append(entry) + entries.append("{'\\0', 0}") + return ",\n".join(entries) + + +def get_table_content(tablename): + (scheme, part) = tablename.split('_', 1) + if part == "SYMBOLS": + return gen_chewing_symbols(scheme); + if part == "TONES": + return gen_chewing_tones(scheme); + + +### main function ### +if __name__ == "__main__": + expand_file("chewing_table.h.in", get_table_content) diff --git a/scripts/genchewingkey.py b/scripts/genchewingkey.py new file mode 100644 index 0000000..4a0bdcd --- /dev/null +++ b/scripts/genchewingkey.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +from utils import expand_file +from chewingkey import gen_initials, gen_middles, gen_finals, gen_tones + + +def get_table_content(tablename): + if tablename == 'CHEWING_INITIAL': + return gen_initials() + if tablename == 'CHEWING_MIDDLE': + return gen_middles() + if tablename == 'CHEWING_FINAL': + return gen_finals() + if tablename == 'CHEWING_TONE': + return gen_tones() + + +### main function ### +if __name__ == "__main__": + expand_file("chewing_enum.h.in", get_table_content) + diff --git a/scripts/gendoublepinyinheader.py b/scripts/gendoublepinyinheader.py new file mode 100644 index 0000000..08dd817 --- /dev/null +++ b/scripts/gendoublepinyinheader.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import pinyin +from utils import expand_file + +def gen_shengmu_table(scheme): + entries = [] + #select shengmu mapping + sheng = pinyin.SHUANGPIN_SCHEMAS[scheme][0] + for c in "abcdefghijklmnopqrstuvwxyz;": + sh = sheng.get(c, "NULL") + if sh != "NULL": + sh = '"{0}"'.format(sh) + entry = '{{{0: <5}}} /* {1} */'.format(sh, c.upper()) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_yunmu_table(scheme): + entries = [] + #select yunmu mapping + yun = pinyin.SHUANGPIN_SCHEMAS[scheme][1] + for c in "abcdefghijklmnopqrstuvwxyz;": + y = yun.get(c, ("NULL", "NULL")) + if len(y) == 1: + y1 = y[0] + y2 = "NULL" + else: + y1, y2 = y + if y1 != "NULL": + y1 = '"{0}"'.format(y1) + if y2 != "NULL": + y2 = '"{0}"'.format(y2) + entry = '{{{{{0: <7}, {1: <7}}}}} /* {2} */'.format(y1, y2, c.upper()) + entries.append(entry) + return ',\n'.join(entries) + + +def get_table_content(tablename): + (scheme, part) = tablename.split('_', 1) + if part == "SHENG": + return gen_shengmu_table(scheme) + if part == "YUN": + return gen_yunmu_table(scheme) + + +### main function ### +if __name__ == "__main__": + expand_file("double_pinyin_table.h.in", get_table_content) diff --git a/scripts/genpinyinheader.py b/scripts/genpinyinheader.py new file mode 100644 index 0000000..81e0538 --- /dev/null +++ b/scripts/genpinyinheader.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +from utils import expand_file +from genpinyintable import gen_content_table, \ + gen_pinyin_index, gen_bopomofo_index, \ + gen_chewing_key_table +from genspecialtable import gen_divided_table, gen_resplit_table + +def get_table_content(tablename): + if tablename == 'CONTENT_TABLE': + return gen_content_table() + if tablename == 'PINYIN_INDEX': + return gen_pinyin_index() + if tablename == 'BOPOMOFO_INDEX': + return gen_bopomofo_index() + if tablename == 'DIVIDED_TABLE': + return gen_divided_table() + if tablename == 'RESPLIT_TABLE': + return gen_resplit_table() + if tablename == 'TABLE_INDEX': + return gen_chewing_key_table() + + +### main function ### +if __name__ == "__main__": + expand_file("pinyin_parser_table.h.in", get_table_content) diff --git a/scripts/genpinyins.py b/scripts/genpinyins.py new file mode 100644 index 0000000..fef40cd --- /dev/null +++ b/scripts/genpinyins.py @@ -0,0 +1,57 @@ +#!/usr/bin/python3 +import os +from operator import itemgetter + +pinyin_dict = {} + + +def strip_tone(old_pinyin_str): + oldpinyins = old_pinyin_str.split("'") + newpinyins = [] + + for pinyin in oldpinyins: + if pinyin[-1].isdigit(): + pinyin = pinyin[:-1] + newpinyins.append(pinyin) + + new_pinyin_str = "'".join(newpinyins) + return new_pinyin_str + + +def add_pinyin_dict(pinyin, freq): + if 0 == freq: + return + if not pinyin in pinyin_dict: + pinyin_dict[pinyin] = freq + else: + pinyin_dict[pinyin] += freq + + +def load_phrase(filename): + phrasefile = open(filename, "r") + for line in phrasefile.readlines(): + line = line.rstrip(os.linesep) + (pinyin, word, token, freq) = line.split(None, 3) + pinyin = strip_tone(pinyin) + freq = int(freq) + + if len(word) in [1, 2]: + add_pinyin_dict(pinyin, freq) + + phrasefile.close() + +load_phrase("../data/gb_char.table") +load_phrase("../data/gbk_char.table") + + +def save_pinyin(filename): + pinyinfile = open(filename, "w") + for pinyin, freq in pinyin_dict.items(): + freq = str(freq) + line = "\t".join((pinyin, freq)) + pinyinfile.writelines([line, os.linesep]) + pinyinfile.close() + + +if __name__ == "__main__": + save_pinyin("pinyins.txt") diff --git a/scripts/genpinyintable.py b/scripts/genpinyintable.py new file mode 100644 index 0000000..cc60034 --- /dev/null +++ b/scripts/genpinyintable.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +import operator +import bopomofo +from pinyintable import * +from chewingkey import gen_table_index + + +content_table = [] +pinyin_index = [] +bopomofo_index = [] + +#pinyin table +def filter_pinyin_list(): + for (correct, wrong, bopomofo, flags, chewing) in gen_pinyin_list(): + flags = '|'.join(flags) + chewing = "ChewingKey({0})".format(', '.join(chewing)) + #correct = correct.replace("v", "ü") + content_table.append((correct, bopomofo, chewing)) + if "IS_PINYIN" in flags: + pinyin_index.append((wrong, flags, correct)) + if "IS_CHEWING" in flags: + bopomofo_index.append((bopomofo, flags)) + + +def sort_all(): + global content_table, pinyin_index, bopomofo_index + #remove duplicates + content_table = list(set(content_table)) + pinyin_index = list(set(pinyin_index)) + bopomofo_index = list(set(bopomofo_index)) + #define sort function + sortfunc = operator.itemgetter(0) + #begin sort + content_table = sorted(content_table, key=sortfunc) + #prepend zero item to reserve the invalid item + content_table.insert(0, ("", "", "ChewingKey()")) + #sort index + pinyin_index = sorted(pinyin_index, key=sortfunc) + bopomofo_index = sorted(bopomofo_index, key=sortfunc) + +def get_sheng_yun(pinyin): + if pinyin == None: + return None, None + if pinyin == "": + return "", "" + if pinyin == "ng": + return "", "ng" + for i in range(2, 0, -1): + s = pinyin[:i] + if s in shengmu_list: + return s, pinyin[i:] + return "", pinyin + +def gen_content_table(): + entries = [] + for ((correct, bopomofo, chewing)) in content_table: + (shengmu, yunmu) = get_sheng_yun(correct) + entry = '{{"{0}", "{1}", "{2}", "{3}", {4}}}'.format(correct, shengmu, yunmu, bopomofo, chewing) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_pinyin_index(): + entries = [] + for (wrong, flags, correct) in pinyin_index: + index = [x[0] for x in content_table].index(correct) + entry = '{{"{0}", {1}, {2}}}'.format(wrong, flags, index) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_bopomofo_index(): + entries = [] + for (bopomofo_str, flags) in bopomofo_index: + pinyin_str = bopomofo.BOPOMOFO_PINYIN_MAP[bopomofo_str] + index = [x[0] for x in content_table].index(pinyin_str) + entry = '{{"{0}", {1}, {2}}}'.format(bopomofo_str, flags, index) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_chewing_key_table(): + return gen_table_index(content_table) + + +#init code +filter_pinyin_list() +sort_all() + + +### main function ### +if __name__ == "__main__": + #s = gen_content_table() + gen_pinyin_index() + gen_bopomofo_index() + s = gen_chewing_key_table() + print(s) diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py new file mode 100644 index 0000000..061f9d1 --- /dev/null +++ b/scripts/genspecialtable.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import operator +import pinyin +from pinyintable import get_chewing, get_shengmu_chewing +from specialtable import * + +pinyin_list = sorted(pinyin.PINYIN_LIST) +shengmu_list = sorted(pinyin.SHENGMU_LIST) + +divided_list = [] +resplit_list = [] + + +def sort_all(): + global divided_list, resplit_list + divided_list = sorted(divided_list, key=operator.itemgetter(0)) + resplit_list = sorted(resplit_list, key=operator.itemgetter(0, 1)) + +''' +def get_chewing_string(pinyin): + #handle shengmu + if pinyin not in pinyin_list: + if pinyin in shengmu_list: + chewing_key = get_shengmu_chewing(pinyin) + else: + assert False, "Un-expected pinyin string." + else: + chewing_key = get_chewing(pinyin) + chewing_str = 'ChewingKey({0})'.format(', '.join(chewing_key)) + return chewing_str +''' + +def gen_divided_table(): + entries = [] + for (pinyin_key, orig_freq, first_key, second_key, new_freq) \ + in divided_list: + + if orig_freq >= new_freq: + assert orig_freq > 0, "Expected orig_freq > 0 here." + + entry = '{{"{0}", {1}, {{"{2}", "{3}"}}, {4}}}'.format \ + (pinyin_key, orig_freq, first_key, second_key, new_freq) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_resplit_table(): + entries = [] + for (orig_first_key, orig_second_key, orig_freq, \ + new_first_key, new_second_key, new_freq) in resplit_list: + + if orig_freq >= new_freq: + assert orig_freq > 0, "Expected orig_freq > 0 here." + + entry = '{{{{"{0}", "{1}"}}, {2}, {{"{3}", "{4}"}}, {5}}}'.format \ + (orig_first_key, orig_second_key, orig_freq,\ + new_first_key, new_second_key, new_freq) + entries.append(entry) + return ',\n'.join(entries) + + +#init code, load lists +divided_list = filter_divided() +resplit_list = filter_resplit() +sort_all() + + +### main function ### +if __name__ == "__main__": + s = gen_divided_table() + '\n' + gen_resplit_table() + print(s) + diff --git a/scripts/pinyin.py b/scripts/pinyin.py new file mode 100644 index 0000000..dd0e156 --- /dev/null +++ b/scripts/pinyin.py @@ -0,0 +1,400 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (c) 2007-2008 Peng Huang <shawn.p.huang@gmail.com> +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +N_ = lambda x : x +PINYIN_DICT = { + "a" : 1, "ai" : 2, "an" : 3, "ang" : 4, "ao" : 5, + "ba" : 6, "bai" : 7, "ban" : 8, "bang" : 9, "bao" : 10, + "bei" : 11, "ben" : 12, "beng" : 13, "bi" : 14, "bian" : 15, + "biao" : 16, "bie" : 17, "bin" : 18, "bing" : 19, "bo" : 20, + "bu" : 21, "ca" : 22, "cai" : 23, "can" : 24, "cang" : 25, + "cao" : 26, "ce" : 27, "cen" : 28, "ceng" : 29, "ci" : 30, + "cong" : 31, "cou" : 32, "cu" : 33, "cuan" : 34, "cui" : 35, + "cun" : 36, "cuo" : 37, "cha" : 38, "chai" : 39, "chan" : 40, + "chang" : 41, "chao" : 42, "che" : 43, "chen" : 44, "cheng" : 45, + "chi" : 46, "chong" : 47, "chou" : 48, "chu" : 49, "chuai" : 50, + "chuan" : 51, "chuang" : 52, "chui" : 53, "chun" : 54, "chuo" : 55, + "da" : 56, "dai" : 57, "dan" : 58, "dang" : 59, "dao" : 60, + "de" : 61, "dei" : 62, + # "den" : 63, + "deng" : 64, "di" : 65, + "dia" : 66, "dian" : 67, "diao" : 68, "die" : 69, "ding" : 70, + "diu" : 71, "dong" : 72, "dou" : 73, "du" : 74, "duan" : 75, + "dui" : 76, "dun" : 77, "duo" : 78, "e" : 79, "ei" : 80, + "en" : 81, "er" : 82, "fa" : 83, "fan" : 84, "fang" : 85, + "fei" : 86, "fen" : 87, "feng" : 88, "fo" : 89, "fou" : 90, + "fu" : 91, "ga" : 92, "gai" : 93, "gan" : 94, "gang" : 95, + "gao" : 96, "ge" : 97, "gei" : 98, "gen" : 99, "geng" : 100, + "gong" : 101, "gou" : 102, "gu" : 103, "gua" : 104, "guai" : 105, + "guan" : 106, "guang" : 107, "gui" : 108, "gun" : 109, "guo" : 110, + "ha" : 111, "hai" : 112, "han" : 113, "hang" : 114, "hao" : 115, + "he" : 116, "hei" : 117, "hen" : 118, "heng" : 119, "hong" : 120, + "hou" : 121, "hu" : 122, "hua" : 123, "huai" : 124, "huan" : 125, + "huang" : 126, "hui" : 127, "hun" : 128, "huo" : 129, "ji" : 130, + "jia" : 131, "jian" : 132, "jiang" : 133, "jiao" : 134, "jie" : 135, + "jin" : 136, "jing" : 137, "jiong" : 138, "jiu" : 139, "ju" : 140, + "juan" : 141, "jue" : 142, "jun" : 143, "ka" : 144, "kai" : 145, + "kan" : 146, "kang" : 147, "kao" : 148, "ke" : 149, + # "kei" : 150, + "ken" : 151, "keng" : 152, "kong" : 153, "kou" : 154, "ku" : 155, + "kua" : 156, "kuai" : 157, "kuan" : 158, "kuang" : 159, "kui" : 160, + "kun" : 161, "kuo" : 162, "la" : 163, "lai" : 164, "lan" : 165, + "lang" : 166, "lao" : 167, "le" : 168, "lei" : 169, "leng" : 170, + "li" : 171, "lia" : 172, "lian" : 173, "liang" : 174, "liao" : 175, + "lie" : 176, "lin" : 177, "ling" : 178, "liu" : 179, + "lo" : 180, + "long" : 181, "lou" : 182, "lu" : 183, "luan" : 184, + # "lue" : 185, + "lun" : 186, "luo" : 187, "lv" : 188, "lve" : 189, + "ma" : 190, + "mai" : 191, "man" : 192, "mang" : 193, "mao" : 194, "me" : 195, + "mei" : 196, "men" : 197, "meng" : 198, "mi" : 199, "mian" : 200, + "miao" : 201, "mie" : 202, "min" : 203, "ming" : 204, "miu" : 205, + "mo" : 206, "mou" : 207, "mu" : 208, "na" : 209, "nai" : 210, + "nan" : 211, "nang" : 212, "nao" : 213, "ne" : 214, "nei" : 215, + "nen" : 216, "neng" : 217, "ni" : 218, "nian" : 219, "niang" : 220, + "niao" : 221, "nie" : 222, "nin" : 223, "ning" : 224, "niu" : 225, + "ng" : 226, + "nong" : 227, "nou" : 228, "nu" : 229, "nuan" : 230, + # "nue" : 231, + "nuo" : 232, "nv" : 233, "nve" : 234, + "o" : 235, + "ou" : 236, "pa" : 237, "pai" : 238, "pan" : 239, "pang" : 240, + "pao" : 241, "pei" : 242, "pen" : 243, "peng" : 244, "pi" : 245, + "pian" : 246, "piao" : 247, "pie" : 248, "pin" : 249, "ping" : 250, + "po" : 251, "pou" : 252, "pu" : 253, "qi" : 254, "qia" : 255, + "qian" : 256, "qiang" : 257, "qiao" : 258, "qie" : 259, "qin" : 260, + "qing" : 261, "qiong" : 262, "qiu" : 263, "qu" : 264, "quan" : 265, + "que" : 266, "qun" : 267, "ran" : 268, "rang" : 269, "rao" : 270, + "re" : 271, "ren" : 272, "reng" : 273, "ri" : 274, "rong" : 275, + "rou" : 276, "ru" : 277, "ruan" : 278, "rui" : 279, "run" : 280, + "ruo" : 281, "sa" : 282, "sai" : 283, "san" : 284, "sang" : 285, + "sao" : 286, "se" : 287, "sen" : 288, "seng" : 289, "si" : 290, + "song" : 291, "sou" : 292, "su" : 293, "suan" : 294, "sui" : 295, + "sun" : 296, "suo" : 297, "sha" : 298, "shai" : 299, "shan" : 300, + "shang" : 301, "shao" : 302, "she" : 303, "shei" : 304, "shen" : 305, + "sheng" : 306, "shi" : 307, "shou" : 308, "shu" : 309, "shua" : 310, + "shuai" : 311, "shuan" : 312, "shuang" : 313, "shui" : 314, "shun" : 315, + "shuo" : 316, "ta" : 317, "tai" : 318, "tan" : 319, "tang" : 320, + "tao" : 321, "te" : 322, + # "tei" : 323, + "teng" : 324, "ti" : 325, + "tian" : 326, "tiao" : 327, "tie" : 328, "ting" : 329, "tong" : 330, + "tou" : 331, "tu" : 332, "tuan" : 333, "tui" : 334, "tun" : 335, + "tuo" : 336, "wa" : 337, "wai" : 338, "wan" : 339, "wang" : 340, + "wei" : 341, "wen" : 342, "weng" : 343, "wo" : 344, "wu" : 345, + "xi" : 346, "xia" : 347, "xian" : 348, "xiang" : 349, "xiao" : 350, + "xie" : 351, "xin" : 352, "xing" : 353, "xiong" : 354, "xiu" : 355, + "xu" : 356, "xuan" : 357, "xue" : 358, "xun" : 359, "ya" : 360, + "yan" : 361, "yang" : 362, "yao" : 363, "ye" : 364, "yi" : 365, + "yin" : 366, "ying" : 367, "yo" : 368, "yong" : 369, "you" : 370, + "yu" : 371, "yuan" : 372, "yue" : 373, "yun" : 374, "za" : 375, + "zai" : 376, "zan" : 377, "zang" : 378, "zao" : 379, "ze" : 380, + "zei" : 381, "zen" : 382, "zeng" : 383, "zi" : 384, "zong" : 385, + "zou" : 386, "zu" : 387, "zuan" : 388, "zui" : 389, "zun" : 390, + "zuo" : 391, "zha" : 392, "zhai" : 393, "zhan" : 394, "zhang" : 395, + "zhao" : 396, "zhe" : 397, "zhen" : 398, "zheng" : 399, "zhi" : 400, + "zhong" : 401, "zhou" : 402, "zhu" : 403, "zhua" : 404, "zhuai" : 405, + "zhuan" : 406, "zhuang" : 407, "zhui" : 408, "zhun" : 409, "zhuo" : 410, + # some weird pinyins + #~ "eng" : 411, "chua" : 412, "fe" : 413, "fiao" : 414, "liong" : 415 +} + +PINYIN_LIST = PINYIN_DICT.keys () + + +SHENGMU_DICT = { + "b" : 1, "p" : 2, "m" : 3, "f" : 4, "d" : 5, + "t" : 6, "n" : 7, "l" : 8, "g" : 9, "k" : 10, "h" : 11, + "j" : 12, "q" : 13, "x" : 14, "zh" : 15, "ch" : 16, "sh" : 17, + "r" : 18, "z" : 19, "c" : 20, "s" : 21, "y" : 22, "w" : 23 +} + +SHENGMU_LIST = SHENGMU_DICT.keys () + + +YUNMU_DICT = { + "a" : 1, "ai" : 2, "an" : 3, "ang" : 4, "ao" : 5, + "e" : 6, "ei" : 7, "en" : 8, "eng" : 9, "er" : 10, + "i" : 11, "ia" : 12, "ian" : 13, "iang" : 14, "iao" : 15, + "ie" : 16, "in" : 17, "ing" : 18, "iong" : 19, "iu" : 20, + "o" : 21, "ong" : 22, "ou" : 23, "u" : 24, "ua" : 25, + "uai" : 26, "uan" : 27, "uang" : 28, "ue" : 29, "ui" : 30, + "un" : 31, "uo" : 32, "v" : 33, "ve" : 34 +} + +YUNMU_LIST = YUNMU_DICT.keys () + + +MOHU_SHENGMU = { + "z" : ("z", "zh"), + "zh" : ("z", "zh"), + "c" : ("c", "ch"), + "ch" : ("c", "ch"), + "s" : ("s", "sh"), + "sh" : ("s", "sh"), + "l" : ("l", "n"), + "n" : ("l", "n") +} + +MOHU_YUNMU = { + "an" : ("an", "ang"), + "ang" : ("an", "ang"), + "en" : ("en", "eng"), + "eng" : ("en", "eng"), + "in" : ("in", "ing"), + "ing" : ("in", "ing") +} + +MSPY_SHUANGPIN_SHENGMU_DICT = { + "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "ch","j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "sh","v" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +MSPY_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("ou",), + "c" : ("iao",), + "d" : ("uang", "iang"), + "e" : ("e",), + "f" : ("en",), + "g" : ("eng", "ng"), + "h" : ("ang",), + "i" : ("i",), + "j" : ("an",), + "k" : ("ao",), + "l" : ("ai",), + "m" : ("ian",), + "n" : ("in",), + "o" : ("uo", "o"), + "p" : ("un",), + "q" : ("iu",), + "r" : ("uan", "er"), + "s" : ("ong", "iong"), + "t" : ("ue",), + "u" : ("u",), + "v" : ("ui","ue"), + "w" : ("ia","ua"), + "x" : ("ie",), + "y" : ("uai", "v"), + "z" : ("ei",), + ";" : ("ing",) +} + +ZRM_SHUANGPIN_SHENGMU_DICT = { + "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "ch","j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "sh","v" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +ZRM_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("ou",), + "c" : ("iao",), + "d" : ("uang", "iang"), + "e" : ("e",), + "f" : ("en",), + "g" : ("eng", "ng"), + "h" : ("ang",), + "i" : ("i",), + "j" : ("an",), + "k" : ("ao",), + "l" : ("ai",), + "m" : ("ian",), + "n" : ("in",), + "o" : ("uo", "o"), + "p" : ("un",), + "q" : ("iu",), + "r" : ("uan", "er"), + "s" : ("ong", "iong"), + "t" : ("ue",), + "u" : ("u",), + "v" : ("ui","v"), + "w" : ("ia","ua"), + "x" : ("ie",), + "y" : ("uai", "ing"), + "z" : ("ei",), +} + +ABC_SHUANGPIN_SHENGMU_DICT = { + "a" : "zh", "b" : "b", "c" : "c", "d" : "d", "e":"ch", "f" : "f", "g" : "g", + "h" : "h", "j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "v" : "sh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +ABC_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("ou",), + "c" : ("in","uai"), + "d" : ("ia", "ua"), + "e" : ("e",), + "f" : ("en",), + "g" : ("eng", "ng"), + "h" : ("ang",), + "i" : ("i",), + "j" : ("an",), + "k" : ("ao",), + "l" : ("ai",), + "m" : ("ue","ui"), + "n" : ("un",), + "o" : ("uo", "o"), + "p" : ("uan",), + "q" : ("ei",), + "r" : ("er", "iu"), + "s" : ("ong", "iong"), + "t" : ("iang","uang"), + "u" : ("u",), + "v" : ("v","ue"), + "w" : ("ian",), + "x" : ("ie",), + "y" : ("ing",), + "z" : ("iao",), +} + +ZGPY_SHUANGPIN_SHENGMU_DICT = { + "a" : "ch", "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "sh","j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +ZGPY_SHUANGPIN_YUNMU_DICT = { + "a" : ("a", ), + "b" : ("iao", ), + "d" : ("ie", ), + "e" : ("e", ), + "f" : ("ian", ), + "g" : ("iang", "uang"), + "h" : ("ong", "iong"), + "i" : ("i", ), + "j" : ("er", "iu"), + "k" : ("ei", ), + "l" : ("uan", ), + "m" : ("un", ), + "n" : ("ue", "ui"), + "o" : ("uo", "o"), + "p" : ("ai", ), + "q" : ("ao", ), + "r" : ("an", ), + "s" : ("ang", ), + "t" : ("eng", "ng"), + "u" : ("u", ), + "v" : ("v", ), + "w" : ("en", ), + "x" : ("ia", "ua"), + "y" : ("in", "uai"), + "z" : ("ou" ,), + ";" : ("ing", ) +} + +PYJJ_SHUANGPIN_SHENGMU_DICT = { + "a" : "'", "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "sh","j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "ch","v" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +PYJJ_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("ia","ua"), + "c" : ("uan",), + "d" : ("ao", ), + "e" : ("e",), + "f" : ("an",), + "g" : ("ang",), + "h" : ("iang","uang"), + "i" : ("i",), + "j" : ("ian",), + "k" : ("iao",), + "l" : ("in",), + "m" : ("ie",), + "n" : ("iu",), + "o" : ("uo", "o"), + "p" : ("ou",), + "q" : ("er","ing"), + "r" : ("en", ), + "s" : ("ai", ), + "t" : ("eng", "ng"), + "u" : ("u",), + "v" : ("v","ui"), + "w" : ("ei",), + "x" : ("uai","ue"), + "y" : ("ong","iong"), + "z" : ("un",), +} + +XHE_SHUANGPIN_SHENGMU_DICT = { + "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "ch", "j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "sh", "v" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z", + "a" : "'", "e" : "'" +} + +XHE_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("in",), + "c" : ("ao",), + "d" : ("ai",), + "e" : ("e",), + "f" : ("en",), + "g" : ("eng", "ng"), + "h" : ("ang",), + "i" : ("i",), + "j" : ("an",), + "k" : ("uai", "ing"), + "l" : ("iang", "uang"), + "m" : ("ian",), + "n" : ("iao",), + "o" : ("uo", "o"), + "p" : ("ie",), + "q" : ("iu",), + "r" : ("uan", "er"), + "s" : ("ong", "iong"), + "t" : ("ue",), + "u" : ("u",), + "v" : ("v", "ui"), + "w" : ("ei",), + "x" : ("ia", "ua"), + "y" : ("un",), + "z" : ("ou",), +} + +SHUANGPIN_SCHEMAS = { + N_("MSPY") : (MSPY_SHUANGPIN_SHENGMU_DICT, MSPY_SHUANGPIN_YUNMU_DICT), + N_("ZRM") : (ZRM_SHUANGPIN_SHENGMU_DICT, ZRM_SHUANGPIN_YUNMU_DICT), + N_("ABC") : (ABC_SHUANGPIN_SHENGMU_DICT, ABC_SHUANGPIN_YUNMU_DICT), + N_("ZGPY") : (ZGPY_SHUANGPIN_SHENGMU_DICT, ZGPY_SHUANGPIN_YUNMU_DICT), + N_("PYJJ") : (PYJJ_SHUANGPIN_SHENGMU_DICT, PYJJ_SHUANGPIN_YUNMU_DICT), + N_("XHE") : (XHE_SHUANGPIN_SHENGMU_DICT, XHE_SHUANGPIN_YUNMU_DICT), +} + diff --git a/scripts/pinyin_parser_table.h.in b/scripts/pinyin_parser_table.h.in new file mode 100644 index 0000000..2f98e0e --- /dev/null +++ b/scripts/pinyin_parser_table.h.in @@ -0,0 +1,34 @@ +#ifndef PINYIN_PARSER_TABLE_H +#define PINYIN_PARSER_TABLE_H + +namespace pinyin{ + +const pinyin_index_item_t pinyin_index[] = { +@PINYIN_INDEX@ +}; + +const chewing_index_item_t chewing_index[] = { +@BOPOMOFO_INDEX@ +}; + +const content_table_item_t content_table[] = { +@CONTENT_TABLE@ +}; + +const divided_table_item_t divided_table[] = { +@DIVIDED_TABLE@ +}; + +const resplit_table_item_t resplit_table[] = { +@RESPLIT_TABLE@ +}; + +const gint chewing_key_table[CHEWING_NUMBER_OF_INITIALS * + CHEWING_NUMBER_OF_MIDDLES * + CHEWING_NUMBER_OF_FINALS] = { +@TABLE_INDEX@ +}; + +}; + +#endif diff --git a/scripts/pinyintable.py b/scripts/pinyintable.py new file mode 100644 index 0000000..bddf2dc --- /dev/null +++ b/scripts/pinyintable.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +import pinyin +import bopomofo +import chewing +import itertools +from correct import * + + +pinyin_list = sorted(bopomofo.PINYIN_BOPOMOFO_MAP.keys()) +shengmu_list = sorted(pinyin.SHENGMU_LIST) + + +def check_pinyin_chewing_map(): + for pinyin_key in pinyin.PINYIN_DICT.keys(): + if pinyin_key in pinyin_list: + pass + else: + print("pinyin %s has no chewing mapping", pinyin_key) + + +def get_chewing(pinyin_key): + initial, middle, final = \ + 'CHEWING_ZERO_INITIAL', 'CHEWING_ZERO_MIDDLE', 'CHEWING_ZERO_FINAL' + assert pinyin_key != None + assert pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP + + #handle 'w' and 'y' + if pinyin_key[0] == 'w': + initial = 'PINYIN_W' + if pinyin_key[0] == 'y': + initial = 'PINYIN_Y' + + #get chewing string + bopomofo_str = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + + #handle bopomofo SPECIAL_INITIAL_SET + if pinyin_key in bopomofo.SPECIAL_INITIAL_SET: + middle = "CHEWING_I" + #normal process + for char in bopomofo_str: + if char in chewing.CHEWING_ASCII_INITIAL_MAP: + initial = chewing.CHEWING_ASCII_INITIAL_MAP[char] + if char in chewing.CHEWING_ASCII_MIDDLE_MAP: + middle = chewing.CHEWING_ASCII_MIDDLE_MAP[char] + if char in chewing.CHEWING_ASCII_FINAL_MAP: + final = chewing.CHEWING_ASCII_FINAL_MAP[char] + if char == "ㄜ": # merge "ㄝ" and "ㄜ" + final = "CHEWING_E" + + post_process_rules = { + #handle "ueng"/"ong" + ("CHEWING_U", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ONG"), + #handle "veng"/"iong" + ("CHEWING_V", "CHEWING_ENG"): ("CHEWING_I", "PINYIN_ONG"), + #handle "ien"/"in" + ("CHEWING_I", "CHEWING_EN"): ("CHEWING_ZERO_MIDDLE", "PINYIN_IN"), + #handle "ieng"/"ing" + ("CHEWING_I", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ING"), + } + + if (middle, final) in post_process_rules: + (middle, final) = post_process_rules[(middle, final)] + + return initial, middle, final + + +def gen_pinyin_list(): + for p in itertools.chain(gen_pinyins(), + gen_shengmu(), + gen_corrects(), + gen_u_to_v(), + ): + yield p + + +def gen_pinyins(): + #generate all pinyins in bopomofo + for pinyin_key in pinyin_list: + flags = [] + if pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP.keys(): + flags.append("IS_CHEWING") + if pinyin_key in pinyin.PINYIN_LIST or \ + pinyin_key in pinyin.SHENGMU_LIST: + flags.append("IS_PINYIN") + if pinyin_key in shengmu_list: + flags.append("PINYIN_INCOMPLETE") + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + if chewing_key in chewing.CHEWING_ASCII_INITIAL_MAP and \ + pinyin_key not in bopomofo.SPECIAL_INITIAL_SET: + flags.append("CHEWING_INCOMPLETE") + yield pinyin_key, pinyin_key, chewing_key, \ + flags, get_chewing(pinyin_key) + + +def get_shengmu_chewing(shengmu): + assert shengmu in shengmu_list, "Expected shengmu here." + chewing_key = 'CHEWING_{0}'.format(shengmu.upper()) + if chewing_key in chewing.ASCII_CHEWING_INITIAL_MAP: + initial = chewing_key + else: + initial = 'PINYIN_{0}'.format(shengmu.upper()) + return initial, "CHEWING_ZERO_MIDDLE", "CHEWING_ZERO_FINAL" + +def gen_shengmu(): + #generate all shengmu + for shengmu in shengmu_list: + if shengmu in pinyin_list: + continue + flags = ["IS_PINYIN", "PINYIN_INCOMPLETE"] + chewing_key = get_shengmu_chewing(shengmu) + chewing_initial = chewing_key[0] + if chewing_initial in chewing.ASCII_CHEWING_INITIAL_MAP: + chewing_initial = chewing.ASCII_CHEWING_INITIAL_MAP[chewing_initial] + yield shengmu, shengmu, chewing_initial, \ + flags, chewing_key + + +def gen_corrects(): + #generate corrections + for correct, wrong in auto_correct: + flags = ['IS_PINYIN', 'PINYIN_CORRECT_{0}_{1}'.format(wrong.upper(), + correct.upper())] + for pinyin_key in pinyin_list: + #fixes partial pinyin instead of the whole pinyin + if pinyin_key.endswith(correct) and pinyin_key != correct: + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + new_pinyin_key = pinyin_key.replace(correct, wrong) + yield pinyin_key, new_pinyin_key, chewing_key,\ + flags, get_chewing(pinyin_key) + + +def gen_u_to_v(): + #generate U to V + for correct, wrong, flags in auto_correct_ext: + #over-ride flags + flags = ['IS_PINYIN', 'PINYIN_CORRECT_V_U'] + pinyin_key = correct + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + yield correct, wrong, chewing_key, flags, get_chewing(pinyin_key) + +### main function ### +if __name__ == "__main__": + #pre-check here + check_pinyin_chewing_map() + + #dump + for p in gen_pinyin_list(): + print (p) diff --git a/scripts/specials.txt b/scripts/specials.txt new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/scripts/specials.txt diff --git a/scripts/specialtable.py b/scripts/specialtable.py new file mode 100644 index 0000000..b6fb680 --- /dev/null +++ b/scripts/specialtable.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import os +import sys +import math +import pinyin + +pinyin_list = sorted(pinyin.PINYIN_LIST) +shengmu_list = sorted(pinyin.SHENGMU_LIST) +yunmu_list = sorted(pinyin.YUNMU_LIST) + +phrase_dict = {} + + +def load_phrase(filename): + phrasefile = open(filename, "r") + for line in phrasefile.readlines(): + line = line.rstrip(os.linesep) + (pinyin_str, freq) = line.split(None, 1) + freq = int(freq) + if 0 == freq: + #print(pinyin_str) + continue + + # no duplicate here + if "'" in pinyin_str: + (first_key, second_key) = pinyin_str.split("'") + phrase_dict[(first_key, second_key)] = freq + else: + phrase_dict[pinyin_str] = freq + phrasefile.close() + + +def gen_all_divided(): + for pinyin_key in pinyin_list: + for first_key in pinyin_list: + if len(pinyin_key) <= len(first_key): + continue + if not pinyin_key.startswith(first_key): + continue + second_key = pinyin_key[len(first_key):] + if second_key in pinyin_list: + yield pinyin_key, first_key, second_key + + +def filter_divided(): + for (pinyin_key, first_key, second_key) in gen_all_divided(): + if not (first_key, second_key) in phrase_dict: + continue + orig_freq = 0 + if pinyin_key in phrase_dict: + orig_freq = phrase_dict[pinyin_key] + new_freq = phrase_dict[(first_key, second_key)] + yield pinyin_key, orig_freq, first_key, second_key, new_freq + + +def gen_all_resplit(): + for pinyin_key in pinyin_list: + if pinyin_key[-1] in ["n", "g", "r"]: + for yun in yunmu_list: + if yun not in pinyin_list: + continue + #check first new pinyin key + if not pinyin_key[:-1] in pinyin_list: + continue + #check second new pinyin key + new_pinyin_key = pinyin_key[-1] + yun + if new_pinyin_key in pinyin_list: + yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key +''' + elif pinyin_key[-1] in ["e"]: + #check first new pinyin key + if pinyin_key[:-1] in pinyin_list: + yield pinyin_key, "r", pinyin_key[:-1], "er" +''' + + +def filter_resplit(): + for (orig_first_key, orig_second_key, new_first_key, new_second_key) \ + in gen_all_resplit(): + #do the reverse here, as libpinyin pinyin parser is different with + #ibus-pinyin's parser. + (orig_first_key, orig_second_key, new_first_key, new_second_key) = \ + (new_first_key, new_second_key, orig_first_key, orig_second_key) + if (new_first_key, new_second_key) not in phrase_dict: + continue + orig_freq = 0 + new_freq = phrase_dict[(new_first_key, new_second_key)] + if (orig_first_key, orig_second_key) in phrase_dict: + orig_freq = phrase_dict[(orig_first_key, orig_second_key)] + yield orig_first_key, orig_second_key, orig_freq, \ + new_first_key, new_second_key, new_freq + + +#init code +load_phrase("pinyins.txt") +load_phrase("specials.txt") + +if __name__ == "__main__": + for p in filter_divided(): + print (p) + for p in filter_resplit(): + print (p) diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..01bdbc7 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import os + +header = '''/* This file is generated by python scripts. Don't edit this file directly. + */ +''' + +def expand_file(filename, get_table_content): + infile = open(filename, "r") + print(header) + for line in infile.readlines(): + line = line.rstrip(os.linesep) + if len(line) < 3 : + print(line) + continue + if line[0] == '@' and line[-1] == '@': + tablename = line[1:-1] + print(get_table_content(tablename)) + else: + print(line) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..4e0b09f --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,50 @@ +set( + LIBPINYIN_HEADERS + pinyin.h +) + +set( + LIBPINYIN_SOURCES + pinyin.cpp +) + +add_library( + libpinyin + SHARED + ${LIBPINYIN_SOURCES} +) + +target_link_libraries( + libpinyin + storage + lookup +) + +set_target_properties( + libpinyin + PROPERTIES + OUTPUT_NAME + pinyin + VERSION + 0.0.0 + SOVERSION + 0 +) + +install( + TARGETS + libpinyin + LIBRARY DESTINATION + ${DIR_LIBRARY} +) + +install( + FILES + ${LIBPINYIN_HEADERS} + DESTINATION + ${DIR_INCLUDE_LIBPINYIN} +) + +add_subdirectory(include) +add_subdirectory(storage) +add_subdirectory(lookup) diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..5600c86 --- /dev/null +++ b/src/Makefile.am @@ -0,0 +1,59 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = include storage lookup + +EXTRA_DIST = libpinyin.ver + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I $(ac_aux_dir) + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + @GLIB2_CFLAGS@ + +libpinyinincludedir = $(includedir)/libpinyin-@VERSION@ + +libpinyininclude_HEADERS= pinyin.h + +noinst_HEADERS = pinyin_internal.h + +lib_LTLIBRARIES = libpinyin.la + +noinst_LTLIBRARIES = libpinyin_internal.la + +libpinyin_la_SOURCES = pinyin.cpp + +libpinyin_la_LIBADD = storage/libstorage.la lookup/liblookup.la @GLIB2_LIBS@ + +libpinyin_la_LDFLAGS = -Wl,--version-script=$(srcdir)/libpinyin.ver \ + -version-info @LT_VERSION_INFO@ + +libpinyin_internal_la_SOURCES = pinyin_internal.cpp + +libpinyin_internal_la_LIBADD = storage/libstorage.la lookup/liblookup.la + + +## Note: +## As libpinyin internal interface will change, only provides static library +## to catch errors when compiling instead of running. diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt new file mode 100644 index 0000000..60d7d4c --- /dev/null +++ b/src/include/CMakeLists.txt @@ -0,0 +1,11 @@ +set( + LIBPINYIN_INCLUDE_HEADERS + novel_types.h +) + +install( + FILES + ${LIBPINYIN_INCLUDE_HEADERS} + DESTINATION + ${DIR_INCLUDE_LIBPINYIN} +) diff --git a/src/include/Makefile.am b/src/include/Makefile.am new file mode 100644 index 0000000..a779d97 --- /dev/null +++ b/src/include/Makefile.am @@ -0,0 +1,25 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +MAINTAINERCLEANFILES = Makefile.in + +libpinyinincludedir = $(includedir)/libpinyin-@VERSION@ + +libpinyininclude_HEADERS= novel_types.h + +noinst_HEADERS = memory_chunk.h \ + stl_lite.h diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h new file mode 100644 index 0000000..7b315af --- /dev/null +++ b/src/include/memory_chunk.h @@ -0,0 +1,413 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef MEMORY_CHUNK_H +#define MEMORY_CHUNK_H + +#include <config.h> +#include <assert.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdlib.h> +#ifdef HAVE_MMAP +#include <sys/mman.h> +#endif +#include "stl_lite.h" + +namespace pinyin{ + +/* for unmanaged mode + * m_free_func == free, when memory is allocated by malloc + * m_free_func == munmap, when memory is allocated by mmap + * m_free_func == NULL, + * when memory is in small protion of allocated area + * m_free_func == other, + * malloc then free. + */ + +/** + * MemoryChunk: + * + * The utility to manage the memory chunks. + * + */ + +class MemoryChunk{ + typedef void (* free_func_t)(...); +private: + char * m_data_begin; + char * m_data_end; //one data pass the end. + char * m_allocated; //one data pass the end. + free_func_t m_free_func; + +private: + void freemem(){ + if ((free_func_t)free == m_free_func) + free(m_data_begin); +#ifdef HAVE_MMAP + else if ((free_func_t)munmap == m_free_func) + munmap(m_data_begin, capacity()); +#endif + else + assert(FALSE); + } + + + void reset(){ + if (m_free_func) + freemem(); + + m_data_begin = NULL; + m_data_end = NULL; + m_allocated = NULL; + m_free_func = NULL; + } + + void ensure_has_space(size_t new_size){ + int delta_size = m_data_begin + new_size - m_data_end; + if ( delta_size <= 0 ) return; + ensure_has_more_space ( delta_size ); + } + + /* enlarge function */ + void ensure_has_more_space(size_t extra_size){ + if ( 0 == extra_size ) return; + size_t newsize; + size_t cursize = size(); + if ( m_free_func != (free_func_t)free ) { + /* copy on resize */ + newsize = cursize + extra_size; + /* do the copy */ + char * tmp = (char *) malloc(newsize); + assert(tmp); + memset(tmp, 0, newsize); + memmove(tmp, m_data_begin, cursize); + /* free the origin memory */ + if (m_free_func) + freemem(); + /* change varibles */ + m_data_begin = tmp; + m_data_end = m_data_begin + cursize; + m_allocated = m_data_begin + newsize; + m_free_func = (free_func_t)free; + return; + } + /* the memory area is managed by this memory chunk */ + if ( extra_size <= (size_t) (m_allocated - m_data_end)) + return; + newsize = std_lite::max( capacity()<<1, cursize + extra_size); + m_data_begin = (char *) realloc(m_data_begin, newsize); + assert(m_data_begin); + memset(m_data_begin + cursize, 0, newsize - cursize); + m_data_end = m_data_begin + cursize; + m_allocated = m_data_begin + newsize; + return; + } + +public: + /** + * MemoryChunk::MemoryChunk: + * + * The constructor of the MemoryChunk. + * + */ + MemoryChunk(){ + m_data_begin = NULL; + m_data_end = NULL; + m_allocated = NULL; + m_free_func = NULL; + } + + /** + * MemoryChunk::~MemoryChunk: + * + * The destructor of the MemoryChunk. + * + */ + ~MemoryChunk(){ + reset(); + } + + /** + * MemoryChunk::begin: + * + * Read access method, to get the begin of the MemoryChunk. + * + */ + void* begin() const{ + return m_data_begin; + } + + /** + * MemoryChunk::end: + * + * Write access method, to get the end of the MemoryChunk. + * + */ + void* end() const{ + return m_data_end; + } + + /** + * MemoryChunk::size: + * + * Get the size of the content in the MemoryChunk. + * + */ + size_t size() const{ + return m_data_end - m_data_begin; + } + + /** + * MemoryChunk::set_size: + * + * Set the size of the content in the MemoryChunk. + * + */ + void set_size(size_t newsize){ + ensure_has_space(newsize); + m_data_end = m_data_begin + newsize; + } + + /** + * MemoryChunk::capacity: + * + * Get the capacity of the MemoryChunk. + * + */ + size_t capacity(){ + return m_allocated - m_data_begin; + } + + /** + * MemoryChunk::set_chunk: + * @begin: the begin of the data + * @length: the length of the data + * @free_func: the function to free the data + * + * Transfer management of a memory chunk allocated by other part of the + * system to the memory chunk. + * + */ + void set_chunk(void* begin, size_t length, free_func_t free_func){ + if (m_free_func) + freemem(); + + m_data_begin = (char *) begin; + m_data_end = (char *) m_data_begin + length; + m_allocated = (char *) m_data_begin + length; + m_free_func = free_func; + } + + /** + * MemoryChunk::get_sub_chunk: + * @offset: the offset in this MemoryChunk. + * @length: the data length to be retrieved. + * @returns: the newly allocated MemoryChunk. + * + * Get a sub MemoryChunk from this MemoryChunk. + * + * Note: use set_chunk internally. + * the returned new chunk need to be deleted. + * + */ + MemoryChunk * get_sub_chunk(size_t offset, size_t length){ + MemoryChunk * retval = new MemoryChunk(); + char * begin_pos = m_data_begin + offset; + retval->set_chunk(begin_pos, length, NULL); + return retval; + } + + /** + * MemoryChunk::set_content: + * @offset: the offset in this MemoryChunk. + * @data: the begin of the data to be copied. + * @len: the length of the data to be copied. + * @returns: whether the data is copied successfully. + * + * Data are written directly to the memory area in this MemoryChunk. + * + */ + bool set_content(size_t offset, const void * data, size_t len){ + size_t cursize = std_lite::max(size(), offset + len); + ensure_has_space(offset + len); + memmove(m_data_begin + offset, data, len); + m_data_end = m_data_begin + cursize; + return true; + } + + /** + * MemoryChunk::append_content: + * @data: the begin of the data to be copied. + * @len: the length of the data to be copied. + * @returns: whether the data is appended successfully. + * + * Data are appended at the end of the MemoryChunk. + * + */ + bool append_content(const void * data, size_t len){ + return set_content(size(), data, len); + } + + /** + * MemoryChunk::insert_content: + * @offset: the offset in this MemoryChunk, which starts from zero. + * @data: the begin of the data to be copied. + * @length: the length of the data to be copied. + * @returns: whether the data is inserted successfully. + * + * Data are written to the memory area, + * the original content are moved towards the rear. + * + */ + bool insert_content(size_t offset, const void * data, size_t length){ + ensure_has_more_space(length); + size_t move_size = size() - offset; + memmove(m_data_begin + offset + length, m_data_begin + offset, move_size); + memmove(m_data_begin + offset, data, length); + m_data_end += length; + return true; + } + + /** + * MemoryChunk::remove_content: + * @offset: the offset in this MemoryChunk. + * @length: the length of the removed content. + * @returns: whether the content is removed successfully. + * + * Data are removed directly, + * the following content are moved towards the front. + * + */ + bool remove_content(size_t offset, size_t length){ + size_t move_size = size() - offset - length; + memmove(m_data_begin + offset, m_data_begin + offset + length, move_size); + m_data_end -= length; + return true; + } + + /** + * MemoryChunk::get_content: + * @offset: the offset in this MemoryChunk. + * @buffer: the buffer to retrieve the content. + * @length: the length of content to be retrieved. + * @returns: whether the content is retrieved. + * + * Get the content in this MemoryChunk. + * + */ + bool get_content(size_t offset, void * buffer, size_t length){ + if ( size() < offset + length ) + return false; + memcpy( buffer, m_data_begin + offset, length); + return true; + } + + /** + * MemoryChunk::compact_memory: + * + * Compact memory, reduce the size. + * + */ + void compact_memory(){ + if ( m_free_func != (free_func_t)free ) + return; + size_t newsize = size(); + m_data_begin = (char *) realloc(m_data_begin, newsize); + m_allocated = m_data_begin + newsize; + } + + /** + * MemoryChunk::load: + * @filename: load the MemoryChunk from the filename. + * @returns: whether the load is successful. + * + * Load the content from the filename. + * + */ + bool load(const char * filename){ + /* free old data */ + reset(); + + int fd = open(filename, O_RDONLY); + if (-1 == fd) + return false; + + off_t file_size = lseek(fd, 0, SEEK_END); + lseek(fd, 0, SEEK_SET); + + int data_len = file_size; + +#ifdef HAVE_MMAP + void* data = mmap(NULL, data_len, PROT_READ|PROT_WRITE, MAP_PRIVATE, + fd, 0); + + if (MAP_FAILED == data) { + close(fd); + return false; + } + + set_chunk(data, data_len, (free_func_t)munmap); +#else + void* data = malloc(data_len); + if ( !data ){ + close(fd); + return false; + } + + data_len = read(fd, data, data_len); + set_chunk(data, data_len, (free_func_t)free); +#endif + + close(fd); + return true; + } + + /** + * MemoryChunk::save: + * @filename: save this MemoryChunk to the filename. + * @returns: whether the save is successful. + * + * Save the content to the filename. + * + */ + bool save(const char * filename){ + int fd = open(filename, O_CREAT|O_WRONLY|O_TRUNC, 0644); + if ( -1 == fd ) + return false; + + size_t data_len = write(fd, begin(), size()); + if ( data_len != size()){ + close(fd); + return false; + } + + fsync(fd); + close(fd); + return true; + } +}; + +}; + +#endif diff --git a/src/include/novel_types.h b/src/include/novel_types.h new file mode 100644 index 0000000..88c063c --- /dev/null +++ b/src/include/novel_types.h @@ -0,0 +1,155 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +/* + * This header file contains novel types designed for pinyin processing. + */ + + +#ifndef NOVEL_TYPES_H +#define NOVEL_TYPES_H + +#include <glib.h> + +G_BEGIN_DECLS + +typedef guint32 phrase_token_t; +typedef gunichar ucs4_t; + +/* + * Phrase Index Library Definition + * Reserve 4-bits for future usage. + */ + +#define PHRASE_MASK 0x00FFFFFF +#define PHRASE_INDEX_LIBRARY_MASK 0x0F000000 +#define PHRASE_INDEX_LIBRARY_COUNT (1<<4) +#define PHRASE_INDEX_LIBRARY_INDEX(token) ((token&PHRASE_INDEX_LIBRARY_MASK)>>24) +#define PHRASE_INDEX_MAKE_TOKEN(phrase_index, token) \ + ( ( (phrase_index<<24) & PHRASE_INDEX_LIBRARY_MASK)|(token & PHRASE_MASK)) + + +/* + * PhraseIndexRanges definitions + */ + +struct PhraseIndexRange{ + phrase_token_t m_range_begin; + phrase_token_t m_range_end; /* pass the last item like stl */ +}; + +/* Array of PhraseIndexRange */ +typedef GArray * PhraseIndexRanges[PHRASE_INDEX_LIBRARY_COUNT]; +/* Array of Token */ +typedef GArray * PhraseTokens[PHRASE_INDEX_LIBRARY_COUNT]; + + +/* + * PinYin Table Definition + */ + + +/* For both PinYin Table and Phrase Table */ +enum SearchResult{ + SEARCH_NONE = 0x00, /* found nothing */ + SEARCH_OK = 0x01 , /* found items */ + SEARCH_CONTINUED = 0x02 /* has longer word in the storage to search */ +}; + +/* For Phrase Index */ +enum ErrorResult{ + ERROR_OK = 0, /* operate ok */ + ERROR_INSERT_ITEM_EXISTS, /* item already exists */ + ERROR_REMOVE_ITEM_DONOT_EXISTS, /* item don't exists */ + ERROR_PHRASE_TOO_LONG, /* the phrase is too long */ + ERROR_NO_SUB_PHRASE_INDEX, /* sub phrase index is not loaded */ + ERROR_NO_ITEM, /* item has a null slot */ + ERROR_OUT_OF_RANGE, /* beyond the end of the sub phrase index */ + ERROR_FILE_CORRUPTION, /* file is corrupted */ + ERROR_INTEGER_OVERFLOW, /* integer is overflowed */ + ERROR_ALREADY_EXISTS, /* the sub phrase already exists. */ + ERROR_NO_USER_TABLE /* the user table is not loaded. */ +}; + +/* For N-gram */ +enum ATTACH_FLAG{ + ATTACH_READONLY = 1, + ATTACH_READWRITE = 0x1 << 1, + ATTACH_CREATE = 0x1 << 2, +}; + +/* + * n-gram Definition + * no B parameter(there are duplicated items in uni-gram and bi-gram) + * used in system n-gram and user n-gram. + * using delta technique. + */ + +struct BigramPhraseItem{ + phrase_token_t m_token; + gfloat m_freq; /* P(W2|W1) */ +}; + +struct BigramPhraseItemWithCount{ + phrase_token_t m_token; + guint32 m_count; + gfloat m_freq; /* P(W2|W1) */ +}; + +typedef GArray * BigramPhraseArray; /* Array of BigramPhraseItem */ +typedef GArray * BigramPhraseWithCountArray; /* Array of BigramPhraseItemWithCount */ + +#define MAX_PHRASE_LENGTH 16 + +const phrase_token_t null_token = 0; +const phrase_token_t sentence_start = 1; +const phrase_token_t token_min = 0; +const phrase_token_t token_max = UINT_MAX; + +const char c_separate = '#'; +typedef guint32 table_offset_t; + +typedef double parameter_t; + +/* Array of ChewingKey/ChewingKeyRest */ +typedef GArray * ChewingKeyVector; +typedef GArray * ChewingKeyRestVector; + +/* Array of phrase_token_t */ +typedef GArray * TokenVector; +typedef TokenVector MatchResults; + +/* Array of lookup_constraint_t */ +typedef GArray * CandidateConstraints; + +typedef guint32 pinyin_option_t; + +typedef enum { + RESERVED = 0, + GB_DICTIONARY = 1, + GBK_DICTIONARY = 2, + MERGED_DICTIONARY = 3, + USER_DICTIONARY = 15 +} PHRASE_INDEX_LIBRARIES; + +G_END_DECLS + +#endif diff --git a/src/include/stl_lite.h b/src/include/stl_lite.h new file mode 100644 index 0000000..5ad977d --- /dev/null +++ b/src/include/stl_lite.h @@ -0,0 +1,45 @@ +#ifndef STL_LITE_H +#define STL_LITE_H + +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include <algorithm> + +namespace std_lite{ + + /** + * To restrict the usage of STL functions in libpinyin, + * all needed functions should be imported here. + */ + + + using std::min; + + + using std::max; + + + using std::pair; + + + using std::make_pair; + + + using std::lower_bound; + + + using std::upper_bound; + + + using std::equal_range; + + + using std::make_heap; + + + using std::pop_heap; + + +} +#endif diff --git a/src/libpinyin.ver b/src/libpinyin.ver new file mode 100644 index 0000000..1b6cc4b --- /dev/null +++ b/src/libpinyin.ver @@ -0,0 +1,58 @@ +LIBPINYIN { + global: + pinyin_init; + pinyin_save; + pinyin_set_double_pinyin_scheme; + pinyin_set_chewing_scheme; + pinyin_load_phrase_library; + pinyin_unload_phrase_library; + pinyin_begin_add_phrases; + pinyin_iterator_add_phrase; + pinyin_end_add_phrases; + pinyin_fini; + pinyin_mask_out; + pinyin_set_options; + pinyin_alloc_instance; + pinyin_free_instance; + pinyin_guess_sentence; + pinyin_guess_sentence_with_prefix; + pinyin_phrase_segment; + pinyin_get_sentence; + pinyin_parse_full_pinyin; + pinyin_parse_more_full_pinyins; + pinyin_parse_double_pinyin; + pinyin_parse_more_double_pinyins; + pinyin_parse_chewing; + pinyin_parse_more_chewings; + pinyin_in_chewing_keyboard; + pinyin_guess_candidates; + pinyin_guess_full_pinyin_candidates; + pinyin_choose_candidate; + pinyin_clear_constraint; + pinyin_lookup_tokens; + pinyin_train; + pinyin_reset; + pinyin_get_chewing_string; + pinyin_get_pinyin_string; + pinyin_get_pinyin_strings; + pinyin_token_get_phrase; + pinyin_token_get_n_pronunciation; + pinyin_token_get_nth_pronunciation; + pinyin_token_get_unigram_frequency; + pinyin_token_add_unigram_frequency; + pinyin_get_n_candidate; + pinyin_get_candidate; + pinyin_get_candidate_type; + pinyin_get_candidate_string; + pinyin_get_n_pinyin; + pinyin_get_pinyin_key; + pinyin_get_pinyin_key_rest; + pinyin_get_pinyin_key_rest_positions; + pinyin_get_pinyin_key_rest_length; + pinyin_get_raw_full_pinyin; + pinyin_get_n_phrase; + pinyin_get_phrase_token; + + local: + *; +}; diff --git a/src/lookup/CMakeLists.txt b/src/lookup/CMakeLists.txt new file mode 100644 index 0000000..937b2cb --- /dev/null +++ b/src/lookup/CMakeLists.txt @@ -0,0 +1,23 @@ +set( + CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" +) + +set( + LIBLOOKUP_SOURCES + pinyin_lookup2.cpp + phrase_lookup.cpp + lookup.cpp +) + +add_library( + lookup + STATIC + ${LIBLOOKUP_SOURCES} +) + +install( + FILES + ${LIBLOOKUP_HEADERS} + DESTINATION + ${DIR_INCLUDE_LIBPINYIN} +) diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am new file mode 100644 index 0000000..00d7df4 --- /dev/null +++ b/src/lookup/Makefile.am @@ -0,0 +1,36 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CFLAGS@ + +noinst_HEADERS = lookup.h \ + pinyin_lookup2.h \ + phrase_lookup.h + +noinst_LTLIBRARIES = liblookup.la + +liblookup_la_CXXFLAGS = "-fPIC" + +liblookup_la_LDFLAGS = -static + +liblookup_la_SOURCES = pinyin_lookup2.cpp \ + phrase_lookup.cpp \ + lookup.cpp diff --git a/src/lookup/lookup.cpp b/src/lookup/lookup.cpp new file mode 100644 index 0000000..c32a0ec --- /dev/null +++ b/src/lookup/lookup.cpp @@ -0,0 +1,73 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "lookup.h" +#include "phrase_index.h" + +namespace pinyin{ + +bool convert_to_utf8(FacadePhraseIndex * phrase_index, + MatchResults match_results, + /* in */ const char * delimiter, + /* in */ bool show_tokens, + /* out */ char * & result_string){ + //init variables + if ( NULL == delimiter ) + delimiter = ""; + result_string = NULL; + + PhraseItem item; + + for ( size_t i = 0; i < match_results->len; ++i ){ + phrase_token_t token = g_array_index + (match_results, phrase_token_t, i); + if ( null_token == token ) + continue; + + phrase_index->get_phrase_item(token, item); + ucs4_t buffer[MAX_PHRASE_LENGTH]; + item.get_phrase_string(buffer); + + guint8 length = item.get_phrase_length(); + gchar * phrase = NULL; + char * tmp = NULL; + + if (show_tokens) { + tmp = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); + phrase = g_strdup_printf("%d %s", token, tmp); + g_free(tmp); + } else { + phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); + } + + tmp = result_string; + if ( NULL == result_string ) + result_string = g_strdup(phrase); + else + result_string = g_strconcat(result_string, delimiter, phrase, NULL); + g_free(phrase); + g_free(tmp); + } + return true; +} + +}; diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h new file mode 100644 index 0000000..8dc1a89 --- /dev/null +++ b/src/lookup/lookup.h @@ -0,0 +1,79 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef LOOKUP_H +#define LOOKUP_H + + +/** @file lookup.h + * @brief the definitions of common lookup related classes and structs. + */ + +#include "novel_types.h" +#include <limits.h> + +namespace pinyin{ + +typedef phrase_token_t lookup_key_t; + +struct lookup_value_t{ + /* previous and current tokens of the node */ + phrase_token_t m_handles[2]; + /* maximum possibility of current node */ + gfloat m_poss; + /* trace back information for final step */ + gint32 m_last_step; + + lookup_value_t(gfloat poss = FLT_MAX){ + m_handles[0] = null_token; m_handles[1] = null_token; + m_poss = poss; + m_last_step = -1; + } +}; + + +class FacadePhraseIndex; + + +/* Note: + * LookupStepIndex: + * the main purpose of lookup step index is served for an index + * for lookup step content, which can quickly merge the same node + * with different possibilities, + * then only keep the highest value of the node. + * LookupStepContent: + * the place to store the lookup values of current step, + * and indexed by lookup step index. + * See also comments on lookup_value_t. + */ + +typedef GHashTable * LookupStepIndex; +/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */ +typedef GArray * LookupStepContent; /* array of lookup_value_t */ + +bool convert_to_utf8(FacadePhraseIndex * phrase_index, + MatchResults match_results, + /* in */ const char * delimiter, + /* in */ bool show_tokens, + /* out */ char * & result_string); + +}; +#endif diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp new file mode 100644 index 0000000..f7da0b7 --- /dev/null +++ b/src/lookup/phrase_lookup.cpp @@ -0,0 +1,434 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <math.h> +#include "stl_lite.h" +#include "novel_types.h" +#include "phrase_index.h" +#include "facade_phrase_table2.h" +#include "ngram.h" +#include "phrase_lookup.h" + +using namespace pinyin; + + +/* +const gfloat PhraseLookup::bigram_lambda = lambda; +const gfloat PhraseLookup::unigram_lambda = 1 - lambda; +*/ + +static bool populate_prefixes(GPtrArray * steps_index, + GPtrArray * steps_content) { + + lookup_key_t initial_key = sentence_start; + lookup_value_t initial_value(log(1)); + initial_value.m_handles[1] = sentence_start; + + LookupStepContent initial_step_content = (LookupStepContent) + g_ptr_array_index(steps_content, 0); + g_array_append_val(initial_step_content, initial_value); + + LookupStepIndex initial_step_index = (LookupStepIndex) + g_ptr_array_index(steps_index, 0); + g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), + GUINT_TO_POINTER(initial_step_content->len - 1)); + + return true; +} + +static bool init_steps(GPtrArray * steps_index, + GPtrArray * steps_content, + int nstep) { + + /* add null start step */ + g_ptr_array_set_size(steps_index, nstep); + g_ptr_array_set_size(steps_content, nstep); + + for ( int i = 0; i < nstep; ++i ){ + /* initialize steps_index */ + g_ptr_array_index(steps_index, i) = g_hash_table_new + (g_direct_hash, g_direct_equal); + /* initialize steps_content */ + g_ptr_array_index(steps_content, i) = g_array_new + (FALSE, FALSE, sizeof(lookup_value_t)); + } + + return true; +} + +static void clear_steps(GPtrArray * steps_index, + GPtrArray * steps_content){ + /* clear steps_index */ + for ( size_t i = 0; i < steps_index->len; ++i){ + GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i); + g_hash_table_destroy(table); + g_ptr_array_index(steps_index, i) = NULL; + } + + /* free steps_content */ + for ( size_t i = 0; i < steps_content->len; ++i){ + GArray * array = (GArray *) g_ptr_array_index(steps_content, i); + g_array_free(array, TRUE); + g_ptr_array_index(steps_content, i) = NULL; + } +} + +PhraseLookup::PhraseLookup(const gfloat lambda, + FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram) + : bigram_lambda(lambda), + unigram_lambda(1. - lambda) +{ + m_phrase_table = phrase_table; + m_phrase_index = phrase_index; + m_system_bigram = system_bigram; + m_user_bigram = user_bigram; + + m_steps_index = g_ptr_array_new(); + m_steps_content = g_ptr_array_new(); + + /* the member variables below are saved in get_best_match call. */ + m_sentence = NULL; + m_sentence_length = 0; +} + +PhraseLookup::~PhraseLookup(){ + clear_steps(m_steps_index, m_steps_content); + g_ptr_array_free(m_steps_index, TRUE); + g_ptr_array_free(m_steps_content, TRUE); +} + +bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[], + MatchResults & results){ + m_sentence_length = sentence_length; + m_sentence = sentence; + int nstep = m_sentence_length + 1; + + clear_steps(m_steps_index, m_steps_content); + + init_steps(m_steps_index, m_steps_content, nstep); + + populate_prefixes(m_steps_index, m_steps_content); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + m_phrase_index->prepare_tokens(tokens); + + for ( int i = 0; i < nstep - 1; ++i ){ + for ( int m = i + 1; m < nstep; ++m ){ + + /* do one phrase table search. */ + int result = m_phrase_table->search(m - i, sentence + i, tokens); + + /* found next phrase */ + if ( result & SEARCH_OK ) { + search_bigram2(i, tokens), + search_unigram2(i, tokens); + } + + /* no longer phrase */ + if (!(result & SEARCH_CONTINUED)) + break; + } + } + + m_phrase_index->destroy_tokens(tokens); + + return final_step(results); +} + +#if 0 + +bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){ + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return false; + + lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0); + /* find the maximum node */ + for ( size_t i = 1; i < lookup_content->len; ++i ){ + lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + return unigram_gen_next_step(nstep, max_value, token); +} + +bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return false; + + for ( size_t i = 0; i < lookup_content->len; ++i ){ + lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i); + phrase_token_t index_token = cur_value->m_handles[1]; + SingleGram * system, * user; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if ( !merge_single_gram(&m_merged_single_gram, system, user) ) + continue; + + guint32 freq; + if ( m_merged_single_gram.get_freq(token, freq) ){ + guint32 total_freq; + m_merged_single_gram.get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found; + } + + if (system) + delete system; + if (user) + delete user; + } + + return found; +} + +#endif + +bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return found; + + /* find the maximum node */ + lookup_value_t * max_value = &g_array_index + (lookup_content, lookup_value_t, 0); + + for (size_t i = 1; i < lookup_content->len; ++i) { + lookup_value_t * cur_value = &g_array_index + (lookup_content, lookup_value_t, i); + if (cur_value->m_poss > max_value->m_poss) + max_value = cur_value; + } + + /* iterate over tokens */ + for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { + GArray * array = tokens[n]; + if (NULL == array) + continue; + + /* just skip the loop when the length is zero. */ + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = + g_array_index(array, phrase_token_t, k); + + found = unigram_gen_next_step + (nstep, max_value, token) || found; + } + } + + return found; +} + +bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if (0 == lookup_content->len) + return found; + + for (size_t i = 0; i < lookup_content->len; ++i) { + lookup_value_t * cur_value = &g_array_index + (lookup_content, lookup_value_t, i); + phrase_token_t index_token = cur_value->m_handles[1]; + + SingleGram * system = NULL, * user = NULL; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if (!merge_single_gram + (&m_merged_single_gram, system, user)) + continue; + + /* iterate over tokens */ + for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { + GArray * array = tokens[n]; + if (NULL == array) + continue; + + /* just skip the loop when the length is zero. */ + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = + g_array_index(array, phrase_token_t, k); + + guint32 freq = 0; + if (m_merged_single_gram.get_freq(token, freq)) { + guint32 total_freq = 0; + m_merged_single_gram.get_total_freq(total_freq); + + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found; + } + } + } + + if (system) + delete system; + if (user) + delete user; + } + + return found; +} + +bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value, +phrase_token_t token){ + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble) + m_phrase_index->get_phrase_index_total_freq(); + if ( elem_poss < DBL_EPSILON ) + return false; + + lookup_value_t next_value; + next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token; + next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda); + next_value.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_value, &next_value); +} + +bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){ + + if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() / + (gdouble) m_phrase_index->get_phrase_index_total_freq(); + + if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) + return false; + + lookup_value_t next_value; + next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token; + next_value.m_poss = cur_value->m_poss + + log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss ); + next_value.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_value, &next_value); +} + +bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){ + + LookupStepIndex next_lookup_index = (LookupStepIndex) + g_ptr_array_index(m_steps_index, next_step_pos); + LookupStepContent next_lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, next_step_pos); + + lookup_key_t next_key = next_value->m_handles[1]; + + gpointer key = NULL, value = NULL; + gboolean lookup_result = g_hash_table_lookup_extended + (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); + + if (!lookup_result){ + g_array_append_val(next_lookup_content, *next_value); + g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), + GUINT_TO_POINTER(next_lookup_content->len - 1)); + return true; + }else{ + size_t step_index = GPOINTER_TO_UINT(value); + lookup_value_t * orig_next_value = &g_array_index + (next_lookup_content, lookup_value_t, step_index); + + if ( orig_next_value->m_poss < next_value->m_poss ){ + orig_next_value->m_handles[0] = next_value->m_handles[0]; + assert(orig_next_value->m_handles[1] == next_value->m_handles[1]); + orig_next_value->m_poss = next_value->m_poss; + orig_next_value->m_last_step = next_value->m_last_step; + return true; + } + return false; + } +} + +bool PhraseLookup::final_step(MatchResults & results ){ + + /* reset results */ + g_array_set_size(results, m_steps_content->len - 1); + for ( size_t i = 0; i < results->len; ++i ){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + *token = null_token; + } + + /* find max element */ + size_t last_step_pos = m_steps_content->len - 1; + LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index + (m_steps_content, last_step_pos); + if ( last_step_content->len == 0 ) + return false; + + lookup_value_t * max_value = &g_array_index + (last_step_content, lookup_value_t, 0); + for ( size_t i = 1; i < last_step_content->len; ++i ){ + lookup_value_t * cur_value = &g_array_index + (last_step_content, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + /* backtracing */ + while( true ){ + int cur_step_pos = max_value->m_last_step; + if ( -1 == cur_step_pos ) + break; + + phrase_token_t * token = &g_array_index + (results, phrase_token_t, cur_step_pos); + *token = max_value->m_handles[1]; + + phrase_token_t last_token = max_value->m_handles[0]; + LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos); + + gpointer key = NULL, value = NULL; + gboolean result = g_hash_table_lookup_extended + (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); + if ( !result ) + return false; + + LookupStepContent lookup_step_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, cur_step_pos); + max_value = &g_array_index + (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); + } + + /* no need to reverse the result */ + return true; +} diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h new file mode 100644 index 0000000..cf65692 --- /dev/null +++ b/src/lookup/phrase_lookup.h @@ -0,0 +1,142 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef PHRASE_LOOKUP_H +#define PHRASE_LOOKUP_H + +#include "novel_types.h" +#include "ngram.h" +#include "lookup.h" + +/** + * phrase_lookup.h + * + * The definitions of phrase lookup related classes and structs. + * + */ + +namespace pinyin{ + +/** + * PhraseLookup: + * + * The phrase lookup class to convert the sentence to phrase tokens. + * + */ +class PhraseLookup{ +private: + const gfloat bigram_lambda; + const gfloat unigram_lambda; + + PhraseItem m_cache_phrase_item; + SingleGram m_merged_single_gram; +protected: + //saved varibles + FacadePhraseTable2 * m_phrase_table; + FacadePhraseIndex * m_phrase_index; + Bigram * m_system_bigram; + Bigram * m_user_bigram; + + //internal step data structure + GPtrArray * m_steps_index; + /* Array of LookupStepIndex */ + GPtrArray * m_steps_content; + /* Array of LookupStepContent */ + + /* Saved sentence */ + int m_sentence_length; + ucs4_t * m_sentence; + +protected: + /* Explicitly search the next phrase, + * to avoid double phrase lookup as the next token has only one. + */ + bool search_unigram2(int nstep, PhraseTokens tokens); + bool search_bigram2(int nstep, PhraseTokens tokens); + + bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token); + bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss); + + bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step); + + bool final_step(MatchResults & results); +public: + /** + * PhraseLookup::PhraseLookup: + * @lambda: the lambda parameter for interpolation model. + * @phrase_table: the phrase table. + * @phrase_index: the phrase index. + * @system_bigram: the system bi-gram. + * @user_bigram: the user bi-gram. + * + * The constructor of the PhraseLookup. + * + */ + PhraseLookup(const gfloat lambda, + FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram); + + /** + * PhraseLookup::~PhraseLookup: + * + * The destructor of the PhraseLookup. + * + */ + ~PhraseLookup(); + + /** + * PhraseLookup::get_best_match: + * @sentence_length: the length of the sentence in ucs4 characters. + * @sentence: the ucs4 characters of the sentence. + * @results: the segmented sentence in the form of phrase tokens. + * @returns: whether the segment operation is successful. + * + * Segment the sentence into phrase tokens. + * + * Note: this method only accepts the characters in phrase large table. + * + */ + bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results); + + /** + * PhraseLookup::convert_to_utf8: + * @results: the guessed sentence in the form of phrase tokens. + * @result_string: the converted sentence in utf8 string. + * @returns: whether the convert operation is successful. + * + * Convert the sentence from phrase tokens to the utf8 string. + * + * Note: free the result_string by g_free. + * + */ + bool convert_to_utf8(MatchResults results, + /* out */ char * & result_string) + { + return pinyin::convert_to_utf8(m_phrase_index, results, + "\n", true, result_string); + } +}; + +}; + +#endif diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp new file mode 100644 index 0000000..2250a93 --- /dev/null +++ b/src/lookup/pinyin_lookup2.cpp @@ -0,0 +1,730 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <math.h> +#include "facade_chewing_table.h" +#include "pinyin_lookup2.h" +#include "stl_lite.h" + +using namespace pinyin; + +/* +const gfloat PinyinLookup2::bigram_lambda = lambda; +const gfloat PinyinLookup2::unigram_lambda = 1 - lambda; +*/ + +/* internal definition */ +static const size_t nbeam = 32; + +static bool dump_max_value(GPtrArray * values){ + if (0 == values->len) + return false; + + const lookup_value_t * max = + (const lookup_value_t *) g_ptr_array_index(values, 0); + + for (size_t i = 1; i < values->len; ++i) { + const lookup_value_t * cur = + (const lookup_value_t *) g_ptr_array_index(values, i); + + if (cur->m_poss > max->m_poss) + max = cur; + } + + printf("max value: %f\n", max->m_poss); + + return true; +} + +static bool dump_all_values(GPtrArray * values) { + if (0 == values->len) + return false; + + printf("values:"); + for (size_t i = 0; i < values->len; ++i) { + const lookup_value_t * cur = + (const lookup_value_t *) g_ptr_array_index(values, i); + + printf("%f\t", cur->m_poss); + } + printf("\n"); + + return true; +} + +/* populate the candidates. */ +static bool populate_candidates(/* out */ GPtrArray * candidates, + /* in */ LookupStepContent step) { + g_ptr_array_set_size(candidates, 0); + + if (0 == step->len) + return false; + + for (size_t i = 0; i < step->len; ++i) { + lookup_value_t * value = &g_array_index + (step, lookup_value_t, i); + + g_ptr_array_add(candidates, value); + } + + /* dump_max_value(candidates); */ + + return true; +} + +static bool lookup_value_less_than(lookup_value_t * lhs, lookup_value_t * rhs){ + return lhs->m_poss < rhs->m_poss; +} + +/* use maximum heap to get the topest results. */ +static bool get_top_results(/* out */ GPtrArray * topresults, + /* in */ GPtrArray * candidates) { + g_ptr_array_set_size(topresults, 0); + + if (0 == candidates->len) + return false; + + lookup_value_t ** begin = + (lookup_value_t **) &g_ptr_array_index(candidates, 0); + lookup_value_t ** end = + (lookup_value_t **) &g_ptr_array_index(candidates, candidates->len); + + std_lite::make_heap(begin, end, lookup_value_less_than); + + while (end != begin) { + lookup_value_t * one = *begin; + g_ptr_array_add(topresults, one); + + std_lite::pop_heap(begin, end, lookup_value_less_than); + --end; + + if (topresults->len >= nbeam) + break; + } + + /* dump_all_values(topresults); */ + + return true; +} + +static bool populate_prefixes(GPtrArray * steps_index, + GPtrArray * steps_content, + TokenVector prefixes) { + assert(prefixes->len > 0); + + for (size_t i = 0; i < prefixes->len; ++i) { + phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); + lookup_key_t initial_key = token; + lookup_value_t initial_value(log(1)); + initial_value.m_handles[1] = token; + + LookupStepContent initial_step_content = (LookupStepContent) + g_ptr_array_index(steps_content, 0); + initial_step_content = g_array_append_val + (initial_step_content, initial_value); + + LookupStepIndex initial_step_index = (LookupStepIndex) + g_ptr_array_index(steps_index, 0); + g_hash_table_insert(initial_step_index, + GUINT_TO_POINTER(initial_key), + GUINT_TO_POINTER(initial_step_content->len - 1)); + } + + return true; +} + +static bool init_steps(GPtrArray * steps_index, + GPtrArray * steps_content, + int nstep){ + /* add null start step */ + g_ptr_array_set_size(steps_index, nstep); + g_ptr_array_set_size(steps_content, nstep); + + for (int i = 0; i < nstep; ++i) { + /* initialize steps_index */ + g_ptr_array_index(steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal); + /* initialize steps_content */ + g_ptr_array_index(steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t)); + } + + return true; +} + +static void clear_steps(GPtrArray * steps_index, GPtrArray * steps_content){ + /* clear steps_index */ + for ( size_t i = 0; i < steps_index->len; ++i){ + GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i); + g_hash_table_destroy(table); + g_ptr_array_index(steps_index, i) = NULL; + } + + /* clear steps_content */ + for ( size_t i = 0; i < steps_content->len; ++i){ + GArray * array = (GArray *) g_ptr_array_index(steps_content, i); + g_array_free(array, TRUE); + g_ptr_array_index(steps_content, i) = NULL; + } +} + + +PinyinLookup2::PinyinLookup2(const gfloat lambda, + pinyin_option_t options, + FacadeChewingTable * pinyin_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram) + : bigram_lambda(lambda), + unigram_lambda(1. - lambda) +{ + m_options = options; + m_pinyin_table = pinyin_table; + m_phrase_index = phrase_index; + m_system_bigram = system_bigram; + m_user_bigram = user_bigram; + + m_steps_index = g_ptr_array_new(); + m_steps_content = g_ptr_array_new(); + + /* the member variables below are saved in get_best_match call. */ + m_keys = NULL; + m_constraints = NULL; +} + +PinyinLookup2::~PinyinLookup2(){ + clear_steps(m_steps_index, m_steps_content); + g_ptr_array_free(m_steps_index, TRUE); + g_ptr_array_free(m_steps_content, TRUE); +} + + +bool PinyinLookup2::get_best_match(TokenVector prefixes, + ChewingKeyVector keys, + CandidateConstraints constraints, + MatchResults & results){ + m_constraints = constraints; + m_keys = keys; + int nstep = keys->len + 1; + + clear_steps(m_steps_index, m_steps_content); + + init_steps(m_steps_index, m_steps_content, nstep); + + populate_prefixes(m_steps_index, m_steps_content, prefixes); + + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(PhraseIndexRanges)); + m_phrase_index->prepare_ranges(ranges); + + GPtrArray * candidates = g_ptr_array_new(); + GPtrArray * topresults = g_ptr_array_new(); + + /* begin the viterbi beam search. */ + for ( int i = 0; i < nstep - 1; ++i ){ + lookup_constraint_t * cur_constraint = &g_array_index + (m_constraints, lookup_constraint_t, i); + + if (CONSTRAINT_NOSEARCH == cur_constraint->m_type) + continue; + + LookupStepContent step = (LookupStepContent) + g_ptr_array_index(m_steps_content, i); + + populate_candidates(candidates, step); + get_top_results(topresults, candidates); + + if (0 == topresults->len) + continue; + + for ( int m = i + 1; m < nstep; ++m ){ + const int len = m - i; + if (len > MAX_PHRASE_LENGTH) + break; + + lookup_constraint_t * next_constraint = &g_array_index + (m_constraints, lookup_constraint_t, m - 1); + + if (CONSTRAINT_NOSEARCH == next_constraint->m_type) + break; + + ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data; + /* do one pinyin table search. */ + int result = m_pinyin_table->search(len, pinyin_keys + i, ranges); + + if (result & SEARCH_OK) { + /* assume topresults always contains items. */ + search_bigram2(topresults, i, ranges), + search_unigram2(topresults, i, ranges); + } + + /* poke the next constraint. */ + ++ next_constraint; + if (CONSTRAINT_ONESTEP == next_constraint->m_type) + break; + + /* no longer pinyin */ + if (!(result & SEARCH_CONTINUED)) + break; + } + } + + m_phrase_index->destroy_ranges(ranges); + + g_ptr_array_free(candidates, TRUE); + g_ptr_array_free(topresults, TRUE); + + return final_step(results); +} + +bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges) { + + if (0 == topresults->len) + return false; + + lookup_value_t * max = (lookup_value_t *) + g_ptr_array_index(topresults, 0); + + lookup_constraint_t * constraint = + &g_array_index(m_constraints, lookup_constraint_t, nstep); + + if (CONSTRAINT_ONESTEP == constraint->m_type) { + return unigram_gen_next_step(nstep, max, constraint->m_token); + } + + bool found = false; + + if (NO_CONSTRAINT == constraint->m_type) { + for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = ranges[m]; + if ( !array ) continue; + + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); + for ( phrase_token_t token = range->m_range_begin; + token != range->m_range_end; ++token){ + found = unigram_gen_next_step(nstep, max, token)|| found; + } + } + } + } + + return found; +} + +bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges) { + + lookup_constraint_t * constraint = + &g_array_index(m_constraints, lookup_constraint_t, nstep); + + bool found = false; + BigramPhraseArray bigram_phrase_items = g_array_new + (FALSE, FALSE, sizeof(BigramPhraseItem)); + + for (size_t i = 0; i < topresults->len; ++i) { + lookup_value_t * value = (lookup_value_t *) + g_ptr_array_index(topresults, i); + + phrase_token_t index_token = value->m_handles[1]; + + SingleGram * system = NULL, * user = NULL; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if ( !merge_single_gram(&m_merged_single_gram, system, user) ) + continue; + + if ( CONSTRAINT_ONESTEP == constraint->m_type ){ + phrase_token_t token = constraint->m_token; + + guint32 freq; + if( m_merged_single_gram.get_freq(token, freq) ){ + guint32 total_freq; + m_merged_single_gram.get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found; + } + } + + if (NO_CONSTRAINT == constraint->m_type) { + for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = ranges[m]; + if ( !array ) continue; + + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = + &g_array_index(array, PhraseIndexRange, n); + + g_array_set_size(bigram_phrase_items, 0); + m_merged_single_gram.search(range, bigram_phrase_items); + for( size_t k = 0; k < bigram_phrase_items->len; ++k) { + BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); + found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found; + } + } + } + } + if (system) + delete system; + if (user) + delete user; + } + + g_array_free(bigram_phrase_items, TRUE); + return found; +} + + +bool PinyinLookup2::unigram_gen_next_step(int nstep, + lookup_value_t * cur_step, + phrase_token_t token) { + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble) + m_phrase_index->get_phrase_index_total_freq(); + if ( elem_poss < DBL_EPSILON ) + return false; + + ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep; + gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys); + if (pinyin_poss < FLT_EPSILON ) + return false; + + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup2::bigram_gen_next_step(int nstep, + lookup_value_t * cur_step, + phrase_token_t token, + gfloat bigram_poss) { + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() / + (gdouble) m_phrase_index->get_phrase_index_total_freq(); + if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) + return false; + + ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep; + gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys); + if ( pinyin_poss < FLT_EPSILON ) + return false; + + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + + log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup2::save_next_step(int next_step_pos, + lookup_value_t * cur_step, + lookup_value_t * next_step){ + + lookup_key_t next_key = next_step->m_handles[1]; + LookupStepIndex next_lookup_index = (LookupStepIndex) + g_ptr_array_index(m_steps_index, next_step_pos); + LookupStepContent next_lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, next_step_pos); + + gpointer key = NULL, value = NULL; + gboolean lookup_result = g_hash_table_lookup_extended + (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); + + if ( !lookup_result ){ + g_array_append_val(next_lookup_content, *next_step); + g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1)); + return true; + }else{ + size_t step_index = GPOINTER_TO_UINT(value); + lookup_value_t * orig_next_value = &g_array_index + (next_lookup_content, lookup_value_t, step_index); + + if ( orig_next_value->m_poss < next_step->m_poss) { + /* found better result. */ + orig_next_value->m_handles[0] = next_step->m_handles[0]; + assert(orig_next_value->m_handles[1] == next_step->m_handles[1]); + orig_next_value->m_poss = next_step->m_poss; + orig_next_value->m_last_step = next_step->m_last_step; + return true; + } + + return false; + } +} + +bool PinyinLookup2::final_step(MatchResults & results){ + + /* reset results */ + g_array_set_size(results, m_steps_content->len - 1); + for (size_t i = 0; i < results->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + *token = null_token; + } + + /* find max element */ + size_t last_step_pos = m_steps_content->len - 1; + GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos); + if ( last_step_array->len == 0 ) + return false; + + lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0); + for ( size_t i = 1; i < last_step_array->len; ++i){ + lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + /* backtracing */ + while( true ){ + int cur_step_pos = max_value->m_last_step; + if ( -1 == cur_step_pos ) + break; + + phrase_token_t * token = &g_array_index + (results, phrase_token_t, cur_step_pos); + *token = max_value->m_handles[1]; + + phrase_token_t last_token = max_value->m_handles[0]; + LookupStepIndex lookup_step_index = (LookupStepIndex) + g_ptr_array_index(m_steps_index, cur_step_pos); + + gpointer key = NULL, value = NULL; + gboolean result = g_hash_table_lookup_extended + (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); + if (!result) + return false; + + LookupStepContent lookup_step_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, cur_step_pos); + max_value = &g_array_index + (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); + } + + /* no need to reverse the result */ + return true; +} + + +bool PinyinLookup2::train_result2(ChewingKeyVector keys, + CandidateConstraints constraints, + MatchResults results) { + const guint32 initial_seed = 23 * 3; + const guint32 expand_factor = 2; + const guint32 unigram_factor = 7; + const guint32 pinyin_factor = 1; + const guint32 ceiling_seed = 23 * 15 * 64; + + /* begin training based on constraints and results. */ + bool train_next = false; + ChewingKey * pinyin_keys = (ChewingKey *) keys->data; + + phrase_token_t last_token = sentence_start; + /* constraints->len + 1 == results->len */ + for (size_t i = 0; i < constraints->len; ++i) { + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if (null_token == *token) + continue; + + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, i); + if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) { + if (CONSTRAINT_ONESTEP == constraint->m_type) { + assert(*token == constraint->m_token); + train_next = true; + } else { + train_next = false; + } + + guint32 seed = initial_seed; + /* train bi-gram first, and get train seed. */ + if (last_token) { + SingleGram * user = NULL; + m_user_bigram->load(last_token, user); + + guint32 total_freq = 0; + if (!user) { + user = new SingleGram; + } + assert(user->get_total_freq(total_freq)); + + guint32 freq = 0; + /* compute train factor */ + if (!user->get_freq(*token, freq)) { + assert(user->insert_freq(*token, 0)); + seed = initial_seed; + } else { + seed = std_lite::max(freq, initial_seed); + seed *= expand_factor; + seed = std_lite::min(seed, ceiling_seed); + } + + /* protect against total_freq overflow */ + if (seed > 0 && total_freq > total_freq + seed) + goto next; + + assert(user->set_total_freq(total_freq + seed)); + /* if total_freq is not overflow, then freq won't overflow. */ + assert(user->set_freq(*token, freq + seed)); + assert(m_user_bigram->store(last_token, user)); + next: + assert(NULL != user); + if (user) + delete user; + } + + /* train uni-gram */ + m_phrase_index->get_phrase_item(*token, m_cache_phrase_item); + m_cache_phrase_item.increase_pronunciation_possibility + (m_options, pinyin_keys + i, seed * pinyin_factor); + m_phrase_index->add_unigram_frequency + (*token, seed * unigram_factor); + } + last_token = *token; + } + return true; +} + + +int PinyinLookup2::add_constraint(CandidateConstraints constraints, + size_t index, + phrase_token_t token) { + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return 0; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + if ( index + phrase_length > constraints->len ) + return 0; + + for (size_t i = index; i < index + phrase_length; ++i){ + clear_constraint(constraints, i); + } + + /* store one step constraint */ + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, index); + constraint->m_type = CONSTRAINT_ONESTEP; + constraint->m_token = token; + + /* propagate no search constraint */ + for (size_t i = 1; i < phrase_length; ++i){ + constraint = &g_array_index(constraints, lookup_constraint_t, index + i); + constraint->m_type = CONSTRAINT_NOSEARCH; + constraint->m_constraint_step = index; + } + + return phrase_length; +} + +bool PinyinLookup2::clear_constraint(CandidateConstraints constraints, + int index) { + if (index < 0 || index >= constraints->len) + return false; + + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, index); + + if (NO_CONSTRAINT == constraint->m_type) + return false; + + if (CONSTRAINT_NOSEARCH == constraint->m_type){ + index = constraint->m_constraint_step; + constraint = &g_array_index(constraints, lookup_constraint_t, index); + } + + /* now var constraint points to the one step constraint. */ + assert(constraint->m_type == CONSTRAINT_ONESTEP); + + phrase_token_t token = constraint->m_token; + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + for ( size_t i = 0; i < phrase_length; ++i){ + if (index + i >= constraints->len) + continue; + + constraint = &g_array_index + (constraints, lookup_constraint_t, index + i); + constraint->m_type = NO_CONSTRAINT; + } + + return true; +} + +bool PinyinLookup2::validate_constraint(CandidateConstraints constraints, + ChewingKeyVector keys) { + /* resize constraints array first */ + size_t constraints_length = constraints->len; + + if ( keys->len > constraints_length ){ + g_array_set_size(constraints, keys->len); + + /* initialize new element */ + for( size_t i = constraints_length; i < keys->len; ++i){ + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + }else if (keys->len < constraints_length ){ + /* just shrink it */ + g_array_set_size(constraints, keys->len); + } + + for ( size_t i = 0; i < constraints->len; ++i){ + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, i); + + /* handle one step constraint */ + if ( constraint->m_type == CONSTRAINT_ONESTEP ){ + + phrase_token_t token = constraint->m_token; + m_phrase_index->get_phrase_item(token, m_cache_phrase_item); + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + + /* clear too long constraint */ + if (i + phrase_length > constraints->len){ + clear_constraint(constraints, i); + continue; + } + + ChewingKey * pinyin_keys = (ChewingKey *)keys->data; + /* clear invalid pinyin */ + gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys + i); + if (pinyin_poss < FLT_EPSILON) + clear_constraint(constraints, i); + } + } + return true; +} diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h new file mode 100644 index 0000000..dbe15c9 --- /dev/null +++ b/src/lookup/pinyin_lookup2.h @@ -0,0 +1,240 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef PINYIN_LOOKUP2_H +#define PINYIN_LOOKUP2_H + + +#include <float.h> +#include <glib.h> +#include "novel_types.h" +#include "chewing_key.h" +#include "phrase_index.h" +#include "ngram.h" +#include "lookup.h" + + +namespace pinyin{ + +/** + * pinyin_lookup2.h + * + * The definitions of pinyin lookup related classes and structs. + * + */ + + + +enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH }; + +struct lookup_constraint_t{ + /* current type of the step */ + constraint_type m_type; + + /* Note: + * value of m_type: + * NO_CONSTRAINT: + * no values in the below union. + * search all possible next words. + * CONSTRAINT_ONESTEP: + * m_token contains the next word. + * only one word can be used to search for the next step, + * use case for user selected candidates. + * CONSTRAINT_NOSEARCH: + * m_constraint_step contains the value + * which points back to the CONSTRAINT_ONESTEP step. + * no search is allowed for the current step. + */ + + union{ + phrase_token_t m_token; + guint32 m_constraint_step; /* index of m_token */ + }; +}; + + +/** + * PinyinLookup2: + * + * The pinyin lookup class to convert pinyin keys to guessed sentence. + * + */ +class PinyinLookup2{ +private: + const gfloat bigram_lambda; + const gfloat unigram_lambda; + + PhraseItem m_cache_phrase_item; + SingleGram m_merged_single_gram; + +protected: + /* saved varibles */ + CandidateConstraints m_constraints; + ChewingKeyVector m_keys; + + pinyin_option_t m_options; + FacadeChewingTable * m_pinyin_table; + FacadePhraseIndex * m_phrase_index; + Bigram * m_system_bigram; + Bigram * m_user_bigram; + + /* internal step data structure */ + GPtrArray * m_steps_index; + /* Array of LookupStepIndex */ + GPtrArray * m_steps_content; + /* Array of LookupStepContent */ + + + bool search_unigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges); + bool search_bigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges); + + bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token); + bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss); + + bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step); + + bool final_step(MatchResults & results); + +public: + /** + * PinyinLookup2::PinyinLookup2: + * @lambda: the lambda parameter for interpolation model. + * @options: the pinyin options. + * @pinyin_table: the pinyin table. + * @phrase_index: the phrase index. + * @system_bigram: the system bi-gram. + * @user_bigram: the user bi-gram. + * + * The constructor of the PinyinLookup2. + * + */ + PinyinLookup2(const gfloat lambda, + pinyin_option_t options, + FacadeChewingTable * pinyin_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram); + + /** + * PinyinLookup2::~PinyinLookup2: + * + * The destructor of the PinyinLookup2. + * + */ + ~PinyinLookup2(); + + /** + * PinyinLookup2::set_options: + * @options: the pinyin options. + * @returns: whether the set operation is successful. + * + * Set the pinyin options. + * + */ + bool set_options(pinyin_option_t options) { + m_options = options; + return true; + } + + /** + * PinyinLookup2::get_best_match: + * @prefixes: the phrase tokens before the guessed sentence. + * @keys: the pinyin keys of the guessed sentence. + * @constraints: the constraints on the guessed sentence. + * @results: the guessed sentence in the form of the phrase tokens. + * @returns: whether the guess operation is successful. + * + * Guess the best sentence according to user inputs. + * + */ + bool get_best_match(TokenVector prefixes, ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results); + + /** + * PinyinLookup2::train_result2: + * @keys: the pinyin keys of the guessed sentence. + * @constraints: the constraints on the guessed sentence. + * @results: the guessed sentence in the form of the phrase tokens. + * @returns: whether the train operation is successful. + * + * Self learning the guessed sentence based on the constraints. + * + */ + bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results); + + /** + * PinyinLookup2::convert_to_utf8: + * @results: the guessed sentence in the form of the phrase tokens. + * @result_string: the guessed sentence in the utf8 encoding. + * @returns: whether the convert operation is successful. + * + * Convert the guessed sentence from the phrase tokens to the utf8 string. + * + */ + bool convert_to_utf8(MatchResults results, + /* out */ char * & result_string) + { + return pinyin::convert_to_utf8(m_phrase_index, results, + NULL, false, result_string); + } + + + /** + * PinyinLookup2::add_constraint: + * @constraints: the constraints on the guessed sentence. + * @index: the character offset in the guessed sentence. + * @token: the phrase token in the candidate list chosen by user. + * @returns: the number of the characters in the chosen token. + * + * Add one constraint to the constraints on the guessed sentence. + * + */ + int add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token); + + /** + * PinyinLookup2::clear_constraint: + * @constraints: the constraints on the guessed sentence. + * @index: the character offset in the guessed sentence. + * @returns: whether the clear operation is successful. + * + * Clear one constraint in the constraints on the guessed sentence. + * + */ + bool clear_constraint(CandidateConstraints constraints, int index); + + /** + * PinyinLookup2::validate_constraint: + * @constraints: the constraints on the guessed sentence. + * @keys: the pinyin keys of the guessed sentence. + * @returns: whether the validate operation is successful. + * + * Validate the old constraints with the new pinyin keys. + * + */ + bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys); + +}; + +}; + +#endif diff --git a/src/pinyin.cpp b/src/pinyin.cpp new file mode 100644 index 0000000..95215ae --- /dev/null +++ b/src/pinyin.cpp @@ -0,0 +1,2096 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin.h" +#include <stdio.h> +#include <unistd.h> +#include <glib/gstdio.h> +#include "pinyin_internal.h" + + +using namespace pinyin; + +/* a glue layer for input method integration. */ + +typedef GArray * CandidateVector; /* GArray of lookup_candidate_t */ + +struct _pinyin_context_t{ + pinyin_option_t m_options; + + FullPinyinParser2 * m_full_pinyin_parser; + DoublePinyinParser2 * m_double_pinyin_parser; + ChewingParser2 * m_chewing_parser; + + FacadeChewingTable * m_pinyin_table; + FacadePhraseTable2 * m_phrase_table; + FacadePhraseIndex * m_phrase_index; + Bigram * m_system_bigram; + Bigram * m_user_bigram; + + PinyinLookup2 * m_pinyin_lookup; + PhraseLookup * m_phrase_lookup; + + char * m_system_dir; + char * m_user_dir; + bool m_modified; + + SystemTableInfo m_system_table_info; +}; + +struct _pinyin_instance_t{ + pinyin_context_t * m_context; + gchar * m_raw_full_pinyin; + TokenVector m_prefixes; + ChewingKeyVector m_pinyin_keys; + ChewingKeyRestVector m_pinyin_key_rests; + CandidateConstraints m_constraints; + MatchResults m_match_results; + CandidateVector m_candidates; +}; + +struct _lookup_candidate_t{ + lookup_candidate_type_t m_candidate_type; + gchar * m_phrase_string; + phrase_token_t m_token; + ChewingKeyRest m_orig_rest; + gchar * m_new_pinyins; + guint32 m_freq; /* the amplifed gfloat numerical value. */ +public: + _lookup_candidate_t() { + m_candidate_type = NORMAL_CANDIDATE; + m_phrase_string = NULL; + m_token = null_token; + m_new_pinyins = NULL; + m_freq = 0; + } +}; + +struct _import_iterator_t{ + pinyin_context_t * m_context; + guint8 m_phrase_index; +}; + + +static bool check_format(pinyin_context_t * context){ + const char * userdir = context->m_user_dir; + + UserTableInfo user_table_info; + gchar * filename = g_build_filename + (userdir, USER_TABLE_INFO, NULL); + user_table_info.load(filename); + g_free(filename); + + bool exists = user_table_info.is_conform + (&context->m_system_table_info); + + if (exists) + return exists; + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + /* clean up files, if version mis-matches. */ + for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (NOT_USED == table_info->m_file_type) + continue; + + if (NULL == table_info->m_user_filename) + continue; + + const char * userfilename = table_info->m_user_filename; + + /* remove dbin file. */ + filename = g_build_filename(userdir, userfilename, NULL); + unlink(filename); + g_free(filename); + } + + filename = g_build_filename + (userdir, USER_PINYIN_INDEX, NULL); + unlink(filename); + g_free(filename); + + filename = g_build_filename + (userdir, USER_PHRASE_INDEX, NULL); + unlink(filename); + g_free(filename); + + filename = g_build_filename + (userdir, USER_BIGRAM, NULL); + unlink(filename); + g_free(filename); + + return exists; +} + +static bool mark_version(pinyin_context_t * context){ + const char * userdir = context->m_user_dir; + + UserTableInfo user_table_info; + user_table_info.make_conform(&context->m_system_table_info); + + gchar * filename = g_build_filename + (userdir, USER_TABLE_INFO, NULL); + bool retval = user_table_info.save(filename); + g_free(filename); + + return retval; +} + +pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ + pinyin_context_t * context = new pinyin_context_t; + + context->m_options = USE_TONE; + + context->m_system_dir = g_strdup(systemdir); + context->m_user_dir = g_strdup(userdir); + context->m_modified = false; + + gchar * filename = g_build_filename + (context->m_system_dir, SYSTEM_TABLE_INFO, NULL); + if (!context->m_system_table_info.load(filename)) { + fprintf(stderr, "load %s failed!\n", filename); + return NULL; + } + g_free(filename); + + + check_format(context); + + context->m_full_pinyin_parser = new FullPinyinParser2; + context->m_double_pinyin_parser = new DoublePinyinParser2; + context->m_chewing_parser = new ChewingParser2; + + /* load chewing table. */ + context->m_pinyin_table = new FacadeChewingTable; + + /* load system chewing table. */ + MemoryChunk * chunk = new MemoryChunk; + filename = g_build_filename + (context->m_system_dir, SYSTEM_PINYIN_INDEX, NULL); + if (!chunk->load(filename)) { + fprintf(stderr, "open %s failed!\n", filename); + return NULL; + } + g_free(filename); + + /* load user chewing table */ + MemoryChunk * userchunk = new MemoryChunk; + filename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX, NULL); + if (!userchunk->load(filename)) { + /* hack here: use local Chewing Table to create empty memory chunk. */ + ChewingLargeTable table(context->m_options); + table.store(userchunk); + } + g_free(filename); + + context->m_pinyin_table->load(context->m_options, chunk, userchunk); + + /* load phrase table */ + context->m_phrase_table = new FacadePhraseTable2; + + /* load system phrase table */ + chunk = new MemoryChunk; + filename = g_build_filename + (context->m_system_dir, SYSTEM_PHRASE_INDEX, NULL); + if (!chunk->load(filename)) { + fprintf(stderr, "open %s failed!\n", filename); + return NULL; + } + g_free(filename); + + /* load user phrase table */ + userchunk = new MemoryChunk; + filename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX, NULL); + if (!userchunk->load(filename)) { + /* hack here: use local Phrase Table to create empty memory chunk. */ + PhraseLargeTable2 table; + table.store(userchunk); + } + g_free(filename); + + context->m_phrase_table->load(chunk, userchunk); + + context->m_phrase_index = new FacadePhraseIndex; + + /* hack here: directly call load phrase library. */ + pinyin_load_phrase_library(context, GB_DICTIONARY); + pinyin_load_phrase_library(context, MERGED_DICTIONARY); + + context->m_system_bigram = new Bigram; + filename = g_build_filename(context->m_system_dir, SYSTEM_BIGRAM, NULL); + context->m_system_bigram->attach(filename, ATTACH_READONLY); + g_free(filename); + + context->m_user_bigram = new Bigram; + filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); + context->m_user_bigram->load_db(filename); + g_free(filename); + + gfloat lambda = context->m_system_table_info.get_lambda(); + + context->m_pinyin_lookup = new PinyinLookup2 + ( lambda, context->m_options, + context->m_pinyin_table, context->m_phrase_index, + context->m_system_bigram, context->m_user_bigram); + + context->m_phrase_lookup = new PhraseLookup + (lambda, + context->m_phrase_table, context->m_phrase_index, + context->m_system_bigram, context->m_user_bigram); + + return context; +} + +bool pinyin_load_phrase_library(pinyin_context_t * context, + guint8 index){ + if (!(index < PHRASE_INDEX_LIBRARY_COUNT)) + return false; + + /* check whether the sub phrase index is already loaded. */ + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(index, range); + if (ERROR_OK == retval) + return false; + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + const pinyin_table_info_t * table_info = phrase_files + index; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + /* system phrase library */ + MemoryChunk * chunk = new MemoryChunk; + + const char * systemfilename = table_info->m_system_filename; + /* check bin file in system dir. */ + gchar * chunkfilename = g_build_filename(context->m_system_dir, + systemfilename, NULL); + chunk->load(chunkfilename); + g_free(chunkfilename); + + context->m_phrase_index->load(index, chunk); + + const char * userfilename = table_info->m_user_filename; + + chunkfilename = g_build_filename(context->m_user_dir, + userfilename, NULL); + + MemoryChunk * log = new MemoryChunk; + log->load(chunkfilename); + g_free(chunkfilename); + + /* merge the chunk log. */ + context->m_phrase_index->merge(index, log); + return true; + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + MemoryChunk * chunk = new MemoryChunk; + const char * userfilename = table_info->m_user_filename; + + gchar * chunkfilename = g_build_filename(context->m_user_dir, + userfilename, NULL); + + /* check bin file exists. if not, create a new one. */ + if (chunk->load(chunkfilename)) { + context->m_phrase_index->load(index, chunk); + } else { + delete chunk; + context->m_phrase_index->create_sub_phrase(index); + } + + g_free(chunkfilename); + return true; + } + + return false; +} + +bool pinyin_unload_phrase_library(pinyin_context_t * context, + guint8 index){ + /* gb_char.bin and merged.bin can't be unloaded. */ + if (GB_DICTIONARY == index || MERGED_DICTIONARY == index) + return false; + + assert(index < PHRASE_INDEX_LIBRARY_COUNT); + + context->m_phrase_index->unload(index); + return true; +} + +import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context, + guint8 index){ + import_iterator_t * iter = new import_iterator_t; + iter->m_context = context; + iter->m_phrase_index = index; + return iter; +} + +bool pinyin_iterator_add_phrase(import_iterator_t * iter, + const char * phrase, + const char * pinyin, + gint count){ + /* if -1 == count, use the default value. */ + const gint default_count = 5; + const guint32 unigram_factor = 3; + if (-1 == count) + count = default_count; + + pinyin_context_t * & context = iter->m_context; + FacadePhraseTable2 * & phrase_table = context->m_phrase_table; + FacadeChewingTable * & pinyin_table = context->m_pinyin_table; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + bool result = false; + + if (NULL == phrase || NULL == pinyin) + return result; + + /* check whether the phrase exists in phrase table */ + glong len_phrase = 0; + ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &len_phrase, NULL); + + pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; + FullPinyinParser2 parser; + ChewingKeyVector keys = + g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + /* parse the pinyin. */ + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + + if (len_phrase != keys->len) + return result; + + if (0 == len_phrase || len_phrase >= MAX_PHRASE_LENGTH) + return result; + + phrase_token_t token = null_token; + GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + /* do phrase table search. */ + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + int retval = phrase_table->search(len_phrase, ucs4_phrase, tokens); + int num = reduce_tokens(tokens, tokenarray); + phrase_index->destroy_tokens(tokens); + + /* find the best token candidate. */ + for (size_t i = 0; i < tokenarray->len; ++i) { + phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i); + if (null_token == token) { + token = candidate; + continue; + } + + if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == iter->m_phrase_index) { + /* only one phrase string per sub phrase index. */ + assert(PHRASE_INDEX_LIBRARY_INDEX(token) != iter->m_phrase_index); + token = candidate; + continue; + } + } + g_array_free(tokenarray, TRUE); + + PhraseItem item; + /* check whether it exists in the same sub phrase index; */ + if (null_token != token && + PHRASE_INDEX_LIBRARY_INDEX(token) == iter->m_phrase_index) { + /* if so, remove the phrase, add the pinyin for the phrase item, + then add it back;*/ + phrase_index->get_phrase_item(token, item); + assert(len_phrase == item.get_phrase_length()); + ucs4_t tmp_phrase[MAX_PHRASE_LENGTH]; + item.get_phrase_string(tmp_phrase); + assert(0 == memcmp + (ucs4_phrase, tmp_phrase, sizeof(ucs4_t) * len_phrase)); + + PhraseItem * removed_item = NULL; + retval = phrase_index->remove_phrase_item(token, removed_item); + if (ERROR_OK == retval) { + /* maybe check whether there are duplicated pronunciations here. */ + removed_item->add_pronunciation((ChewingKey *)keys->data, + count); + phrase_index->add_phrase_item(token, removed_item); + delete removed_item; + result = true; + } + } else { + /* if not exists in the same sub phrase index, + get the maximum token, + then add it directly with maximum token + 1; */ + PhraseIndexRange range; + retval = phrase_index->get_range(iter->m_phrase_index, range); + + if (ERROR_OK == retval) { + token = range.m_range_end; + if (0x00000000 == (token & PHRASE_MASK)) + token++; + + if (len_phrase == keys->len) { /* valid pinyin */ + phrase_table->add_index(len_phrase, ucs4_phrase, token); + pinyin_table->add_index + (keys->len, (ChewingKey *)(keys->data), token); + + item.set_phrase_string(len_phrase, ucs4_phrase); + item.add_pronunciation((ChewingKey *)(keys->data), count); + phrase_index->add_phrase_item(token, &item); + phrase_index->add_unigram_frequency(token, + count * unigram_factor); + result = true; + } + } + } + + g_array_free(key_rests, TRUE); + g_array_free(keys, TRUE); + g_free(ucs4_phrase); + return result; +} + +void pinyin_end_add_phrases(import_iterator_t * iter){ + /* compact the content memory chunk of phrase index. */ + iter->m_context->m_phrase_index->compact(); + iter->m_context->m_modified = true; + delete iter; +} + +bool pinyin_save(pinyin_context_t * context){ + if (!context->m_user_dir) + return false; + + if (!context->m_modified) + return false; + + context->m_phrase_index->compact(); + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + /* skip the reserved zero phrase library. */ + for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(i, range); + + if (ERROR_NO_SUB_PHRASE_INDEX == retval) + continue; + + const pinyin_table_info_t * table_info = phrase_files + i; + + if (NOT_USED == table_info->m_file_type) + continue; + + const char * userfilename = table_info->m_user_filename; + + if (NULL == userfilename) + continue; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + /* system phrase library */ + MemoryChunk * chunk = new MemoryChunk; + MemoryChunk * log = new MemoryChunk; + const char * systemfilename = table_info->m_system_filename; + + /* check bin file in system dir. */ + gchar * chunkfilename = g_build_filename(context->m_system_dir, + systemfilename, NULL); + chunk->load(chunkfilename); + g_free(chunkfilename); + context->m_phrase_index->diff(i, chunk, log); + + const char * userfilename = table_info->m_user_filename; + gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); + + gchar * tmppathname = g_build_filename(context->m_user_dir, + tmpfilename, NULL); + g_free(tmpfilename); + + gchar * chunkpathname = g_build_filename(context->m_user_dir, + userfilename, NULL); + log->save(tmppathname); + + int result = rename(tmppathname, chunkpathname); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmppathname, chunkpathname); + + g_free(chunkpathname); + g_free(tmppathname); + delete log; + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + MemoryChunk * chunk = new MemoryChunk; + context->m_phrase_index->store(i, chunk); + + const char * userfilename = table_info->m_user_filename; + gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); + gchar * tmppathname = g_build_filename(context->m_user_dir, + tmpfilename, NULL); + g_free(tmpfilename); + + gchar * chunkpathname = g_build_filename(context->m_user_dir, + userfilename, NULL); + + chunk->save(tmppathname); + + int result = rename(tmppathname, chunkpathname); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmppathname, chunkpathname); + + g_free(chunkpathname); + g_free(tmppathname); + delete chunk; + } + } + + /* save user pinyin table */ + gchar * tmpfilename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL); + unlink(tmpfilename); + gchar * filename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX, NULL); + + MemoryChunk * chunk = new MemoryChunk; + context->m_pinyin_table->store(chunk); + chunk->save(tmpfilename); + delete chunk; + + int result = rename(tmpfilename, filename); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmpfilename, filename); + + g_free(tmpfilename); + g_free(filename); + + /* save user phrase table */ + tmpfilename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL); + unlink(tmpfilename); + filename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX, NULL); + + chunk = new MemoryChunk; + context->m_phrase_table->store(chunk); + chunk->save(tmpfilename); + delete chunk; + + result = rename(tmpfilename, filename); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmpfilename, filename); + + g_free(tmpfilename); + g_free(filename); + + /* save user bi-gram */ + tmpfilename = g_build_filename + (context->m_user_dir, USER_BIGRAM ".tmp", NULL); + unlink(tmpfilename); + filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); + context->m_user_bigram->save_db(tmpfilename); + + result = rename(tmpfilename, filename); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmpfilename, filename); + + g_free(tmpfilename); + g_free(filename); + + mark_version(context); + + context->m_modified = false; + return true; +} + +bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context, + DoublePinyinScheme scheme){ + context->m_double_pinyin_parser->set_scheme(scheme); + return true; +} + +bool pinyin_set_chewing_scheme(pinyin_context_t * context, + ChewingScheme scheme){ + context->m_chewing_parser->set_scheme(scheme); + return true; +} + +void pinyin_fini(pinyin_context_t * context){ + delete context->m_full_pinyin_parser; + delete context->m_double_pinyin_parser; + delete context->m_chewing_parser; + delete context->m_pinyin_table; + delete context->m_phrase_table; + delete context->m_phrase_index; + delete context->m_system_bigram; + delete context->m_user_bigram; + delete context->m_pinyin_lookup; + delete context->m_phrase_lookup; + + g_free(context->m_system_dir); + g_free(context->m_user_dir); + context->m_modified = false; + + delete context; +} + +bool pinyin_mask_out(pinyin_context_t * context, + phrase_token_t mask, + phrase_token_t value) { + + context->m_pinyin_table->mask_out(mask, value); + context->m_phrase_table->mask_out(mask, value); + context->m_user_bigram->mask_out(mask, value); + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + /* mask out the phrase index. */ + for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) { + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(index, range); + + if (ERROR_NO_SUB_PHRASE_INDEX == retval) + continue; + + const pinyin_table_info_t * table_info = phrase_files + index; + + if (NOT_USED == table_info->m_file_type) + continue; + + const char * userfilename = table_info->m_user_filename; + + if (NULL == userfilename) + continue; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + /* system phrase library */ + MemoryChunk * chunk = new MemoryChunk; + + const char * systemfilename = table_info->m_system_filename; + /* check bin file in system dir. */ + gchar * chunkfilename = g_build_filename(context->m_system_dir, + systemfilename, NULL); + chunk->load(chunkfilename); + g_free(chunkfilename); + + context->m_phrase_index->load(index, chunk); + + const char * userfilename = table_info->m_user_filename; + + chunkfilename = g_build_filename(context->m_user_dir, + userfilename, NULL); + + MemoryChunk * log = new MemoryChunk; + log->load(chunkfilename); + g_free(chunkfilename); + + /* merge the chunk log with mask. */ + context->m_phrase_index->merge_with_mask(index, log, mask, value); + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + context->m_phrase_index->mask_out(index, mask, value); + } + } + + context->m_phrase_index->compact(); + return true; +} + +/* copy from options to context->m_options. */ +bool pinyin_set_options(pinyin_context_t * context, + pinyin_option_t options){ + context->m_options = options; + context->m_pinyin_table->set_options(context->m_options); + context->m_pinyin_lookup->set_options(context->m_options); + return true; +} + + +pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){ + pinyin_instance_t * instance = new pinyin_instance_t; + instance->m_context = context; + + instance->m_raw_full_pinyin = NULL; + + instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + instance->m_pinyin_key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + instance->m_constraints = g_array_new + (TRUE, FALSE, sizeof(lookup_constraint_t)); + instance->m_match_results = + g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + instance->m_candidates = + g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); + + return instance; +} + +void pinyin_free_instance(pinyin_instance_t * instance){ + g_free(instance->m_raw_full_pinyin); + g_array_free(instance->m_prefixes, TRUE); + g_array_free(instance->m_pinyin_keys, TRUE); + g_array_free(instance->m_pinyin_key_rests, TRUE); + g_array_free(instance->m_constraints, TRUE); + g_array_free(instance->m_match_results, TRUE); + g_array_free(instance->m_candidates, TRUE); + + delete instance; +} + + +static bool pinyin_update_constraints(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + CandidateConstraints & constraints = instance->m_constraints; + + size_t key_len = constraints->len; + g_array_set_size(constraints, pinyin_keys->len); + for (size_t i = key_len; i < pinyin_keys->len; ++i ) { + lookup_constraint_t * constraint = + &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + context->m_pinyin_lookup->validate_constraint + (constraints, pinyin_keys); + + return true; +} + + +bool pinyin_guess_sentence(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; + + g_array_set_size(instance->m_prefixes, 0); + g_array_append_val(instance->m_prefixes, sentence_start); + + pinyin_update_constraints(instance); + bool retval = context->m_pinyin_lookup->get_best_match + (instance->m_prefixes, + instance->m_pinyin_keys, + instance->m_constraints, + instance->m_match_results); + + return retval; +} + +bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, + const char * prefix){ + pinyin_context_t * & context = instance->m_context; + + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + g_array_set_size(instance->m_prefixes, 0); + g_array_append_val(instance->m_prefixes, sentence_start); + + glong len_str = 0; + ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL); + GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + if (ucs4_str && len_str) { + /* add prefixes. */ + for (ssize_t i = 1; i <= len_str; ++i) { + if (i > MAX_PHRASE_LENGTH) + break; + + ucs4_t * start = ucs4_str + len_str - i; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(tokens)); + phrase_index->prepare_tokens(tokens); + int result = context->m_phrase_table->search(i, start, tokens); + int num = reduce_tokens(tokens, tokenarray); + phrase_index->destroy_tokens(tokens); + + if (result & SEARCH_OK) + g_array_append_vals(instance->m_prefixes, + tokenarray->data, tokenarray->len); + } + } + g_array_free(tokenarray, TRUE); + g_free(ucs4_str); + + pinyin_update_constraints(instance); + bool retval = context->m_pinyin_lookup->get_best_match + (instance->m_prefixes, + instance->m_pinyin_keys, + instance->m_constraints, + instance->m_match_results); + + return retval; +} + +bool pinyin_phrase_segment(pinyin_instance_t * instance, + const char * sentence){ + pinyin_context_t * & context = instance->m_context; + + const glong num_of_chars = g_utf8_strlen(sentence, -1); + glong ucs4_len = 0; + ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL); + + g_return_val_if_fail(num_of_chars == ucs4_len, FALSE); + + bool retval = context->m_phrase_lookup->get_best_match + (ucs4_len, ucs4_str, instance->m_match_results); + + g_free(ucs4_str); + return retval; +} + +/* the returned sentence should be freed by g_free(). */ +bool pinyin_get_sentence(pinyin_instance_t * instance, + char ** sentence){ + pinyin_context_t * & context = instance->m_context; + + bool retval = pinyin::convert_to_utf8 + (context->m_phrase_index, instance->m_match_results, + NULL, false, *sentence); + + return retval; +} + +bool pinyin_parse_full_pinyin(pinyin_instance_t * instance, + const char * onepinyin, + ChewingKey * onekey){ + pinyin_context_t * & context = instance->m_context; + + int pinyin_len = strlen(onepinyin); + bool retval = context->m_full_pinyin_parser->parse_one_key + ( context->m_options, *onekey, onepinyin, pinyin_len); + return retval; +} + +size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance, + const char * pinyins){ + pinyin_context_t * & context = instance->m_context; + + g_free(instance->m_raw_full_pinyin); + instance->m_raw_full_pinyin = g_strdup(pinyins); + int pinyin_len = strlen(pinyins); + + int parse_len = context->m_full_pinyin_parser->parse + ( context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, pinyins, pinyin_len); + + return parse_len; +} + +bool pinyin_parse_double_pinyin(pinyin_instance_t * instance, + const char * onepinyin, + ChewingKey * onekey){ + pinyin_context_t * & context = instance->m_context; + + int pinyin_len = strlen(onepinyin); + bool retval = context->m_double_pinyin_parser->parse_one_key + ( context->m_options, *onekey, onepinyin, pinyin_len); + return retval; +} + +size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance, + const char * pinyins){ + pinyin_context_t * & context = instance->m_context; + int pinyin_len = strlen(pinyins); + + int parse_len = context->m_double_pinyin_parser->parse + ( context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, pinyins, pinyin_len); + + return parse_len; +} + +bool pinyin_parse_chewing(pinyin_instance_t * instance, + const char * onechewing, + ChewingKey * onekey){ + pinyin_context_t * & context = instance->m_context; + + int chewing_len = strlen(onechewing); + bool retval = context->m_chewing_parser->parse_one_key + ( context->m_options, *onekey, onechewing, chewing_len ); + return retval; +} + +size_t pinyin_parse_more_chewings(pinyin_instance_t * instance, + const char * chewings){ + pinyin_context_t * & context = instance->m_context; + int chewing_len = strlen(chewings); + + int parse_len = context->m_chewing_parser->parse + ( context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, chewings, chewing_len); + + return parse_len; +} + +bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, + const char key, const char ** symbol) { + pinyin_context_t * & context = instance->m_context; + return context->m_chewing_parser->in_chewing_scheme + (context->m_options, key, symbol); +} + +#if 0 +static gint compare_item_with_token(gconstpointer lhs, + gconstpointer rhs) { + lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; + lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; + + phrase_token_t token_lhs = item_lhs->m_token; + phrase_token_t token_rhs = item_rhs->m_token; + + return (token_lhs - token_rhs); +} +#endif + +static gint compare_item_with_frequency(gconstpointer lhs, + gconstpointer rhs) { + lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; + lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; + + guint32 freq_lhs = item_lhs->m_freq; + guint32 freq_rhs = item_rhs->m_freq; + + return -(freq_lhs - freq_rhs); /* in descendant order */ +} + +static phrase_token_t _get_previous_token(pinyin_instance_t * instance, + size_t offset) { + phrase_token_t prev_token = null_token; + ssize_t i; + + if (0 == offset) { + /* get previous token from prefixes. */ + prev_token = sentence_start; + size_t prev_token_len = 0; + + pinyin_context_t * context = instance->m_context; + TokenVector prefixes = instance->m_prefixes; + PhraseItem item; + + for (size_t i = 0; i < prefixes->len; ++i) { + phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); + if (sentence_start == token) + continue; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK == retval) { + size_t token_len = item.get_phrase_length(); + if (token_len > prev_token_len) { + /* found longer match, and save it. */ + prev_token = token; + prev_token_len = token_len; + } + } + } + } else { + /* get previous token from match results. */ + assert (0 < offset); + + phrase_token_t cur_token = g_array_index + (instance->m_match_results, phrase_token_t, offset); + if (null_token != cur_token) { + for (i = offset - 1; i >= 0; --i) { + cur_token = g_array_index + (instance->m_match_results, phrase_token_t, i); + if (null_token != cur_token) { + prev_token = cur_token; + break; + } + } + } + } + + return prev_token; +} + +static void _append_items(pinyin_context_t * context, + PhraseIndexRanges ranges, + lookup_candidate_t * template_item, + CandidateVector items) { + /* reduce and append to a single GArray. */ + for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) { + if (NULL == ranges[m]) + continue; + + for (size_t n = 0; n < ranges[m]->len; ++n) { + PhraseIndexRange * range = + &g_array_index(ranges[m], PhraseIndexRange, n); + for (size_t k = range->m_range_begin; + k < range->m_range_end; ++k) { + lookup_candidate_t item; + item.m_candidate_type = template_item->m_candidate_type; + item.m_token = k; + item.m_orig_rest = template_item->m_orig_rest; + item.m_new_pinyins = g_strdup(template_item->m_new_pinyins); + item.m_freq = template_item->m_freq; + g_array_append_val(items, item); + } + } + } +} + +#if 0 +static void _remove_duplicated_items(CandidateVector items) { + /* remove the duplicated items. */ + phrase_token_t last_token = null_token, saved_token; + for (size_t n = 0; n < items->len; ++n) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, n); + + saved_token = item->m_token; + if (last_token == saved_token) { + g_array_remove_index(items, n); + n--; + } + last_token = saved_token; + } +} +#endif + +static void _compute_frequency_of_items(pinyin_context_t * context, + phrase_token_t prev_token, + SingleGram * merged_gram, + CandidateVector items) { + pinyin_option_t & options = context->m_options; + ssize_t i; + + PhraseItem cached_item; + /* compute all freqs. */ + for (i = 0; i < items->len; ++i) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, i); + phrase_token_t & token = item->m_token; + + gfloat bigram_poss = 0; guint32 total_freq = 0; + if (options & DYNAMIC_ADJUST) { + if (null_token != prev_token) { + guint32 bigram_freq = 0; + merged_gram->get_total_freq(total_freq); + merged_gram->get_freq(token, bigram_freq); + if (0 != total_freq) + bigram_poss = bigram_freq / (gfloat)total_freq; + } + } + + /* compute the m_freq. */ + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + phrase_index->get_phrase_item(token, cached_item); + total_freq = phrase_index->get_phrase_index_total_freq(); + assert (0 < total_freq); + + gfloat lambda = context->m_system_table_info.get_lambda(); + + /* Note: possibility value <= 1.0. */ + guint32 freq = (lambda * bigram_poss + + (1 - lambda) * + cached_item.get_unigram_frequency() / + (gfloat) total_freq) * 256 * 256 * 256; + item->m_freq = freq; + } +} + +static bool _prepend_sentence_candidate(pinyin_instance_t * instance, + CandidateVector candidates) { + /* check whether the best match candidate exists. */ + gchar * sentence = NULL; + pinyin_get_sentence(instance, &sentence); + if (NULL == sentence) + return false; + g_free(sentence); + + /* prepend best match candidate to candidates. */ + lookup_candidate_t candidate; + candidate.m_candidate_type = BEST_MATCH_CANDIDATE; + g_array_prepend_val(candidates, candidate); + + return true; +} + +static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance, + size_t offset, + CandidateVector candidates) { + /* populate m_phrase_string in lookup_candidate_t. */ + + for(size_t i = 0; i < candidates->len; ++i) { + lookup_candidate_t * candidate = &g_array_index + (candidates, lookup_candidate_t, i); + + switch(candidate->m_candidate_type) { + case BEST_MATCH_CANDIDATE: { + gchar * sentence = NULL; + pinyin_get_sentence(instance, &sentence); + candidate->m_phrase_string = g_strdup + (g_utf8_offset_to_pointer(sentence, offset)); + g_free(sentence); + break; + } + case NORMAL_CANDIDATE: + case DIVIDED_CANDIDATE: + case RESPLIT_CANDIDATE: + pinyin_token_get_phrase + (instance, candidate->m_token, NULL, + &(candidate->m_phrase_string)); + break; + case ZOMBIE_CANDIDATE: + break; + } + } + + return true; +} + +static gint compare_indexed_item_with_phrase_string(gconstpointer lhs, + gconstpointer rhs, + gpointer userdata) { + size_t index_lhs = *((size_t *) lhs); + size_t index_rhs = *((size_t *) rhs); + CandidateVector candidates = (CandidateVector) userdata; + + lookup_candidate_t * candidate_lhs = + &g_array_index(candidates, lookup_candidate_t, index_lhs); + lookup_candidate_t * candidate_rhs = + &g_array_index(candidates, lookup_candidate_t, index_rhs); + + return -strcmp(candidate_lhs->m_phrase_string, + candidate_rhs->m_phrase_string); /* in descendant order */ +} + + +static bool _remove_duplicated_items_by_phrase_string +(pinyin_instance_t * instance, + CandidateVector candidates) { + size_t i; + /* create the GArray of indexed item */ + GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t)); + for (i = 0; i < candidates->len; ++i) + g_array_append_val(indices, i); + + /* sort the indices array by phrase array */ + g_array_sort_with_data + (indices, compare_indexed_item_with_phrase_string, candidates); + + /* mark duplicated items as zombie candidate */ + lookup_candidate_t * cur_item, * saved_item = NULL; + for (i = 0; i < indices->len; ++i) { + size_t cur_index = g_array_index(indices, size_t, i); + cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index); + + /* handle the first candidate */ + if (NULL == saved_item) { + saved_item = cur_item; + continue; + } + + if (0 == strcmp(saved_item->m_phrase_string, + cur_item->m_phrase_string)) { + /* found duplicated candidates */ + + /* keep best match candidate */ + if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) { + cur_item->m_candidate_type = ZOMBIE_CANDIDATE; + continue; + } + + if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { + saved_item->m_candidate_type = ZOMBIE_CANDIDATE; + saved_item = cur_item; + continue; + } + + /* keep the higher possiblity one + to quickly move the word forward in the candidate list */ + if (cur_item->m_freq > saved_item->m_freq) { + /* find better candidate */ + saved_item->m_candidate_type = ZOMBIE_CANDIDATE; + saved_item = cur_item; + continue; + } else { + cur_item->m_candidate_type = ZOMBIE_CANDIDATE; + continue; + } + } else { + /* keep the current candidate */ + saved_item = cur_item; + } + } + + g_array_free(indices, TRUE); + + /* remove zombie candidate from the returned candidates */ + for (i = 0; i < candidates->len; ++i) { + lookup_candidate_t * candidate = &g_array_index + (candidates, lookup_candidate_t, i); + + if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) { + g_free(candidate->m_phrase_string); + g_free(candidate->m_new_pinyins); + g_array_remove_index(candidates, i); + i--; + } + } + + return true; +} + +static bool _free_candidates(CandidateVector candidates) { + /* free candidates */ + for (size_t i = 0; i < candidates->len; ++i) { + lookup_candidate_t * candidate = &g_array_index + (candidates, lookup_candidate_t, i); + g_free(candidate->m_phrase_string); + g_free(candidate->m_new_pinyins); + } + g_array_set_size(candidates, 0); + + return true; +} + +bool pinyin_guess_candidates(pinyin_instance_t * instance, + size_t offset) { + + pinyin_context_t * & context = instance->m_context; + pinyin_option_t & options = context->m_options; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + + _free_candidates(instance->m_candidates); + + size_t pinyin_len = pinyin_keys->len - offset; + ssize_t i; + + /* lookup the previous token here. */ + phrase_token_t prev_token = null_token; + + if (options & DYNAMIC_ADJUST) { + prev_token = _get_previous_token(instance, offset); + } + + SingleGram merged_gram; + SingleGram * system_gram = NULL, * user_gram = NULL; + + if (options & DYNAMIC_ADJUST) { + if (null_token != prev_token) { + context->m_system_bigram->load(prev_token, system_gram); + context->m_user_bigram->load(prev_token, user_gram); + merge_single_gram(&merged_gram, system_gram, user_gram); + } + } + + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(ranges)); + context->m_phrase_index->prepare_ranges(ranges); + + GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); + + for (i = pinyin_len; i >= 1; --i) { + g_array_set_size(items, 0); + + ChewingKey * keys = &g_array_index + (pinyin_keys, ChewingKey, offset); + + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (i, keys, ranges); + + if ( !(retval & SEARCH_OK) ) + continue; + + lookup_candidate_t template_item; + _append_items(context, ranges, &template_item, items); + +#if 0 + g_array_sort(items, compare_item_with_token); + + _remove_duplicated_items(items); +#endif + + _compute_frequency_of_items(context, prev_token, &merged_gram, items); + + /* sort the candidates of the same length by frequency. */ + g_array_sort(items, compare_item_with_frequency); + + /* transfer back items to tokens, and save it into candidates */ + for (size_t k = 0; k < items->len; ++k) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, k); + g_array_append_val(instance->m_candidates, *item); + } + +#if 0 + if (!(retval & SEARCH_CONTINUED)) + break; +#endif + } + + g_array_free(items, TRUE); + context->m_phrase_index->destroy_ranges(ranges); + if (system_gram) + delete system_gram; + if (user_gram) + delete user_gram; + + /* post process to remove duplicated candidates */ + + _prepend_sentence_candidate(instance, instance->m_candidates); + + _compute_phrase_strings_of_items(instance, offset, instance->m_candidates); + + _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); + + return true; +} + + +static bool _try_divided_table(pinyin_instance_t * instance, + PhraseIndexRanges ranges, + size_t offset, + CandidateVector items){ + bool found = false; + + pinyin_context_t * & context = instance->m_context; + pinyin_option_t & options = context->m_options; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; + + assert(pinyin_keys->len == pinyin_key_rests->len); + guint num_keys = pinyin_keys->len; + assert(offset < num_keys); + + /* handle "^xian$" -> "xi'an" here */ + ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset); + ChewingKeyRest * rest = &g_array_index(pinyin_key_rests, + ChewingKeyRest, offset); + ChewingKeyRest orig_rest = *rest; + guint16 tone = CHEWING_ZERO_TONE; + + const divided_table_item_t * item = NULL; + + /* back up tone */ + if (options & USE_TONE) { + tone = key->m_tone; + if (CHEWING_ZERO_TONE != tone) { + key->m_tone = CHEWING_ZERO_TONE; + rest->m_raw_end --; + } + } + + item = context->m_full_pinyin_parser->retrieve_divided_item + (options, key, rest, instance->m_raw_full_pinyin, + strlen(instance->m_raw_full_pinyin)); + + if (item) { + /* no ops */ + assert(item->m_new_freq > 0); + + ChewingKey divided_keys[2]; + const char * pinyin = item->m_new_keys[0]; + assert(context->m_full_pinyin_parser-> + parse_one_key(options, divided_keys[0], + pinyin, strlen(pinyin))); + pinyin = item->m_new_keys[1]; + assert(context->m_full_pinyin_parser-> + parse_one_key(options, divided_keys[1], + pinyin, strlen(pinyin))); + + gchar * new_pinyins = g_strdup_printf + ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]); + + /* propagate the tone */ + if (options & USE_TONE) { + if (CHEWING_ZERO_TONE != tone) { + assert(0 < tone && tone <= 5); + divided_keys[1].m_tone = tone; + + gchar * tmp_str = g_strdup_printf + ("%s%d", new_pinyins, tone); + g_free(new_pinyins); + new_pinyins = tmp_str; + } + } + + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (2, divided_keys, ranges); + + if (retval & SEARCH_OK) { + lookup_candidate_t template_item; + template_item.m_candidate_type = DIVIDED_CANDIDATE; + template_item.m_orig_rest = orig_rest; + template_item.m_new_pinyins = new_pinyins; + + _append_items(context, ranges, &template_item, items); + found = true; + } + g_free(new_pinyins); + } + + /* restore tones */ + if (options & USE_TONE) { + if (CHEWING_ZERO_TONE != tone) { + key->m_tone = tone; + rest->m_raw_end ++; + } + } + + return found; +} + +static bool _try_resplit_table(pinyin_instance_t * instance, + PhraseIndexRanges ranges, + size_t offset, + CandidateVector items){ + bool found = false; + + pinyin_context_t * & context = instance->m_context; + pinyin_option_t & options = context->m_options; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; + + assert(pinyin_keys->len == pinyin_key_rests->len); + guint num_keys = pinyin_keys->len; + assert(offset + 1 < num_keys); + + guint16 next_tone = CHEWING_ZERO_TONE; + + /* handle "^fa'nan$" -> "fan'an" here */ + ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests, + ChewingKeyRest, offset); + ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests, + ChewingKeyRest, offset + 1); + /* some "'" here */ + if (cur_rest->m_raw_end != next_rest->m_raw_begin) + return found; + + ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset); + ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey, + offset + 1); + + /* some tone here */ + if (CHEWING_ZERO_TONE != cur_key->m_tone) + return found; + + ChewingKeyRest orig_rest; + orig_rest.m_raw_begin = cur_rest->m_raw_begin; + orig_rest.m_raw_end = next_rest->m_raw_end; + + /* backup tone */ + if (options & USE_TONE) { + next_tone = next_key->m_tone; + if (CHEWING_ZERO_TONE != next_tone) { + next_key->m_tone = CHEWING_ZERO_TONE; + next_rest->m_raw_end --; + } + } + + /* lookup re-split table */ + const char * str = instance->m_raw_full_pinyin; + const resplit_table_item_t * item_by_orig = + context->m_full_pinyin_parser-> + retrieve_resplit_item_by_original_pinyins + (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str)); + + const resplit_table_item_t * item_by_new = + context->m_full_pinyin_parser-> + retrieve_resplit_item_by_resplit_pinyins + (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str)); + + /* there are no same couple of pinyins in re-split table. */ + assert(!(item_by_orig && item_by_new)); + + ChewingKey resplit_keys[2]; + const char * pinyins[2]; + + bool tosearch = false; + if (item_by_orig && item_by_orig->m_new_freq) { + pinyins[0] = item_by_orig->m_new_keys[0]; + pinyins[1] = item_by_orig->m_new_keys[1]; + + assert(context->m_full_pinyin_parser-> + parse_one_key(options, resplit_keys[0], + pinyins[0], strlen(pinyins[0]))); + + assert(context->m_full_pinyin_parser-> + parse_one_key(options, resplit_keys[1], + pinyins[1], strlen(pinyins[1]))); + tosearch = true; + } + + if (item_by_new && item_by_new->m_orig_freq) { + pinyins[0] = item_by_new->m_orig_keys[0]; + pinyins[1] = item_by_new->m_orig_keys[1]; + + assert(context->m_full_pinyin_parser-> + parse_one_key(options, resplit_keys[0], + pinyins[0], strlen(pinyins[0]))); + + assert(context->m_full_pinyin_parser-> + parse_one_key(options, resplit_keys[1], + pinyins[1], strlen(pinyins[1]))); + tosearch = true; + } + + if (tosearch) { + gchar * new_pinyins = g_strdup_printf + ("%s'%s", pinyins[0], pinyins[1]); + + /* propagate the tone */ + if (options & USE_TONE) { + if (CHEWING_ZERO_TONE != next_tone) { + assert(0 < next_tone && next_tone <= 5); + resplit_keys[1].m_tone = next_tone; + + gchar * tmp_str = g_strdup_printf + ("%s%d", new_pinyins, next_tone); + g_free(new_pinyins); + new_pinyins = tmp_str; + } + } + + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (2, resplit_keys, ranges); + + if (retval & SEARCH_OK) { + lookup_candidate_t template_item; + template_item.m_candidate_type = RESPLIT_CANDIDATE; + template_item.m_orig_rest = orig_rest; + template_item.m_new_pinyins = new_pinyins; + + _append_items(context, ranges, &template_item, items); + found = true; + } + g_free(new_pinyins); + } + + /* restore tones */ + if (options & USE_TONE) { + if (CHEWING_ZERO_TONE != next_tone) { + next_key->m_tone = next_tone; + next_rest->m_raw_end ++; + } + } + + return found; +} + +bool pinyin_guess_full_pinyin_candidates(pinyin_instance_t * instance, + size_t offset){ + + pinyin_context_t * & context = instance->m_context; + pinyin_option_t & options = context->m_options; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + + _free_candidates(instance->m_candidates); + + size_t pinyin_len = pinyin_keys->len - offset; + pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len); + ssize_t i; + + /* lookup the previous token here. */ + phrase_token_t prev_token = null_token; + + if (options & DYNAMIC_ADJUST) { + prev_token = _get_previous_token(instance, offset); + } + + SingleGram merged_gram; + SingleGram * system_gram = NULL, * user_gram = NULL; + + if (options & DYNAMIC_ADJUST) { + if (null_token != prev_token) { + context->m_system_bigram->load(prev_token, system_gram); + context->m_user_bigram->load(prev_token, user_gram); + merge_single_gram(&merged_gram, system_gram, user_gram); + } + } + + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(ranges)); + context->m_phrase_index->prepare_ranges(ranges); + + GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); + + if (1 == pinyin_len) { + /* because there is only one pinyin left, + * the following for-loop will not produce 2 character candidates. + * the if-branch will fill the candidate list with + * 2 character candidates. + */ + + if (options & USE_DIVIDED_TABLE) { + g_array_set_size(items, 0); + + if (_try_divided_table(instance, ranges, offset, items)) { + +#if 0 + g_array_sort(items, compare_item_with_token); + + _remove_duplicated_items(items); +#endif + + _compute_frequency_of_items(context, prev_token, + &merged_gram, items); + + /* sort the candidates of the same length by frequency. */ + g_array_sort(items, compare_item_with_frequency); + + /* transfer back items to tokens, and save it into candidates */ + for (i = 0; i < items->len; ++i) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, i); + g_array_append_val(instance->m_candidates, *item); + } + } + } + } + + for (i = pinyin_len; i >= 1; --i) { + bool found = false; + g_array_set_size(items, 0); + + if (2 == i) { + /* handle fuzzy pinyin segment here. */ + if (options & USE_DIVIDED_TABLE) { + found = _try_divided_table(instance, ranges, offset, items) || + found; + } + if (options & USE_RESPLIT_TABLE) { + found = _try_resplit_table(instance, ranges, offset, items) || + found; + } + } + + ChewingKey * keys = &g_array_index + (pinyin_keys, ChewingKey, offset); + + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (i, keys, ranges); + + found = (retval & SEARCH_OK) || found; + + if ( !found ) + continue; + + lookup_candidate_t template_item; + _append_items(context, ranges, &template_item, items); + +#if 0 + g_array_sort(items, compare_item_with_token); + + _remove_duplicated_items(items); +#endif + + _compute_frequency_of_items(context, prev_token, &merged_gram, items); + + g_array_sort(items, compare_item_with_frequency); + + for (size_t k = 0; k < items->len; ++k) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, k); + g_array_append_val(instance->m_candidates, *item); + } + +#if 0 + if (!(retval & SEARCH_CONTINUED)) + break; +#endif + } + + g_array_free(items, TRUE); + context->m_phrase_index->destroy_ranges(ranges); + if (system_gram) + delete system_gram; + if (user_gram) + delete user_gram; + + /* post process to remove duplicated candidates */ + + _prepend_sentence_candidate(instance, instance->m_candidates); + + _compute_phrase_strings_of_items(instance, offset, instance->m_candidates); + + _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); + + return true; +} + + +int pinyin_choose_candidate(pinyin_instance_t * instance, + size_t offset, + lookup_candidate_t * candidate){ + pinyin_context_t * & context = instance->m_context; + + if (DIVIDED_CANDIDATE == candidate->m_candidate_type || + RESPLIT_CANDIDATE == candidate->m_candidate_type) { + /* update full pinyin. */ + gchar * oldpinyins = instance->m_raw_full_pinyin; + const ChewingKeyRest rest = candidate->m_orig_rest; + oldpinyins[rest.m_raw_begin] = '\0'; + const gchar * left_part = oldpinyins; + const gchar * right_part = oldpinyins + rest.m_raw_end; + gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins, + right_part, NULL); + g_free(oldpinyins); + instance->m_raw_full_pinyin = newpinyins; + + /* re-parse the full pinyin. */ + const gchar * pinyins = instance->m_raw_full_pinyin; + int pinyin_len = strlen(pinyins); + int parse_len = context->m_full_pinyin_parser->parse + (context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, pinyins, pinyin_len); + + /* Note: there may be some un-parsable input here. */ + } + + /* sync m_constraints to the length of m_pinyin_keys. */ + bool retval = context->m_pinyin_lookup->validate_constraint + (instance->m_constraints, instance->m_pinyin_keys); + + phrase_token_t token = candidate->m_token; + guint8 len = context->m_pinyin_lookup->add_constraint + (instance->m_constraints, offset, token); + + /* safe guard: validate the m_constraints again. */ + retval = context->m_pinyin_lookup->validate_constraint + (instance->m_constraints, instance->m_pinyin_keys) && len; + + return offset + len; +} + +bool pinyin_clear_constraint(pinyin_instance_t * instance, + size_t offset){ + pinyin_context_t * & context = instance->m_context; + + bool retval = context->m_pinyin_lookup->clear_constraint + (instance->m_constraints, offset); + + return retval; +} + +bool pinyin_lookup_tokens(pinyin_instance_t * instance, + const char * phrase, GArray * tokenarray){ + pinyin_context_t * & context = instance->m_context; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + glong ucs4_len = 0; + ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens); + int num = reduce_tokens(tokens, tokenarray); + phrase_index->destroy_tokens(tokens); + + return SEARCH_OK & retval; +} + +bool pinyin_train(pinyin_instance_t * instance){ + if (!instance->m_context->m_user_dir) + return false; + + pinyin_context_t * & context = instance->m_context; + context->m_modified = true; + + bool retval = context->m_pinyin_lookup->train_result2 + (instance->m_pinyin_keys, instance->m_constraints, + instance->m_match_results); + + return retval; +} + +bool pinyin_reset(pinyin_instance_t * instance){ + g_free(instance->m_raw_full_pinyin); + instance->m_raw_full_pinyin = NULL; + + g_array_set_size(instance->m_prefixes, 0); + g_array_set_size(instance->m_pinyin_keys, 0); + g_array_set_size(instance->m_pinyin_key_rests, 0); + g_array_set_size(instance->m_constraints, 0); + g_array_set_size(instance->m_match_results, 0); + _free_candidates(instance->m_candidates); + + return true; +} + +bool pinyin_get_chewing_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str) { + *utf8_str = NULL; + if (0 == key->get_table_index()) + return false; + + *utf8_str = key->get_chewing_string(); + return true; +} + +bool pinyin_get_pinyin_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str) { + *utf8_str = NULL; + if (0 == key->get_table_index()) + return false; + + *utf8_str = key->get_pinyin_string(); + return true; +} + +bool pinyin_get_pinyin_strings(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** shengmu, + gchar ** yunmu) { + if (0 == key->get_table_index()) + return false; + + if (shengmu) + *shengmu = key->get_shengmu_string(); + if (yunmu) + *yunmu = key->get_yunmu_string(); + return true; +} + +bool pinyin_token_get_phrase(pinyin_instance_t * instance, + phrase_token_t token, + guint * len, + gchar ** utf8_str) { + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + ucs4_t buffer[MAX_PHRASE_LENGTH]; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + item.get_phrase_string(buffer); + guint length = item.get_phrase_length(); + if (len) + *len = length; + if (utf8_str) + *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); + return true; +} + +bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint * num){ + *num = 0; + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + *num = item.get_n_pronunciation(); + return true; +} + +bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint nth, + ChewingKeyVector keys){ + g_array_set_size(keys, 0); + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + ChewingKey buffer[MAX_PHRASE_LENGTH]; + guint32 freq = 0; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + item.get_nth_pronunciation(nth, buffer, freq); + guint8 len = item.get_phrase_length(); + g_array_append_vals(keys, buffer, len); + return true; +} + +bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint * freq) { + *freq = 0; + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + *freq = item.get_unigram_frequency(); + return true; +} + +bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint delta){ + pinyin_context_t * & context = instance->m_context; + int retval = context->m_phrase_index->add_unigram_frequency + (token, delta); + return ERROR_OK == retval; +} + +bool pinyin_get_n_candidate(pinyin_instance_t * instance, + guint * num) { + *num = instance->m_candidates->len; + return true; +} + +bool pinyin_get_candidate(pinyin_instance_t * instance, + guint index, + lookup_candidate_t ** candidate) { + CandidateVector & candidates = instance->m_candidates; + + *candidate = NULL; + + if (index >= candidates->len) + return false; + + *candidate = &g_array_index(candidates, lookup_candidate_t, index); + + return true; +} + +bool pinyin_get_candidate_type(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + lookup_candidate_type_t * type) { + *type = candidate->m_candidate_type; + return true; +} + +bool pinyin_get_candidate_string(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + const gchar ** utf8_str) { + *utf8_str = candidate->m_phrase_string; + return true; +} + +bool pinyin_get_n_pinyin(pinyin_instance_t * instance, + guint * num) { + *num = 0; + + if (instance->m_pinyin_keys->len != + instance->m_pinyin_key_rests->len) + return false; + + *num = instance->m_pinyin_keys->len; + return true; +} + +bool pinyin_get_pinyin_key(pinyin_instance_t * instance, + guint index, + ChewingKey ** key) { + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + + *key = NULL; + + if (index >= pinyin_keys->len) + return false; + + *key = &g_array_index(pinyin_keys, ChewingKey, index); + + return true; +} + +bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance, + guint index, + ChewingKeyRest ** key_rest) { + ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; + + *key_rest = NULL; + + if (index >= pinyin_key_rests->len) + return false; + + *key_rest = &g_array_index(pinyin_key_rests, ChewingKeyRest, index); + + return true; +} + +bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * begin, guint16 * end) { + if (begin) + *begin = key_rest->m_raw_begin; + + if (end) + *end = key_rest->m_raw_end; + + return true; +} + +bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * length) { + *length = key_rest->length(); + return true; +} + +bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance, + const gchar ** utf8_str) { + *utf8_str = instance->m_raw_full_pinyin; + return true; +} + +bool pinyin_get_n_phrase(pinyin_instance_t * instance, + guint * num) { + *num = instance->m_match_results->len; + return true; +} + +bool pinyin_get_phrase_token(pinyin_instance_t * instance, + guint index, + phrase_token_t * token){ + MatchResults & match_results = instance->m_match_results; + + *token = null_token; + + if (index >= match_results->len) + return false; + + *token = g_array_index(match_results, phrase_token_t, index); + + return true; +} + + +/** + * Note: prefix is the text before the pre-edit string. + */ diff --git a/src/pinyin.h b/src/pinyin.h new file mode 100644 index 0000000..8c39c3d --- /dev/null +++ b/src/pinyin.h @@ -0,0 +1,719 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef PINYIN_H +#define PINYIN_H + + +#include "novel_types.h" +#include "pinyin_custom2.h" + + +G_BEGIN_DECLS + +typedef struct _ChewingKey ChewingKey; +typedef struct _ChewingKeyRest ChewingKeyRest; + +typedef struct _pinyin_context_t pinyin_context_t; +typedef struct _pinyin_instance_t pinyin_instance_t; +typedef struct _lookup_candidate_t lookup_candidate_t; + +typedef struct _import_iterator_t import_iterator_t; + +typedef enum _lookup_candidate_type_t{ + BEST_MATCH_CANDIDATE = 1, + NORMAL_CANDIDATE, + DIVIDED_CANDIDATE, + RESPLIT_CANDIDATE, + ZOMBIE_CANDIDATE +} lookup_candidate_type_t; + +/** + * pinyin_init: + * @systemdir: the system wide language model data directory. + * @userdir: the user's language model data directory. + * @returns: the newly created pinyin context, NULL if failed. + * + * Create a new pinyin context. + * + */ +pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir); + +/** + * pinyin_load_phrase_library: + * @context: the pinyin context. + * @index: the phrase index to be loaded. + * @returns: whether the load succeeded. + * + * Load the sub phrase library of the index. + * + */ +bool pinyin_load_phrase_library(pinyin_context_t * context, + guint8 index); + +/** + * pinyin_unload_phrase_library: + * @context: the pinyin context. + * @index: the phrase index to be unloaded. + * @returns: whether the unload succeeded. + * + * Unload the sub phrase library of the index. + * + */ +bool pinyin_unload_phrase_library(pinyin_context_t * context, + guint8 index); + +/** + * pinyin_begin_add_phrases: + * @context: the pinyin context. + * @index: the phrase index to be imported. + * @returns: the import iterator. + * + * Begin to add phrases. + * + */ +import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context, + guint8 index); + +/** + * pinyin_iterator_add_phrase: + * @iter: the import iterator. + * @phrase: the phrase string. + * @pinyin: the pinyin string. + * @count: the count of the phrase/pinyin pair, -1 to use the default value. + * @returns: whether the add operation succeeded. + * + * Add a pair of phrase and pinyin with count. + * + */ +bool pinyin_iterator_add_phrase(import_iterator_t * iter, + const char * phrase, + const char * pinyin, + gint count); + +/** + * pinyin_end_add_phrases: + * @iter: the import iterator. + * + * End adding phrases. + * + */ +void pinyin_end_add_phrases(import_iterator_t * iter); + +/** + * pinyin_save: + * @context: the pinyin context to be saved into user directory. + * @returns: whether the save succeeded. + * + * Save the user's self-learning information of the pinyin context. + * + */ +bool pinyin_save(pinyin_context_t * context); + +/** + * pinyin_set_double_pinyin_scheme: + * @context: the pinyin context. + * @scheme: the double pinyin scheme. + * @returns: whether the set double pinyin scheme succeeded. + * + * Change the double pinyin scheme of the pinyin context. + * + */ +bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context, + DoublePinyinScheme scheme); + +/** + * pinyin_set_chewing_scheme: + * @context: the pinyin context. + * @scheme: the chewing scheme. + * @returns: whether the set chewing scheme succeeded. + * + * Change the chewing scheme of the pinyin context. + * + */ +bool pinyin_set_chewing_scheme(pinyin_context_t * context, + ChewingScheme scheme); + +/** + * pinyin_fini: + * @context: the pinyin context. + * + * Finalize the pinyin context. + * + */ +void pinyin_fini(pinyin_context_t * context); + + +/** + * pinyin_mask_out: + * @context: the pinyin context. + * @mask: the mask. + * @value: the value. + * @returns: whether the mask out operation is successful. + * + * Mask out the matched phrase tokens. + * + */ +bool pinyin_mask_out(pinyin_context_t * context, + phrase_token_t mask, + phrase_token_t value); + + +/** + * pinyin_set_options: + * @context: the pinyin context. + * @options: the pinyin options of the pinyin context. + * @returns: whether the set options scheme succeeded. + * + * Set the options of the pinyin context. + * + */ +bool pinyin_set_options(pinyin_context_t * context, + pinyin_option_t options); + +/** + * pinyin_alloc_instance: + * @context: the pinyin context. + * @returns: the newly allocated pinyin instance, NULL if failed. + * + * Allocate a new pinyin instance from the context. + * + */ +pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context); + +/** + * pinyin_free_instance: + * @instance: the pinyin instance. + * + * Free the pinyin instance. + * + */ +void pinyin_free_instance(pinyin_instance_t * instance); + + +/** + * pinyin_guess_sentence: + * @instance: the pinyin instance. + * @returns: whether the sentence are guessed successfully. + * + * Guess a sentence from the saved pinyin keys in the instance. + * + */ +bool pinyin_guess_sentence(pinyin_instance_t * instance); + +/** + * pinyin_guess_sentence_with_prefix: + * @instance: the pinyin instance. + * @prefix: the prefix before the sentence. + * @returns: whether the sentence are guessed successfully. + * + * Guess a sentence from the saved pinyin keys with a prefix. + * + */ +bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, + const char * prefix); + +/** + * pinyin_phrase_segment: + * @instance: the pinyin instance. + * @sentence: the utf-8 sentence to be segmented. + * @returns: whether the sentence are segmented successfully. + * + * Segment a sentence and saved the result in the instance. + * + */ +bool pinyin_phrase_segment(pinyin_instance_t * instance, + const char * sentence); + +/** + * pinyin_get_sentence: + * @instance: the pinyin instance. + * @sentence: the saved sentence in the instance. + * @returns: whether the sentence is already saved in the instance. + * + * Get the sentence from the instance. + * + * Note: the returned sentence should be freed by g_free(). + * + */ +bool pinyin_get_sentence(pinyin_instance_t * instance, + char ** sentence); + +/** + * pinyin_parse_full_pinyin: + * @instance: the pinyin instance. + * @onepinyin: a single full pinyin to be parsed. + * @onekey: the parsed key. + * @returns: whether the parse is successfully. + * + * Parse a single full pinyin. + * + */ +bool pinyin_parse_full_pinyin(pinyin_instance_t * instance, + const char * onepinyin, + ChewingKey * onekey); + +/** + * pinyin_parse_more_full_pinyins: + * @instance: the pinyin instance. + * @pinyins: the full pinyins to be parsed. + * @returns: the parsed length of the full pinyins. + * + * Parse multiple full pinyins and save it in the instance. + * + */ +size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance, + const char * pinyins); + +/** + * pinyin_parse_double_pinyin: + * @instance: the pinyin instance. + * @onepinyin: the single double pinyin to be parsed. + * @onekey: the parsed key. + * @returns: whether the parse is successfully. + * + * Parse a single double pinyin. + * + */ +bool pinyin_parse_double_pinyin(pinyin_instance_t * instance, + const char * onepinyin, + ChewingKey * onekey); + +/** + * pinyin_parse_more_double_pinyins: + * @instance: the pinyin instance. + * @pinyins: the double pinyins to be parsed. + * @returns: the parsed length of the double pinyins. + * + * Parse multiple double pinyins and save it in the instance. + * + */ +size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance, + const char * pinyins); + +/** + * pinyin_parse_chewing: + * @instance: the pinyin instance. + * @onechewing: the single chewing to be parsed. + * @onekey: the parsed key. + * @returns: whether the parse is successfully. + * + * Parse a single chewing. + * + */ +bool pinyin_parse_chewing(pinyin_instance_t * instance, + const char * onechewing, + ChewingKey * onekey); + +/** + * pinyin_parse_more_chewings: + * @instance: the pinyin instance. + * @chewings: the chewings to be parsed. + * @returns: the parsed length of the chewings. + * + * Parse multiple chewings and save it in the instance. + * + */ +size_t pinyin_parse_more_chewings(pinyin_instance_t * instance, + const char * chewings); + +/** + * pinyin_in_chewing_keyboard: + * @instance: the pinyin instance. + * @key: the input key. + * @symbol: the chewing symbol. + * @returns: whether the key is in current chewing scheme. + * + * Check whether the input key is in current chewing scheme. + * + */ +bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, + const char key, const char ** symbol); +/** + * pinyin_guess_candidates: + * @instance: the pinyin instance. + * @offset: the offset in the pinyin keys. + * @returns: whether a list of tokens are gotten. + * + * Guess the candidates at the offset. + * + */ +bool pinyin_guess_candidates(pinyin_instance_t * instance, + size_t offset); + +/** + * pinyin_guess_full_pinyin_candidates: + * @instance: the pinyin instance. + * @offset: the offset in the pinyin keys. + * @returns: whether a list of lookup_candidate_t candidates are gotten. + * + * Guess the full pinyin candidates at the offset. + * + */ +bool pinyin_guess_full_pinyin_candidates(pinyin_instance_t * instance, + size_t offset); + +/** + * pinyin_choose_candidate: + * @instance: the pinyin instance. + * @offset: the offset in the pinyin keys. + * @candidate: the selected candidate. + * @returns: the cursor after the chosen candidate. + * + * Choose a full pinyin candidate at the offset. + * + */ +int pinyin_choose_candidate(pinyin_instance_t * instance, + size_t offset, + lookup_candidate_t * candidate); + +/** +* pinyin_clear_constraint: +* @instance: the pinyin instance. +* @offset: the offset in the pinyin keys. +* @returns: whether the constraint is cleared. +* +* Clear the previous chosen candidate. +* +*/ +bool pinyin_clear_constraint(pinyin_instance_t * instance, + size_t offset); + +/** + * pinyin_lookup_tokens: + * @instance: the pinyin instance. + * @phrase: the phrase to be looked up. + * @tokenarray: the returned GArray of tokens. + * @returns: whether the lookup operation is successful. + * + * Lookup the tokens for the phrase utf8 string. + * + */ +bool pinyin_lookup_tokens(pinyin_instance_t * instance, + const char * phrase, GArray * tokenarray); + +/** + * pinyin_train: + * @instance: the pinyin instance. + * @returns: whether the sentence is trained. + * + * Train the current user input sentence. + * + */ +bool pinyin_train(pinyin_instance_t * instance); + +/** + * pinyin_reset: + * @instance: the pinyin instance. + * @returns: whether the pinyin instance is resetted. + * + * Reset the pinyin instance. + * + */ +bool pinyin_reset(pinyin_instance_t * instance); + +/** + * pinyin_get_chewing_string: + * @instance: the pinyin instance. + * @key: the chewing key. + * @utf8_str: the chewing string. + * @returns: whether the get operation is successful. + * + * Get the chewing string of the key. + * + */ +bool pinyin_get_chewing_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str); + +/** + * pinyin_get_pinyin_string: + * @instance: the pinyin instance. + * @key: the pinyin key. + * @utf8_str: the pinyin string. + * @returns: whether the get operation is successful. + * + * Get the pinyin string of the key. + * + */ +bool pinyin_get_pinyin_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str); + +/** + * pinyin_get_pinyin_strings: + * @instance: the pinyin instance. + * @key: the pinyin key. + * @shengmu: the shengmu string. + * @yunmu: the yunmu string. + * @returns: whether the get operation is successful. + * + * Get the shengmu and yunmu strings of the key. + * + */ +bool pinyin_get_pinyin_strings(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** shengmu, + gchar ** yunmu); + +/** + * pinyin_token_get_phrase: + * @instance: the pinyin instance. + * @token: the phrase token. + * @len: the phrase length. + * @utf8_str: the phrase string. + * @returns: whether the get operation is successful. + * + * Get the phrase length and utf8 string. + * + */ +bool pinyin_token_get_phrase(pinyin_instance_t * instance, + phrase_token_t token, + guint * len, + gchar ** utf8_str); + +/** + * pinyin_token_get_n_pronunciation: + * @instance: the pinyin instance. + * @token: the phrase token. + * @num: the number of pinyins. + * @returns: whether the get operation is successful. + * + * Get the number of the pinyins. + * + */ +bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint * num); + +/** + * pinyin_token_get_nth_pronunciation: + * @instance: the pinyin instance. + * @token: the phrase token. + * @nth: the index of the pinyin. + * @keys: the GArray of chewing key. + * @returns: whether the get operation is successful. + * + * Get the nth pinyin from the phrase. + * + */ +bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint nth, + ChewingKeyVector keys); + +/** + * pinyin_token_get_unigram_frequency: + * @instance: the pinyin instance. + * @token: the phrase token. + * @freq: the unigram frequency of the phrase. + * @returns: whether the get operation is successful. + * + * Get the unigram frequency of the phrase. + * + */ +bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint * freq); + +/** + * pinyin_token_add_unigram_frequency: + * @instance: the pinyin instance. + * @token: the phrase token. + * @delta: the delta of the unigram frequency. + * @returns: whether the add operation is successful. + * + * Add delta to the unigram frequency of the phrase token. + * + */ +bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint delta); + +/** + * pinyin_get_n_candidate: + * @instance: the pinyin instance. + * @num: the number of the candidates. + * @returns: whether the get operation is successful. + * + * Get the number of the candidates. + * + */ +bool pinyin_get_n_candidate(pinyin_instance_t * instance, + guint * num); + +/** + * pinyin_get_candidate: + * @instance: the pinyin instance. + * @index: the index of the candidate. + * @candidate: the retrieved candidate. + * + * Get the candidate of the index from the candidates. + * + */ +bool pinyin_get_candidate(pinyin_instance_t * instance, + guint index, + lookup_candidate_t ** candidate); + +/** + * pinyin_get_candidate_type: + * @instance: the pinyin instance. + * @candidate: the lookup candidate. + * @type: the type of the candidate. + * @returns: whether the get operation is successful. + * + * Get the type of the lookup candidate. + * + */ +bool pinyin_get_candidate_type(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + lookup_candidate_type_t * type); + +/** + * pinyin_get_candidate_string: + * @instance: the pinyin instance. + * @candidate: the lookup candidate. + * @utf8_str: the string of the candidate. + * @returns: whether the get operation is successful. + * + * Get the string of the candidate. + * + */ +bool pinyin_get_candidate_string(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + const gchar ** utf8_str); + +/** + * pinyin_get_n_pinyin: + * @instance: the pinyin instance. + * @num: the number of the pinyins. + * @returns: whether the get operation is successful. + * + * Get the number of the pinyins. + * + */ +bool pinyin_get_n_pinyin(pinyin_instance_t * instance, + guint * num); + +/** + * pinyin_get_pinyin_key: + * @instance: the pinyin instance. + * @index: the index of the pinyin key. + * @key: the retrieved pinyin key. + * @returns: whether the get operation is successful. + * + * Get the pinyin key of the index from the pinyin keys. + * + */ +bool pinyin_get_pinyin_key(pinyin_instance_t * instance, + guint index, + ChewingKey ** key); + +/** + * pinyin_get_pinyin_key_rest: + * @instance: the pinyin index. + * @index: the index of the pinyin key rest. + * @key_rest: the retrieved pinyin key rest. + * @returns: whether the get operation is successful. + * + * Get the pinyin key rest of the index from the pinyin key rests. + * + */ +bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance, + guint index, + ChewingKeyRest ** key_rest); + +/** + * pinyin_get_pinyin_key_rest_positions: + * @instance: the pinyin instance. + * @key_rest: the pinyin key rest. + * @begin: the begin position of the corresponding pinyin key. + * @end: the end position of the corresponding pinyin key. + * @returns: whether the get operation is successful. + * + * Get the positions of the pinyin key rest. + * + */ +bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * begin, guint16 * end); + +/** + * pinyin_get_pinyin_key_rest_length: + * @instance: the pinyin instance. + * @key_rest: the pinyin key rest. + * @length: the length of the corresponding pinyin key. + * @returns: whether the get operation is successful. + * + * Get the length of the corresponding pinyin key. + * + */ +bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * length); + +/** + * pinyin_get_raw_full_pinyin: + * @instance: the pinyin instance. + * @utf8_str: the modified raw full pinyin after choose candidate. + * @returns: whether the get operation is successful. + * + * Get the modified raw full pinyin after choose candidate. + * + */ +bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance, + const gchar ** utf8_str); + +/** + * pinyin_get_n_phrase: + * @instance: the pinyin instance. + * @num: the number of the phrase tokens. + * @returns: whether the get operation is successful. + * + * Get the number of the phrase tokens. + * + */ +bool pinyin_get_n_phrase(pinyin_instance_t * instance, + guint * num); + +/** + * pinyin_get_phrase_token: + * @instance: the pinyin instance. + * @index: the index of the phrase token. + * @token: the retrieved phrase token. + * @returns: whether the get operation is successful. + * + * Get the phrase token of the index from the phrase tokens. + * + */ +bool pinyin_get_phrase_token(pinyin_instance_t * instance, + guint index, + phrase_token_t * token); + +/* hack here. */ +typedef ChewingKey PinyinKey; +typedef ChewingKeyRest PinyinKeyPos; + + +G_END_DECLS + +#endif diff --git a/src/pinyin_internal.cpp b/src/pinyin_internal.cpp new file mode 100644 index 0000000..79fb688 --- /dev/null +++ b/src/pinyin_internal.cpp @@ -0,0 +1,4 @@ +#include "pinyin_internal.h" + + +/* Place holder for pinyin internal library. */ diff --git a/src/pinyin_internal.h b/src/pinyin_internal.h new file mode 100644 index 0000000..3f97efa --- /dev/null +++ b/src/pinyin_internal.h @@ -0,0 +1,73 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef PINYIN_INTERNAL_H +#define PINYIN_INTERNAL_H + +#include <stdio.h> +#include "novel_types.h" +#include "memory_chunk.h" +#include "pinyin_custom2.h" +#include "chewing_key.h" +#include "pinyin_parser2.h" +#include "pinyin_phrase2.h" +#include "chewing_large_table.h" +#include "phrase_large_table2.h" +#include "facade_chewing_table.h" +#include "facade_phrase_table2.h" +#include "phrase_index.h" +#include "phrase_index_logger.h" +#include "ngram.h" +#include "lookup.h" +#include "pinyin_lookup2.h" +#include "phrase_lookup.h" +#include "tag_utility.h" +#include "table_info.h" + + +/* training module */ +#include "flexible_ngram.h" + + +/* define filenames */ +#define SYSTEM_TABLE_INFO "table.conf" +#define USER_TABLE_INFO "user.conf" +#define SYSTEM_BIGRAM "bigram.db" +#define USER_BIGRAM "user_bigram.db" +#define DELETED_BIGRAM "deleted_bigram.db" +#define SYSTEM_PINYIN_INDEX "pinyin_index.bin" +#define USER_PINYIN_INDEX "user_pinyin_index.bin" +#define SYSTEM_PHRASE_INDEX "phrase_index.bin" +#define USER_PHRASE_INDEX "user_phrase_index.bin" + + +using namespace pinyin; + + +/* the following fixes build on Debian GNU/kFreeBSD */ +#include <errno.h> +#ifndef ENODATA +#define ENODATA ENOENT +#endif + + +#endif diff --git a/src/storage/CMakeLists.txt b/src/storage/CMakeLists.txt new file mode 100644 index 0000000..e33e213 --- /dev/null +++ b/src/storage/CMakeLists.txt @@ -0,0 +1,38 @@ +set( + CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" +) + +set( + LIBSTORAGE_HEADERS + chewing_key.h + pinyin_custom2.h +) + +set( + LIBSTORAGE_SOURCES + phrase_index.cpp + phrase_large_table2.cpp + ngram.cpp + tag_utility.cpp + pinyin_parser2.cpp + chewing_large_table.cpp +) + +add_library( + storage + STATIC + ${LIBSTORAGE_SOURCES} +) + +target_link_libraries( + storage + ${GLIB2_LIBRARIES} + ${BERKELEY_DB_LIBRARIES} +) + +install( + FILES + ${LIBSTORAGE_HEADERS} + DESTINATION + ${DIR_INCLUDE_LIBPINYIN} +) diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am new file mode 100644 index 0000000..d805f18 --- /dev/null +++ b/src/storage/Makefile.am @@ -0,0 +1,59 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CFLAGS@ + +libpinyinincludedir = $(includedir)/libpinyin-@VERSION@ + +libpinyininclude_HEADERS= pinyin_custom2.h + + +noinst_HEADERS = chewing_enum.h \ + chewing_key.h \ + pinyin_parser2.h \ + phrase_index.h \ + phrase_index_logger.h \ + phrase_large_table2.h \ + ngram.h \ + flexible_ngram.h \ + tag_utility.h \ + pinyin_parser_table.h \ + double_pinyin_table.h \ + chewing_table.h \ + pinyin_phrase2.h \ + chewing_large_table.h \ + facade_chewing_table.h \ + facade_phrase_table2.h \ + table_info.h + + +noinst_LTLIBRARIES = libstorage.la + +libstorage_la_CXXFLAGS = "-fPIC" + +libstorage_la_LDFLAGS = -static + +libstorage_la_SOURCES = phrase_index.cpp \ + phrase_large_table2.cpp \ + ngram.cpp \ + tag_utility.cpp \ + pinyin_parser2.cpp \ + chewing_large_table.cpp \ + table_info.cpp + diff --git a/src/storage/chewing_enum.h b/src/storage/chewing_enum.h new file mode 100644 index 0000000..e6d212d --- /dev/null +++ b/src/storage/chewing_enum.h @@ -0,0 +1,104 @@ +/* This file is generated by python scripts. Don't edit this file directly. + */ + +#ifndef CHEWING_ENUM_H +#define CHEWING_ENUM_H + +namespace pinyin{ + +/** + * @brief enums of chewing initial element. + */ + +enum ChewingInitial +{ +CHEWING_ZERO_INITIAL = 0, +CHEWING_B = 1, +CHEWING_C = 2, +CHEWING_CH = 3, +CHEWING_D = 4, +CHEWING_F = 5, +CHEWING_H = 6, +CHEWING_G = 7, +CHEWING_K = 8, +CHEWING_J = 9, +CHEWING_M = 10, +CHEWING_N = 11, +CHEWING_L = 12, +CHEWING_R = 13, +CHEWING_P = 14, +CHEWING_Q = 15, +CHEWING_S = 16, +CHEWING_SH = 17, +CHEWING_T = 18, +PINYIN_W = 19, +CHEWING_X = 20, +PINYIN_Y = 21, +CHEWING_Z = 22, +CHEWING_ZH = 23, +CHEWING_LAST_INITIAL = CHEWING_ZH, +CHEWING_NUMBER_OF_INITIALS = CHEWING_LAST_INITIAL + 1 +}; + + +/** + * @brief enums of chewing middle element. + */ + +enum ChewingMiddle +{ +CHEWING_ZERO_MIDDLE = 0, +CHEWING_I = 1, +CHEWING_U = 2, +CHEWING_V = 3, +CHEWING_LAST_MIDDLE = CHEWING_V, +CHEWING_NUMBER_OF_MIDDLES = CHEWING_LAST_MIDDLE + 1 +}; + + +/** + * @brief enums of chewing final element. + */ +enum ChewingFinal +{ +CHEWING_ZERO_FINAL = 0, +CHEWING_A = 1, +CHEWING_AI = 2, +CHEWING_AN = 3, +CHEWING_ANG = 4, +CHEWING_AO = 5, +CHEWING_E = 6, +INVALID_EA = 7, +CHEWING_EI = 8, +CHEWING_EN = 9, +CHEWING_ENG = 10, +CHEWING_ER = 11, +CHEWING_NG = 12, +CHEWING_O = 13, +PINYIN_ONG = 14, +CHEWING_OU = 15, +PINYIN_IN = 16, +PINYIN_ING = 17, +CHEWING_LAST_FINAL = PINYIN_ING, +CHEWING_NUMBER_OF_FINALS = CHEWING_LAST_FINAL + 1 +}; + + +/** + * @brief enums of chewing tone element. + */ +enum ChewingTone +{ +CHEWING_ZERO_TONE = 0, +CHEWING_1 = 1, +CHEWING_2 = 2, +CHEWING_3 = 3, +CHEWING_4 = 4, +CHEWING_5 = 5, +CHEWING_LAST_TONE = CHEWING_5, +CHEWING_NUMBER_OF_TONES = CHEWING_LAST_TONE + 1 +}; + +}; + +#endif diff --git a/src/storage/chewing_key.h b/src/storage/chewing_key.h new file mode 100644 index 0000000..f3202e8 --- /dev/null +++ b/src/storage/chewing_key.h @@ -0,0 +1,111 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef CHEWING_KEY_H +#define CHEWING_KEY_H + +#include <glib.h> +#include "chewing_enum.h" + +using namespace pinyin; + +G_BEGIN_DECLS + +/** @file chewing_key.h + * @brief the definitions of chewing key related classes and structs. + */ + + +/** Note: The parsed pinyins are stored in the following two + * GArrays to speed up chewing table lookup. + * As the chewing large table only contains information of struct ChewingKey. + */ + +struct _ChewingKey +{ + guint16 m_initial : 5; + guint16 m_middle : 2; + guint16 m_final : 5; + guint16 m_tone : 3; + + _ChewingKey() { + m_initial = CHEWING_ZERO_INITIAL; + m_middle = CHEWING_ZERO_MIDDLE; + m_final = CHEWING_ZERO_FINAL; + m_tone = CHEWING_ZERO_TONE; + } + + _ChewingKey(ChewingInitial initial, ChewingMiddle middle, + ChewingFinal final) { + m_initial = initial; + m_middle = middle; + m_final = final; + m_tone = CHEWING_ZERO_TONE; + } + +public: + gint get_table_index(); + + /* Note: the return value should be freed by g_free. */ + gchar * get_pinyin_string(); + gchar * get_shengmu_string(); + gchar * get_yunmu_string(); + gchar * get_chewing_string(); +}; + +typedef struct _ChewingKey ChewingKey; + +static inline bool operator == (ChewingKey lhs, ChewingKey rhs) { + if (lhs.m_initial != rhs.m_initial) + return false; + if (lhs.m_middle != rhs.m_middle) + return false; + if (lhs.m_final != rhs.m_final) + return false; + if (lhs.m_tone != rhs.m_tone) + return false; + return true; +} + +struct _ChewingKeyRest +{ + /* Note: the table index is removed, + * Please use get_table_index in ChewingKey. + */ + guint16 m_raw_begin; /* the begin of the raw input. */ + guint16 m_raw_end; /* the end of the raw input. */ + + _ChewingKeyRest() { + /* the 0th item in pinyin parser table is reserved for invalid. */ + m_raw_begin = 0; + m_raw_end = 0; + } + + guint16 length() { + return m_raw_end - m_raw_begin; + } +}; + +typedef struct _ChewingKeyRest ChewingKeyRest; + +G_END_DECLS + +#endif diff --git a/src/storage/chewing_large_table.cpp b/src/storage/chewing_large_table.cpp new file mode 100644 index 0000000..2eb8658 --- /dev/null +++ b/src/storage/chewing_large_table.cpp @@ -0,0 +1,1047 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "chewing_large_table.h" +#include <assert.h> +#include "pinyin_phrase2.h" +#include "pinyin_parser2.h" + + +/* internal class definition */ + +namespace pinyin{ +class ChewingLengthIndexLevel{ + +protected: + GArray * m_chewing_array_indexes; + +public: + /* constructor/destructor */ + ChewingLengthIndexLevel(); + ~ChewingLengthIndexLevel(); + + /* load/store method */ + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, + table_offset_t & end); + + /* search method */ + int search(pinyin_option_t options, int phrase_length, + /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const; + + /* add/remove index method */ + int add_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token); + int remove_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token); + + /* get length method */ + int get_length() const; + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + + +template<size_t phrase_length> +class ChewingArrayIndexLevel{ +protected: + typedef PinyinIndexItem2<phrase_length> IndexItem; + +protected: + MemoryChunk m_chunk; + + /* compress consecutive tokens */ + int convert(pinyin_option_t options, + const ChewingKey keys[], + IndexItem * begin, + IndexItem * end, + PhraseIndexRanges ranges) const; + +public: + /* load/store method */ + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, + table_offset_t & end); + + /* search method */ + int search(pinyin_option_t options, /* in */const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const; + + /* add/remove index method */ + int add_index(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token); + int remove_index(/* in */ const ChewingKey keys[], + /* in */ phrase_token_t token); + + /* get length method */ + int get_length() const; + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + +}; + + +using namespace pinyin; + +/* class implementation */ + +ChewingBitmapIndexLevel::ChewingBitmapIndexLevel(pinyin_option_t options) + : m_options(options) { + memset(m_chewing_length_indexes, 0, sizeof(m_chewing_length_indexes)); +} + +void ChewingBitmapIndexLevel::reset() { + for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k) + for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l) + for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m) + for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES; + ++n) { + ChewingLengthIndexLevel * & length_array = + m_chewing_length_indexes[k][l][m][n]; + if (length_array) + delete length_array; + length_array = NULL; + } +} + + +/* search method */ + +int ChewingBitmapIndexLevel::search(int phrase_length, + /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + assert(phrase_length > 0); + return initial_level_search(phrase_length, keys, ranges); +} + +int ChewingBitmapIndexLevel::initial_level_search (int phrase_length, + /* in */ const ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const { + +/* macros */ +#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \ + { \ + result |= middle_and_final_level_search(ORIGIN, phrase_length, \ + keys, ranges); \ + if (m_options & AMBIGUITY) { \ + result |= middle_and_final_level_search(ANOTHER, \ + phrase_length, \ + keys, ranges); \ + } \ + return result; \ + } + + /* deal with ambiguities */ + int result = SEARCH_NONE; + const ChewingKey & first_key = keys[0]; + + switch(first_key.m_initial) { + MATCH(PINYIN_AMB_C_CH, CHEWING_C, CHEWING_CH); + MATCH(PINYIN_AMB_C_CH, CHEWING_CH, CHEWING_C); + MATCH(PINYIN_AMB_Z_ZH, CHEWING_Z, CHEWING_ZH); + MATCH(PINYIN_AMB_Z_ZH, CHEWING_ZH, CHEWING_Z); + MATCH(PINYIN_AMB_S_SH, CHEWING_S, CHEWING_SH); + MATCH(PINYIN_AMB_S_SH, CHEWING_SH, CHEWING_S); + MATCH(PINYIN_AMB_L_R, CHEWING_R, CHEWING_L); + MATCH(PINYIN_AMB_L_N, CHEWING_N, CHEWING_L); + MATCH(PINYIN_AMB_F_H, CHEWING_F, CHEWING_H); + MATCH(PINYIN_AMB_F_H, CHEWING_H, CHEWING_F); + MATCH(PINYIN_AMB_G_K, CHEWING_G, CHEWING_K); + MATCH(PINYIN_AMB_G_K, CHEWING_K, CHEWING_G); + + case CHEWING_L: + { + result |= middle_and_final_level_search + (CHEWING_L, phrase_length, keys, ranges); + + if (m_options & PINYIN_AMB_L_N) + result |= middle_and_final_level_search + (CHEWING_N, phrase_length, keys,ranges); + + if (m_options & PINYIN_AMB_L_R) + result |= middle_and_final_level_search + (CHEWING_R, phrase_length, keys, ranges); + return result; + } + default: + { + result |= middle_and_final_level_search + ((ChewingInitial) first_key.m_initial, + phrase_length, keys, ranges); + return result; + } + } +#undef MATCH + return result; +} + + +int ChewingBitmapIndexLevel::middle_and_final_level_search +(ChewingInitial initial, int phrase_length, /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + +/* macros */ +#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \ + { \ + result = tone_level_search \ + (initial, middle, \ + ORIGIN, phrase_length, keys, ranges); \ + if (m_options & AMBIGUITY) { \ + result |= tone_level_search \ + (initial, middle, \ + ANOTHER, phrase_length, keys, ranges); \ + } \ + return result; \ + } + + int result = SEARCH_NONE; + const ChewingKey & first_key = keys[0]; + const ChewingMiddle middle = (ChewingMiddle)first_key.m_middle; + + switch(first_key.m_final) { + case CHEWING_ZERO_FINAL: + { + if (middle == CHEWING_ZERO_MIDDLE) { /* in-complete pinyin */ + if (!(m_options & PINYIN_INCOMPLETE)) + return result; + for (int m = CHEWING_ZERO_MIDDLE; + m < CHEWING_NUMBER_OF_MIDDLES; ++m) + for (int n = CHEWING_ZERO_FINAL; + n < CHEWING_NUMBER_OF_FINALS; ++n) { + + if (CHEWING_ZERO_MIDDLE == m && + CHEWING_ZERO_FINAL == n) + continue; + + result |= tone_level_search + (initial, (ChewingMiddle) m, (ChewingFinal) n, + phrase_length, keys, ranges); + } + return result; + } else { /* normal pinyin */ + result |= tone_level_search + (initial, middle, CHEWING_ZERO_FINAL, + phrase_length, keys, ranges); + return result; + } + } + + MATCH(PINYIN_AMB_AN_ANG, CHEWING_AN, CHEWING_ANG); + MATCH(PINYIN_AMB_AN_ANG, CHEWING_ANG, CHEWING_AN); + MATCH(PINYIN_AMB_EN_ENG, CHEWING_EN, CHEWING_ENG); + MATCH(PINYIN_AMB_EN_ENG, CHEWING_ENG, CHEWING_EN); + MATCH(PINYIN_AMB_IN_ING, PINYIN_IN, PINYIN_ING); + MATCH(PINYIN_AMB_IN_ING, PINYIN_ING, PINYIN_IN); + + default: + { + result |= tone_level_search + (initial, middle, (ChewingFinal) first_key.m_final, + phrase_length, keys, ranges); + return result; + } + } +#undef MATCH + return result; +} + + +int ChewingBitmapIndexLevel::tone_level_search +(ChewingInitial initial, ChewingMiddle middle, ChewingFinal final, + int phrase_length, /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + + int result = SEARCH_NONE; + const ChewingKey & first_key = keys[0]; + + switch (first_key.m_tone) { + case CHEWING_ZERO_TONE: + { + /* deal with zero tone in chewing large table. */ + for (int i = CHEWING_ZERO_TONE; i < CHEWING_NUMBER_OF_TONES; ++i) { + ChewingLengthIndexLevel * phrases = + m_chewing_length_indexes + [initial][middle][final][(ChewingTone)i]; + if (phrases) + result |= phrases->search + (m_options, phrase_length - 1, keys + 1, ranges); + } + return result; + } + default: + { + ChewingLengthIndexLevel * phrases = + m_chewing_length_indexes + [initial][middle][final][CHEWING_ZERO_TONE]; + if (phrases) + result |= phrases->search + (m_options, phrase_length - 1, keys + 1, ranges); + + phrases = m_chewing_length_indexes + [initial][middle][final][(ChewingTone) first_key.m_tone]; + if (phrases) + result |= phrases->search + (m_options, phrase_length - 1, keys + 1, ranges); + return result; + } + } + return result; +} + + +ChewingLengthIndexLevel::ChewingLengthIndexLevel() { + m_chewing_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *)); +} + +ChewingLengthIndexLevel::~ChewingLengthIndexLevel() { +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \ + if (array) \ + delete array; \ + array = NULL; \ + break; \ + } + + for (guint i = 0; i < m_chewing_array_indexes->len; ++i) { + switch (i){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + } +#undef CASE + g_array_free(m_chewing_array_indexes, TRUE); +} + + +int ChewingLengthIndexLevel::search(pinyin_option_t options, int phrase_length, + /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + int result = SEARCH_NONE; + if (m_chewing_array_indexes->len < phrase_length + 1) + return result; + if (m_chewing_array_indexes->len > phrase_length + 1) + result |= SEARCH_CONTINUED; + +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \ + if (!array) \ + return result; \ + result |= array->search(options, keys, ranges); \ + return result; \ + } + + switch (phrase_length) { + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + +#undef CASE +} + + +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::search +(pinyin_option_t options, /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + IndexItem * chunk_begin = NULL, * chunk_end = NULL; + chunk_begin = (IndexItem *) m_chunk.begin(); + chunk_end = (IndexItem *) m_chunk.end(); + + /* do the search */ + ChewingKey left_keys[phrase_length], right_keys[phrase_length]; + compute_lower_value2(options, keys, left_keys, phrase_length); + compute_upper_value2(options, keys, right_keys, phrase_length); + + IndexItem left(left_keys, -1), right(right_keys, -1); + + IndexItem * begin = std_lite::lower_bound + (chunk_begin, chunk_end, left, + phrase_exact_less_than2<phrase_length>); + IndexItem * end = std_lite::upper_bound + (chunk_begin, chunk_end, right, + phrase_exact_less_than2<phrase_length>); + + return convert(options, keys, begin, end, ranges); +} + +/* compress consecutive tokens */ +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::convert +(pinyin_option_t options, const ChewingKey keys[], + IndexItem * begin, IndexItem * end, + PhraseIndexRanges ranges) const { + IndexItem * iter = NULL; + PhraseIndexRange cursor; + GArray * head, * cursor_head = NULL; + + int result = SEARCH_NONE; + /* TODO: check the below code */ + cursor.m_range_begin = null_token; cursor.m_range_end = null_token; + for (iter = begin; iter != end; ++iter) { + if (0 != pinyin_compare_with_ambiguities2 + (options, keys, iter->m_keys, phrase_length)) + continue; + + phrase_token_t token = iter->m_token; + head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)]; + if (NULL == head) + continue; + + result |= SEARCH_OK; + + if (null_token == cursor.m_range_begin) { + cursor.m_range_begin = token; + cursor.m_range_end = token + 1; + cursor_head = head; + } else if (cursor.m_range_end == token && + PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_begin) == + PHRASE_INDEX_LIBRARY_INDEX(token)) { + ++cursor.m_range_end; + } else { + g_array_append_val(cursor_head, cursor); + cursor.m_range_begin = token; cursor.m_range_end = token + 1; + cursor_head = head; + } + } + + if (null_token == cursor.m_range_begin) + return result; + + g_array_append_val(cursor_head, cursor); + return result; +} + + +/* add/remove index method */ + +int ChewingBitmapIndexLevel::add_index(int phrase_length, + /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + const ChewingKey first_key = keys[0]; + ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes + [first_key.m_initial][first_key.m_middle] + [first_key.m_final][first_key.m_tone]; + + if (NULL == length_array) { + length_array = new ChewingLengthIndexLevel(); + } + + return length_array->add_index(phrase_length - 1, keys + 1, token); +} + +int ChewingBitmapIndexLevel::remove_index(int phrase_length, + /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + const ChewingKey first_key = keys[0]; + ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes + [first_key.m_initial][first_key.m_middle] + [first_key.m_final][first_key.m_tone]; + + if (NULL == length_array) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + + int retval = length_array->remove_index(phrase_length - 1, keys + 1, token); + + /* remove empty array. */ + if (0 == length_array->get_length()) { + delete length_array; + length_array = NULL; + } + + return retval; +} + +int ChewingLengthIndexLevel::add_index(int phrase_length, + /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + if (!(phrase_length + 1 < MAX_PHRASE_LENGTH)) + return ERROR_PHRASE_TOO_LONG; + + if (m_chewing_array_indexes->len <= phrase_length) + g_array_set_size(m_chewing_array_indexes, phrase_length + 1); + +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, \ + ChewingArrayIndexLevel<len> *, len); \ + if (NULL == array) \ + array = new ChewingArrayIndexLevel<len>; \ + return array->add_index(keys, token); \ + } + + switch(phrase_length) { + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + +#undef CASE +} + +int ChewingLengthIndexLevel::remove_index(int phrase_length, + /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + if (!(phrase_length + 1 < MAX_PHRASE_LENGTH)) + return ERROR_PHRASE_TOO_LONG; + + if (m_chewing_array_indexes->len <= phrase_length) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, \ + ChewingArrayIndexLevel<len> *, len); \ + if (NULL == array) \ + return ERROR_REMOVE_ITEM_DONOT_EXISTS; \ + int retval = array->remove_index(keys, token); \ + \ + /* remove empty array. */ \ + if (0 == array->get_length()) { \ + delete array; \ + array = NULL; \ + \ + /* shrink self array. */ \ + g_array_set_size(m_chewing_array_indexes, \ + get_length()); \ + } \ + return retval; \ + } + + switch (phrase_length) { + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + +#undef CASE +} + +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::add_index +(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) { + IndexItem * begin, * end; + + IndexItem add_elem(keys, token); + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + std_lite::pair<IndexItem *, IndexItem *> range; + range = std_lite::equal_range + (begin, end, add_elem, phrase_exact_less_than2<phrase_length>); + + IndexItem * cur_elem; + for (cur_elem = range.first; + cur_elem != range.second; ++cur_elem) { + if (cur_elem->m_token == token) + return ERROR_INSERT_ITEM_EXISTS; + if (cur_elem->m_token > token) + break; + } + + int offset = (cur_elem - begin) * sizeof(IndexItem); + m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem)); + return ERROR_OK; +} + +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::remove_index +(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) { + IndexItem * begin, * end; + + IndexItem remove_elem(keys, token); + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + std_lite::pair<IndexItem *, IndexItem *> range; + range = std_lite::equal_range + (begin, end, remove_elem, phrase_exact_less_than2<phrase_length>); + + IndexItem * cur_elem; + for (cur_elem = range.first; + cur_elem != range.second; ++cur_elem) { + if (cur_elem->m_token == token) + break; + } + + if (cur_elem == range.second) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + + int offset = (cur_elem - begin) * sizeof(IndexItem); + m_chunk.remove_content(offset, sizeof(IndexItem)); + return ERROR_OK; +} + + +/* load text method */ +bool ChewingLargeTable::load_text(FILE * infile) { + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + + while (!feof(infile)) { + int num = fscanf(infile, "%s %s %u %ld", + pinyin, phrase, &token, &freq); + + if (4 != num) + continue; + + if(feof(infile)) + break; + + glong len = g_utf8_strlen(phrase, -1); + + FullPinyinParser2 parser; + ChewingKeyVector keys; + ChewingKeyRestVector key_rests; + + keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + pinyin_option_t options = USE_TONE; + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + + if (len != keys->len) { + fprintf(stderr, "ChewingLargeTable::load_text:%s\t%s\t%u\t%ld\n", + pinyin, phrase, token, freq); + continue; + } + + add_index(keys->len, (ChewingKey *)keys->data, token); + + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + } + + return true; +} + + +/* load/store method */ + +bool ChewingBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, + table_offset_t end) { + reset(); + char * begin = (char *) chunk->begin(); + table_offset_t phrase_begin, phrase_end; + table_offset_t * index = (table_offset_t *) (begin + offset); + phrase_end = *index; + + for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k) + for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l) + for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m) + for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) { + phrase_begin = phrase_end; + index++; + phrase_end = *index; + + if (phrase_begin == phrase_end) /* null pointer */ + continue; + + /* after reset() all phrases are null pointer. */ + ChewingLengthIndexLevel * phrases = new ChewingLengthIndexLevel; + m_chewing_length_indexes[k][l][m][n] = phrases; + + phrases->load(chunk, phrase_begin, phrase_end - 1); + assert(phrase_end <= end); + assert(*(begin + phrase_end - 1) == c_separate); + } + + offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t); + assert(c_separate == *(begin + offset)); + return true; +} + +bool ChewingBitmapIndexLevel::store(MemoryChunk * new_chunk, + table_offset_t offset, + table_offset_t & end) { + table_offset_t phrase_end; + table_offset_t index = offset; + offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t); + + /* add '#' */ + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + + for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k) + for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l) + for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m) + for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) { + ChewingLengthIndexLevel * phrases = + m_chewing_length_indexes[k][l][m][n]; + + if (NULL == phrases) { /* null pointer */ + new_chunk->set_content(index, &offset, + sizeof(table_offset_t)); + index += sizeof(table_offset_t); + continue; + } + + /* has a end '#' */ + phrases->store(new_chunk, offset, phrase_end); + offset = phrase_end; + + /* add '#' */ + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, + sizeof(table_offset_t)); + index += sizeof(table_offset_t); + } + + end = offset; + return true; +} + +bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, + table_offset_t end) { + char * begin = (char *) chunk->begin(); + guint32 nindex = *((guint32 *)(begin + offset)); /* number of index */ + table_offset_t * index = (table_offset_t *) + (begin + offset + sizeof(guint32)); + + table_offset_t phrase_begin, phrase_end = *index; + g_array_set_size(m_chewing_array_indexes, 0); + for (guint32 i = 0; i < nindex; ++i) { + phrase_begin = phrase_end; + index++; + phrase_end = *index; + + if (phrase_begin == phrase_end) { + void * null = NULL; + g_array_append_val(m_chewing_array_indexes, null); + continue; + } + +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * phrase = \ + new ChewingArrayIndexLevel<len>; \ + phrase->load(chunk, phrase_begin, phrase_end - 1); \ + assert(*(begin + phrase_end - 1) == c_separate); \ + assert(phrase_end <= end); \ + g_array_append_val(m_chewing_array_indexes, phrase); \ + break; \ + } + + switch ( i ){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + +#undef CASE + } + + /* check '#' */ + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + assert(c_separate == *(begin + offset)); + return true; +} + +bool ChewingLengthIndexLevel::store(MemoryChunk * new_chunk, + table_offset_t offset, + table_offset_t & end) { + guint32 nindex = m_chewing_array_indexes->len; /* number of index */ + new_chunk->set_content(offset, &nindex, sizeof(guint32)); + table_offset_t index = offset + sizeof(guint32); + + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + + table_offset_t phrase_end; + for (guint32 i = 0; i < nindex; ++i) { +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * phrase = g_array_index \ + (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \ + if (NULL == phrase) { \ + new_chunk->set_content \ + (index, &offset, sizeof(table_offset_t)); \ + index += sizeof(table_offset_t); \ + continue; \ + } \ + phrase->store(new_chunk, offset, phrase_end); \ + offset = phrase_end; \ + break; \ + } + + switch ( i ){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } +#undef CASE + + /* add '#' */ + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + } + + end = offset; + return true; +} + +template<size_t phrase_length> +bool ChewingArrayIndexLevel<phrase_length>:: +load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end) { + char * begin = (char *) chunk->begin(); + m_chunk.set_chunk(begin + offset, end - offset, NULL); + return true; +} + +template<size_t phrase_length> +bool ChewingArrayIndexLevel<phrase_length>:: +store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) { + new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size()); + end = offset + m_chunk.size(); + return true; +} + + +/* get length method */ + +int ChewingLengthIndexLevel::get_length() const { + int length = m_chewing_array_indexes->len; + + /* trim trailing zero. */ + for (int i = length - 1; i >= 0; --i) { + void * array = g_array_index(m_chewing_array_indexes, void *, i); + + if (NULL != array) + break; + + --length; + } + + return length; +} + +template<size_t phrase_length> +int ChewingArrayIndexLevel<phrase_length>::get_length() const { + IndexItem * chunk_begin = NULL, * chunk_end = NULL; + chunk_begin = (IndexItem *) m_chunk.begin(); + chunk_end = (IndexItem *) m_chunk.end(); + + return chunk_end - chunk_begin; +} + + +/* mask out method */ + +bool ChewingBitmapIndexLevel::mask_out(phrase_token_t mask, + phrase_token_t value) { + for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k) + for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l) + for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m) + for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES; + ++n) { + ChewingLengthIndexLevel * & length_array = + m_chewing_length_indexes[k][l][m][n]; + + if (NULL == length_array) + continue; + + length_array->mask_out(mask, value); + + if (0 == length_array->get_length()) { + delete length_array; + length_array = NULL; + } + } + return true; +} + +bool ChewingLengthIndexLevel::mask_out(phrase_token_t mask, + phrase_token_t value) { +#define CASE(len) case len: \ + { \ + ChewingArrayIndexLevel<len> * & array = g_array_index \ + (m_chewing_array_indexes, \ + ChewingArrayIndexLevel<len> *, len); \ + \ + if (NULL == array) \ + continue; \ + \ + array->mask_out(mask, value); \ + \ + if (0 == array->get_length()) { \ + delete array; \ + array = NULL; \ + } \ + break; \ + } + + for (guint i = 0; i < m_chewing_array_indexes->len; ++i) { + switch (i){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + } +#undef CASE + g_array_set_size(m_chewing_array_indexes, get_length()); + return true; +} + +template<size_t phrase_length> +bool ChewingArrayIndexLevel<phrase_length>::mask_out +(phrase_token_t mask, phrase_token_t value) { + IndexItem * begin = NULL, * end = NULL; + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + for (IndexItem * cur = begin; cur != end; ++cur) { + if ((cur->m_token & mask) != value) + continue; + + int offset = (cur - begin) * sizeof(IndexItem); + m_chunk.remove_content(offset, sizeof(IndexItem)); + + /* update chunk end. */ + end = (IndexItem *) m_chunk.end(); + --cur; + } + + return true; +} diff --git a/src/storage/chewing_large_table.h b/src/storage/chewing_large_table.h new file mode 100644 index 0000000..30ae9aa --- /dev/null +++ b/src/storage/chewing_large_table.h @@ -0,0 +1,154 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef CHEWING_LARGE_TABLE_H +#define CHEWING_LARGE_TABLE_H + + +#include <stdio.h> +#include "novel_types.h" +#include "memory_chunk.h" +#include "chewing_key.h" + +namespace pinyin{ + +class ChewingLengthIndexLevel; + +class ChewingBitmapIndexLevel{ + +protected: + pinyin_option_t m_options; + +protected: + ChewingLengthIndexLevel * m_chewing_length_indexes + [CHEWING_NUMBER_OF_INITIALS][CHEWING_NUMBER_OF_MIDDLES] + [CHEWING_NUMBER_OF_FINALS][CHEWING_NUMBER_OF_TONES]; + + /* search functions */ + int initial_level_search(int phrase_length, + /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const; + + int middle_and_final_level_search(ChewingInitial initial, + int phrase_length, + /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const; + int tone_level_search(ChewingInitial initial, ChewingMiddle middle, + ChewingFinal final, int phrase_length, + /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const; + + void reset(); + +public: + /* constructor/destructor */ + ChewingBitmapIndexLevel(pinyin_option_t options); + ~ChewingBitmapIndexLevel() { reset(); } + + /* set options method */ + bool set_options(pinyin_option_t options) { + m_options = options; + return true; + } + + /* load/store method */ + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, + table_offset_t & end); + + /* search method */ + int search(int phrase_length, /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const; + + /* add/remove index method */ + int add_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token); + int remove_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token); + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + + +class ChewingLargeTable{ +protected: + ChewingBitmapIndexLevel m_bitmap_table; + MemoryChunk * m_chunk; + + void reset(){ + if (m_chunk) { + delete m_chunk; m_chunk = NULL; + } + } + +public: + /* constructor/destructor */ + ChewingLargeTable(pinyin_option_t options): + m_bitmap_table(options), m_chunk(NULL) {} + + ~ChewingLargeTable() { reset(); } + + /* set options method */ + bool set_options(pinyin_option_t options) { + return m_bitmap_table.set_options(options); + } + + /* load/store method */ + bool load(MemoryChunk * chunk) { + reset(); + m_chunk = chunk; + return m_bitmap_table.load(chunk, 0, chunk->size()); + } + + bool store(MemoryChunk * new_chunk) { + table_offset_t end; + return m_bitmap_table.store(new_chunk, 0, end); + } + + bool load_text(FILE * file); + + /* search method */ + int search(int phrase_length, /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + return m_bitmap_table.search(phrase_length, keys, ranges); + } + + /* add/remove index method */ + int add_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + return m_bitmap_table.add_index(phrase_length, keys, token); + } + + int remove_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + return m_bitmap_table.remove_index(phrase_length, keys, token); + } + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value) { + return m_bitmap_table.mask_out(mask, value); + } +}; + +}; + +#endif diff --git a/src/storage/chewing_table.h b/src/storage/chewing_table.h new file mode 100644 index 0000000..56ceba0 --- /dev/null +++ b/src/storage/chewing_table.h @@ -0,0 +1,221 @@ +/* This file is generated by python scripts. Don't edit this file directly. + */ + +#ifndef CHEWING_TABLE_H +#define CHEWING_TABLE_H + +namespace pinyin{ + +const chewing_symbol_item_t chewing_standard_symbols[] = { +{',' , "ㄝ"}, +{'-' , "ㄦ"}, +{'.' , "ㄡ"}, +{'/' , "ㄥ"}, +{'0' , "ㄢ"}, +{'1' , "ㄅ"}, +{'2' , "ㄉ"}, +{'5' , "ㄓ"}, +{'8' , "ㄚ"}, +{'9' , "ㄞ"}, +{';' , "ㄤ"}, +{'a' , "ㄇ"}, +{'b' , "ㄖ"}, +{'c' , "ㄏ"}, +{'d' , "ㄎ"}, +{'e' , "ㄍ"}, +{'f' , "ㄑ"}, +{'g' , "ㄕ"}, +{'h' , "ㄘ"}, +{'i' , "ㄛ"}, +{'j' , "ㄨ"}, +{'k' , "ㄜ"}, +{'l' , "ㄠ"}, +{'m' , "ㄩ"}, +{'n' , "ㄙ"}, +{'o' , "ㄟ"}, +{'p' , "ㄣ"}, +{'q' , "ㄆ"}, +{'r' , "ㄐ"}, +{'s' , "ㄋ"}, +{'t' , "ㄔ"}, +{'u' , "ㄧ"}, +{'v' , "ㄒ"}, +{'w' , "ㄊ"}, +{'x' , "ㄌ"}, +{'y' , "ㄗ"}, +{'z' , "ㄈ"}, +{'\0', NULL} +}; + +const chewing_tone_item_t chewing_standard_tones[] = { +{' ' , 1}, +{'3' , 3}, +{'4' , 4}, +{'6' , 2}, +{'7' , 5}, +{'\0', 0} +}; + + +const chewing_symbol_item_t chewing_ginyieh_symbols[] = { +{'\'' , "ㄩ"}, +{',' , "ㄝ"}, +{'-' , "ㄧ"}, +{'.' , "ㄡ"}, +{'/' , "ㄥ"}, +{'0' , "ㄢ"}, +{'2' , "ㄅ"}, +{'3' , "ㄉ"}, +{'6' , "ㄓ"}, +{'8' , "ㄚ"}, +{'9' , "ㄞ"}, +{';' , "ㄤ"}, +{'=' , "ㄦ"}, +{'[' , "ㄨ"}, +{'b' , "ㄒ"}, +{'c' , "ㄌ"}, +{'d' , "ㄋ"}, +{'e' , "ㄊ"}, +{'f' , "ㄎ"}, +{'g' , "ㄑ"}, +{'h' , "ㄕ"}, +{'i' , "ㄛ"}, +{'j' , "ㄘ"}, +{'k' , "ㄜ"}, +{'l' , "ㄠ"}, +{'m' , "ㄙ"}, +{'n' , "ㄖ"}, +{'o' , "ㄟ"}, +{'p' , "ㄣ"}, +{'r' , "ㄍ"}, +{'s' , "ㄇ"}, +{'t' , "ㄐ"}, +{'u' , "ㄗ"}, +{'v' , "ㄏ"}, +{'w' , "ㄆ"}, +{'x' , "ㄈ"}, +{'y' , "ㄔ"}, +{'\0', NULL} +}; + +const chewing_tone_item_t chewing_ginyieh_tones[] = { +{' ' , 1}, +{'1' , 5}, +{'a' , 3}, +{'q' , 2}, +{'z' , 4}, +{'\0', 0} +}; + +const chewing_symbol_item_t chewing_eten_symbols[] = { +{'\'' , "ㄘ"}, +{',' , "ㄓ"}, +{'-' , "ㄥ"}, +{'.' , "ㄔ"}, +{'/' , "ㄕ"}, +{'0' , "ㄤ"}, +{'7' , "ㄑ"}, +{'8' , "ㄢ"}, +{'9' , "ㄣ"}, +{';' , "ㄗ"}, +{'=' , "ㄦ"}, +{'a' , "ㄚ"}, +{'b' , "ㄅ"}, +{'c' , "ㄒ"}, +{'d' , "ㄉ"}, +{'e' , "ㄧ"}, +{'f' , "ㄈ"}, +{'g' , "ㄐ"}, +{'h' , "ㄏ"}, +{'i' , "ㄞ"}, +{'j' , "ㄖ"}, +{'k' , "ㄎ"}, +{'l' , "ㄌ"}, +{'m' , "ㄇ"}, +{'n' , "ㄋ"}, +{'o' , "ㄛ"}, +{'p' , "ㄆ"}, +{'q' , "ㄟ"}, +{'r' , "ㄜ"}, +{'s' , "ㄙ"}, +{'t' , "ㄊ"}, +{'u' , "ㄩ"}, +{'v' , "ㄍ"}, +{'w' , "ㄝ"}, +{'x' , "ㄨ"}, +{'y' , "ㄡ"}, +{'z' , "ㄠ"}, +{'\0', NULL} +}; + +const chewing_tone_item_t chewing_eten_tones[] = { +{' ' , 1}, +{'1' , 5}, +{'2' , 2}, +{'3' , 3}, +{'4' , 4}, +{'\0', 0} +}; + +const chewing_symbol_item_t chewing_ibm_symbols[] = { +{'-' , "ㄏ"}, +{'0' , "ㄎ"}, +{'1' , "ㄅ"}, +{'2' , "ㄆ"}, +{'3' , "ㄇ"}, +{'4' , "ㄈ"}, +{'5' , "ㄉ"}, +{'6' , "ㄊ"}, +{'7' , "ㄋ"}, +{'8' , "ㄌ"}, +{'9' , "ㄍ"}, +{';' , "ㄠ"}, +{'a' , "ㄧ"}, +{'b' , "ㄥ"}, +{'c' , "ㄣ"}, +{'d' , "ㄩ"}, +{'e' , "ㄒ"}, +{'f' , "ㄚ"}, +{'g' , "ㄛ"}, +{'h' , "ㄜ"}, +{'i' , "ㄗ"}, +{'j' , "ㄝ"}, +{'k' , "ㄞ"}, +{'l' , "ㄟ"}, +{'n' , "ㄦ"}, +{'o' , "ㄘ"}, +{'p' , "ㄙ"}, +{'q' , "ㄐ"}, +{'r' , "ㄓ"}, +{'s' , "ㄨ"}, +{'t' , "ㄔ"}, +{'u' , "ㄖ"}, +{'v' , "ㄤ"}, +{'w' , "ㄑ"}, +{'x' , "ㄢ"}, +{'y' , "ㄕ"}, +{'z' , "ㄡ"}, +{'\0', NULL} +}; + +const chewing_tone_item_t chewing_ibm_tones[] = { +{' ' , 1}, +{',' , 3}, +{'.' , 4}, +{'/' , 5}, +{'m' , 2}, +{'\0', 0} +}; + +const char * chewing_tone_table[CHEWING_NUMBER_OF_TONES] = { +"", +"ˉ", +"ˊ", +"ˇ", +"ˋ", +"˙" +}; + +}; + +#endif diff --git a/src/storage/double_pinyin_table.h b/src/storage/double_pinyin_table.h new file mode 100644 index 0000000..52af618 --- /dev/null +++ b/src/storage/double_pinyin_table.h @@ -0,0 +1,371 @@ +/* This file is generated by python scripts. Don't edit this file directly. + */ + +#ifndef DOUBLE_PINYIN_TABLE_H +#define DOUBLE_PINYIN_TABLE_H + +namespace pinyin{ + +const double_pinyin_scheme_shengmu_item_t double_pinyin_mspy_sheng[] = { +{NULL } /* A */, +{"b" } /* B */, +{"c" } /* C */, +{"d" } /* D */, +{NULL } /* E */, +{"f" } /* F */, +{"g" } /* G */, +{"h" } /* H */, +{"ch" } /* I */, +{"j" } /* J */, +{"k" } /* K */, +{"l" } /* L */, +{"m" } /* M */, +{"n" } /* N */, +{"'" } /* O */, +{"p" } /* P */, +{"q" } /* Q */, +{"r" } /* R */, +{"s" } /* S */, +{"t" } /* T */, +{"sh" } /* U */, +{"zh" } /* V */, +{"w" } /* W */, +{"x" } /* X */, +{"y" } /* Y */, +{"z" } /* Z */, +{NULL } /* ; */ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_mspy_yun[] = { +{{"a" , NULL }} /* A */, +{{"ou" , NULL }} /* B */, +{{"iao" , NULL }} /* C */, +{{"uang" , "iang" }} /* D */, +{{"e" , NULL }} /* E */, +{{"en" , NULL }} /* F */, +{{"eng" , "ng" }} /* G */, +{{"ang" , NULL }} /* H */, +{{"i" , NULL }} /* I */, +{{"an" , NULL }} /* J */, +{{"ao" , NULL }} /* K */, +{{"ai" , NULL }} /* L */, +{{"ian" , NULL }} /* M */, +{{"in" , NULL }} /* N */, +{{"uo" , "o" }} /* O */, +{{"un" , NULL }} /* P */, +{{"iu" , NULL }} /* Q */, +{{"uan" , "er" }} /* R */, +{{"ong" , "iong" }} /* S */, +{{"ue" , NULL }} /* T */, +{{"u" , NULL }} /* U */, +{{"ui" , "ue" }} /* V */, +{{"ia" , "ua" }} /* W */, +{{"ie" , NULL }} /* X */, +{{"uai" , "v" }} /* Y */, +{{"ei" , NULL }} /* Z */, +{{"ing" , NULL }} /* ; */ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_zrm_sheng[] = { +{NULL } /* A */, +{"b" } /* B */, +{"c" } /* C */, +{"d" } /* D */, +{NULL } /* E */, +{"f" } /* F */, +{"g" } /* G */, +{"h" } /* H */, +{"ch" } /* I */, +{"j" } /* J */, +{"k" } /* K */, +{"l" } /* L */, +{"m" } /* M */, +{"n" } /* N */, +{"'" } /* O */, +{"p" } /* P */, +{"q" } /* Q */, +{"r" } /* R */, +{"s" } /* S */, +{"t" } /* T */, +{"sh" } /* U */, +{"zh" } /* V */, +{"w" } /* W */, +{"x" } /* X */, +{"y" } /* Y */, +{"z" } /* Z */, +{NULL } /* ; */ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_zrm_yun[] = { +{{"a" , NULL }} /* A */, +{{"ou" , NULL }} /* B */, +{{"iao" , NULL }} /* C */, +{{"uang" , "iang" }} /* D */, +{{"e" , NULL }} /* E */, +{{"en" , NULL }} /* F */, +{{"eng" , "ng" }} /* G */, +{{"ang" , NULL }} /* H */, +{{"i" , NULL }} /* I */, +{{"an" , NULL }} /* J */, +{{"ao" , NULL }} /* K */, +{{"ai" , NULL }} /* L */, +{{"ian" , NULL }} /* M */, +{{"in" , NULL }} /* N */, +{{"uo" , "o" }} /* O */, +{{"un" , NULL }} /* P */, +{{"iu" , NULL }} /* Q */, +{{"uan" , "er" }} /* R */, +{{"ong" , "iong" }} /* S */, +{{"ue" , NULL }} /* T */, +{{"u" , NULL }} /* U */, +{{"ui" , "v" }} /* V */, +{{"ia" , "ua" }} /* W */, +{{"ie" , NULL }} /* X */, +{{"uai" , "ing" }} /* Y */, +{{"ei" , NULL }} /* Z */, +{{NULL , NULL }} /* ; */ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_abc_sheng[] = { +{"zh" } /* A */, +{"b" } /* B */, +{"c" } /* C */, +{"d" } /* D */, +{"ch" } /* E */, +{"f" } /* F */, +{"g" } /* G */, +{"h" } /* H */, +{NULL } /* I */, +{"j" } /* J */, +{"k" } /* K */, +{"l" } /* L */, +{"m" } /* M */, +{"n" } /* N */, +{"'" } /* O */, +{"p" } /* P */, +{"q" } /* Q */, +{"r" } /* R */, +{"s" } /* S */, +{"t" } /* T */, +{NULL } /* U */, +{"sh" } /* V */, +{"w" } /* W */, +{"x" } /* X */, +{"y" } /* Y */, +{"z" } /* Z */, +{NULL } /* ; */ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_abc_yun[] = { +{{"a" , NULL }} /* A */, +{{"ou" , NULL }} /* B */, +{{"in" , "uai" }} /* C */, +{{"ia" , "ua" }} /* D */, +{{"e" , NULL }} /* E */, +{{"en" , NULL }} /* F */, +{{"eng" , "ng" }} /* G */, +{{"ang" , NULL }} /* H */, +{{"i" , NULL }} /* I */, +{{"an" , NULL }} /* J */, +{{"ao" , NULL }} /* K */, +{{"ai" , NULL }} /* L */, +{{"ue" , "ui" }} /* M */, +{{"un" , NULL }} /* N */, +{{"uo" , "o" }} /* O */, +{{"uan" , NULL }} /* P */, +{{"ei" , NULL }} /* Q */, +{{"er" , "iu" }} /* R */, +{{"ong" , "iong" }} /* S */, +{{"iang" , "uang" }} /* T */, +{{"u" , NULL }} /* U */, +{{"v" , "ue" }} /* V */, +{{"ian" , NULL }} /* W */, +{{"ie" , NULL }} /* X */, +{{"ing" , NULL }} /* Y */, +{{"iao" , NULL }} /* Z */, +{{NULL , NULL }} /* ; */ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_zgpy_sheng[] = { +{"ch" } /* A */, +{"b" } /* B */, +{"c" } /* C */, +{"d" } /* D */, +{NULL } /* E */, +{"f" } /* F */, +{"g" } /* G */, +{"h" } /* H */, +{"sh" } /* I */, +{"j" } /* J */, +{"k" } /* K */, +{"l" } /* L */, +{"m" } /* M */, +{"n" } /* N */, +{"'" } /* O */, +{"p" } /* P */, +{"q" } /* Q */, +{"r" } /* R */, +{"s" } /* S */, +{"t" } /* T */, +{"zh" } /* U */, +{NULL } /* V */, +{"w" } /* W */, +{"x" } /* X */, +{"y" } /* Y */, +{"z" } /* Z */, +{NULL } /* ; */ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_zgpy_yun[] = { +{{"a" , NULL }} /* A */, +{{"iao" , NULL }} /* B */, +{{NULL , NULL }} /* C */, +{{"ie" , NULL }} /* D */, +{{"e" , NULL }} /* E */, +{{"ian" , NULL }} /* F */, +{{"iang" , "uang" }} /* G */, +{{"ong" , "iong" }} /* H */, +{{"i" , NULL }} /* I */, +{{"er" , "iu" }} /* J */, +{{"ei" , NULL }} /* K */, +{{"uan" , NULL }} /* L */, +{{"un" , NULL }} /* M */, +{{"ue" , "ui" }} /* N */, +{{"uo" , "o" }} /* O */, +{{"ai" , NULL }} /* P */, +{{"ao" , NULL }} /* Q */, +{{"an" , NULL }} /* R */, +{{"ang" , NULL }} /* S */, +{{"eng" , "ng" }} /* T */, +{{"u" , NULL }} /* U */, +{{"v" , NULL }} /* V */, +{{"en" , NULL }} /* W */, +{{"ia" , "ua" }} /* X */, +{{"in" , "uai" }} /* Y */, +{{"ou" , NULL }} /* Z */, +{{"ing" , NULL }} /* ; */ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_pyjj_sheng[] = { +{"'" } /* A */, +{"b" } /* B */, +{"c" } /* C */, +{"d" } /* D */, +{NULL } /* E */, +{"f" } /* F */, +{"g" } /* G */, +{"h" } /* H */, +{"sh" } /* I */, +{"j" } /* J */, +{"k" } /* K */, +{"l" } /* L */, +{"m" } /* M */, +{"n" } /* N */, +{"'" } /* O */, +{"p" } /* P */, +{"q" } /* Q */, +{"r" } /* R */, +{"s" } /* S */, +{"t" } /* T */, +{"ch" } /* U */, +{"zh" } /* V */, +{"w" } /* W */, +{"x" } /* X */, +{"y" } /* Y */, +{"z" } /* Z */, +{NULL } /* ; */ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_pyjj_yun[] = { +{{"a" , NULL }} /* A */, +{{"ia" , "ua" }} /* B */, +{{"uan" , NULL }} /* C */, +{{"ao" , NULL }} /* D */, +{{"e" , NULL }} /* E */, +{{"an" , NULL }} /* F */, +{{"ang" , NULL }} /* G */, +{{"iang" , "uang" }} /* H */, +{{"i" , NULL }} /* I */, +{{"ian" , NULL }} /* J */, +{{"iao" , NULL }} /* K */, +{{"in" , NULL }} /* L */, +{{"ie" , NULL }} /* M */, +{{"iu" , NULL }} /* N */, +{{"uo" , "o" }} /* O */, +{{"ou" , NULL }} /* P */, +{{"er" , "ing" }} /* Q */, +{{"en" , NULL }} /* R */, +{{"ai" , NULL }} /* S */, +{{"eng" , "ng" }} /* T */, +{{"u" , NULL }} /* U */, +{{"v" , "ui" }} /* V */, +{{"ei" , NULL }} /* W */, +{{"uai" , "ue" }} /* X */, +{{"ong" , "iong" }} /* Y */, +{{"un" , NULL }} /* Z */, +{{NULL , NULL }} /* ; */ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_xhe_sheng[] = { +{"'" } /* A */, +{"b" } /* B */, +{"c" } /* C */, +{"d" } /* D */, +{"'" } /* E */, +{"f" } /* F */, +{"g" } /* G */, +{"h" } /* H */, +{"ch" } /* I */, +{"j" } /* J */, +{"k" } /* K */, +{"l" } /* L */, +{"m" } /* M */, +{"n" } /* N */, +{"'" } /* O */, +{"p" } /* P */, +{"q" } /* Q */, +{"r" } /* R */, +{"s" } /* S */, +{"t" } /* T */, +{"sh" } /* U */, +{"zh" } /* V */, +{"w" } /* W */, +{"x" } /* X */, +{"y" } /* Y */, +{"z" } /* Z */, +{NULL } /* ; */ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_xhe_yun[] = { +{{"a" , NULL }} /* A */, +{{"in" , NULL }} /* B */, +{{"ao" , NULL }} /* C */, +{{"ai" , NULL }} /* D */, +{{"e" , NULL }} /* E */, +{{"en" , NULL }} /* F */, +{{"eng" , "ng" }} /* G */, +{{"ang" , NULL }} /* H */, +{{"i" , NULL }} /* I */, +{{"an" , NULL }} /* J */, +{{"uai" , "ing" }} /* K */, +{{"iang" , "uang" }} /* L */, +{{"ian" , NULL }} /* M */, +{{"iao" , NULL }} /* N */, +{{"uo" , "o" }} /* O */, +{{"ie" , NULL }} /* P */, +{{"iu" , NULL }} /* Q */, +{{"uan" , "er" }} /* R */, +{{"ong" , "iong" }} /* S */, +{{"ue" , NULL }} /* T */, +{{"u" , NULL }} /* U */, +{{"v" , "ui" }} /* V */, +{{"ei" , NULL }} /* W */, +{{"ia" , "ua" }} /* X */, +{{"un" , NULL }} /* Y */, +{{"ou" , NULL }} /* Z */, +{{NULL , NULL }} /* ; */ +}; + +}; + +#endif diff --git a/src/storage/facade_chewing_table.h b/src/storage/facade_chewing_table.h new file mode 100644 index 0000000..474311c --- /dev/null +++ b/src/storage/facade_chewing_table.h @@ -0,0 +1,216 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef FACADE_CHEWING_TABLE_H +#define FACADE_CHEWING_TABLE_H + +#include "novel_types.h" +#include "chewing_large_table.h" + +namespace pinyin{ + +/** + * FacadeChewingTable: + * + * The facade class of chewing large table. + * + */ + +class FacadeChewingTable{ +private: + ChewingLargeTable * m_system_chewing_table; + ChewingLargeTable * m_user_chewing_table; + + void reset() { + if (m_system_chewing_table) { + delete m_system_chewing_table; + m_system_chewing_table = NULL; + } + + if (m_user_chewing_table) { + delete m_user_chewing_table; + m_user_chewing_table = NULL; + } + } +public: + /** + * FacadeChewingTable::FacadeChewingTable: + * + * The constructor of the FacadeChewingTable. + * + */ + FacadeChewingTable() { + m_system_chewing_table = NULL; + m_user_chewing_table = NULL; + } + + /** + * FacadeChewingTable::~FacadeChewingTable: + * + * The destructor of the FacadeChewingTable. + * + */ + ~FacadeChewingTable() { + reset(); + } + + /** + * FacadeChewingTable::set_options: + * @options: the pinyin options. + * @returns: whether the setting options is successful. + * + * Set the options of the system and user chewing table. + * + */ + bool set_options(pinyin_option_t options) { + bool result = false; + if (m_system_chewing_table) + result = m_system_chewing_table->set_options(options) || result; + if (m_user_chewing_table) + result = m_user_chewing_table->set_options(options) || result; + return result; + } + + /** + * FacadeChewingTable::load: + * @options: the pinyin options. + * @system: the memory chunk of the system chewing table. + * @user: the memory chunk of the user chewing table. + * @returns: whether the load operation is successful. + * + * Load the system or user chewing table from the memory chunks. + * + */ + bool load(pinyin_option_t options, MemoryChunk * system, + MemoryChunk * user){ + reset(); + + bool result = false; + if (system) { + m_system_chewing_table = new ChewingLargeTable(options); + result = m_system_chewing_table->load(system) || result; + } + if (user) { + m_user_chewing_table = new ChewingLargeTable(options); + result = m_user_chewing_table->load(user) || result; + } + return result; + } + + /** + * FacadeChewingTable::store: + * @new_user: the memory chunk to store the user chewing table. + * @returns: whether the store operation is successful. + * + * Store the user chewing table to the memory chunk. + * + */ + bool store(MemoryChunk * new_user) { + if (NULL == m_user_chewing_table) + return false; + return m_user_chewing_table->store(new_user); + } + + /** + * FacadeChewingTable::search: + * @phrase_length: the length of the phrase to be searched. + * @keys: the pinyin key of the phrase to be searched. + * @ranges: the array of GArrays to store the matched phrase token. + * @returns: the search result of enum SearchResult. + * + * Search the phrase tokens according to the pinyin keys. + * + */ + int search(int phrase_length, /* in */ const ChewingKey keys[], + /* out */ PhraseIndexRanges ranges) const { + + /* clear ranges. */ + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + if (ranges[i]) + g_array_set_size(ranges[i], 0); + } + + int result = SEARCH_NONE; + + if (NULL != m_system_chewing_table) + result |= m_system_chewing_table->search + (phrase_length, keys, ranges); + + if (NULL != m_user_chewing_table) + result |= m_user_chewing_table->search + (phrase_length, keys, ranges); + + return result; + } + + /** + * FacadeChewingTable::add_index: + * @phrase_length: the length of the phrase to be added. + * @keys: the pinyin keys of the phrase to be added. + * @token: the token of the phrase to be added. + * @returns: the add result of enum ErrorResult. + * + * Add the phrase token to the user chewing table. + * + */ + int add_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + if (NULL == m_user_chewing_table) + return ERROR_NO_USER_TABLE; + return m_user_chewing_table->add_index(phrase_length, keys, token); + } + + /** + * FacadeChewingTable::remove_index: + * @phrase_length: the length of the phrase to be removed. + * @keys: the pinyin keys of the phrase to be removed. + * @token: the token of the phrase to be removed. + * @returns: the remove result of enum ErrorResult. + * + * Remove the phrase token from the user chewing table. + * + */ + int remove_index(int phrase_length, /* in */ const ChewingKey keys[], + /* in */ phrase_token_t token) { + if (NULL == m_user_chewing_table) + return ERROR_NO_USER_TABLE; + return m_user_chewing_table->remove_index(phrase_length, keys, token); + } + + /** + * FacadeChewingTable::mask_out: + * @mask: the mask. + * @value: the value. + * @returns: whether the mask out operation is successful. + * + * Mask out the matched chewing index. + * + */ + bool mask_out(phrase_token_t mask, phrase_token_t value) { + if (NULL == m_user_chewing_table) + return false; + return m_user_chewing_table->mask_out(mask, value); + } +}; + +}; + +#endif diff --git a/src/storage/facade_phrase_table2.h b/src/storage/facade_phrase_table2.h new file mode 100644 index 0000000..3ef1c37 --- /dev/null +++ b/src/storage/facade_phrase_table2.h @@ -0,0 +1,203 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef FACADE_PHRASE_TABLE2_H +#define FACADE_PHRASE_TABLE2_H + +#include "phrase_large_table2.h" + +namespace pinyin{ + +/** + * FacadePhraseTable2: + * + * The facade class of phrase large table2. + * + */ + +class FacadePhraseTable2{ +private: + PhraseLargeTable2 * m_system_phrase_table; + PhraseLargeTable2 * m_user_phrase_table; + + void reset(){ + if (m_system_phrase_table) { + delete m_system_phrase_table; + m_system_phrase_table = NULL; + } + + if (m_user_phrase_table) { + delete m_user_phrase_table; + m_user_phrase_table = NULL; + } + } + +public: + /** + * FacadePhraseTable2::FacadePhraseTable2: + * + * The constructor of the FacadePhraseTable2. + * + */ + FacadePhraseTable2() { + m_system_phrase_table = NULL; + m_user_phrase_table = NULL; + } + + /** + * FacadePhraseTable2::~FacadePhraseTable2: + * + * The destructor of the FacadePhraseTable2. + * + */ + ~FacadePhraseTable2() { + reset(); + } + + /** + * FacadePhraseTable2::load: + * @system: the memory chunk of the system phrase table. + * @user: the memory chunk of the user phrase table. + * @returns: whether the load operation is successful. + * + * Load the system or user phrase table from the memory chunks. + * + */ + bool load(MemoryChunk * system, MemoryChunk * user) { + reset(); + + bool result = false; + if (system) { + m_system_phrase_table = new PhraseLargeTable2; + result = m_system_phrase_table->load(system) || result; + } + if (user) { + m_user_phrase_table = new PhraseLargeTable2; + result = m_user_phrase_table->load(user) || result; + } + return result; + } + + /** + * FacadePhraseTable2::store: + * @new_user: the memory chunk to store the user phrase table. + * @returns: whether the store operation is successful. + * + * Store the user phrase table to the memory chunk. + * + */ + bool store(MemoryChunk * new_user) { + if (NULL == m_user_phrase_table) + return false; + return m_user_phrase_table->store(new_user); + } + + /** + * FacadePhraseTable2::search: + * @phrase_length: the length of the phrase to be searched. + * @phrase: the ucs4 characters of the phrase to be searched. + * @tokens: the GArray of tokens to store the matched phrases. + * @returns: the search result of enum SearchResult. + * + * Search the phrase tokens according to the ucs4 characters. + * + */ + int search(int phrase_length, /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const { + /* clear tokens. */ + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + if (tokens[i]) + g_array_set_size(tokens[i], 0); + } + + int result = SEARCH_NONE; + + if (NULL != m_system_phrase_table) + result |= m_system_phrase_table->search + (phrase_length, phrase, tokens); + + if (NULL != m_user_phrase_table) + result |= m_user_phrase_table->search + (phrase_length, phrase, tokens); + + return result; + } + + /** + * FacadePhraseTable2::add_index: + * @phrase_length: the length of the phrase to be added. + * @phrase: the ucs4 characters of the phrase to be added. + * @token: the token of the phrase to be added. + * @returns: the add result of enum ErrorResult. + * + * Add the phrase token to the user phrase table. + * + */ + int add_index(int phrase_length, /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token) { + if (NULL == m_user_phrase_table) + return ERROR_NO_USER_TABLE; + + return m_user_phrase_table->add_index + (phrase_length, phrase, token); + } + + /** + * FacadePhraseTable2::remove_index: + * @phrase_length: the length of the phrase to be removed. + * @phrase: the ucs4 characters of the phrase to be removed. + * @token: the token of the phrase to be removed. + * @returns: the remove result of enum ErrorResult. + * + * Remove the phrase token from the user phrase table. + * + */ + int remove_index(int phrase_length, /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token) { + if (NULL == m_user_phrase_table) + return ERROR_NO_USER_TABLE; + + return m_user_phrase_table->remove_index + (phrase_length, phrase, token); + } + + /** + * FacadePhraseTable2::mask_out: + * @mask: the mask. + * @value: the value. + * @returns: whether the mask out operation is successful. + * + * Mask out the matched phrase index. + * + */ + bool mask_out(phrase_token_t mask, phrase_token_t value) { + if (NULL == m_user_phrase_table) + return false; + + return m_user_phrase_table->mask_out + (mask, value); + } +}; + +}; + + +#endif diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h new file mode 100644 index 0000000..6cff7ff --- /dev/null +++ b/src/storage/flexible_ngram.h @@ -0,0 +1,719 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + + +#ifndef FLEXIBLE_NGRAM_H +#define FLEXIBLE_NGRAM_H + +#include <db.h> +#include <errno.h> + +/* Note: the signature of the template parameters. + * struct MagicHeader, ArrayHeader, ArrayItem. + */ + +namespace pinyin{ + +typedef GArray * FlexibleBigramPhraseArray; + +/** + * FlexibleSingleGram: + * @ArrayHeader: the struct ArrayHeader. + * @ArrayItem: the struct ArrayItem. + * + * The flexible single gram is mainly used for training purpose. + * + */ + +template<typename ArrayHeader, typename ArrayItem> +class FlexibleSingleGram{ + template<typename MH, typename AH, + typename AI> + friend class FlexibleBigram; +private: + MemoryChunk m_chunk; + FlexibleSingleGram(void * buffer, size_t length){ + m_chunk.set_chunk(buffer, length, NULL); + } +public: + /** + * ArrayItemWithToken: + * + * Define the struct ArrayItemWithToken type. + * + */ + typedef struct{ + phrase_token_t m_token; + ArrayItem m_item; + } ArrayItemWithToken; + +private: + static bool token_less_than(const ArrayItemWithToken & lhs, + const ArrayItemWithToken & rhs){ + return lhs.m_token < rhs.m_token; + } + +public: + /** + * FlexibleSingleGram::FlexibleSingleGram: + * + * The constructor of the FlexibleSingleGram. + * + */ + FlexibleSingleGram(){ + m_chunk.set_size(sizeof(ArrayHeader)); + memset(m_chunk.begin(), 0, sizeof(ArrayHeader)); + } + + /** + * FlexibleSingleGram::retrieve_all: + * @array: the array to store all items in this single gram. + * @returns: whether the retrieve operation is successful. + * + * Retrieve all items in this single gram. + * + */ + bool retrieve_all(/* out */ FlexibleBigramPhraseArray array){ + const ArrayItemWithToken * begin = (const ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + const ArrayItemWithToken * end = (const ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken item; + for ( const ArrayItemWithToken * cur_item = begin; + cur_item != end; + ++cur_item){ + /* Note: optimize this with g_array_append_vals? */ + item.m_token = cur_item->m_token; + item.m_item = cur_item->m_item; + g_array_append_val(array, item); + } + + return true; + } + + /** + * FlexibleSingleGram::search: + * @range: the token range. + * @array: the array to store the array items with token in the range. + * @returns: whether the search operation is successful. + * + * Search the array items with token in the range. + * + * Note: The array result may contain many items. + * + */ + bool search(/* in */ PhraseIndexRange * range, + /* out */ FlexibleBigramPhraseArray array){ + const ArrayItemWithToken * begin = (const ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + const ArrayItemWithToken * end = (const ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = range->m_range_begin; + const ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + ArrayItemWithToken item; + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token >= range->m_range_end ) + break; + item.m_token = cur_item->m_token; + item.m_item = cur_item->m_item; + g_array_append_val(array, item); + } + + return true; + } + + /** + * FlexibleSingleGram::insert_array_item: + * @token: the phrase token to be inserted. + * @item: the array item of this token. + * @returns: whether the insert operation is successful. + * + * Insert the array item of the token. + * + */ + bool insert_array_item(/* in */ phrase_token_t token, + /* in */ const ArrayItem & item){ + ArrayItemWithToken * begin = (ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + ArrayItemWithToken * end = (ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = token; + ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + ArrayItemWithToken insert_item; + insert_item.m_token = token; + insert_item.m_item = item; + + for ( ; cur_item != end; ++cur_item ){ + if ( cur_item->m_token > token ){ + size_t offset = sizeof(ArrayHeader) + + sizeof(ArrayItemWithToken) * (cur_item - begin); + m_chunk.insert_content(offset, &insert_item, + sizeof(ArrayItemWithToken)); + return true; + } + if ( cur_item->m_token == token ){ + return false; + } + } + m_chunk.insert_content(m_chunk.size(), &insert_item, + sizeof(ArrayItemWithToken)); + return true; + } + + /** + * FlexibleSingleGram::remove_array_item: + * @token: the phrase token to be removed. + * @item: the content of the removed array item. + * @returns: whether the remove operation is successful. + * + * Remove the array item of the token. + * + */ + bool remove_array_item(/* in */ phrase_token_t token, + /* out */ ArrayItem & item) + { + /* clear retval */ + memset(&item, 0, sizeof(ArrayItem)); + + const ArrayItemWithToken * begin = (const ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + const ArrayItemWithToken * end = (const ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = token; + const ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem)); + size_t offset = sizeof(ArrayHeader) + + sizeof(ArrayItemWithToken) * (cur_item - begin); + m_chunk.remove_content(offset, sizeof(ArrayItemWithToken)); + return true; + } + } + return false; + } + + /** + * FlexibleSingleGram::get_array_item: + * @token: the phrase token. + * @item: the array item of the token. + * @returns: whether the get operation is successful. + * + * Get the array item of the token. + * + */ + bool get_array_item(/* in */ phrase_token_t token, + /* out */ ArrayItem & item) + { + /* clear retval */ + memset(&item, 0, sizeof(ArrayItem)); + + const ArrayItemWithToken * begin = (const ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + const ArrayItemWithToken * end = (const ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = token; + const ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem)); + return true; + } + } + return false; + } + + /** + * FlexibleSingleGram::set_array_item: + * @token: the phrase token. + * @item: the array item of the token. + * @returns: whether the set operation is successful. + * + * Set the array item of the token. + * + */ + bool set_array_item(/* in */ phrase_token_t token, + /* in */ const ArrayItem & item){ + ArrayItemWithToken * begin = (ArrayItemWithToken *) + ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader)); + ArrayItemWithToken * end = (ArrayItemWithToken *) + m_chunk.end(); + + ArrayItemWithToken compare_item; + compare_item.m_token = token; + ArrayItemWithToken * cur_item = std_lite::lower_bound + (begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item ){ + if ( cur_item->m_token > token ){ + return false; + } + if ( cur_item->m_token == token ){ + memcpy(&(cur_item->m_item), &item, sizeof(ArrayItem)); + return true; + } + } + return false; + } + + /** + * FlexibleSingleGram::get_array_header: + * @header: the array header of this single gram. + * @returns: whether the get operation is successful. + * + * Get the array header of this single gram. + * + */ + bool get_array_header(/* out */ ArrayHeader & header){ + /* clear retval */ + memset(&header, 0, sizeof(ArrayHeader)); + char * buf_begin = (char *)m_chunk.begin(); + memcpy(&header, buf_begin, sizeof(ArrayHeader)); + return true; + } + + /** + * FlexibleSingleGram::set_array_header: + * @header: the array header of this single gram. + * @returns: whether the set operation is successful. + * + * Set the array header of this single gram. + * + */ + bool set_array_header(/* in */ const ArrayHeader & header){ + char * buf_begin = (char *)m_chunk.begin(); + memcpy(buf_begin, &header, sizeof(ArrayHeader)); + return true; + } +}; + +/** + * FlexibleBigram: + * @MagicHeader: the struct type of the magic header. + * @ArrayHeader: the struct type of the array header. + * @ArrayItem: the struct type of the array item. + * + * The flexible bi-gram is mainly used for training purpose. + * + */ +template<typename MagicHeader, typename ArrayHeader, + typename ArrayItem> +class FlexibleBigram{ + /* Note: some flexible bi-gram file format check should be here. */ +private: + DB * m_db; + + phrase_token_t m_magic_header_index[2]; + + char m_magic_number[4]; + + void reset(){ + if ( m_db ){ + m_db->sync(m_db, 0); + m_db->close(m_db, 0); + m_db = NULL; + } + } + +public: + /** + * FlexibleBigram::FlexibleBigram: + * @magic_number: the 4 bytes magic number of the flexible bi-gram. + * + * The constructor of the FlexibleBigram. + * + */ + FlexibleBigram(const char * magic_number){ + m_db = NULL; + m_magic_header_index[0] = null_token; + m_magic_header_index[1] = null_token; + + memcpy(m_magic_number, magic_number, sizeof(m_magic_number)); + } + + /** + * FlexibleBigram::~FlexibleBigram: + * + * The destructor of the FlexibleBigram. + * + */ + ~FlexibleBigram(){ + reset(); + } + + /** + * FlexibleBigram::attach: + * @dbfile: the path name of the flexible bi-gram. + * @flags: the attach flags for the Berkeley DB. + * @returns: whether the attach operation is successful. + * + * Attach Berkeley DB on filesystem for training purpose. + * + */ + bool attach(const char * dbfile, guint32 flags){ + reset(); + u_int32_t db_flags = 0; + + if ( flags & ATTACH_READONLY ) + db_flags |= DB_RDONLY; + if ( flags & ATTACH_READWRITE ) + assert( !(flags & ATTACH_READONLY ) ); + + if ( !dbfile ) + return false; + int ret = db_create(&m_db, NULL, 0); + if ( ret != 0 ) + assert(false); + + ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644); + if ( ret != 0 && (flags & ATTACH_CREATE) ) { + db_flags |= DB_CREATE; + /* Create database file here, and write the signature. */ + ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644); + if ( ret != 0 ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = m_magic_header_index; + db_key.size = sizeof(m_magic_header_index); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = m_magic_number; + db_data.size = sizeof(m_magic_number); + db_data.flags = DB_DBT_PARTIAL; + db_data.doff = 0; + db_data.dlen = sizeof(m_magic_number); + + ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); + return ret == 0; + } + + /* check the signature. */ + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = m_magic_header_index; + db_key.size = sizeof(m_magic_header_index); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.flags = DB_DBT_PARTIAL; + db_data.doff = 0; + db_data.dlen = sizeof(m_magic_number); + ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); + if ( ret != 0 ) + return false; + if ( sizeof(m_magic_number) != db_data.size ) + return false; + if ( memcmp(db_data.data, m_magic_number, + sizeof(m_magic_number)) == 0 ) + return true; + return false; + } + + /** + * FlexibleBigram::load: + * @index: the previous token in the flexible bi-gram. + * @single_gram: the single gram of the previous token. + * @returns: whether the load operation is successful. + * + * Load the single gram of the previous token. + * + */ + bool load(phrase_token_t index, + FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + single_gram = NULL; + + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); + if ( ret != 0) + return false; + + single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem> + (db_data.data, db_data.size); + + return true; + } + + /** + * FlexibleBigram::store: + * @index: the previous token in the flexible bi-gram. + * @single_gram: the single gram of the previous token. + * @returns: whether the store operation is successful. + * + * Store the single gram of the previous token. + * + */ + bool store(phrase_token_t index, + FlexibleSingleGram<ArrayHeader, ArrayItem> * single_gram){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = single_gram->m_chunk.begin(); + db_data.size = single_gram->m_chunk.size(); + + int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); + return ret == 0; + } + + /** + * FlexibleBigram::remove: + * @index: the previous token in the flexible bi-gram. + * @returns: whether the remove operation is successful. + * + * Remove the single gram of the previous token. + * + */ + bool remove(phrase_token_t index){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + int ret = m_db->del(m_db, NULL, &db_key, 0); + return ret == 0; + } + + /** + * FlexibleBigram::get_all_items: + * @items: the GArray to store all previous tokens. + * @returns: whether the get operation is successful. + * + * Get the array of all previous tokens for parameter estimation. + * + */ + bool get_all_items(GArray * items){ + g_array_set_size(items, 0); + + if ( !m_db ) + return false; + + DBC * cursorp; + DBT key, data; + int ret; + + /* Get a cursor */ + m_db->cursor(m_db, NULL, &cursorp, 0); + + if (NULL == cursorp) + return false; + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){ + if (key.size != sizeof(phrase_token_t)){ + /* skip magic header. */ + continue; + } + phrase_token_t * token = (phrase_token_t *) key.data; + g_array_append_val(items, *token); + } + + if ( ret != DB_NOTFOUND ){ + fprintf(stderr, "training db error, exit!"); + + if (cursorp != NULL) + cursorp->c_close(cursorp); + + exit(EIO); + } + + /* Cursors must be closed */ + if (cursorp != NULL) + cursorp->c_close(cursorp); + return true; + } + + /** + * FlexibleBigram::get_magic_header: + * @header: the magic header. + * @returns: whether the get operation is successful. + * + * Get the magic header of the flexible bi-gram. + * + */ + bool get_magic_header(MagicHeader & header){ + /* clear retval */ + memset(&header, 0, sizeof(MagicHeader)); + + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = m_magic_header_index; + db_key.size = sizeof(m_magic_header_index); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.flags = DB_DBT_PARTIAL; + db_data.doff = sizeof(m_magic_number); + db_data.dlen = sizeof(MagicHeader); + + int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); + if ( ret != 0 ) + return false; + + if ( sizeof(MagicHeader) != db_data.size ) + return false; + + memcpy(&header, db_data.data, sizeof(MagicHeader)); + return true; + } + + /** + * FlexibleBigram::set_magic_header: + * @header: the magic header. + * @returns: whether the set operation is successful. + * + * Set the magic header of the flexible bi-gram. + * + */ + bool set_magic_header(const MagicHeader & header){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = m_magic_header_index; + db_key.size = sizeof(m_magic_header_index); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = (void *) &header; + db_data.size = sizeof(MagicHeader); + db_data.flags = DB_DBT_PARTIAL; + db_data.doff = sizeof(m_magic_number); + db_data.dlen = sizeof(MagicHeader); + + int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); + return ret == 0; + } + + /** + * FlexibleBigram::get_array_header: + * @index: the previous token in the flexible bi-gram. + * @header: the array header in the single gram of the previous token. + * @returns: whether the get operation is successful. + * + * Get the array header in the single gram of the previous token. + * + */ + bool get_array_header(phrase_token_t index, ArrayHeader & header){ + /* clear retval */ + memset(&header, 0, sizeof(ArrayHeader)); + + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.flags = DB_DBT_PARTIAL; + db_data.doff = 0; + db_data.dlen = sizeof(ArrayHeader); + int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); + if ( ret != 0 ) + return false; + + assert(db_data.size == sizeof(ArrayHeader)); + memcpy(&header, db_data.data, sizeof(ArrayHeader)); + return true; + } + + /** + * FlexibleBigram::set_array_header: + * @index: the previous token of the flexible bi-gram. + * @header: the array header in the single gram of the previous token. + * @returns: whether the set operation is successful. + * + * Set the array header in the single gram of the previous token. + * + */ + bool set_array_header(phrase_token_t index, const ArrayHeader & header){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = (void *)&header; + db_data.size = sizeof(ArrayHeader); + db_data.flags = DB_DBT_PARTIAL; + db_data.doff = 0; + db_data.dlen = sizeof(ArrayHeader); + + int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); + return ret == 0; + } + +}; + +}; + +#endif diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp new file mode 100644 index 0000000..3964388 --- /dev/null +++ b/src/storage/ngram.cpp @@ -0,0 +1,602 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <errno.h> +#include <glib.h> +#include <glib/gstdio.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "ngram.h" + +using namespace pinyin; + +struct SingleGramItem{ + phrase_token_t m_token; + guint32 m_freq; +}; + +SingleGram::SingleGram(){ + m_chunk.set_size(sizeof(guint32)); + memset(m_chunk.begin(), 0, sizeof(guint32)); +} + +SingleGram::SingleGram(void * buffer, size_t length){ + m_chunk.set_chunk(buffer, length, NULL); +} + +bool SingleGram::get_total_freq(guint32 & total) const{ + char * buf_begin = (char *)m_chunk.begin(); + total = *((guint32 *)buf_begin); + return true; +} + +bool SingleGram::set_total_freq(guint32 total){ + char * buf_begin = (char *)m_chunk.begin(); + *((guint32 *)buf_begin) = total; + return true; +} + +guint32 SingleGram::get_length(){ + /* get the number of items. */ + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *) m_chunk.end(); + + const guint32 length = end - begin; + + if (0 == length) { + /* no items here, total freq should be zero. */ + guint32 total_freq = 0; + assert(get_total_freq(total_freq)); + assert(0 == total_freq); + } + + return length; +} + +guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){ + guint32 removed_items = 0; + + guint32 total_freq = 0; + assert(get_total_freq(total_freq)); + + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *) m_chunk.end(); + + for (const SingleGramItem * cur = begin; cur != end; ++cur) { + if ((cur->m_token & mask) != value) + continue; + + total_freq -= cur->m_freq; + size_t offset = sizeof(guint32) + + sizeof(SingleGramItem) * (cur - begin); + m_chunk.remove_content(offset, sizeof(SingleGramItem)); + + /* update chunk end. */ + end = (const SingleGramItem *) m_chunk.end(); + ++removed_items; + --cur; + } + + assert(set_total_freq(total_freq)); + return removed_items; +} + +bool SingleGram::prune(){ + assert(false); +#if 0 + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *)m_chunk.end(); + + size_t nitem = 0; + for ( SingleGramItem * cur = begin; cur != end; ++cur){ + cur->m_freq--; + nitem++; + if ( cur->m_freq == 0 ){ + size_t offset = sizeof(guint32) + (cur - begin) + * sizeof(SingleGramItem) ; + m_chunk.remove_content(offset, sizeof(SingleGramItem)); + } + } + guint32 total_freq; + assert(get_total_freq(total_freq)); + assert(set_total_freq(total_freq - nitem)); +#endif + return true; +} + +static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){ + return lhs.m_token < rhs.m_token; +} + +bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array) + const { + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *) m_chunk.end(); + + guint32 total_freq; + BigramPhraseItemWithCount bigram_item_with_count; + assert(get_total_freq(total_freq)); + + for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){ + bigram_item_with_count.m_token = cur_item->m_token; + bigram_item_with_count.m_count = cur_item->m_freq; + bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq; + g_array_append_val(array, bigram_item_with_count); + } + + return true; +} + +bool SingleGram::search(/* in */ PhraseIndexRange * range, + /* out */ BigramPhraseArray array) const { + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + + SingleGramItem compare_item; + compare_item.m_token = range->m_range_begin; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + guint32 total_freq; + BigramPhraseItem bigram_item; + assert(get_total_freq(total_freq)); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token >= range->m_range_end ) + break; + bigram_item.m_token = cur_item->m_token; + bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq; + g_array_append_val(array, bigram_item); + } + + return true; +} + +bool SingleGram::insert_freq( /* in */ phrase_token_t token, + /* in */ guint32 freq){ + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *) m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + SingleGramItem insert_item; + insert_item.m_token = token; + insert_item.m_freq = freq; + for ( ; cur_item != end; ++cur_item ){ + if ( cur_item->m_token > token ){ + size_t offset = sizeof(guint32) + + sizeof(SingleGramItem) * (cur_item - begin); + m_chunk.insert_content(offset, &insert_item, + sizeof(SingleGramItem)); + return true; + } + if ( cur_item->m_token == token ){ + return false; + } + } + m_chunk.insert_content(m_chunk.size(), &insert_item, + sizeof(SingleGramItem)); + return true; +} + +bool SingleGram::remove_freq( /* in */ phrase_token_t token, + /* out */ guint32 & freq){ + freq = 0; + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item ){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + freq = cur_item -> m_freq; + size_t offset = sizeof(guint32) + + sizeof(SingleGramItem) * (cur_item - begin); + m_chunk.remove_content(offset, sizeof(SingleGramItem)); + return true; + } + } + return false; +} + +bool SingleGram::get_freq(/* in */ phrase_token_t token, + /* out */ guint32 & freq) const { + freq = 0; + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + freq = cur_item -> m_freq; + return true; + } + } + return false; +} + +bool SingleGram::set_freq( /* in */ phrase_token_t token, + /* in */ guint32 freq){ + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + for ( ;cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ){ + return false; + } + if ( cur_item->m_token == token ){ + cur_item -> m_freq = freq; + return true; + } + } + return false; +} + +bool Bigram::load_db(const char * dbfile){ + reset(); + + /* create in memory db. */ + int ret = db_create(&m_db, NULL, 0); + assert(ret == 0); + + ret = m_db->open(m_db, NULL, NULL, NULL, + DB_HASH, DB_CREATE, 0600); + if ( ret != 0 ) + return false; + + /* load db into memory. */ + DB * tmp_db = NULL; + ret = db_create(&tmp_db, NULL, 0); + assert(ret == 0); + + if (NULL == tmp_db) + return false; + + ret = tmp_db->open(tmp_db, NULL, dbfile, NULL, + DB_HASH, DB_RDONLY, 0600); + if ( ret != 0 ) + return false; + + DBC * cursorp = NULL; + DBT key, data; + + /* Get a cursor */ + tmp_db->cursor(tmp_db, NULL, &cursorp, 0); + + if (NULL == cursorp) + return false; + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + int ret = m_db->put(m_db, NULL, &key, &data, 0); + assert(ret == 0); + } + assert (ret == DB_NOTFOUND); + + /* Cursors must be closed */ + if ( cursorp != NULL ) + cursorp->c_close(cursorp); + + if ( tmp_db != NULL ) + tmp_db->close(tmp_db, 0); + + return true; +} + +bool Bigram::save_db(const char * dbfile){ + DB * tmp_db = NULL; + + int ret = unlink(dbfile); + if ( ret != 0 && errno != ENOENT) + return false; + + ret = db_create(&tmp_db, NULL, 0); + assert(ret == 0); + + if (NULL == tmp_db) + return false; + + ret = tmp_db->open(tmp_db, NULL, dbfile, NULL, + DB_HASH, DB_CREATE, 0600); + if ( ret != 0 ) + return false; + + DBC * cursorp = NULL; + DBT key, data; + /* Get a cursor */ + m_db->cursor(m_db, NULL, &cursorp, 0); + + if (NULL == cursorp) + return false; + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0); + assert(ret == 0); + } + assert (ret == DB_NOTFOUND); + + /* Cursors must be closed */ + if ( cursorp != NULL ) + cursorp->c_close(cursorp); + + if ( tmp_db != NULL ) + tmp_db->close(tmp_db, 0); + + return true; +} + +bool Bigram::attach(const char * dbfile, guint32 flags){ + reset(); + u_int32_t db_flags = 0; + + if ( flags & ATTACH_READONLY ) + db_flags |= DB_RDONLY; + if ( flags & ATTACH_READWRITE ) + assert( !( flags & ATTACH_READONLY ) ); + if ( flags & ATTACH_CREATE ) + db_flags |= DB_CREATE; + + if ( !dbfile ) + return false; + int ret = db_create(&m_db, NULL, 0); + if ( ret != 0 ) + assert(false); + + ret = m_db->open(m_db, NULL, dbfile, NULL, + DB_HASH, db_flags, 0644); + if ( ret != 0) + return false; + + return true; +} + +bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){ + single_gram = NULL; + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); + if ( ret != 0 ) + return false; + + single_gram = new SingleGram(db_data.data, db_data.size); + return true; +} + +bool Bigram::store(phrase_token_t index, SingleGram * single_gram){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = single_gram->m_chunk.begin(); + db_data.size = single_gram->m_chunk.size(); + + int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); + return ret == 0; +} + +bool Bigram::remove(/* in */ phrase_token_t index){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + int ret = m_db->del(m_db, NULL, &db_key, 0); + return 0 == ret; +} + +bool Bigram::get_all_items(GArray * items){ + g_array_set_size(items, 0); + + if ( !m_db ) + return false; + + DBC * cursorp = NULL; + DBT key, data; + int ret; + /* Get a cursor */ + m_db->cursor(m_db, NULL, &cursorp, 0); + + if (NULL == cursorp) + return false; + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + assert(key.size == sizeof(phrase_token_t)); + phrase_token_t * token = (phrase_token_t *)key.data; + g_array_append_val(items, *token); + } + + assert (ret == DB_NOTFOUND); + + /* Cursors must be closed */ + if (cursorp != NULL) + cursorp->c_close(cursorp); + + return true; +} + +bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){ + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + if (!get_all_items(items)) { + g_array_free(items, TRUE); + return false; + } + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t index = g_array_index(items, phrase_token_t, i); + + if ((index & mask) == value) { + assert(remove(index)); + continue; + } + + SingleGram * gram = NULL; + assert(load(index, gram)); + + int num = gram->mask_out(mask, value); + if (0 == num) { + delete gram; + continue; + } + + if (0 == gram->get_length()) { + assert(remove(index)); + } else { + assert(store(index, gram)); + } + + delete gram; + } + + g_array_free(items, TRUE); + return true; +} + + +namespace pinyin{ + +/* merge origin system info and delta user info */ +bool merge_single_gram(SingleGram * merged, const SingleGram * system, + const SingleGram * user){ + if (NULL == system && NULL == user) + return false; + + MemoryChunk & merged_chunk = merged->m_chunk; + + if (NULL == system) { + merged_chunk.set_chunk(user->m_chunk.begin(), + user->m_chunk.size(), NULL); + return true; + } + + if (NULL == user) { + merged_chunk.set_chunk(system->m_chunk.begin(), + system->m_chunk.size(), NULL); + return true; + } + + /* clear merged. */ + merged_chunk.set_size(sizeof(guint32)); + + /* merge the origin info and delta info */ + guint32 system_total, user_total; + assert(system->get_total_freq(system_total)); + assert(user->get_total_freq(user_total)); + const guint32 merged_total = system_total + user_total; + merged_chunk.set_content(0, &merged_total, sizeof(guint32)); + + const SingleGramItem * cur_system = (const SingleGramItem *) + (((const char *)(system->m_chunk.begin())) + sizeof(guint32)); + const SingleGramItem * system_end = (const SingleGramItem *) + system->m_chunk.end(); + + const SingleGramItem * cur_user = (const SingleGramItem *) + (((const char *)(user->m_chunk.begin())) + sizeof(guint32)); + const SingleGramItem * user_end = (const SingleGramItem *) + user->m_chunk.end(); + + while (cur_system < system_end && cur_user < user_end) { + + if (cur_system->m_token < cur_user->m_token) { + /* do append operation here */ + merged_chunk.append_content(cur_system, sizeof(SingleGramItem)); + cur_system++; + } else if (cur_system->m_token > cur_user->m_token) { + /* do append operation here */ + merged_chunk.append_content(cur_user, sizeof(SingleGramItem)); + cur_user++; + } else { + assert(cur_system->m_token == cur_user->m_token); + + SingleGramItem merged_item; + merged_item.m_token = cur_system->m_token; + merged_item.m_freq = cur_system->m_freq + cur_user->m_freq; + + merged_chunk.append_content(&merged_item, sizeof(SingleGramItem)); + cur_system++; cur_user++; + } + } + + /* add remained items. */ + while (cur_system < system_end) { + merged_chunk.append_content(cur_system, sizeof(SingleGramItem)); + cur_system++; + } + + while (cur_user < user_end) { + merged_chunk.append_content(cur_user, sizeof(SingleGramItem)); + cur_user++; + } + + return true; +} + +}; diff --git a/src/storage/ngram.h b/src/storage/ngram.h new file mode 100644 index 0000000..e4045a9 --- /dev/null +++ b/src/storage/ngram.h @@ -0,0 +1,329 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef NGRAM_H +#define NGRAM_H + +#include <db.h> + +namespace pinyin{ + +class Bigram; + +/** Note: + * The system single gram contains the trained freqs. + * The user single gram contains the delta freqs. + * During the Viterbi beam search, use merge_single_gram to merge the system + * single gram and the user single gram. + */ + + +/** + * SingleGram: + * + * The single gram in the bi-gram. + * + */ +class SingleGram{ + friend class Bigram; + friend bool merge_single_gram(SingleGram * merged, + const SingleGram * system, + const SingleGram * user); + +private: + MemoryChunk m_chunk; + SingleGram(void * buffer, size_t length); +public: + /** + * SingleGram::SingleGram: + * + * The constructor of the SingleGram. + * + */ + SingleGram(); + /** + * SingleGram::retrieve_all: + * @array: the GArray to store the retrieved bi-gram phrase item. + * @returns: whether the retrieve operation is successful. + * + * Retrieve all bi-gram phrase items in this single gram. + * + */ + bool retrieve_all(/* out */ BigramPhraseWithCountArray array) const; + + /** + * SingleGram::search: + * @range: the token range. + * @array: the GArray to store the matched bi-gram phrase item. + * @returns: whether the search operation is successful. + * + * Search the bi-gram phrase items according to the token range. + * + * Note: the array result may contain many items. + * + */ + bool search(/* in */ PhraseIndexRange * range, + /* out */ BigramPhraseArray array) const; + + /** + * SingleGram::insert_freq: + * @token: the phrase token. + * @freq: the freq of this token. + * @returns: whether the insert operation is successful. + * + * Insert the token with the freq. + * + */ + bool insert_freq(/* in */ phrase_token_t token, + /* in */ guint32 freq); + + /** + * SingleGram::remove_freq: + * @token: the phrase token. + * @freq: the freq of the removed token. + * @returns: whether the remove operation is successful. + * + * Remove the token. + * + */ + bool remove_freq(/* in */ phrase_token_t token, + /* out */ guint32 & freq); + + /** + * SingleGram::get_freq: + * @token: the phrase token. + * @freq: the freq of the token. + * @returns: whether the get operation is successful. + * + * Get the freq of the token. + * + */ + bool get_freq(/* in */ phrase_token_t token, + /* out */ guint32 & freq) const; + + /** + * SingleGram::set_freq: + * @token: the phrase token. + * @freq: the freq of the token. + * @returns: whether the set operation is successful. + * + * Set the freq of the token. + * + */ + bool set_freq(/* in */ phrase_token_t token, + /* in */ guint32 freq); + + /** + * SingleGram::get_total_freq: + * @total: the total freq of this single gram. + * @returns: whether the get operation is successful. + * + * Get the total freq of this single gram. + * + */ + bool get_total_freq(guint32 & total) const; + + /** + * SingleGram::set_total_freq: + * @total: the total freq of this single gram. + * @returns: whether the set operation is successful. + * + * Set the total freq of this single gram. + * + */ + bool set_total_freq(guint32 total); + + /** + * SingleGram::get_length: + * @returns: the number of items in this single gram. + * + * Get the number of items in this single gram. + * + */ + guint32 get_length(); + + /** + * SingleGram::mask_out: + * @mask: the mask. + * @value: the value. + * @returns: the number of removed items. + * + * Mask out the matched items in this single gram. + * + */ + guint32 mask_out(phrase_token_t mask, phrase_token_t value); + + /** + * SingleGram::prune: + * @returns: whether the prune operation is successful. + * + * Obsoleted by Katz k mixture model pruning. + * + */ + bool prune(); +}; + + +/** + * Bigram: + * + * The Bi-gram class. + * + */ +class Bigram{ +private: + DB * m_db; + + void reset(){ + if ( m_db ){ + m_db->sync(m_db, 0); + m_db->close(m_db, 0); + m_db = NULL; + } + } + +public: + /** + * Bigram::Bigram: + * + * The constructor of the Bigram. + * + */ + Bigram(){ + m_db = NULL; + } + + /** + * Bigram::~Bigram: + * + * The destructor of the Bigram. + * + */ + ~Bigram(){ + reset(); + } + + /** + * Bigram::load_db: + * @dbfile: the Berkeley DB file name. + * @returns: whether the load operation is successful. + * + * Load the Berkeley DB into memory. + * + */ + bool load_db(const char * dbfile); + + /** + * Bigram::save_db: + * @dbfile: the Berkeley DB file name. + * @returns: whether the save operation is successful. + * + * Save the in-memory Berkeley DB into disk. + * + */ + bool save_db(const char * dbfile); + + /** + * Bigram::attach: + * @dbfile: the Berkeley DB file name. + * @flags: the flags of enum ATTACH_FLAG. + * @returns: whether the attach operation is successful. + * + * Attach this Bigram with the Berkeley DB. + * + */ + bool attach(const char * dbfile, guint32 flags); + + /** + * Bigram::load: + * @index: the previous token in the bi-gram. + * @single_gram: the single gram of the previous token. + * @returns: whether the load operation is successful. + * + * Load the single gram of the previous token. + * + */ + bool load(/* in */ phrase_token_t index, + /* out */ SingleGram * & single_gram); + + /** + * Bigram::store: + * @index: the previous token in the bi-gram. + * @single_gram: the single gram of the previous token. + * @returns: whether the store operation is successful. + * + * Store the single gram of the previous token. + * + */ + bool store(/* in */ phrase_token_t index, + /* in */ SingleGram * single_gram); + + /** + * Bigram::remove: + * @index: the previous token in the bi-gram. + * @returns: whether the remove operation is successful. + * + * Remove the single gram of the previous token. + * + */ + bool remove(/* in */ phrase_token_t index); + + /** + * Bigram::get_all_items: + * @items: the GArray to store all previous tokens. + * @returns: whether the get operation is successful. + * + * Get the array of all previous tokens for parameter estimation. + * + */ + bool get_all_items(/* out */ GArray * items); + + /** + * Bigram::mask_out: + * @mask: the mask. + * @value: the value. + * @returns: whether the mask out operation is successful. + * + * Mask out the matched items. + * + */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + +/** + * merge_single_gram: + * @merged: the merged single gram of system and user single gram. + * @system: the system single gram to be merged. + * @user: the user single gram to be merged. + * @returns: whether the merge operation is successful. + * + * Merge the system and user single gram into one merged single gram. + * + * Note: Please keep system and user single gram + * when using merged single gram. + * + */ +bool merge_single_gram(SingleGram * merged, const SingleGram * system, + const SingleGram * user); + +}; + +#endif diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp new file mode 100644 index 0000000..5fe61c2 --- /dev/null +++ b/src/storage/phrase_index.cpp @@ -0,0 +1,860 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "phrase_index.h" +#include "pinyin_custom2.h" + +using namespace pinyin; + +bool PhraseItem::set_n_pronunciation(guint8 n_prouns){ + m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8)); + return true; +} + +bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys, + guint32 & freq){ + guint8 phrase_length = get_phrase_length(); + table_offset_t offset = phrase_item_header + phrase_length * sizeof( ucs4_t) + index * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32)); + + bool retval = m_chunk.get_content + (offset, keys, phrase_length * sizeof(ChewingKey)); + if ( !retval ) + return retval; + return m_chunk.get_content + (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32)); +} + +#if 0 +void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){ + guint8 phrase_length = get_phrase_length(); + set_n_pronunciation(get_n_pronunciation() + 1); + m_chunk.set_content(m_chunk.size(), keys, + phrase_length * sizeof(ChewingKey)); + m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32)); +} +#endif + +bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){ + guint8 phrase_length = get_phrase_length(); + guint8 npron = get_n_pronunciation(); + size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t); + char * buf_begin = (char *) m_chunk.begin(); + guint32 total_freq = 0; + + for (int i = 0; i < npron; ++i) { + char * chewing_begin = buf_begin + offset + + i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); + guint32 * freq = (guint32 *)(chewing_begin + + phrase_length * sizeof(ChewingKey)); + + total_freq += *freq; + + if (0 == pinyin_exact_compare2 + (keys, (ChewingKey *)chewing_begin, phrase_length)) { + /* found the exact match pinyin keys. */ + + /* protect against total_freq overflow. */ + if (delta > 0 && total_freq > total_freq + delta) + return false; + + *freq += delta; + total_freq += delta; + return true; + } + } + + set_n_pronunciation(npron + 1); + m_chunk.set_content(m_chunk.size(), keys, + phrase_length * sizeof(ChewingKey)); + m_chunk.set_content(m_chunk.size(), &delta, sizeof(guint32)); + return true; +} + +void PhraseItem::remove_nth_pronunciation(size_t index){ + guint8 phrase_length = get_phrase_length(); + set_n_pronunciation(get_n_pronunciation() - 1); + size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t) + + index * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); + m_chunk.remove_content(offset, phrase_length * sizeof(ChewingKey) + sizeof(guint32)); +} + +bool PhraseItem::get_phrase_string(ucs4_t * phrase){ + guint8 phrase_length = get_phrase_length(); + return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t)); +} + +bool PhraseItem::set_phrase_string(guint8 phrase_length, ucs4_t * phrase){ + m_chunk.set_content(0, &phrase_length, sizeof(guint8)); + m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(ucs4_t)); + return true; +} + +void PhraseItem::increase_pronunciation_possibility(pinyin_option_t options, + ChewingKey * keys, + gint32 delta){ + guint8 phrase_length = get_phrase_length(); + guint8 npron = get_n_pronunciation(); + size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t); + char * buf_begin = (char *) m_chunk.begin(); + guint32 total_freq = 0; + + for (int i = 0; i < npron; ++i) { + char * chewing_begin = buf_begin + offset + + i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); + guint32 * freq = (guint32 *)(chewing_begin + + phrase_length * sizeof(ChewingKey)); + total_freq += *freq; + + if (0 == pinyin_compare_with_ambiguities2 + (options, keys, + (ChewingKey *)chewing_begin, phrase_length)) { + + /* protect against total_freq overflow. */ + if (delta > 0 && total_freq > total_freq + delta) + return; + + *freq += delta; + total_freq += delta; + } + } +} + + +guint32 SubPhraseIndex::get_phrase_index_total_freq(){ + return m_total_freq; +} + +int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){ + table_offset_t offset; + guint32 freq; + bool result = m_phrase_index.get_content + ((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + + if ( !result ) + return ERROR_OUT_OF_RANGE; + + if ( 0 == offset ) + return ERROR_NO_ITEM; + + result = m_phrase_content.get_content + (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32)); + + if ( !result ) + return ERROR_FILE_CORRUPTION; + + //protect total_freq overflow + if ( delta > 0 && m_total_freq > m_total_freq + delta ) + return ERROR_INTEGER_OVERFLOW; + + freq += delta; + m_total_freq += delta; + m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32)); + + return ERROR_OK; +} + +int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){ + table_offset_t offset; + guint8 phrase_length; + guint8 n_prons; + + bool result = m_phrase_index.get_content + ((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + + if ( !result ) + return ERROR_OUT_OF_RANGE; + + if ( 0 == offset ) + return ERROR_NO_ITEM; + + result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8)); + if ( !result ) + return ERROR_FILE_CORRUPTION; + + result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8)); + if ( !result ) + return ERROR_FILE_CORRUPTION; + + size_t length = phrase_item_header + phrase_length * sizeof ( ucs4_t ) + n_prons * ( phrase_length * sizeof (ChewingKey) + sizeof(guint32) ); + item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL); + return ERROR_OK; +} + +int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){ + table_offset_t offset = m_phrase_content.size(); + if ( 0 == offset ) + offset = 8; + m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size()); + m_phrase_index.set_content((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + m_total_freq += item->get_unigram_frequency(); + return ERROR_OK; +} + +int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){ + PhraseItem old_item; + + int result = get_phrase_item(token, old_item); + if (result != ERROR_OK) + return result; + + item = new PhraseItem; + //implictly copy data from m_chunk_content. + item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size()); + + const table_offset_t zero_const = 0; + m_phrase_index.set_content((token & PHRASE_MASK) + * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t)); + m_total_freq -= item->get_unigram_frequency(); + return ERROR_OK; +} + +bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ){ + sub_phrases = new SubPhraseIndex; + } + + m_total_freq -= sub_phrases->get_phrase_index_total_freq(); + bool retval = sub_phrases->load(chunk, 0, chunk->size()); + if ( !retval ) + return retval; + m_total_freq += sub_phrases->get_phrase_index_total_freq(); + return retval; +} + +bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){ + table_offset_t end; + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ) + return false; + + sub_phrases->store(new_chunk, 0, end); + return true; +} + +bool FacadePhraseIndex::unload(guint8 phrase_index){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ) + return false; + m_total_freq -= sub_phrases->get_phrase_index_total_freq(); + delete sub_phrases; + sub_phrases = NULL; + return true; +} + +bool FacadePhraseIndex::diff(guint8 phrase_index, MemoryChunk * oldchunk, + MemoryChunk * newlog){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ) + return false; + + SubPhraseIndex old_sub_phrases; + old_sub_phrases.load(oldchunk, 0, oldchunk->size()); + PhraseIndexLogger logger; + + bool retval = sub_phrases->diff(&old_sub_phrases, &logger); + logger.store(newlog); + return retval; +} + +bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ) + return false; + + m_total_freq -= sub_phrases->get_phrase_index_total_freq(); + PhraseIndexLogger logger; + logger.load(log); + + bool retval = sub_phrases->merge(&logger); + m_total_freq += sub_phrases->get_phrase_index_total_freq(); + + return retval; +} + +bool FacadePhraseIndex::merge_with_mask(guint8 phrase_index, + MemoryChunk * log, + phrase_token_t mask, + phrase_token_t value){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ) + return false; + + /* check mask and value. */ + phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask); + phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value); + if ((phrase_index & index_mask) != index_value) + return false; + + /* unload old sub phrase index */ + m_total_freq -= sub_phrases->get_phrase_index_total_freq(); + + /* calculate the sub phrase index mask and value. */ + mask &= PHRASE_MASK; value &= PHRASE_MASK; + + /* prepare the new logger. */ + PhraseIndexLogger oldlogger; + oldlogger.load(log); + PhraseIndexLogger * newlogger = mask_out_phrase_index_logger + (&oldlogger, mask, value); + + bool retval = sub_phrases->merge(newlogger); + m_total_freq += sub_phrases->get_phrase_index_total_freq(); + delete newlogger; + + return retval; +} + + +bool SubPhraseIndex::load(MemoryChunk * chunk, + table_offset_t offset, table_offset_t end){ + //save the memory chunk + if ( m_chunk ){ + delete m_chunk; + m_chunk = NULL; + } + m_chunk = chunk; + + char * buf_begin = (char *)chunk->begin(); + chunk->get_content(offset, &m_total_freq, sizeof(guint32)); + offset += sizeof(guint32); + table_offset_t index_one, index_two, index_three; + chunk->get_content(offset, &index_one, sizeof(table_offset_t)); + offset += sizeof(table_offset_t); + chunk->get_content(offset, &index_two, sizeof(table_offset_t)); + offset += sizeof(table_offset_t); + chunk->get_content(offset, &index_three, sizeof(table_offset_t)); + offset += sizeof(table_offset_t); + g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE); + g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE); + g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE); + m_phrase_index.set_chunk(buf_begin + index_one, + index_two - 1 - index_one, NULL); + m_phrase_content.set_chunk(buf_begin + index_two, + index_three - 1 - index_two, NULL); + g_return_val_if_fail( index_three <= end, FALSE); + return true; +} + +bool SubPhraseIndex::store(MemoryChunk * new_chunk, + table_offset_t offset, table_offset_t& end){ + new_chunk->set_content(offset, &m_total_freq, sizeof(guint32)); + table_offset_t index = offset + sizeof(guint32); + + offset = index + sizeof(table_offset_t) * 3 ; + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size()); + offset += m_phrase_index.size(); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + + new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size()); + offset += m_phrase_content.size(); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + return true; +} + +bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){ + /* diff the header */ + MemoryChunk oldheader, newheader; + guint32 total_freq = oldone->get_phrase_index_total_freq(); + oldheader.set_content(0, &total_freq, sizeof(guint32)); + total_freq = get_phrase_index_total_freq(); + newheader.set_content(0, &total_freq, sizeof(guint32)); + logger->append_record(LOG_MODIFY_HEADER, null_token, + &oldheader, &newheader); + + /* diff phrase items */ + PhraseIndexRange oldrange, currange, range; + oldone->get_range(oldrange); get_range(currange); + range.m_range_begin = std_lite::min(oldrange.m_range_begin, + currange.m_range_begin); + range.m_range_end = std_lite::max(oldrange.m_range_end, + currange.m_range_end); + PhraseItem olditem, newitem; + + for (phrase_token_t token = range.m_range_begin; + token < range.m_range_end; ++token ){ + bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem); + bool newretval = ERROR_OK == get_phrase_item(token, newitem); + + if ( oldretval ){ + if ( newretval ) { /* compare phrase item. */ + if ( olditem == newitem ) + continue; + logger->append_record(LOG_MODIFY_RECORD, token, + &(olditem.m_chunk), &(newitem.m_chunk)); + } else { /* remove phrase item. */ + logger->append_record(LOG_REMOVE_RECORD, token, + &(olditem.m_chunk), NULL); + } + } else { + if ( newretval ){ /* add phrase item. */ + logger->append_record(LOG_ADD_RECORD, token, + NULL, &(newitem.m_chunk)); + } else { /* both empty. */ + /* do nothing. */ + } + } + } + + return true; +} + +bool SubPhraseIndex::merge(PhraseIndexLogger * logger){ + LOG_TYPE log_type; phrase_token_t token; + MemoryChunk oldchunk, newchunk; + PhraseItem olditem, newitem, item, * tmpitem; + + while(logger->has_next_record()){ + bool retval = logger->next_record + (log_type, token, &oldchunk, &newchunk); + + if (!retval) + break; + + switch(log_type){ + case LOG_ADD_RECORD:{ + assert( 0 == oldchunk.size() ); + newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(), + NULL); + add_phrase_item(token, &newitem); + break; + } + case LOG_REMOVE_RECORD:{ + assert( 0 == newchunk.size() ); + tmpitem = NULL; + remove_phrase_item(token, tmpitem); + + olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(), + NULL); + + if (olditem != *tmpitem) { + delete tmpitem; + return false; + } + + delete tmpitem; + + break; + } + case LOG_MODIFY_RECORD:{ + get_phrase_item(token, item); + olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(), + NULL); + newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(), + NULL); + if (item != olditem) + return false; + + if (newchunk.size() > item.m_chunk.size() ){ /* increase size. */ + tmpitem = NULL; + remove_phrase_item(token, tmpitem); + assert(olditem == *tmpitem); + add_phrase_item(token, &newitem); + delete tmpitem; + } else { /* in place editing. */ + /* newchunk.size() <= item.m_chunk.size() */ + /* Hack here: we assume the behaviour of get_phrase_item + * point to the actual data positon, so changes to item + * will be saved in SubPhraseIndex immediately. + */ + memmove(item.m_chunk.begin(), newchunk.begin(), + newchunk.size()); + } + break; + } + case LOG_MODIFY_HEADER:{ + guint32 total_freq = get_phrase_index_total_freq(); + guint32 tmp_freq = 0; + assert(null_token == token); + assert(oldchunk.size() == newchunk.size()); + oldchunk.get_content(0, &tmp_freq, sizeof(guint32)); + if (total_freq != tmp_freq) + return false; + newchunk.get_content(0, &tmp_freq, sizeof(guint32)); + m_total_freq = tmp_freq; + break; + } + default: + assert(false); + } + } + return true; +} + +bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ){ + sub_phrases = new SubPhraseIndex; + } + + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + + PhraseItem * item_ptr = new PhraseItem; + phrase_token_t cur_token = 0; + + while (!feof(infile)){ + int num = fscanf(infile, "%s %s %u %ld", + pinyin, phrase, &token, &freq); + + if (4 != num) + continue; + + if (feof(infile)) + break; + + assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index ); + + glong written; + ucs4_t * phrase_ucs4 = g_utf8_to_ucs4(phrase, -1, NULL, + &written, NULL); + + if ( 0 == cur_token ){ + cur_token = token; + item_ptr->set_phrase_string(written, phrase_ucs4); + } + + if ( cur_token != token ){ + add_phrase_item( cur_token, item_ptr); + delete item_ptr; + item_ptr = new PhraseItem; + cur_token = token; + item_ptr->set_phrase_string(written, phrase_ucs4); + } + + pinyin_option_t options = USE_TONE; + FullPinyinParser2 parser; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + + if (item_ptr->get_phrase_length() == keys->len) { + item_ptr->add_pronunciation((ChewingKey *)keys->data, freq); + } else { + fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n", + pinyin, phrase); + } + + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + g_free(phrase_ucs4); + } + + add_phrase_item( cur_token, item_ptr); + delete item_ptr; +#if 0 + m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq(); +#endif + return true; +} + +int FacadePhraseIndex::get_sub_phrase_range(guint8 & min_index, + guint8 & max_index){ + min_index = PHRASE_INDEX_LIBRARY_COUNT; max_index = 0; + for ( guint8 i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){ + if ( m_sub_phrase_indices[i] ) { + min_index = std_lite::min(min_index, i); + max_index = std_lite::max(max_index, i); + } + } + return ERROR_OK; +} + +int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){ + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrase ) + return ERROR_NO_SUB_PHRASE_INDEX; + + int result = sub_phrase->get_range(range); + if ( result ) + return result; + + range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin); + range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end); + return ERROR_OK; +} + +int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){ + const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin(); + const table_offset_t * end = (const table_offset_t *)m_phrase_index.end(); + + if (begin == end) { + /* skip empty sub phrase index. */ + range.m_range_begin = 1; + range.m_range_end = 1; + return ERROR_OK; + } + + /* remove trailing zeros. */ + const table_offset_t * poffset = 0; + for (poffset = end - 1; poffset >= begin + 1; --poffset) { + if (0 != *poffset) + break; + } + + range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */ + range.m_range_end = poffset + 1 - begin; /* removed zeros. */ + + return ERROR_OK; +} + +bool FacadePhraseIndex::compact(){ + for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) { + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ) + continue; + + PhraseIndexRange range; + int result = sub_phrase->get_range(range); + if ( result != ERROR_OK ) + continue; + + SubPhraseIndex * new_sub_phrase = new SubPhraseIndex; + + PhraseItem item; + for ( phrase_token_t token = range.m_range_begin; + token < range.m_range_end; + ++token ) { + result = sub_phrase->get_phrase_item(token, item); + if ( result != ERROR_OK ) + continue; + new_sub_phrase->add_phrase_item(token, &item); + } + + delete sub_phrase; + m_sub_phrase_indices[index] = new_sub_phrase; + } + return true; +} + +bool SubPhraseIndex::mask_out(phrase_token_t mask, phrase_token_t value){ + PhraseIndexRange range; + if (ERROR_OK != get_range(range)) + return false; + + /* calculate mask and value for sub phrase index. */ + mask &= PHRASE_MASK; value &= PHRASE_MASK; + + for (phrase_token_t token = range.m_range_begin; + token < range.m_range_end; ++token) { + if ((token & mask) != value) + continue; + + PhraseItem * item = NULL; + remove_phrase_item(token, item); + if (item) + delete item; + } + + return true; +} + +bool FacadePhraseIndex::mask_out(guint8 phrase_index, + phrase_token_t mask, + phrase_token_t value){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if (!sub_phrases) + return false; + + /* check mask and value. */ + phrase_token_t index_mask = PHRASE_INDEX_LIBRARY_INDEX(mask); + phrase_token_t index_value = PHRASE_INDEX_LIBRARY_INDEX(value); + + if ((phrase_index & index_mask ) != index_value) + return false; + + m_total_freq -= sub_phrases->get_phrase_index_total_freq(); + bool retval = sub_phrases->mask_out(mask, value); + m_total_freq += sub_phrases->get_phrase_index_total_freq(); + + return retval; +} + +namespace pinyin{ + + +static bool _peek_header(PhraseIndexLogger * logger, + guint32 & old_total_freq){ + old_total_freq = 0; + + size_t header_count = 0; + LOG_TYPE log_type; phrase_token_t token; + MemoryChunk oldchunk, newchunk; + + while (logger->has_next_record()) { + bool retval = logger->next_record + (log_type, token, &oldchunk, &newchunk); + + if (!retval) + break; + + if (LOG_MODIFY_HEADER != log_type) + continue; + + ++header_count; + + oldchunk.get_content(0, &old_total_freq, sizeof(guint32)); + } + + /* 1 for normal case, 0 for corrupted file. */ + assert(1 >= header_count); + + return 1 == header_count? true : false; +} + +bool _compute_new_header(PhraseIndexLogger * logger, + phrase_token_t mask, + phrase_token_t value, + guint32 & new_total_freq) { + + LOG_TYPE log_type; phrase_token_t token; + MemoryChunk oldchunk, newchunk; + PhraseItem olditem, newitem; + + while(logger->has_next_record()) { + bool retval = logger->next_record + (log_type, token, &oldchunk, &newchunk); + + if (!retval) + break; + + if (LOG_MODIFY_HEADER == log_type) + continue; + + if ((token & mask) == value) + continue; + + switch(log_type) { + case LOG_ADD_RECORD:{ + assert( 0 == oldchunk.size() ); + newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(), + NULL); + new_total_freq += newitem.get_unigram_frequency(); + break; + } + case LOG_REMOVE_RECORD:{ + assert( 0 == newchunk.size() ); + olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(), + NULL); + new_total_freq -= olditem.get_unigram_frequency(); + break; + } + case LOG_MODIFY_RECORD:{ + olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(), + NULL); + new_total_freq -= olditem.get_unigram_frequency(); + + newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(), + NULL); + new_total_freq += newitem.get_unigram_frequency(); + break; + } + default: + assert(false); + } + } + + return true; +} + +static bool _write_header(PhraseIndexLogger * logger, + guint32 & old_total_freq, + guint32 & new_total_freq) { + MemoryChunk oldheader, newheader; + oldheader.set_content(0, &old_total_freq, sizeof(guint32)); + newheader.set_content(0, &new_total_freq, sizeof(guint32)); + logger->append_record(LOG_MODIFY_HEADER, null_token, + &oldheader, &newheader); + return true; +} + +static bool _mask_out_records(PhraseIndexLogger * oldlogger, + phrase_token_t mask, + phrase_token_t value, + PhraseIndexLogger * newlogger) { + LOG_TYPE log_type; phrase_token_t token; + MemoryChunk oldchunk, newchunk; + + while(oldlogger->has_next_record()) { + bool retval = oldlogger->next_record + (log_type, token, &oldchunk, &newchunk); + + if (!retval) + break; + + if (LOG_MODIFY_HEADER == log_type) + continue; + + if ((token & mask) == value) + continue; + + newlogger->append_record(log_type, token, &oldchunk, &newchunk); + } + + return true; +} + +PhraseIndexLogger * mask_out_phrase_index_logger +(PhraseIndexLogger * oldlogger, phrase_token_t mask, + phrase_token_t value) { + PhraseIndexLogger * newlogger = new PhraseIndexLogger; + guint32 old_total_freq = 0, new_total_freq = 0; + + /* peek the header value. */ + if (!_peek_header(oldlogger, old_total_freq)) + return newlogger; + + new_total_freq = old_total_freq; + + /* compute the new header based on add/modify/remove records. */ + oldlogger->rewind(); + if (!_compute_new_header(oldlogger, mask, value, new_total_freq)) + return newlogger; + + /* write out the modify header record. */ + _write_header(newlogger, old_total_freq, new_total_freq); + + /* mask out the matched records. */ + oldlogger->rewind(); + _mask_out_records(oldlogger, mask, value, newlogger); + + return newlogger; +} + +}; diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h new file mode 100644 index 0000000..e1dad0b --- /dev/null +++ b/src/storage/phrase_index.h @@ -0,0 +1,839 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef PHRASE_INDEX_H +#define PHRASE_INDEX_H + +#include <stdio.h> +#include <glib.h> +#include "novel_types.h" +#include "chewing_key.h" +#include "pinyin_parser2.h" +#include "pinyin_phrase2.h" +#include "memory_chunk.h" +#include "phrase_index_logger.h" + +/** + * Phrase Index File Format + * + * Indirect Index: Index by Token + * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * + Phrase Offset + Phrase Offset + Phrase Offset + ...... + + * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * Phrase Content: + * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * + Phrase Length + number of Pronunciations + Uni-gram Frequency+ + * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * + Phrase String(UCS4) + n Pronunciations with Frequency + + * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + +namespace pinyin{ + +/* Store delta info by phrase index logger in user home directory. + */ + +const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32); + +/** + * PhraseItem: + * + * The PhraseItem to access the items in phrase index. + * + */ +class PhraseItem{ + friend class SubPhraseIndex; + friend bool _compute_new_header(PhraseIndexLogger * logger, + phrase_token_t mask, + phrase_token_t value, + guint32 & new_total_freq); + +private: + MemoryChunk m_chunk; + bool set_n_pronunciation(guint8 n_prouns); +public: + /** + * PhraseItem::PhraseItem: + * + * The constructor of the PhraseItem. + * + */ + PhraseItem(){ + m_chunk.set_size(phrase_item_header); + memset(m_chunk.begin(), 0, m_chunk.size()); + } + +#if 0 + PhraseItem(MemoryChunk & chunk){ + m_chunk.set_content(0, chunk->begin(), chunk->size()); + assert ( m_chunk.size() >= phrase_item_header); + } +#endif + + /** + * PhraseItem::get_phrase_length: + * @returns: the length of this phrase item. + * + * Get the length of this phrase item. + * + */ + guint8 get_phrase_length(){ + char * buf_begin = (char *)m_chunk.begin(); + return (*(guint8 *)buf_begin); + } + + /** + * PhraseItem::get_n_pronunciation: + * @returns: the number of the pronunciations. + * + * Get the number of the pronunciations. + * + */ + guint8 get_n_pronunciation(){ + char * buf_begin = ( char *) m_chunk.begin(); + return (*(guint8 *)(buf_begin + sizeof(guint8))); + } + + /** + * PhraseItem::get_unigram_frequency: + * @returns: the uni-gram frequency of this phrase item. + * + * Get the uni-gram frequency of this phrase item. + * + */ + guint32 get_unigram_frequency(){ + char * buf_begin = (char *)m_chunk.begin(); + return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8))); + } + + /** + * PhraseItem::get_pronunciation_possibility: + * @options: the pinyin options. + * @keys: the pronunciation keys. + * @returns: the possibility of this phrase item pronounces the pinyin. + * + * Get the possibility of this phrase item pronounces the pinyin. + * + */ + gfloat get_pronunciation_possibility(pinyin_option_t options, + ChewingKey * keys){ + guint8 phrase_length = get_phrase_length(); + guint8 npron = get_n_pronunciation(); + size_t offset = phrase_item_header + phrase_length * sizeof (ucs4_t); + char * buf_begin = (char *)m_chunk.begin(); + guint32 matched = 0, total_freq =0; + for ( int i = 0 ; i < npron ; ++i){ + char * chewing_begin = buf_begin + offset + + i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); + guint32 * freq = (guint32 *)(chewing_begin + + phrase_length * sizeof(ChewingKey)); + total_freq += *freq; + if ( 0 == pinyin_compare_with_ambiguities2 + (options, keys, + (ChewingKey *)chewing_begin,phrase_length) ){ + matched += *freq; + } + } + +#if 1 + /* an additional safe guard for chewing. */ + if ( 0 == total_freq ) + return 0; +#endif + + /* used preprocessor to avoid zero freq, in gen_chewing_table. */ + gfloat retval = matched / (gfloat) total_freq; + return retval; + } + + /** + * PhraseItem::increase_pronunciation_possibility: + * @options: the pinyin options. + * @keys: the pronunciation keys. + * @delta: the delta to be added to the pronunciation keys. + * + * Add the delta to the pronunciation of the pronunciation keys. + * + */ + void increase_pronunciation_possibility(pinyin_option_t options, + ChewingKey * keys, + gint32 delta); + + /** + * PhraseItem::get_phrase_string: + * @phrase: the ucs4 character buffer. + * @returns: whether the get operation is successful. + * + * Get the ucs4 characters of this phrase item. + * + */ + bool get_phrase_string(ucs4_t * phrase); + + /** + * PhraseItem::set_phrase_string: + * @phrase_length: the ucs4 character length of this phrase item. + * @phrase: the ucs4 character buffer. + * @returns: whether the set operation is successful. + * + * Set the length and ucs4 characters of this phrase item. + * + */ + bool set_phrase_string(guint8 phrase_length, ucs4_t * phrase); + + /** + * PhraseItem::get_nth_pronunciation: + * @index: the pronunciation index. + * @keys: the pronunciation keys. + * @freq: the frequency of the pronunciation. + * @returns: whether the get operation is successful. + * + * Get the nth pronunciation of this phrase item. + * + */ + bool get_nth_pronunciation(size_t index, + /* out */ ChewingKey * keys, + /* out */ guint32 & freq); + + /** + * PhraseItem::add_pronunciation: + * @keys: the pronunciation keys. + * @delta: the delta of the frequency of the pronunciation. + * @returns: whether the add operation is successful. + * + * Add one pronunciation. + * + */ + bool add_pronunciation(ChewingKey * keys, guint32 delta); + + /** + * PhraseItem::remove_nth_pronunciation: + * @index: the pronunciation index. + * + * Remove the nth pronunciation. + * + * Note: Normally don't change the first pronunciation, + * which decides the token number. + * + */ + void remove_nth_pronunciation(size_t index); + + bool operator == (const PhraseItem & rhs) const{ + if (m_chunk.size() != rhs.m_chunk.size()) + return false; + return memcmp(m_chunk.begin(), rhs.m_chunk.begin(), + m_chunk.size()) == 0; + } + + bool operator != (const PhraseItem & rhs) const{ + return ! (*this == rhs); + } +}; + +/* + * In Sub Phrase Index, token == (token & PHRASE_MASK). + */ + +/** + * SubPhraseIndex: + * + * The SubPhraseIndex class for internal usage. + * + */ +class SubPhraseIndex{ +private: + guint32 m_total_freq; + MemoryChunk m_phrase_index; + MemoryChunk m_phrase_content; + MemoryChunk * m_chunk; + + void reset(){ + m_total_freq = 0; + m_phrase_index.set_size(0); + m_phrase_content.set_size(0); + if ( m_chunk ){ + delete m_chunk; + m_chunk = NULL; + } + } + +public: + /** + * SubPhraseIndex::SubPhraseIndex: + * + * The constructor of the SubPhraseIndex. + * + */ + SubPhraseIndex():m_total_freq(0){ + m_chunk = NULL; + } + + /** + * SubPhraseIndex::~SubPhraseIndex: + * + * The destructor of the SubPhraseIndex. + * + */ + ~SubPhraseIndex(){ + reset(); + } + + /** + * SubPhraseIndex::load: + * @chunk: the memory chunk of the binary sub phrase index. + * @offset: the begin of binary data in the memory chunk. + * @end: the end of binary data in the memory chunk. + * @returns: whether the load operation is successful. + * + * Load the sub phrase index from the memory chunk. + * + */ + bool load(MemoryChunk * chunk, + table_offset_t offset, table_offset_t end); + + /** + * SubPhraseIndex::store: + * @new_chunk: the new memory chunk to store this sub phrase index. + * @offset: the begin of binary data in the memory chunk. + * @end: the end of stored binary data in the memory chunk. + * @returns: whether the store operation is successful. + * + * Store the sub phrase index to the new memory chunk. + * + */ + bool store(MemoryChunk * new_chunk, + table_offset_t offset, table_offset_t & end); + + /** + * SubPhraseIndex::diff: + * @oldone: the original content of sub phrase index. + * @logger: the delta information of user self-learning data. + * @returns: whether the diff operation is successful. + * + * Compare this sub phrase index with the original content of the system + * sub phrase index to generate the logger of difference. + * + * Note: Switch to logger format to reduce user space storage. + * + */ + bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger); + + /** + * SubPhraseIndex::merge: + * @logger: the logger of difference in user home directory. + * @returns: whether the merge operation is successful. + * + * Merge the user logger of difference with this sub phrase index. + * + */ + bool merge(PhraseIndexLogger * logger); + + /** + * SubPhraseIndex::get_range: + * @range: the token range. + * @returns: whether the get operation is successful. + * + * Get the token range in this sub phrase index. + * + */ + int get_range(/* out */ PhraseIndexRange & range); + + /** + * SubPhraseIndex::get_phrase_index_total_freq: + * @returns: the total frequency of this sub phrase index. + * + * Get the total frequency of this sub phrase index. + * + * Note: maybe call it "Zero-gram". + * + */ + guint32 get_phrase_index_total_freq(); + + /** + * SubPhraseIndex::add_unigram_frequency: + * @token: the phrase token. + * @delta: the delta value of the phrase token. + * @returns: the status of the add operation. + * + * Add delta value to the phrase of the token. + * + * Note: this method is a fast path to add delta value. + * Maybe use the get_phrase_item method instead in future. + * + */ + int add_unigram_frequency(phrase_token_t token, guint32 delta); + + /** + * SubPhraseIndex::get_phrase_item: + * @token: the phrase token. + * @item: the phrase item of the token. + * @returns: the status of the get operation. + * + * Get the phrase item from this sub phrase index. + * + * Note:get_phrase_item function can't modify the phrase item size, + * but can increment the freq of the special pronunciation, + * or change the content without size increasing. + * + */ + int get_phrase_item(phrase_token_t token, PhraseItem & item); + + /** + * SubPhraseIndex::add_phrase_item: + * @token: the phrase token. + * @item: the phrase item of the token. + * @returns: the status of the add operation. + * + * Add the phrase item to this sub phrase index. + * + */ + int add_phrase_item(phrase_token_t token, PhraseItem * item); + + /** + * SubPhraseIndex::remove_phrase_item: + * @token: the phrase token. + * @item: the removed phrase item of the token. + * @returns: the status of the remove operation. + * + * Remove the phrase item of the token. + * + * Note: this remove_phrase_item method will substract the unigram + * frequency of the removed item from m_total_freq. + * + */ + int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item); + + /** + * SubPhraseIndex::mask_out: + * @mask: the mask. + * @value: the value. + * @returns: whether the mask out operation is successful. + * + * Mask out the matched phrase items. + * + */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + +/** + * FacadePhraseIndex: + * + * The facade class of phrase index. + * + */ +class FacadePhraseIndex{ +private: + guint32 m_total_freq; + SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT]; +public: + /** + * FacadePhraseIndex::FacadePhraseIndex: + * + * The constructor of the FacadePhraseIndex. + * + */ + FacadePhraseIndex(){ + m_total_freq = 0; + memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices)); + } + + /** + * FacadePhraseIndex::~FacadePhraseIndex: + * + * The destructor of the FacadePhraseIndex. + * + */ + ~FacadePhraseIndex(){ + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){ + if ( m_sub_phrase_indices[i] ){ + delete m_sub_phrase_indices[i]; + m_sub_phrase_indices[i] = NULL; + } + } + } + + /** + * FacadePhraseIndex::load_text: + * @phrase_index: the index of sub phrase index to be loaded. + * @infile: the textual format file of the phrase table. + * @returns: whether the load operation is successful. + * + * Load one sub phrase index from the textual format file. + * Note: load sub phrase index according to the config in future. + * + */ + bool load_text(guint8 phrase_index, FILE * infile); + + /** + * FacadePhraseIndex::load: + * @phrase_index: the index of sub phrase index to be loaded. + * @chunk: the memory chunk of sub phrase index to be loaded. + * @returns: whether the load operation is successful. + * + * Load one sub phrase index from the memory chunk. + * + */ + bool load(guint8 phrase_index, MemoryChunk * chunk); + + /** + * FacadePhraseIndex::store: + * @phrase_index: the index of sub phrase index to be stored. + * @new_chunk: the memory chunk of sub phrase index to be stored. + * @returns: whether the store operation is successful. + * + * Store one sub phrase index to the memory chunk. + * + */ + bool store(guint8 phrase_index, MemoryChunk * new_chunk); + + /** + * FacadePhraseIndex::unload: + * @phrase_index: the index of sub phrase index to be unloaded. + * @returns: whether the unload operation is successful. + * + * Unload one sub phrase index. + * + */ + bool unload(guint8 phrase_index); + + + /** + * FacadePhraseIndex::diff: + * @phrase_index: the index of sub phrase index to be differed. + * @oldchunk: the original content of sub phrase index. + * @newlog: the delta information of user self-learning data. + * @returns: whether the diff operation is successful. + * + * Store user delta information in the logger format. + * + * Note: the ownership of oldchunk is transfered here. + * + */ + bool diff(guint8 phrase_index, MemoryChunk * oldchunk, + MemoryChunk * newlog); + + /** + * FacadePhraseIndex::merge: + * @phrase_index: the index of sub phrase index to be merged. + * @log: the logger of difference in user home directory. + * @returns: whether the merge operation is successful. + * + * Merge the user logger of difference with the sub phrase index. + * + * Note: the ownership of log is transfered here. + * + */ + bool merge(guint8 phrase_index, MemoryChunk * log); + + /** + * FacadePhraseIndex::merge_with_mask: + * @phrase_index: the index of sub phrase index to be merged. + * @log: the logger of difference in user home directory. + * @mask: the mask. + * @value: the value. + * @returns: whether the merge operation is successful. + * + * Merge the user logger of difference with mask operation. + * + * Note: the ownership of log is transfered here. + * + */ + bool merge_with_mask(guint8 phrase_index, MemoryChunk * log, + phrase_token_t mask, phrase_token_t value); + + /** + * FacadePhraseIndex::compact: + * @returns: whether the compact operation is successful. + * + * Compat all sub phrase index memory usage. + * + */ + bool compact(); + + /** + * FacadePhraseIndex::mask_out: + * @phrase_index: the index of sub phrase index. + * @mask: the mask. + * @value: the value. + * @returns: whether the mask out operation is successful. + * + * Mask out the matched phrase items. + * + * Note: should call compact() after the mask out operation. + * + */ + bool mask_out(guint8 phrase_index, + phrase_token_t mask, phrase_token_t value); + + /** + * FacadePhraseIndex::get_sub_phrase_range: + * @min_index: the minimal sub phrase index. + * @max_index: the maximal sub phrase index. + * @returns: the status of the get operation. + * + * Get the minimum and maximum of the sub phrase index. + * + */ + int get_sub_phrase_range(guint8 & min_index, guint8 & max_index); + + /** + * FacadePhraseIndex::get_range: + * @phrase_index: the index of sub phrase index. + * @range: the token range of the sub phrase index. + * @returns: the status of the get operation. + * + * Get the token range of the sub phrase index. + * + */ + int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range); + + /** + * FacadePhraseIndex::get_phrase_index_total_freq: + * @returns: the total freq of the facade phrase index. + * + * Get the total freq of the facade phrase index. + * + * Note: maybe call it "Zero-gram". + * + */ + guint32 get_phrase_index_total_freq(){ + return m_total_freq; + } + + /** + * FacadePhraseIndex::add_unigram_frequency: + * @token: the phrase token. + * @delta: the delta value of the phrase token. + * @returns: the status of the add operation. + * + * Add delta value to the phrase of the token. + * + */ + int add_unigram_frequency(phrase_token_t token, guint32 delta){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ) + return ERROR_NO_SUB_PHRASE_INDEX; + m_total_freq += delta; + return sub_phrase->add_unigram_frequency(token, delta); + } + + /** + * FacadePhraseIndex::get_phrase_item: + * @token: the phrase token. + * @item: the phrase item of the token. + * @returns: the status of the get operation. + * + * Get the phrase item from the facade phrase index. + * + */ + int get_phrase_item(phrase_token_t token, PhraseItem & item){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ) + return ERROR_NO_SUB_PHRASE_INDEX; + return sub_phrase->get_phrase_item(token, item); + } + + /** + * FacadePhraseIndex::add_phrase_item: + * @token: the phrase token. + * @item: the phrase item of the token. + * @returns: the status of the add operation. + * + * Add the phrase item to the facade phrase index. + * + */ + int add_phrase_item(phrase_token_t token, PhraseItem * item){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ){ + sub_phrase = new SubPhraseIndex; + } + m_total_freq += item->get_unigram_frequency(); + return sub_phrase->add_phrase_item(token, item); + } + + /** + * FacadePhraseIndex::remove_phrase_item: + * @token: the phrase token. + * @item: the removed phrase item of the token. + * @returns: the status of the remove operation. + * + * Remove the phrase item of the token. + * + */ + int remove_phrase_item(phrase_token_t token, PhraseItem * & item){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ){ + return ERROR_NO_SUB_PHRASE_INDEX; + } + int result = sub_phrase->remove_phrase_item(token, item); + if ( result ) + return result; + m_total_freq -= item->get_unigram_frequency(); + return result; + } + + /** + * FacadePhraseIndex::prepare_ranges: + * @ranges: the ranges to be prepared. + * @returns: whether the prepare operation is successful. + * + * Prepare the ranges. + * + */ + bool prepare_ranges(PhraseIndexRanges ranges) { + /* assume memset(ranges, 0, sizeof(ranges)); */ + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & range = ranges[i]; + assert(NULL == range); + + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i]; + if (sub_phrase) { + range = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange)); + } + } + return true; + } + + /** + * FacadePhraseIndex::clear_ranges: + * @ranges: the ranges to be cleared. + * @returns: whether the clear operation is successful. + * + * Clear the ranges. + * + */ + bool clear_ranges(PhraseIndexRanges ranges) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * range = ranges[i]; + if (range) { + g_array_set_size(range, 0); + } + } + return true; + } + + /** + * FacadePhraseIndex::destroy_ranges: + * @ranges: the ranges to be destroyed. + * @returns: whether the destroy operation is successful. + * + * Destroy the ranges. + * + */ + bool destroy_ranges(PhraseIndexRanges ranges) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & range = ranges[i]; + if (range) { + g_array_free(range, TRUE); + range = NULL; + } + } + return true; + } + + /** + * FacadePhraseIndex::prepare_tokens: + * @tokens: the tokens to be prepared. + * @returns: whether the prepare operation is successful. + * + * Prepare the tokens. + * + */ + bool prepare_tokens(PhraseTokens tokens) { + /* assume memset(tokens, 0, sizeof(tokens)); */ + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & token = tokens[i]; + assert(NULL == token); + + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i]; + if (sub_phrase) { + token = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + } + } + return true; + } + + /** + * FacadePhraseIndex::clear_tokens: + * @tokens: the tokens to be cleared. + * @return: whether the clear operation is successful. + * + * Clear the tokens. + * + */ + bool clear_tokens(PhraseTokens tokens) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * token = tokens[i]; + if (token) { + g_array_set_size(token, 0); + } + } + return true; + } + + /** + * FacadePhraseIndex::destroy_tokens: + * @tokens: the tokens to be destroyed. + * @returns: whether the destroy operation is successful. + * + * Destroy the tokens. + * + */ + bool destroy_tokens(PhraseTokens tokens) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & token = tokens[i]; + if (token) { + g_array_free(token, TRUE); + token = NULL; + } + } + return true; + } + + /** + * FacadePhraseIndex::create_sub_phrase: + * @index: the phrase index to be created. + * @returns: the result of the create operation. + * + * Create the sub phrase index. + * + */ + int create_sub_phrase(guint8 index) { + SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index]; + if (sub_phrase) { + return ERROR_ALREADY_EXISTS; + } + + sub_phrase = new SubPhraseIndex; + + return ERROR_OK; + } +}; + +PhraseIndexLogger * mask_out_phrase_index_logger +(PhraseIndexLogger * oldlogger, phrase_token_t mask, phrase_token_t value); + +}; + +#endif diff --git a/src/storage/phrase_index_logger.h b/src/storage/phrase_index_logger.h new file mode 100644 index 0000000..06f933e --- /dev/null +++ b/src/storage/phrase_index_logger.h @@ -0,0 +1,305 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef PHRASE_LOGGER_H +#define PHRASE_LOGGER_H + +#include <assert.h> +#include "novel_types.h" +#include "memory_chunk.h" + +/** + * File Format + * Logger Record type: add/remove/modify + * + * Modify Header: header/null token/len/old data chunk/new data chunk + * + * Add Record: add/token/len/data chunk + * Remove Record: remove/token/len/data chunk + * Modify Record: modify/token/old len/new len/old data chunk/new data chunk + * + */ + +namespace pinyin{ + +enum LOG_TYPE{ + LOG_ADD_RECORD = 1, + LOG_REMOVE_RECORD, + LOG_MODIFY_RECORD, + LOG_MODIFY_HEADER +}; + + +/** + * PhraseIndexLogger: + * + * The logger of phrase index changes. + * + */ +class PhraseIndexLogger{ +protected: + MemoryChunk * m_chunk; + size_t m_offset; + bool m_error; + + void reset(){ + if ( m_chunk ){ + delete m_chunk; + m_chunk = NULL; + } + m_offset = 0; + m_error = false; + } +public: + /** + * PhraseIndexLogger::PhraseIndexLogger: + * + * The constructor of the PhraseIndexLogger. + * + */ + PhraseIndexLogger():m_offset(0), m_error(false){ + m_chunk = new MemoryChunk; + } + + /** + * PhraseIndexLogger::~PhraseIndexLogger: + * + * The destructor of the PhraseIndexLogger. + * + */ + ~PhraseIndexLogger(){ + reset(); + } + + /** + * PhraseIndexLogger::load: + * @chunk: the memory chunk of the logs. + * @returns: whether the load operation is successful. + * + * Load the logs from the memory chunk. + * + */ + bool load(MemoryChunk * chunk) { + reset(); + m_chunk = chunk; + return true; + } + + /** + * PhraseIndexLogger::store: + * @new_chunk: the new memory chunk to store the logs. + * @returns: whether the store operation is successful. + * + * Store the logs to the new memory chunk. + * + */ + bool store(MemoryChunk * new_chunk){ + new_chunk->set_content(0, m_chunk->begin(), m_chunk->size()); + return true; + } + + /** + * PhraseIndexLogger::has_next_record: + * @returns: whether this logger has next record. + * + * Whether this logger has next record. + * + */ + bool has_next_record(){ + if (m_error) + return false; + + return m_offset < m_chunk->size(); + } + + /** + * PhraseIndexLogger::rewind: + * @returns: whether the rewind operation is successful. + * + * Rewind this logger to the begin of logs. + * + */ + bool rewind(){ + m_offset = 0; + return true; + } + + /** + * PhraseIndexLogger::next_record: + * @log_type: the type of this log record. + * @token: the token of this log record. + * @oldone: the original content of the phrase item. + * @newone: the new content of the phrase item. + * + * Read the next log record. + * + * Prolog: has_next_record() returned true. + * + */ + bool next_record(LOG_TYPE & log_type, phrase_token_t & token, + MemoryChunk * oldone, MemoryChunk * newone){ + size_t offset = m_offset; + m_chunk->get_content(offset, &log_type, sizeof(LOG_TYPE)); + offset += sizeof(LOG_TYPE); + m_chunk->get_content(offset, &token, sizeof(phrase_token_t)); + offset += sizeof(phrase_token_t); + + oldone->set_size(0); newone->set_size(0); + + switch(log_type){ + case LOG_ADD_RECORD:{ + guint16 len = 0; + m_chunk->get_content(offset, &len, sizeof(guint16)); + offset += sizeof(guint16); + newone->set_content(0, ((char *)m_chunk->begin()) + offset, len); + offset += len; + break; + } + case LOG_REMOVE_RECORD:{ + guint16 len = 0; + m_chunk->get_content(offset, &len, sizeof(guint16)); + offset += sizeof(guint16); + oldone->set_content(0, ((char *)m_chunk->begin()) + offset, len); + offset += len; + break; + } + case LOG_MODIFY_RECORD:{ + guint16 oldlen = 0, newlen = 0; + m_chunk->get_content(offset, &oldlen, sizeof(guint16)); + offset += sizeof(guint16); + m_chunk->get_content(offset, &newlen, sizeof(guint16)); + offset += sizeof(guint16); + oldone->set_content(0, ((char *)m_chunk->begin()) + offset, + oldlen); + offset += oldlen; + newone->set_content(0, ((char *)m_chunk->begin()) + offset, newlen); + offset += newlen; + break; + } + case LOG_MODIFY_HEADER:{ + assert(token == null_token); + guint16 len = 0; + m_chunk->get_content(offset, &len, sizeof(guint16)); + offset += sizeof(guint16); + oldone->set_content(0, ((char *)m_chunk->begin()) + offset, + len); + offset += len; + newone->set_content(0, ((char *)m_chunk->begin()) + offset, + len); + offset += len; + break; + } + default: + m_error = true; + return false; + } + + m_offset = offset; + return true; + } + + /** + * PhraseIndexLogger::append_record: + * @log_type: the type of this log record. + * @token: the token of this log record. + * @oldone: the original content of the phrase item. + * @newone: the new content of the phrase item. + * + * Append one log record to the logger. + * + */ + bool append_record(LOG_TYPE log_type, phrase_token_t token, + MemoryChunk * oldone, MemoryChunk * newone){ + + MemoryChunk chunk; + size_t offset = 0; + chunk.set_content(offset, &log_type, sizeof(LOG_TYPE)); + offset += sizeof(LOG_TYPE); + chunk.set_content(offset, &token, sizeof(phrase_token_t)); + offset += sizeof(phrase_token_t); + + switch(log_type){ + case LOG_ADD_RECORD:{ + assert( NULL == oldone ); + assert( NULL != newone ); + /* use newone chunk */ + guint16 len = newone->size(); + chunk.set_content(offset, &len, sizeof(guint16)); + offset += sizeof(guint16); + chunk.set_content(offset, newone->begin(), newone->size()); + offset += newone->size(); + break; + } + case LOG_REMOVE_RECORD:{ + assert(NULL != oldone); + assert(NULL == newone); + /* use oldone chunk */ + guint16 len = oldone->size(); + chunk.set_content(offset, &len, sizeof(guint16)); + offset += sizeof(guint16); + chunk.set_content(offset, oldone->begin(), oldone->size()); + offset += oldone->size(); + break; + } + case LOG_MODIFY_RECORD:{ + assert(NULL != oldone); + assert(NULL != newone); + guint16 oldlen = oldone->size(); + guint16 newlen = newone->size(); + chunk.set_content(offset, &oldlen, sizeof(guint16)); + offset += sizeof(guint16); + chunk.set_content(offset, &newlen, sizeof(guint16)); + offset += sizeof(guint16); + chunk.set_content(offset, oldone->begin(), oldone->size()); + offset += oldlen; + chunk.set_content(offset, newone->begin(), newone->size()); + offset += newlen; + break; + } + case LOG_MODIFY_HEADER:{ + assert(NULL != oldone); + assert(NULL != newone); + assert(null_token == token); + guint16 oldlen = oldone->size(); + guint16 newlen = newone->size(); + assert(oldlen == newlen); + chunk.set_content(offset, &oldlen, sizeof(guint16)); + offset += sizeof(guint16); + chunk.set_content(offset, oldone->begin(), oldone->size()); + offset += oldlen; + chunk.set_content(offset, newone->begin(), newone->size()); + offset += newlen; + break; + } + default: + assert(false); + } + + /* store log record. */ + m_chunk->set_content(m_chunk->size(), chunk.begin(), chunk.size()); + return true; + } +}; + +}; + +#endif diff --git a/src/storage/phrase_large_table2.cpp b/src/storage/phrase_large_table2.cpp new file mode 100644 index 0000000..f7d8ae2 --- /dev/null +++ b/src/storage/phrase_large_table2.cpp @@ -0,0 +1,809 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <assert.h> +#include <string.h> +#include "phrase_large_table2.h" + + +/* class definition */ + +namespace pinyin{ + +class PhraseLengthIndexLevel2{ +protected: + GArray * m_phrase_array_indexes; +public: + PhraseLengthIndexLevel2(); + ~PhraseLengthIndexLevel2(); + + /* load/store method */ + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end); + + /* search method */ + int search(int phrase_length, /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const; + + /* add_index/remove_index method */ + int add_index(int phrase_length, /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token); + int remove_index(int phrase_length, /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token); + + /* get length method */ + int get_length() const; + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + + +template<size_t phrase_length> +struct PhraseIndexItem2{ + phrase_token_t m_token; + ucs4_t m_phrase[phrase_length]; +public: + PhraseIndexItem2<phrase_length>(const ucs4_t phrase[], phrase_token_t token){ + memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length); + m_token = token; + } +}; + + +template<size_t phrase_length> +class PhraseArrayIndexLevel2{ +protected: + typedef PhraseIndexItem2<phrase_length> IndexItem; + +protected: + MemoryChunk m_chunk; +public: + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end); + + /* search method */ + int search(/* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const; + + /* add_index/remove_index method */ + int add_index(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token); + int remove_index(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token); + + /* get length method */ + int get_length() const; + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + +}; + +using namespace pinyin; + +/* class implementation */ + +template<size_t phrase_length> +static int phrase_compare2(const PhraseIndexItem2<phrase_length> &lhs, + const PhraseIndexItem2<phrase_length> &rhs){ + ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase; + ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase; + + return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length); +} + +template<size_t phrase_length> +static bool phrase_less_than2(const PhraseIndexItem2<phrase_length> & lhs, + const PhraseIndexItem2<phrase_length> & rhs){ + return 0 > phrase_compare2(lhs, rhs); +} + +PhraseBitmapIndexLevel2::PhraseBitmapIndexLevel2(){ + memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes)); +} + +void PhraseBitmapIndexLevel2::reset(){ + for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){ + PhraseLengthIndexLevel2 * & length_array = + m_phrase_length_indexes[i]; + if ( length_array ) + delete length_array; + length_array = NULL; + } +} + + +/* search method */ + +int PhraseBitmapIndexLevel2::search(int phrase_length, + /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const { + assert(phrase_length > 0); + + int result = SEARCH_NONE; + /* use the first 8-bit of the lower 16-bit for bitmap index, + * as most the higher 16-bit are zero. + */ + guint8 first_key = (phrase[0] & 0xFF00) >> 8; + + PhraseLengthIndexLevel2 * phrase_array = m_phrase_length_indexes[first_key]; + if ( phrase_array ) + return phrase_array->search(phrase_length, phrase, tokens); + return result; +} + +PhraseLengthIndexLevel2::PhraseLengthIndexLevel2(){ + m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *)); +} + +PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){ +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2<len> * & array = g_array_index \ + (m_phrase_array_indexes, \ + PhraseArrayIndexLevel2<len> *, len - 1); \ + if ( array ) { \ + delete array; \ + array = NULL; \ + } \ + break; \ + } + + for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i){ + switch (i){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } + } + g_array_free(m_phrase_array_indexes, TRUE); +#undef CASE +} + +int PhraseLengthIndexLevel2::search(int phrase_length, + /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + if(m_phrase_array_indexes->len < phrase_length) + return result; + if (m_phrase_array_indexes->len > phrase_length) + result |= SEARCH_CONTINUED; + +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2<len> * array = g_array_index \ + (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \ + if ( !array ) \ + return result; \ + result |= array->search(phrase, tokens); \ + return result; \ + } + + switch ( phrase_length ){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } +#undef CASE +} + +template<size_t phrase_length> +int PhraseArrayIndexLevel2<phrase_length>::search +(/* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + + IndexItem * chunk_begin = NULL, * chunk_end = NULL; + chunk_begin = (IndexItem *) m_chunk.begin(); + chunk_end = (IndexItem *) m_chunk.end(); + + /* do the search */ + IndexItem search_elem(phrase, -1); + std_lite::pair<IndexItem *, IndexItem *> range; + range = std_lite::equal_range + (chunk_begin, chunk_end, search_elem, + phrase_less_than2<phrase_length>); + + const IndexItem * const begin = range.first; + const IndexItem * const end = range.second; + if (begin == end) + return result; + + const IndexItem * iter = NULL; + GArray * array = NULL; + + for (iter = begin; iter != end; ++iter) { + phrase_token_t token = iter->m_token; + + /* filter out disabled sub phrase indices. */ + array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)]; + if (NULL == array) + continue; + + result |= SEARCH_OK; + + g_array_append_val(array, token); + } + + return result; +} + + +/* add/remove index method */ + +int PhraseBitmapIndexLevel2::add_index(int phrase_length, + /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token){ + guint8 first_key = (phrase[0] & 0xFF00) >> 8; + + PhraseLengthIndexLevel2 * & length_array = + m_phrase_length_indexes[first_key]; + + if ( !length_array ){ + length_array = new PhraseLengthIndexLevel2(); + } + return length_array->add_index(phrase_length, phrase, token); +} + +int PhraseBitmapIndexLevel2::remove_index(int phrase_length, + /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token){ + guint8 first_key = (phrase[0] & 0xFF00) >> 8; + + PhraseLengthIndexLevel2 * & length_array = + m_phrase_length_indexes[first_key]; + + if (NULL == length_array) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + + int retval = length_array->remove_index(phrase_length, phrase, token); + + /* remove empty array. */ + if (0 == length_array->get_length()) { + delete length_array; + length_array = NULL; + } + + return retval; +} + +int PhraseLengthIndexLevel2::add_index(int phrase_length, + /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token) { + if (phrase_length >= MAX_PHRASE_LENGTH) + return ERROR_PHRASE_TOO_LONG; + + if (m_phrase_array_indexes->len < phrase_length) + g_array_set_size(m_phrase_array_indexes, phrase_length); + +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2<len> * & array = g_array_index \ + (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \ + if ( !array ) \ + array = new PhraseArrayIndexLevel2<len>; \ + return array->add_index(phrase, token); \ + } + + switch(phrase_length){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } + +#undef CASE +} + +int PhraseLengthIndexLevel2::remove_index(int phrase_length, + /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token) { + if (phrase_length >= MAX_PHRASE_LENGTH) + return ERROR_PHRASE_TOO_LONG; + + if (m_phrase_array_indexes->len < phrase_length) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2<len> * & array = g_array_index \ + (m_phrase_array_indexes, \ + PhraseArrayIndexLevel2<len> *, len - 1); \ + if (NULL == array) \ + return ERROR_REMOVE_ITEM_DONOT_EXISTS; \ + int retval = array->remove_index(phrase, token); \ + \ + /* remove empty array. */ \ + if (0 == array->get_length()) { \ + delete array; \ + array = NULL; \ + \ + /* shrink self array. */ \ + g_array_set_size(m_phrase_array_indexes, \ + get_length()); \ + } \ + return retval; \ + } + + switch(phrase_length){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } +#undef CASE +} + +template<size_t phrase_length> +int PhraseArrayIndexLevel2<phrase_length>::add_index +(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token){ + IndexItem * begin, * end; + + IndexItem add_elem(phrase, token); + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + std_lite::pair<IndexItem *, IndexItem *> range; + range = std_lite::equal_range + (begin, end, add_elem, phrase_less_than2<phrase_length>); + + IndexItem * cur_elem; + for (cur_elem = range.first; + cur_elem != range.second; ++cur_elem) { + if (cur_elem->m_token == token) + return ERROR_INSERT_ITEM_EXISTS; + if (cur_elem->m_token > token) + break; + } + + int offset = (cur_elem - begin) * sizeof(IndexItem); + m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem)); + return ERROR_OK; +} + +template<size_t phrase_length> +int PhraseArrayIndexLevel2<phrase_length>::remove_index +(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token) { + IndexItem * begin, * end; + + IndexItem remove_elem(phrase, token); + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + std_lite::pair<IndexItem *, IndexItem *> range; + range = std_lite::equal_range + (begin, end, remove_elem, phrase_less_than2<phrase_length>); + + IndexItem * cur_elem; + for (cur_elem = range.first; + cur_elem != range.second; ++cur_elem) { + if (cur_elem->m_token == token) + break; + } + + if (cur_elem == range.second) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + + int offset = (cur_elem - begin) * sizeof(IndexItem); + m_chunk.remove_content(offset, sizeof(IndexItem)); + return ERROR_OK; +} + + +/* load text method */ + +bool PhraseLargeTable2::load_text(FILE * infile){ + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + + while (!feof(infile)) { + int num = fscanf(infile, "%s %s %u %ld", + pinyin, phrase, &token, &freq); + + if (4 != num) + continue; + + if (feof(infile)) + break; + + glong phrase_len = g_utf8_strlen(phrase, -1); + ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); + add_index(phrase_len, new_phrase, token); + + g_free(new_phrase); + } + return true; +} + + +/* load/store method */ + +bool PhraseBitmapIndexLevel2::load(MemoryChunk * chunk, + table_offset_t offset, + table_offset_t end){ + reset(); + char * buf_begin = (char *) chunk->begin(); + table_offset_t phrase_begin, phrase_end; + table_offset_t * index = (table_offset_t *) (buf_begin + offset); + phrase_end = *index; + + for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) { + phrase_begin = phrase_end; + index++; + phrase_end = *index; + if ( phrase_begin == phrase_end ) //null pointer + continue; + + /* after reset() all phrases are null pointer. */ + PhraseLengthIndexLevel2 * phrases = new PhraseLengthIndexLevel2; + m_phrase_length_indexes[i] = phrases; + + phrases->load(chunk, phrase_begin, phrase_end - 1); + assert( phrase_end <= end ); + assert( *(buf_begin + phrase_end - 1) == c_separate); + } + offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t); + assert( c_separate == *(buf_begin + offset) ); + return true; +} + +bool PhraseBitmapIndexLevel2::store(MemoryChunk * new_chunk, + table_offset_t offset, + table_offset_t & end){ + table_offset_t phrase_end; + table_offset_t index = offset; + offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t); + //add '#' + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset +=sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) { + PhraseLengthIndexLevel2 * phrases = m_phrase_length_indexes[i]; + if ( !phrases ) { //null pointer + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + continue; + } + phrases->store(new_chunk, offset, phrase_end); //has a end '#' + offset = phrase_end; + //add '#' + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + } + end = offset; + return true; +} + +bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk, + table_offset_t offset, + table_offset_t end) { + char * buf_begin = (char *) chunk->begin(); + guint32 nindex = *((guint32 *)(buf_begin + offset)); + table_offset_t * index = (table_offset_t *) + (buf_begin + offset + sizeof(guint32)); + + table_offset_t phrase_begin, phrase_end = *index; + g_array_set_size(m_phrase_array_indexes, 0); + for (size_t i = 1; i <= nindex; ++i) { + phrase_begin = phrase_end; + index++; + phrase_end = *index; + if ( phrase_begin == phrase_end ){ + void * null = NULL; + g_array_append_val(m_phrase_array_indexes, null); + continue; + } + +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2<len> * phrase = \ + new PhraseArrayIndexLevel2<len>; \ + phrase->load(chunk, phrase_begin, phrase_end - 1); \ + assert( *(buf_begin + phrase_end - 1) == c_separate ); \ + assert( phrase_end <= end ); \ + g_array_append_val(m_phrase_array_indexes, phrase); \ + break; \ + } + switch ( i ){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } +#undef CASE + } + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + assert ( c_separate == * (buf_begin + offset) ); + return true; +} + +bool PhraseLengthIndexLevel2::store(MemoryChunk * new_chunk, + table_offset_t offset, + table_offset_t & end) { + guint32 nindex = m_phrase_array_indexes->len; + new_chunk->set_content(offset, &nindex, sizeof(guint32)); + table_offset_t index = offset + sizeof(guint32); + + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + + table_offset_t phrase_end; + for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) { +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2<len> * phrase = g_array_index \ + (m_phrase_array_indexes, PhraseArrayIndexLevel2<len> *, len - 1); \ + if ( !phrase ){ \ + new_chunk->set_content \ + (index, &offset, sizeof(table_offset_t)); \ + index += sizeof(table_offset_t); \ + continue; \ + } \ + phrase->store(new_chunk, offset, phrase_end); \ + offset = phrase_end; \ + break; \ + } + switch ( i ){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } + //add '#' + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + +#undef CASE + } + end = offset; + return true; +} + +template<size_t phrase_length> +bool PhraseArrayIndexLevel2<phrase_length>:: +load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){ + char * buf_begin = (char *) chunk->begin(); + m_chunk.set_chunk(buf_begin + offset, end - offset, NULL); + return true; +} + +template<size_t phrase_length> +bool PhraseArrayIndexLevel2<phrase_length>:: +store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) { + new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size()); + end = offset + m_chunk.size(); + return true; +} + + +/* get length method */ + +int PhraseLengthIndexLevel2::get_length() const { + int length = m_phrase_array_indexes->len; + + /* trim trailing zero. */ + for (int i = length - 1; i >= 0; --i) { + void * array = g_array_index(m_phrase_array_indexes, void *, i); + + if (NULL != array) + break; + + --length; + } + + return length; +} + +template<size_t phrase_length> +int PhraseArrayIndexLevel2<phrase_length>::get_length() const { + IndexItem * chunk_begin = NULL, * chunk_end = NULL; + chunk_begin = (IndexItem *) m_chunk.begin(); + chunk_end = (IndexItem *) m_chunk.end(); + + return chunk_end - chunk_begin; +} + + +/* mask out method */ + +bool PhraseBitmapIndexLevel2::mask_out(phrase_token_t mask, + phrase_token_t value){ + for (size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) { + PhraseLengthIndexLevel2 * & length_array = + m_phrase_length_indexes[i]; + + if (NULL == length_array) + continue; + + length_array->mask_out(mask, value); + + if (0 == length_array->get_length()) { + delete length_array; + length_array = NULL; + } + } + + return true; +} + +bool PhraseLengthIndexLevel2::mask_out(phrase_token_t mask, + phrase_token_t value){ +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2<len> * & array = g_array_index \ + (m_phrase_array_indexes, \ + PhraseArrayIndexLevel2<len> *, len - 1); \ + \ + if (NULL == array) \ + continue; \ + \ + array->mask_out(mask, value); \ + \ + if (0 == array->get_length()) { \ + delete array; \ + array = NULL; \ + } \ + break; \ + } + + for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) { + switch (i) { + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } + } + /* shrink self array. */ + g_array_set_size(m_phrase_array_indexes, get_length()); +#undef CASE + return true; +} + +template<size_t phrase_length> +bool PhraseArrayIndexLevel2<phrase_length>::mask_out +(phrase_token_t mask, phrase_token_t value) { + IndexItem * begin = NULL, * end = NULL; + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + for (IndexItem * cur = begin; cur != end; ++cur) { + if ((cur->m_token & mask) != value) + continue; + + int offset = (cur - begin) * sizeof(IndexItem); + m_chunk.remove_content(offset, sizeof(IndexItem)); + + /* update chunk end. */ + end = (IndexItem *) m_chunk.end(); + --cur; + } + + return true; +} diff --git a/src/storage/phrase_large_table2.h b/src/storage/phrase_large_table2.h new file mode 100644 index 0000000..cf6807c --- /dev/null +++ b/src/storage/phrase_large_table2.h @@ -0,0 +1,157 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef PHRASE_LARGE_TABLE2_H +#define PHRASE_LARGE_TABLE2_H + +#include <stdio.h> +#include "novel_types.h" +#include "memory_chunk.h" + +namespace pinyin{ + +const size_t PHRASE_NUMBER_OF_BITMAP_INDEX = 1<<(sizeof(ucs4_t) / 4 * 8); + +class PhraseLengthIndexLevel2; + +class PhraseBitmapIndexLevel2{ +protected: + PhraseLengthIndexLevel2 * m_phrase_length_indexes[PHRASE_NUMBER_OF_BITMAP_INDEX]; + /* use the third byte of ucs4_t for class PhraseLengthIndexLevel2. */ + void reset(); +public: + PhraseBitmapIndexLevel2(); + ~PhraseBitmapIndexLevel2(){ + reset(); + } + + /* load/store method */ + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end); + + /* search method */ + int search(int phrase_length, /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const; + + /* add_index/remove_index method */ + int add_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token); + + int remove_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token); + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + + +class PhraseLargeTable2{ +protected: + PhraseBitmapIndexLevel2 m_bitmap_table; + MemoryChunk * m_chunk; + + void reset(){ + if ( m_chunk ){ + delete m_chunk; + m_chunk = NULL; + } + } +public: + PhraseLargeTable2(){ + m_chunk = NULL; + } + + ~PhraseLargeTable2(){ + reset(); + } + + /* load/store method */ + bool load(MemoryChunk * chunk){ + reset(); + m_chunk = chunk; + return m_bitmap_table.load(chunk, 0, chunk->size()); + } + + bool store(MemoryChunk * new_chunk){ + table_offset_t end; + return m_bitmap_table.store(new_chunk, 0, end); + } + + bool load_text(FILE * file); + + /* search method */ + int search(int phrase_length, /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const { + return m_bitmap_table.search(phrase_length, phrase, tokens); + } + + /* add_index/remove_index method */ + int add_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token) { + return m_bitmap_table.add_index(phrase_length, phrase, token); + } + + int remove_index(int phrase_length, /* in */ const ucs4_t phrase[], /* in */ phrase_token_t token) { + return m_bitmap_table.remove_index(phrase_length, phrase, token); + } + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value) { + return m_bitmap_table.mask_out(mask, value); + } +}; + + +static inline int reduce_tokens(const PhraseTokens tokens, + TokenVector tokenarray) { + int num = 0; + g_array_set_size(tokenarray, 0); + + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * array = tokens[i]; + if (NULL == array) + continue; + + num += array->len; + + g_array_append_vals(tokenarray, array->data, array->len); + } + + /* the following line will be removed in future after code are verified. */ + assert(0 <= num && num <= 4); + + return num; +} + +/* for compatibility. */ +static inline int get_first_token(const PhraseTokens tokens, + /* out */ phrase_token_t & token){ + token = null_token; + + TokenVector tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + int num = reduce_tokens(tokens, tokenarray); + if (num) + token = g_array_index(tokenarray, phrase_token_t, 0); + g_array_free(tokenarray, TRUE); + + return num; +} + +}; + +#endif diff --git a/src/storage/pinyin_custom2.h b/src/storage/pinyin_custom2.h new file mode 100644 index 0000000..4685a07 --- /dev/null +++ b/src/storage/pinyin_custom2.h @@ -0,0 +1,111 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef PINYIN_CUSTOM2_H +#define PINYIN_CUSTOM2_H + +#include <glib.h> + +G_BEGIN_DECLS + +/** + * PinyinTableFlag: + */ +enum PinyinTableFlag{ + IS_CHEWING = 1U << 1, + IS_PINYIN = 1U << 2, + PINYIN_INCOMPLETE = 1U << 3, + CHEWING_INCOMPLETE = 1U << 4, + USE_TONE = 1U << 5, + USE_DIVIDED_TABLE = 1U << 6, + USE_RESPLIT_TABLE = 1U << 7, + DYNAMIC_ADJUST = 1U << 8 +}; + +/** + * PinyinAmbiguity2: + * + * The enums of pinyin ambiguities. + * + */ +enum PinyinAmbiguity2{ + PINYIN_AMB_C_CH = 1U << 9, + PINYIN_AMB_S_SH = 1U << 10, + PINYIN_AMB_Z_ZH = 1U << 11, + PINYIN_AMB_F_H = 1U << 12, + PINYIN_AMB_G_K = 1U << 13, + PINYIN_AMB_L_N = 1U << 14, + PINYIN_AMB_L_R = 1U << 15, + PINYIN_AMB_AN_ANG = 1U << 16, + PINYIN_AMB_EN_ENG = 1U << 17, + PINYIN_AMB_IN_ING = 1U << 18, + PINYIN_AMB_ALL = 0x3FFU << 9 +}; + +/** + * PinyinCorrection2: + * + * The enums of pinyin corrections. + * + */ + +enum PinyinCorrection2{ + PINYIN_CORRECT_GN_NG = 1U << 21, + PINYIN_CORRECT_MG_NG = 1U << 22, + PINYIN_CORRECT_IOU_IU = 1U << 23, + PINYIN_CORRECT_UEI_UI = 1U << 24, + PINYIN_CORRECT_UEN_UN = 1U << 25, + PINYIN_CORRECT_UE_VE = 1U << 26, + PINYIN_CORRECT_V_U = 1U << 27, + PINYIN_CORRECT_ON_ONG = 1U << 28, + PINYIN_CORRECT_ALL = 0xFFU << 21 +}; + +/** + * @brief enums of Double Pinyin Schemes. + */ +enum DoublePinyinScheme +{ + DOUBLE_PINYIN_ZRM = 1, + DOUBLE_PINYIN_MS = 2, + DOUBLE_PINYIN_ZIGUANG = 3, + DOUBLE_PINYIN_ABC = 4, + DOUBLE_PINYIN_PYJJ = 6, + DOUBLE_PINYIN_XHE = 7, + DOUBLE_PINYIN_CUSTOMIZED = 30, /* for user's keyboard */ + DOUBLE_PINYIN_DEFAULT = DOUBLE_PINYIN_MS +}; + +/** + * @brief enums of Chewing Schemes. + */ +enum ChewingScheme +{ + CHEWING_STANDARD = 1, + CHEWING_IBM = 2, + CHEWING_GINYIEH = 3, + CHEWING_ETEN = 4, + CHEWING_DEFAULT = CHEWING_STANDARD +}; + +G_END_DECLS + +#endif diff --git a/src/storage/pinyin_parser2.cpp b/src/storage/pinyin_parser2.cpp new file mode 100644 index 0000000..5d406ae --- /dev/null +++ b/src/storage/pinyin_parser2.cpp @@ -0,0 +1,989 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin_parser2.h" +#include <ctype.h> +#include <assert.h> +#include <stdio.h> +#include <string.h> +#include "stl_lite.h" +#include "pinyin_phrase2.h" +#include "pinyin_custom2.h" +#include "chewing_key.h" +#include "pinyin_parser_table.h" +#include "double_pinyin_table.h" +#include "chewing_table.h" + + +using namespace pinyin; + +static bool check_pinyin_options(pinyin_option_t options, const pinyin_index_item_t * item) { + guint32 flags = item->m_flags; + assert (flags & IS_PINYIN); + + /* handle incomplete pinyin. */ + if (flags & PINYIN_INCOMPLETE) { + if (!(options & PINYIN_INCOMPLETE)) + return false; + } + + /* handle correct pinyin, currently only one flag per item. */ + flags &= PINYIN_CORRECT_ALL; + options &= PINYIN_CORRECT_ALL; + + if (flags) { + if ((flags & options) != flags) + return false; + } + + return true; +} + +static bool check_chewing_options(pinyin_option_t options, const chewing_index_item_t * item) { + guint32 flags = item->m_flags; + assert (flags & IS_CHEWING); + + /* handle incomplete chewing. */ + if (flags & CHEWING_INCOMPLETE) { + if (!(options & CHEWING_INCOMPLETE)) + return false; + } + + return true; +} + + +gint _ChewingKey::get_table_index() { + assert(m_initial < CHEWING_NUMBER_OF_INITIALS); + assert(m_middle < CHEWING_NUMBER_OF_MIDDLES); + assert(m_final < CHEWING_NUMBER_OF_FINALS); + + gint index = chewing_key_table[(m_initial * CHEWING_NUMBER_OF_MIDDLES + m_middle) * CHEWING_NUMBER_OF_FINALS + m_final]; + return index == -1 ? 0 : index; +} + +gchar * _ChewingKey::get_pinyin_string() { + assert(m_tone < CHEWING_NUMBER_OF_TONES); + gint index = get_table_index(); + assert(index < G_N_ELEMENTS(content_table)); + const content_table_item_t & item = content_table[index]; + + if (CHEWING_ZERO_TONE == m_tone) { + return g_strdup(item.m_pinyin_str); + } else { + return g_strdup_printf("%s%d", item.m_pinyin_str, m_tone); + } +} + +gchar * _ChewingKey::get_shengmu_string() { + gint index = get_table_index(); + assert(index < G_N_ELEMENTS(content_table)); + const content_table_item_t & item = content_table[index]; + return g_strdup(item.m_shengmu_str); +} + +gchar * _ChewingKey::get_yunmu_string() { + gint index = get_table_index(); + assert(index < G_N_ELEMENTS(content_table)); + const content_table_item_t & item = content_table[index]; + return g_strdup(item.m_yunmu_str); +} + +gchar * _ChewingKey::get_chewing_string() { + assert(m_tone < CHEWING_NUMBER_OF_TONES); + gint index = get_table_index(); + assert(index < G_N_ELEMENTS(content_table)); + const content_table_item_t & item = content_table[index]; + + if (CHEWING_ZERO_TONE == m_tone) { + return g_strdup(item.m_chewing_str); + } else { + return g_strdup_printf("%s%s", item.m_chewing_str, + chewing_tone_table[m_tone]); + } +} + + +/* Pinyin Parsers */ + +/* internal information for pinyin parsers. */ +struct parse_value_t{ + ChewingKey m_key; + ChewingKeyRest m_key_rest; + gint16 m_num_keys; + gint16 m_parsed_len; + gint16 m_last_step; + + /* constructor */ +public: + parse_value_t(){ + m_num_keys = 0; + m_parsed_len = 0; + m_last_step = -1; + } +}; + +const guint16 max_full_pinyin_length = 7; /* include tone. */ + +const guint16 max_double_pinyin_length = 3; /* include tone. */ + +const guint16 max_chewing_length = 4; /* include tone. */ + +static bool compare_pinyin_less_than(const pinyin_index_item_t & lhs, + const pinyin_index_item_t & rhs){ + return 0 > strcmp(lhs.m_pinyin_input, rhs.m_pinyin_input); +} + +static inline bool search_pinyin_index(pinyin_option_t options, + const char * pinyin, + ChewingKey & key){ + pinyin_index_item_t item; + memset(&item, 0, sizeof(item)); + item.m_pinyin_input = pinyin; + + std_lite::pair<const pinyin_index_item_t *, + const pinyin_index_item_t *> range; + range = std_lite::equal_range + (pinyin_index, pinyin_index + G_N_ELEMENTS(pinyin_index), + item, compare_pinyin_less_than); + + guint16 range_len = range.second - range.first; + assert(range_len <= 1); + if (range_len == 1) { + const pinyin_index_item_t * index = range.first; + + if (!check_pinyin_options(options, index)) + return false; + + key = content_table[index->m_table_index].m_chewing_key; + assert(key.get_table_index() == index->m_table_index); + return true; + } + + return false; +} + +static bool compare_chewing_less_than(const chewing_index_item_t & lhs, + const chewing_index_item_t & rhs){ + return 0 > strcmp(lhs.m_chewing_input, rhs.m_chewing_input); +} + +static inline bool search_chewing_index(pinyin_option_t options, + const char * chewing, + ChewingKey & key){ + chewing_index_item_t item; + memset(&item, 0, sizeof(item)); + item.m_chewing_input = chewing; + + std_lite::pair<const chewing_index_item_t *, + const chewing_index_item_t *> range; + range = std_lite::equal_range + (chewing_index, chewing_index + G_N_ELEMENTS(chewing_index), + item, compare_chewing_less_than); + + guint16 range_len = range.second - range.first; + assert (range_len <= 1); + + if (range_len == 1) { + const chewing_index_item_t * index = range.first; + + if (!check_chewing_options(options, index)) + return false; + + key = content_table[index->m_table_index].m_chewing_key; + assert(key.get_table_index() == index->m_table_index); + return true; + } + + return false; +} + +/* Full Pinyin Parser */ +FullPinyinParser2::FullPinyinParser2 (){ + m_parse_steps = g_array_new(TRUE, FALSE, sizeof(parse_value_t)); +} + + +bool FullPinyinParser2::parse_one_key (pinyin_option_t options, + ChewingKey & key, + const char * pinyin, int len) const { + /* "'" are not accepted in parse_one_key. */ + gchar * input = g_strndup(pinyin, len); + assert(NULL == strchr(input, '\'')); + + guint16 tone = CHEWING_ZERO_TONE; guint16 tone_pos = 0; + guint16 parsed_len = len; + key = ChewingKey(); + + if (options & USE_TONE) { + /* find the tone in the last character. */ + char chr = input[parsed_len - 1]; + if ( '0' < chr && chr <= '5' ) { + tone = chr - '0'; + parsed_len --; + tone_pos = parsed_len; + } + } + + /* parse pinyin core staff here. */ + + /* Note: optimize here? */ + input[parsed_len] = '\0'; + if (!search_pinyin_index(options, input, key)) { + g_free(input); + return false; + } + + if (options & USE_TONE) { + /* post processing tone. */ + if ( parsed_len == tone_pos ) { + if (tone != CHEWING_ZERO_TONE) { + key.m_tone = tone; + parsed_len ++; + } + } + } + + g_free(input); + return parsed_len == len; +} + + +int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys, + ChewingKeyRestVector & key_rests, + const char *str, int len) const { + int i; + /* clear arrays. */ + g_array_set_size(keys, 0); + g_array_set_size(key_rests, 0); + + /* init m_parse_steps, and prepare dynamic programming. */ + int step_len = len + 1; + g_array_set_size(m_parse_steps, 0); + parse_value_t value; + for (i = 0; i < step_len; ++i) { + g_array_append_val(m_parse_steps, value); + } + + size_t next_sep = 0; + gchar * input = g_strndup(str, len); + parse_value_t * curstep = NULL, * nextstep = NULL; + + for (i = 0; i < len; ++i) { + if (input[i] == '\'') { + curstep = &g_array_index(m_parse_steps, parse_value_t, i); + nextstep = &g_array_index(m_parse_steps, parse_value_t, i + 1); + + /* propagate current step into next step. */ + nextstep->m_key = ChewingKey(); + nextstep->m_key_rest = ChewingKeyRest(); + nextstep->m_num_keys = curstep->m_num_keys; + nextstep->m_parsed_len = curstep->m_parsed_len + 1; + nextstep->m_last_step = i; + next_sep = 0; + continue; + } + + /* forward to next "'" */ + if ( 0 == next_sep ) { + int k; + for (k = i; k < len; ++k) { + if (input[k] == '\'') + break; + } + next_sep = k; + } + + /* dynamic programming here. */ + /* for (size_t m = i; m < next_sep; ++m) */ + { + size_t m = i; + curstep = &g_array_index(m_parse_steps, parse_value_t, m); + size_t try_len = std_lite::min + (m + max_full_pinyin_length, next_sep); + for (size_t n = m + 1; n < try_len + 1; ++n) { + nextstep = &g_array_index(m_parse_steps, parse_value_t, n); + + /* gen next step */ + const char * onepinyin = input + m; + gint16 onepinyinlen = n - m; + value = parse_value_t(); + + ChewingKey key; ChewingKeyRest rest; + bool parsed = parse_one_key + (options, key, onepinyin, onepinyinlen); + rest.m_raw_begin = m; rest.m_raw_end = n; + if (!parsed) + continue; + + //printf("onepinyin:%s len:%d\n", onepinyin, onepinyinlen); + + value.m_key = key; value.m_key_rest = rest; + value.m_num_keys = curstep->m_num_keys + 1; + value.m_parsed_len = curstep->m_parsed_len + onepinyinlen; + value.m_last_step = m; + + /* save next step */ + /* no previous result */ + if (-1 == nextstep->m_last_step) + *nextstep = value; + /* prefer the longest pinyin */ + if (value.m_parsed_len > nextstep->m_parsed_len) + *nextstep = value; + /* prefer the shortest keys with the same pinyin length */ + if (value.m_parsed_len == nextstep->m_parsed_len && + value.m_num_keys < nextstep->m_num_keys) + *nextstep = value; + + /* handle with the same pinyin length and the number of keys */ + if (value.m_parsed_len == nextstep->m_parsed_len && + value.m_num_keys == nextstep->m_num_keys) { + +#if 0 + /* prefer the complete pinyin with shengmu + * over without shengmu, + * ex: "kaneiji" -> "ka'nei'ji". + */ + if ((value.m_key.m_initial != CHEWING_ZERO_INITIAL && + !(value.m_key.m_middle == CHEWING_ZERO_MIDDLE && + value.m_key.m_final == CHEWING_ZERO_FINAL)) && + nextstep->m_key.m_initial == CHEWING_ZERO_INITIAL) + *nextstep = value; + + /* prefer the complete pinyin 'er' + * over the in-complete pinyin 'r', + * ex: "xierqi" -> "xi'er'qi." + */ + if ((value.m_key.m_initial == CHEWING_ZERO_INITIAL && + value.m_key.m_middle == CHEWING_ZERO_MIDDLE && + value.m_key.m_final == CHEWING_ER) && + (nextstep->m_key.m_initial == CHEWING_R && + nextstep->m_key.m_middle == CHEWING_ZERO_MIDDLE && + nextstep->m_key.m_final == CHEWING_ZERO_FINAL)) + *nextstep = value; +#endif + + /* prefer the 'a' at the end of clause, + * ex: "zheyanga$" -> "zhe'yang'a$". + */ + if (value.m_parsed_len == len && + (nextstep->m_key.m_initial != CHEWING_ZERO_INITIAL && + nextstep->m_key.m_final == CHEWING_A) && + (value.m_key.m_initial == CHEWING_ZERO_INITIAL && + value.m_key.m_middle == CHEWING_ZERO_MIDDLE && + value.m_key.m_final == CHEWING_A)) + *nextstep = value; + } + } + } + } + + /* final step for back tracing. */ + gint16 parsed_len = final_step(step_len, keys, key_rests); + + /* post processing for re-split table. */ + if (options & USE_RESPLIT_TABLE) { + post_process2(options, keys, key_rests, str, len); + } + + g_free(input); + return parsed_len; +} + +int FullPinyinParser2::final_step(size_t step_len, ChewingKeyVector & keys, + ChewingKeyRestVector & key_rests) const{ + int i; + gint16 parsed_len = 0; + parse_value_t * curstep = NULL; + + /* find longest match, which starts from the beginning of input. */ + for (i = step_len - 1; i >= 0; --i) { + curstep = &g_array_index(m_parse_steps, parse_value_t, i); + if (i == curstep->m_parsed_len) + break; + } + /* prepare saving. */ + parsed_len = curstep->m_parsed_len; + gint16 num_keys = curstep->m_num_keys; + g_array_set_size(keys, num_keys); + g_array_set_size(key_rests, num_keys); + + /* save the match. */ + while (curstep->m_last_step != -1) { + gint16 pos = curstep->m_num_keys - 1; + + /* skip "'" */ + if (0 != curstep->m_key.get_table_index()) { + ChewingKey * key = &g_array_index(keys, ChewingKey, pos); + ChewingKeyRest * rest = &g_array_index + (key_rests, ChewingKeyRest, pos); + *key = curstep->m_key; *rest = curstep->m_key_rest; + } + + /* back ward */ + curstep = &g_array_index(m_parse_steps, parse_value_t, + curstep->m_last_step); + } + return parsed_len; +} + +bool FullPinyinParser2::post_process2(pinyin_option_t options, + ChewingKeyVector & keys, + ChewingKeyRestVector & key_rests, + const char * str, + int len) const { + int i; + assert(keys->len == key_rests->len); + gint num_keys = keys->len; + + ChewingKey * cur_key = NULL, * next_key = NULL; + ChewingKeyRest * cur_rest = NULL, * next_rest = NULL; + guint16 next_tone = CHEWING_ZERO_TONE; + + for (i = 0; i < num_keys - 1; ++i) { + cur_rest = &g_array_index(key_rests, ChewingKeyRest, i); + next_rest = &g_array_index(key_rests, ChewingKeyRest, i + 1); + + /* some "'" here */ + if (cur_rest->m_raw_end != next_rest->m_raw_begin) + continue; + + cur_key = &g_array_index(keys, ChewingKey, i); + next_key = &g_array_index(keys, ChewingKey, i + 1); + + /* some tone here */ + if (CHEWING_ZERO_TONE != cur_key->m_tone) + continue; + + /* back up tone */ + if (options & USE_TONE) { + next_tone = next_key->m_tone; + if (CHEWING_ZERO_TONE != next_tone) { + next_key->m_tone = CHEWING_ZERO_TONE; + next_rest->m_raw_end --; + } + } + + /* lookup re-split table */ + const resplit_table_item_t * item = NULL; + + item = retrieve_resplit_item_by_original_pinyins + (options, cur_key, cur_rest, next_key, next_rest, str, len); + + if (item) { + /* no ops */ + if (item->m_orig_freq >= item->m_new_freq) + continue; + + /* do re-split */ + const char * onepinyin = str + cur_rest->m_raw_begin; + size_t len = strlen(item->m_new_keys[0]); + + assert(parse_one_key(options, *cur_key, onepinyin, len)); + cur_rest->m_raw_end = cur_rest->m_raw_begin + len; + + next_rest->m_raw_begin = cur_rest->m_raw_end; + onepinyin = str + next_rest->m_raw_begin; + len = strlen(item->m_new_keys[1]); + + assert(parse_one_key(options, *next_key, onepinyin, len)); + } + + /* restore tones */ + if (options & USE_TONE) { + if (CHEWING_ZERO_TONE != next_tone) { + next_key->m_tone = next_tone; + next_rest->m_raw_end ++; + } + } + } + + return true; +} + +const divided_table_item_t * FullPinyinParser2::retrieve_divided_item +(pinyin_option_t options, ChewingKey * key, ChewingKeyRest * rest, + const char * str, int len) const { + + /* lookup divided table */ + size_t k; + const divided_table_item_t * item = NULL; + for (k = 0; k < G_N_ELEMENTS(divided_table); ++k) { + item = divided_table + k; + + const char * onepinyin = str + rest->m_raw_begin; + size_t len = strlen(item->m_orig_key); + + if (rest->length() != len) + continue; + + if (0 == strncmp(onepinyin, item->m_orig_key, len)) + break; + } + + /* found the match */ + if (k < G_N_ELEMENTS(divided_table)) { + /* do divided */ + item = divided_table + k; + return item; + } + + return NULL; +} + + +const resplit_table_item_t * FullPinyinParser2::retrieve_resplit_item_by_original_pinyins +(pinyin_option_t options, + ChewingKey * cur_key, ChewingKeyRest * cur_rest, + ChewingKey * next_key, ChewingKeyRest * next_rest, + const char * str, int len) const{ + /* lookup re-split table */ + size_t k; + const resplit_table_item_t * item = NULL; + + for (k = 0; k < G_N_ELEMENTS(resplit_table); ++k) { + item = resplit_table + k; + + const char * onepinyin = str + cur_rest->m_raw_begin; + size_t len = strlen(item->m_orig_keys[0]); + + if (cur_rest->length() != len) + continue; + + if (0 != strncmp(onepinyin, item->m_orig_keys[0], len)) + continue; + + onepinyin = str + next_rest->m_raw_begin; + len = strlen(item->m_orig_keys[1]); + + if (next_rest->length() != len) + continue; + + if (0 == strncmp(onepinyin, item->m_orig_keys[1], len)) + break; + } + + /* found the match */ + if (k < G_N_ELEMENTS(resplit_table)) { + item = resplit_table + k; + return item; + } + + return NULL; +} + +const resplit_table_item_t * FullPinyinParser2::retrieve_resplit_item_by_resplit_pinyins +(pinyin_option_t options, + ChewingKey * cur_key, ChewingKeyRest * cur_rest, + ChewingKey * next_key, ChewingKeyRest * next_rest, + const char * str, int len) const { + /* lookup divide table */ + size_t k; + const resplit_table_item_t * item = NULL; + + for (k = 0; k < G_N_ELEMENTS(resplit_table); ++k) { + item = resplit_table + k; + + const char * onepinyin = str + cur_rest->m_raw_begin; + size_t len = strlen(item->m_new_keys[0]); + + if (cur_rest->length() != len) + continue; + + if (0 != strncmp(onepinyin, item->m_new_keys[0], len)) + continue; + + onepinyin = str + next_rest->m_raw_begin; + len = strlen(item->m_new_keys[1]); + + if (next_rest->length() != len) + continue; + + if (0 == strncmp(onepinyin, item->m_new_keys[1], len)) + break; + } + + /* found the match */ + if (k < G_N_ELEMENTS(resplit_table)) { + item = resplit_table + k; + return item; + } + + return NULL; +} + +#define IS_KEY(x) (('a' <= x && x <= 'z') || x == ';') + +bool DoublePinyinParser2::parse_one_key(pinyin_option_t options, + ChewingKey & key, + const char *str, int len) const { + options &= ~(PINYIN_CORRECT_ALL|PINYIN_AMB_ALL); + + if (1 == len) { + if (!(options & PINYIN_INCOMPLETE)) + return false; + + char ch = str[0]; + if (!IS_KEY(ch)) + return false; + + int charid = ch == ';' ? 26 : ch - 'a'; + const char * sheng = m_shengmu_table[charid].m_shengmu; + if (NULL == sheng || strcmp(sheng, "'") == 0) + return false; + + if (search_pinyin_index(options, sheng, key)) { + return true; + } else { + return false; + } + } + + ChewingTone tone = CHEWING_ZERO_TONE; + options &= ~(PINYIN_INCOMPLETE|CHEWING_INCOMPLETE); + options |= PINYIN_CORRECT_UE_VE | PINYIN_CORRECT_V_U; + + /* parse tone */ + if (3 == len) { + if (!(options & USE_TONE)) + return false; + char ch = str[2]; + if (!('0' < ch && ch <= '5')) + return false; + tone = (ChewingTone) (ch - '0'); + } + + if (2 == len || 3 == len) { + /* parse shengmu here. */ + char ch = str[0]; + if (!IS_KEY(ch)) + return false; + + int charid = ch == ';' ? 26 : ch - 'a'; + const char * sheng = m_shengmu_table[charid].m_shengmu; + if (NULL == sheng) + return false; + if (0 == strcmp(sheng, "'")) + sheng = ""; + + /* parse yunmu here. */ + ch = str[1]; + if (!IS_KEY(ch)) + return false; + + gchar * pinyin = NULL; + do { + + charid = ch == ';' ? 26 : ch - 'a'; + /* first yunmu */ + const char * yun = m_yunmu_table[charid].m_yunmus[0]; + if (NULL == yun) + break; + + pinyin = g_strdup_printf("%s%s", sheng, yun); + if (search_pinyin_index(options, pinyin, key)) { + key.m_tone = tone; + g_free(pinyin); + return true; + } + g_free(pinyin); + + /* second yunmu */ + yun = m_yunmu_table[charid].m_yunmus[1]; + if (NULL == yun) + break; + + pinyin = g_strdup_printf("%s%s", sheng, yun); + if (search_pinyin_index(options, pinyin, key)) { + key.m_tone = tone; + g_free(pinyin); + return true; + } + g_free(pinyin); + } while(0); + +#if 1 + /* support two letter yunmu from full pinyin */ + if (0 == strcmp(sheng, "")) { + pinyin = g_strndup(str, 2); + if (search_pinyin_index(options, pinyin, key)) { + key.m_tone = tone; + g_free(pinyin); + return true; + } + g_free(pinyin); + } +#endif + } + + return false; +} + + +/* only 'a'-'z' and ';' are accepted here. */ +int DoublePinyinParser2::parse(pinyin_option_t options, ChewingKeyVector & keys, + ChewingKeyRestVector & key_rests, + const char *str, int len) const { + g_array_set_size(keys, 0); + g_array_set_size(key_rests, 0); + + int maximum_len = 0; int i; + /* probe the longest possible double pinyin string. */ + for (i = 0; i < len; ++i) { + const char ch = str[i]; + if (!(IS_KEY(ch) || ('0' < ch && ch <= '5'))) + break; + } + maximum_len = i; + + /* maximum forward match for double pinyin. */ + int parsed_len = 0; + while (parsed_len < maximum_len) { + const char * cur_str = str + parsed_len; + i = std_lite::min(maximum_len - parsed_len, + (int)max_double_pinyin_length); + + ChewingKey key; ChewingKeyRest key_rest; + for (; i > 0; --i) { + bool success = parse_one_key(options, key, cur_str, i); + if (success) + break; + } + + if (0 == i) /* no more possible double pinyins. */ + break; + + key_rest.m_raw_begin = parsed_len; key_rest.m_raw_end = parsed_len + i; + parsed_len += i; + + /* save the pinyin */ + g_array_append_val(keys, key); + g_array_append_val(key_rests, key_rest); + } + + return parsed_len; +} + +#undef IS_KEY + +bool DoublePinyinParser2::set_scheme(DoublePinyinScheme scheme) { + + switch (scheme) { + case DOUBLE_PINYIN_ZRM: + m_shengmu_table = double_pinyin_zrm_sheng; + m_yunmu_table = double_pinyin_zrm_yun; + return true; + case DOUBLE_PINYIN_MS: + m_shengmu_table = double_pinyin_mspy_sheng; + m_yunmu_table = double_pinyin_mspy_yun; + return true; + case DOUBLE_PINYIN_ZIGUANG: + m_shengmu_table = double_pinyin_zgpy_sheng; + m_yunmu_table = double_pinyin_zgpy_yun; + return true; + case DOUBLE_PINYIN_ABC: + m_shengmu_table = double_pinyin_abc_sheng; + m_yunmu_table = double_pinyin_abc_yun; + return true; + case DOUBLE_PINYIN_PYJJ: + m_shengmu_table = double_pinyin_pyjj_sheng; + m_yunmu_table = double_pinyin_pyjj_yun; + return true; + case DOUBLE_PINYIN_XHE: + m_shengmu_table = double_pinyin_xhe_sheng; + m_yunmu_table = double_pinyin_xhe_yun; + return true; + case DOUBLE_PINYIN_CUSTOMIZED: + assert(FALSE); + }; + + return false; /* no such scheme. */ +} + +/* the chewing string must be freed with g_free. */ +static bool search_chewing_symbols(const chewing_symbol_item_t * symbol_table, + const char key, const char ** chewing) { + *chewing = NULL; + /* just iterate the table, as we only have < 50 items. */ + while (symbol_table->m_input != '\0') { + if (symbol_table->m_input == key) { + *chewing = symbol_table->m_chewing; + return true; + } + symbol_table ++; + } + return false; +} + +static bool search_chewing_tones(const chewing_tone_item_t * tone_table, + const char key, char * tone) { + *tone = CHEWING_ZERO_TONE; + /* just iterate the table, as we only have < 10 items. */ + while (tone_table->m_input != '\0') { + if (tone_table->m_input == key) { + *tone = tone_table->m_tone; + return true; + } + tone_table ++; + } + return false; +} + + +bool ChewingParser2::parse_one_key(pinyin_option_t options, + ChewingKey & key, + const char *str, int len) const { + options &= ~(PINYIN_CORRECT_ALL|PINYIN_AMB_ALL); + char tone = CHEWING_ZERO_TONE; + + int symbols_len = len; + /* probe whether the last key is tone key in str. */ + if (options & USE_TONE) { + char ch = str[len - 1]; + /* remove tone from input */ + if (search_chewing_tones(m_tone_table, ch, &tone)) + symbols_len --; + } + + int i; + gchar * chewing = NULL; const char * onechar = NULL; + + /* probe the possible chewing map in the rest of str. */ + for (i = 0; i < symbols_len; ++i) { + if (!search_chewing_symbols(m_symbol_table, str[i], &onechar)) { + g_free(chewing); + return false; + } + + if (!chewing) { + chewing = g_strdup(onechar); + } else { + gchar * tmp = chewing; + chewing = g_strconcat(chewing, onechar, NULL); + g_free(tmp); + } + } + + /* search the chewing in the chewing index table. */ + if (chewing && search_chewing_index(options, chewing, key)) { + /* save back tone if available. */ + key.m_tone = tone; + g_free(chewing); + return true; + } + + g_free(chewing); + return false; +} + + +/* only characters in chewing keyboard scheme are accepted here. */ +int ChewingParser2::parse(pinyin_option_t options, ChewingKeyVector & keys, + ChewingKeyRestVector & key_rests, + const char *str, int len) const { + g_array_set_size(keys, 0); + g_array_set_size(key_rests, 0); + + int maximum_len = 0; int i; + /* probe the longest possible chewing string. */ + for (i = 0; i < len; ++i) { + if (!in_chewing_scheme(options, str[i], NULL)) + break; + } + maximum_len = i; + + /* maximum forward match for chewing. */ + int parsed_len = 0; + while (parsed_len < maximum_len) { + const char * cur_str = str + parsed_len; + i = std_lite::min(maximum_len - parsed_len, + (int)max_chewing_length); + + ChewingKey key; ChewingKeyRest key_rest; + for (; i > 0; --i) { + bool success = parse_one_key(options, key, cur_str, i); + if (success) + break; + } + + if (0 == i) /* no more possible chewings. */ + break; + + key_rest.m_raw_begin = parsed_len; key_rest.m_raw_end = parsed_len + i; + parsed_len += i; + + /* save the pinyin. */ + g_array_append_val(keys, key); + g_array_append_val(key_rests, key_rest); + } + + return parsed_len; +} + + +bool ChewingParser2::set_scheme(ChewingScheme scheme) { + switch(scheme) { + case CHEWING_STANDARD: + m_symbol_table = chewing_standard_symbols; + m_tone_table = chewing_standard_tones; + return true; + case CHEWING_IBM: + m_symbol_table = chewing_ibm_symbols; + m_tone_table = chewing_ibm_tones; + return true; + case CHEWING_GINYIEH: + m_symbol_table = chewing_ginyieh_symbols; + m_tone_table = chewing_ginyieh_tones; + return true; + case CHEWING_ETEN: + m_symbol_table = chewing_eten_symbols; + m_tone_table = chewing_eten_tones; + return true; + } + + return false; +} + + +bool ChewingParser2::in_chewing_scheme(pinyin_option_t options, + const char key, const char ** symbol) + const { + const gchar * chewing = NULL; + char tone = CHEWING_ZERO_TONE; + + if (search_chewing_symbols(m_symbol_table, key, &chewing)) { + if (symbol) + *symbol = chewing; + return true; + } + + if (!(options & USE_TONE)) + return false; + + if (search_chewing_tones(m_tone_table, key, &tone)) { + if (symbol) + *symbol = chewing_tone_table[tone]; + return true; + } + + return false; +} diff --git a/src/storage/pinyin_parser2.h b/src/storage/pinyin_parser2.h new file mode 100644 index 0000000..e40b30c --- /dev/null +++ b/src/storage/pinyin_parser2.h @@ -0,0 +1,361 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef PINYIN_PARSER2_H +#define PINYIN_PARSER2_H + +#include <glib.h> +#include "novel_types.h" +#include "chewing_key.h" +#include "pinyin_custom2.h" + +namespace pinyin{ + +typedef struct { + const char * m_pinyin_str; + const char * m_shengmu_str; + const char * m_yunmu_str; + const char * m_chewing_str; + ChewingKey m_chewing_key; +} content_table_item_t; + +typedef struct { + const char * m_pinyin_input; + guint32 m_flags; + guint16 m_table_index; +} pinyin_index_item_t; + +typedef struct { + const char * m_chewing_input; + guint32 m_flags; + guint16 m_table_index; +} chewing_index_item_t; + +typedef struct { + const char * m_orig_key; + guint32 m_orig_freq; + const char * m_new_keys[2]; + guint32 m_new_freq; +} divided_table_item_t; + +typedef struct { + const char * m_orig_keys[2]; + guint32 m_orig_freq; + const char * m_new_keys[2]; + guint32 m_new_freq; +} resplit_table_item_t; + +typedef struct { + const char * m_shengmu; +} double_pinyin_scheme_shengmu_item_t; + +typedef struct { + const char * m_yunmus[2]; +} double_pinyin_scheme_yunmu_item_t; + +typedef struct { + const char m_input; + const char * m_chewing; +} chewing_symbol_item_t; + +typedef struct { + const char m_input; + const char m_tone; +} chewing_tone_item_t; + +typedef GArray * ParseValueVector; + + +/** + * PinyinParser2: + * + * Parse the ascii string into an array of the struct ChewingKeys. + * + */ +class PinyinParser2 +{ +public: + /** + * PinyinParser2::~PinyinParser2: + * + * The destructor of the PinyinParser2. + * + */ + virtual ~PinyinParser2() {} + +public: + /** + * PinyinParser2::parse_one_key: + * @options: the pinyin options from pinyin_custom2.h. + * @key: the parsed result of struct ChewingKey. + * @str: the input of the ascii string. + * @len: the length of the str. + * @returns: whether the entire string is parsed as one key. + * + * Parse only one struct ChewingKey from a string. + * + */ + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const = 0; + + /** + * PinyinParser2::parse: + * @options: the pinyin options from pinyin_custom2.h. + * @keys: the parsed result of struct ChewingKeys. + * @str: the input of the ascii string. + * @len: the length of the str. + * @returns: the number of chars were actually used. + * + * Parse the ascii string into an array of struct ChewingKeys. + * + */ + virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const = 0; + +}; + + +/** + * FullPinyinParser2: + * + * Parses the full pinyin string into an array of struct ChewingKeys. + * + */ +class FullPinyinParser2 : public PinyinParser2 +{ + /* Note: some internal pointers to full pinyin table. */ + +protected: + ParseValueVector m_parse_steps; + + int final_step(size_t step_len, ChewingKeyVector & keys, + ChewingKeyRestVector & key_rests) const; + + bool post_process2(pinyin_option_t options, ChewingKeyVector & keys, + ChewingKeyRestVector & key_rests, + const char * str, int len) const; + +public: + const divided_table_item_t * retrieve_divided_item + (pinyin_option_t options, ChewingKey * key, ChewingKeyRest * rest, + const char * str, int len) const; + + const resplit_table_item_t * retrieve_resplit_item_by_original_pinyins + (pinyin_option_t options, + ChewingKey * cur_key, ChewingKeyRest * cur_rest, + ChewingKey * next_key, ChewingKeyRest * next_rest, + const char * str, int len) const; + const resplit_table_item_t * retrieve_resplit_item_by_resplit_pinyins + (pinyin_option_t options, + ChewingKey * cur_key, ChewingKeyRest * cur_rest, + ChewingKey * next_key, ChewingKeyRest * next_rest, + const char * str, int len) const; + +public: + FullPinyinParser2(); + virtual ~FullPinyinParser2() { + g_array_free(m_parse_steps, TRUE); + } + + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + + /* Note: + * the parse method will use dynamic programming to drive parse_one_key. + */ + virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const; +}; + + +/** + * DoublePinyinParser2: + * + * Parse the double pinyin string into an array of struct ChewingKeys. + * + */ +/* The valid input chars of ShuangPin is a-z and ';' + */ +class DoublePinyinParser2 : public PinyinParser2 +{ + /* Note: two internal pointers to double pinyin scheme table. */ +protected: + const double_pinyin_scheme_shengmu_item_t * m_shengmu_table; + const double_pinyin_scheme_yunmu_item_t * m_yunmu_table; + +public: + DoublePinyinParser2() { + m_shengmu_table = NULL; m_yunmu_table = NULL; + set_scheme(DOUBLE_PINYIN_DEFAULT); + } + + virtual ~DoublePinyinParser2() {} + + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + + virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const; + +public: + bool set_scheme(DoublePinyinScheme scheme); +}; + + +/** + * ChewingParser2: + * + * Parse the chewing string into an array of struct ChewingKeys. + * + * Several keyboard scheme are supported: + * * Chewing_STANDARD Standard ZhuYin keyboard, which maps 1 to Bo(ㄅ), q to Po(ㄆ) etc. + * * Chewing_IBM IBM ZhuYin keyboard, which maps 1 to Bo(ㄅ), 2 to Po(ㄆ) etc. + * * Chewing_GINYIEH Gin-Yieh ZhuYin keyboard. + * * Chewing_ETEN Eten (倚天) ZhuYin keyboard. + * + */ + +/* Note: maybe yunmus shuffle will be supported later. + * currently this feature is postponed. + */ +class ChewingParser2 : public PinyinParser2 +{ + /* Note: some internal pointers to chewing scheme table. */ +protected: + const chewing_symbol_item_t * m_symbol_table; + const chewing_tone_item_t * m_tone_table; + +public: + ChewingParser2() { + m_symbol_table = NULL; m_tone_table = NULL; + set_scheme(CHEWING_DEFAULT); + } + + virtual ~ChewingParser2() {} + + virtual bool parse_one_key(pinyin_option_t options, ChewingKey & key, const char *str, int len) const; + + virtual int parse(pinyin_option_t options, ChewingKeyVector & keys, ChewingKeyRestVector & key_rests, const char *str, int len) const; + +public: + bool set_scheme(ChewingScheme scheme); + bool in_chewing_scheme(pinyin_option_t options, const char key, const char ** symbol) const; +}; + + +/* compare pinyins with chewing internal representations. */ +inline int pinyin_compare_initial2(pinyin_option_t options, + ChewingInitial lhs, + ChewingInitial rhs) { + if (lhs == rhs) + return 0; + + if ((options & PINYIN_AMB_C_CH) && + ((lhs == CHEWING_C && rhs == CHEWING_CH) || + (lhs == CHEWING_CH && rhs == CHEWING_C))) + return 0; + + if ((options & PINYIN_AMB_S_SH) && + ((lhs == CHEWING_S && rhs == CHEWING_SH) || + (lhs == CHEWING_SH && rhs == CHEWING_S))) + return 0; + + if ((options & PINYIN_AMB_Z_ZH) && + ((lhs == CHEWING_Z && rhs == CHEWING_ZH) || + (lhs == CHEWING_ZH && rhs == CHEWING_Z))) + return 0; + + if ((options & PINYIN_AMB_F_H) && + ((lhs == CHEWING_F && rhs == CHEWING_H) || + (lhs == CHEWING_H && rhs == CHEWING_F))) + return 0; + + if ((options & PINYIN_AMB_L_N) && + ((lhs == CHEWING_L && rhs == CHEWING_N) || + (lhs == CHEWING_N && rhs == CHEWING_L))) + return 0; + + if ((options & PINYIN_AMB_L_R) && + ((lhs == CHEWING_L && rhs == CHEWING_R) || + (lhs == CHEWING_R && rhs == CHEWING_L))) + return 0; + + if ((options & PINYIN_AMB_G_K) && + ((lhs == CHEWING_G && rhs == CHEWING_K) || + (lhs == CHEWING_K && rhs == CHEWING_G))) + return 0; + + return (lhs - rhs); +} + + +inline int pinyin_compare_middle_and_final2(pinyin_option_t options, + ChewingMiddle middle_lhs, + ChewingMiddle middle_rhs, + ChewingFinal final_lhs, + ChewingFinal final_rhs) { + if (middle_lhs == middle_rhs && final_lhs == final_rhs) + return 0; + + /* both pinyin and chewing incomplete options will enable this. */ + if (options & (PINYIN_INCOMPLETE | CHEWING_INCOMPLETE)) { + if (middle_lhs == CHEWING_ZERO_MIDDLE && + final_lhs == CHEWING_ZERO_FINAL) + return 0; + if (middle_rhs == CHEWING_ZERO_MIDDLE && + final_rhs == CHEWING_ZERO_FINAL) + return 0; + } + + /* compare chewing middle first. */ + int middle_diff = middle_lhs - middle_rhs; + if (middle_diff) + return middle_diff; + + if ((options & PINYIN_AMB_AN_ANG) && + ((final_lhs == CHEWING_AN && final_rhs == CHEWING_ANG) || + (final_lhs == CHEWING_ANG && final_rhs == CHEWING_AN))) + return 0; + + if ((options & PINYIN_AMB_EN_ENG) && + ((final_lhs == CHEWING_EN && final_rhs == CHEWING_ENG) || + (final_lhs == CHEWING_ENG && final_rhs == CHEWING_EN))) + return 0; + + if ((options & PINYIN_AMB_IN_ING) && + ((final_lhs == PINYIN_IN && final_rhs == PINYIN_ING) || + (final_lhs == PINYIN_ING && final_rhs == PINYIN_IN))) + return 0; + + return (final_lhs - final_rhs); +} + + +inline int pinyin_compare_tone2(pinyin_option_t options, + ChewingTone lhs, + ChewingTone rhs) { + if (lhs == rhs) + return 0; + if (lhs == CHEWING_ZERO_TONE) + return 0; + if (rhs == CHEWING_ZERO_TONE) + return 0; + return (lhs - rhs); +} + + +}; + +#endif diff --git a/src/storage/pinyin_parser_table.h b/src/storage/pinyin_parser_table.h new file mode 100644 index 0000000..f633604 --- /dev/null +++ b/src/storage/pinyin_parser_table.h @@ -0,0 +1,3393 @@ +/* This file is generated by python scripts. Don't edit this file directly. + */ + +#ifndef PINYIN_PARSER_TABLE_H +#define PINYIN_PARSER_TABLE_H + +namespace pinyin{ + +const pinyin_index_item_t pinyin_index[] = { +{"a", IS_CHEWING|IS_PINYIN, 1}, +{"agn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 4}, +{"ai", IS_CHEWING|IS_PINYIN, 2}, +{"amg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 4}, +{"an", IS_CHEWING|IS_PINYIN, 3}, +{"ang", IS_CHEWING|IS_PINYIN, 4}, +{"ao", IS_CHEWING|IS_PINYIN, 5}, +{"b", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 6}, +{"ba", IS_CHEWING|IS_PINYIN, 7}, +{"bagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 10}, +{"bai", IS_CHEWING|IS_PINYIN, 8}, +{"bamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 10}, +{"ban", IS_CHEWING|IS_PINYIN, 9}, +{"bang", IS_CHEWING|IS_PINYIN, 10}, +{"bao", IS_CHEWING|IS_PINYIN, 11}, +{"begn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 14}, +{"bei", IS_CHEWING|IS_PINYIN, 12}, +{"bemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 14}, +{"ben", IS_CHEWING|IS_PINYIN, 13}, +{"beng", IS_CHEWING|IS_PINYIN, 14}, +{"bi", IS_CHEWING|IS_PINYIN, 15}, +{"bian", IS_CHEWING|IS_PINYIN, 16}, +{"biao", IS_CHEWING|IS_PINYIN, 17}, +{"bie", IS_CHEWING|IS_PINYIN, 18}, +{"bign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 20}, +{"bimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 20}, +{"bin", IS_CHEWING|IS_PINYIN, 19}, +{"bing", IS_CHEWING|IS_PINYIN, 20}, +{"bo", IS_CHEWING|IS_PINYIN, 21}, +{"bu", IS_CHEWING|IS_PINYIN, 22}, +{"c", IS_PINYIN|PINYIN_INCOMPLETE, 23}, +{"ca", IS_CHEWING|IS_PINYIN, 24}, +{"cagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 27}, +{"cai", IS_CHEWING|IS_PINYIN, 25}, +{"camg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 27}, +{"can", IS_CHEWING|IS_PINYIN, 26}, +{"cang", IS_CHEWING|IS_PINYIN, 27}, +{"cao", IS_CHEWING|IS_PINYIN, 28}, +{"ce", IS_CHEWING|IS_PINYIN, 29}, +{"cegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 31}, +{"cemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 31}, +{"cen", IS_CHEWING|IS_PINYIN, 30}, +{"ceng", IS_CHEWING|IS_PINYIN, 31}, +{"ch", IS_PINYIN|PINYIN_INCOMPLETE, 32}, +{"cha", IS_CHEWING|IS_PINYIN, 33}, +{"chagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 36}, +{"chai", IS_CHEWING|IS_PINYIN, 34}, +{"chamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 36}, +{"chan", IS_CHEWING|IS_PINYIN, 35}, +{"chang", IS_CHEWING|IS_PINYIN, 36}, +{"chao", IS_CHEWING|IS_PINYIN, 37}, +{"che", IS_CHEWING|IS_PINYIN, 38}, +{"chegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 40}, +{"chemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 40}, +{"chen", IS_CHEWING|IS_PINYIN, 39}, +{"cheng", IS_CHEWING|IS_PINYIN, 40}, +{"chi", IS_CHEWING|IS_PINYIN, 41}, +{"chogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 42}, +{"chomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 42}, +{"chon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 42}, +{"chong", IS_CHEWING|IS_PINYIN, 42}, +{"chou", IS_CHEWING|IS_PINYIN, 43}, +{"chu", IS_CHEWING|IS_PINYIN, 44}, +{"chuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 48}, +{"chuai", IS_CHEWING|IS_PINYIN, 46}, +{"chuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 48}, +{"chuan", IS_CHEWING|IS_PINYIN, 47}, +{"chuang", IS_CHEWING|IS_PINYIN, 48}, +{"chuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 49}, +{"chuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 50}, +{"chui", IS_CHEWING|IS_PINYIN, 49}, +{"chun", IS_CHEWING|IS_PINYIN, 50}, +{"chuo", IS_CHEWING|IS_PINYIN, 51}, +{"ci", IS_CHEWING|IS_PINYIN, 52}, +{"cogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 53}, +{"comg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 53}, +{"con", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 53}, +{"cong", IS_CHEWING|IS_PINYIN, 53}, +{"cou", IS_CHEWING|IS_PINYIN, 54}, +{"cu", IS_CHEWING|IS_PINYIN, 55}, +{"cuan", IS_CHEWING|IS_PINYIN, 56}, +{"cuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 57}, +{"cuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 58}, +{"cui", IS_CHEWING|IS_PINYIN, 57}, +{"cun", IS_CHEWING|IS_PINYIN, 58}, +{"cuo", IS_CHEWING|IS_PINYIN, 59}, +{"d", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 60}, +{"da", IS_CHEWING|IS_PINYIN, 61}, +{"dagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 64}, +{"dai", IS_CHEWING|IS_PINYIN, 62}, +{"damg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 64}, +{"dan", IS_CHEWING|IS_PINYIN, 63}, +{"dang", IS_CHEWING|IS_PINYIN, 64}, +{"dao", IS_CHEWING|IS_PINYIN, 65}, +{"de", IS_CHEWING|IS_PINYIN, 66}, +{"degn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 69}, +{"dei", IS_CHEWING|IS_PINYIN, 67}, +{"demg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 69}, +{"deng", IS_CHEWING|IS_PINYIN, 69}, +{"di", IS_CHEWING|IS_PINYIN, 70}, +{"dia", IS_CHEWING|IS_PINYIN, 71}, +{"dian", IS_CHEWING|IS_PINYIN, 72}, +{"diao", IS_CHEWING|IS_PINYIN, 73}, +{"die", IS_CHEWING|IS_PINYIN, 74}, +{"dign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 76}, +{"dimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 76}, +{"ding", IS_CHEWING|IS_PINYIN, 76}, +{"diou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 77}, +{"diu", IS_CHEWING|IS_PINYIN, 77}, +{"dogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 78}, +{"domg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 78}, +{"don", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 78}, +{"dong", IS_CHEWING|IS_PINYIN, 78}, +{"dou", IS_CHEWING|IS_PINYIN, 79}, +{"du", IS_CHEWING|IS_PINYIN, 80}, +{"duan", IS_CHEWING|IS_PINYIN, 81}, +{"duei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 82}, +{"duen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 83}, +{"dui", IS_CHEWING|IS_PINYIN, 82}, +{"dun", IS_CHEWING|IS_PINYIN, 83}, +{"duo", IS_CHEWING|IS_PINYIN, 84}, +{"e", IS_CHEWING|IS_PINYIN, 85}, +{"egn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 88}, +{"ei", IS_CHEWING|IS_PINYIN, 86}, +{"emg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 88}, +{"en", IS_CHEWING|IS_PINYIN, 87}, +{"er", IS_CHEWING|IS_PINYIN, 89}, +{"f", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 90}, +{"fa", IS_CHEWING|IS_PINYIN, 91}, +{"fagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 93}, +{"famg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 93}, +{"fan", IS_CHEWING|IS_PINYIN, 92}, +{"fang", IS_CHEWING|IS_PINYIN, 93}, +{"fegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 97}, +{"fei", IS_CHEWING|IS_PINYIN, 95}, +{"femg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 97}, +{"fen", IS_CHEWING|IS_PINYIN, 96}, +{"feng", IS_CHEWING|IS_PINYIN, 97}, +{"fo", IS_CHEWING|IS_PINYIN, 98}, +{"fou", IS_CHEWING|IS_PINYIN, 99}, +{"fu", IS_CHEWING|IS_PINYIN, 100}, +{"g", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 101}, +{"ga", IS_CHEWING|IS_PINYIN, 102}, +{"gagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 105}, +{"gai", IS_CHEWING|IS_PINYIN, 103}, +{"gamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 105}, +{"gan", IS_CHEWING|IS_PINYIN, 104}, +{"gang", IS_CHEWING|IS_PINYIN, 105}, +{"gao", IS_CHEWING|IS_PINYIN, 106}, +{"ge", IS_CHEWING|IS_PINYIN, 107}, +{"gegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 110}, +{"gei", IS_CHEWING|IS_PINYIN, 108}, +{"gemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 110}, +{"gen", IS_CHEWING|IS_PINYIN, 109}, +{"geng", IS_CHEWING|IS_PINYIN, 110}, +{"gogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 111}, +{"gomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 111}, +{"gon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 111}, +{"gong", IS_CHEWING|IS_PINYIN, 111}, +{"gou", IS_CHEWING|IS_PINYIN, 112}, +{"gu", IS_CHEWING|IS_PINYIN, 113}, +{"gua", IS_CHEWING|IS_PINYIN, 114}, +{"guagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 117}, +{"guai", IS_CHEWING|IS_PINYIN, 115}, +{"guamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 117}, +{"guan", IS_CHEWING|IS_PINYIN, 116}, +{"guang", IS_CHEWING|IS_PINYIN, 117}, +{"guei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 118}, +{"guen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 119}, +{"gui", IS_CHEWING|IS_PINYIN, 118}, +{"gun", IS_CHEWING|IS_PINYIN, 119}, +{"guo", IS_CHEWING|IS_PINYIN, 120}, +{"h", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 121}, +{"ha", IS_CHEWING|IS_PINYIN, 122}, +{"hagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 125}, +{"hai", IS_CHEWING|IS_PINYIN, 123}, +{"hamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 125}, +{"han", IS_CHEWING|IS_PINYIN, 124}, +{"hang", IS_CHEWING|IS_PINYIN, 125}, +{"hao", IS_CHEWING|IS_PINYIN, 126}, +{"he", IS_CHEWING|IS_PINYIN, 127}, +{"hegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 130}, +{"hei", IS_CHEWING|IS_PINYIN, 128}, +{"hemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 130}, +{"hen", IS_CHEWING|IS_PINYIN, 129}, +{"heng", IS_CHEWING|IS_PINYIN, 130}, +{"hogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 131}, +{"homg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 131}, +{"hon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 131}, +{"hong", IS_CHEWING|IS_PINYIN, 131}, +{"hou", IS_CHEWING|IS_PINYIN, 132}, +{"hu", IS_CHEWING|IS_PINYIN, 133}, +{"hua", IS_CHEWING|IS_PINYIN, 134}, +{"huagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 137}, +{"huai", IS_CHEWING|IS_PINYIN, 135}, +{"huamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 137}, +{"huan", IS_CHEWING|IS_PINYIN, 136}, +{"huang", IS_CHEWING|IS_PINYIN, 137}, +{"huei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 138}, +{"huen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 139}, +{"hui", IS_CHEWING|IS_PINYIN, 138}, +{"hun", IS_CHEWING|IS_PINYIN, 139}, +{"huo", IS_CHEWING|IS_PINYIN, 140}, +{"j", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 141}, +{"ji", IS_CHEWING|IS_PINYIN, 142}, +{"jia", IS_CHEWING|IS_PINYIN, 143}, +{"jiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 145}, +{"jiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 145}, +{"jian", IS_CHEWING|IS_PINYIN, 144}, +{"jiang", IS_CHEWING|IS_PINYIN, 145}, +{"jiao", IS_CHEWING|IS_PINYIN, 146}, +{"jie", IS_CHEWING|IS_PINYIN, 147}, +{"jign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 149}, +{"jimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 149}, +{"jin", IS_CHEWING|IS_PINYIN, 148}, +{"jing", IS_CHEWING|IS_PINYIN, 149}, +{"jiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 150}, +{"jiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 150}, +{"jion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 150}, +{"jiong", IS_CHEWING|IS_PINYIN, 150}, +{"jiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 151}, +{"jiu", IS_CHEWING|IS_PINYIN, 151}, +{"ju", IS_CHEWING|IS_PINYIN, 152}, +{"juan", IS_CHEWING|IS_PINYIN, 153}, +{"jue", IS_CHEWING|IS_PINYIN, 154}, +{"juen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 155}, +{"jun", IS_CHEWING|IS_PINYIN, 155}, +{"jv", IS_PINYIN|PINYIN_CORRECT_V_U, 152}, +{"jvan", IS_PINYIN|PINYIN_CORRECT_V_U, 153}, +{"jve", IS_PINYIN|PINYIN_CORRECT_V_U, 154}, +{"jvn", IS_PINYIN|PINYIN_CORRECT_V_U, 155}, +{"k", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 156}, +{"ka", IS_CHEWING|IS_PINYIN, 157}, +{"kagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 160}, +{"kai", IS_CHEWING|IS_PINYIN, 158}, +{"kamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 160}, +{"kan", IS_CHEWING|IS_PINYIN, 159}, +{"kang", IS_CHEWING|IS_PINYIN, 160}, +{"kao", IS_CHEWING|IS_PINYIN, 161}, +{"ke", IS_CHEWING|IS_PINYIN, 162}, +{"kegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 165}, +{"kemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 165}, +{"ken", IS_CHEWING|IS_PINYIN, 164}, +{"keng", IS_CHEWING|IS_PINYIN, 165}, +{"kogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 166}, +{"komg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 166}, +{"kon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 166}, +{"kong", IS_CHEWING|IS_PINYIN, 166}, +{"kou", IS_CHEWING|IS_PINYIN, 167}, +{"ku", IS_CHEWING|IS_PINYIN, 168}, +{"kua", IS_CHEWING|IS_PINYIN, 169}, +{"kuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 172}, +{"kuai", IS_CHEWING|IS_PINYIN, 170}, +{"kuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 172}, +{"kuan", IS_CHEWING|IS_PINYIN, 171}, +{"kuang", IS_CHEWING|IS_PINYIN, 172}, +{"kuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 173}, +{"kuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 174}, +{"kui", IS_CHEWING|IS_PINYIN, 173}, +{"kun", IS_CHEWING|IS_PINYIN, 174}, +{"kuo", IS_CHEWING|IS_PINYIN, 175}, +{"l", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 176}, +{"la", IS_CHEWING|IS_PINYIN, 177}, +{"lagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 180}, +{"lai", IS_CHEWING|IS_PINYIN, 178}, +{"lamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 180}, +{"lan", IS_CHEWING|IS_PINYIN, 179}, +{"lang", IS_CHEWING|IS_PINYIN, 180}, +{"lao", IS_CHEWING|IS_PINYIN, 181}, +{"le", IS_CHEWING|IS_PINYIN, 182}, +{"legn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 185}, +{"lei", IS_CHEWING|IS_PINYIN, 183}, +{"lemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 185}, +{"leng", IS_CHEWING|IS_PINYIN, 185}, +{"li", IS_CHEWING|IS_PINYIN, 186}, +{"lia", IS_CHEWING|IS_PINYIN, 187}, +{"liagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 189}, +{"liamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 189}, +{"lian", IS_CHEWING|IS_PINYIN, 188}, +{"liang", IS_CHEWING|IS_PINYIN, 189}, +{"liao", IS_CHEWING|IS_PINYIN, 190}, +{"lie", IS_CHEWING|IS_PINYIN, 191}, +{"lign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 193}, +{"limg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 193}, +{"lin", IS_CHEWING|IS_PINYIN, 192}, +{"ling", IS_CHEWING|IS_PINYIN, 193}, +{"liou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 194}, +{"liu", IS_CHEWING|IS_PINYIN, 194}, +{"lo", IS_CHEWING|IS_PINYIN, 195}, +{"logn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 196}, +{"lomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 196}, +{"lon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 196}, +{"long", IS_CHEWING|IS_PINYIN, 196}, +{"lou", IS_CHEWING|IS_PINYIN, 197}, +{"lu", IS_CHEWING|IS_PINYIN, 198}, +{"luan", IS_CHEWING|IS_PINYIN, 199}, +{"lue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 203}, +{"luen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 200}, +{"lun", IS_CHEWING|IS_PINYIN, 200}, +{"luo", IS_CHEWING|IS_PINYIN, 201}, +{"lv", IS_CHEWING|IS_PINYIN, 202}, +{"lve", IS_CHEWING|IS_PINYIN, 203}, +{"m", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 204}, +{"ma", IS_CHEWING|IS_PINYIN, 205}, +{"magn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 208}, +{"mai", IS_CHEWING|IS_PINYIN, 206}, +{"mamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 208}, +{"man", IS_CHEWING|IS_PINYIN, 207}, +{"mang", IS_CHEWING|IS_PINYIN, 208}, +{"mao", IS_CHEWING|IS_PINYIN, 209}, +{"me", IS_CHEWING|IS_PINYIN, 210}, +{"megn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 213}, +{"mei", IS_CHEWING|IS_PINYIN, 211}, +{"memg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 213}, +{"men", IS_CHEWING|IS_PINYIN, 212}, +{"meng", IS_CHEWING|IS_PINYIN, 213}, +{"mi", IS_CHEWING|IS_PINYIN, 214}, +{"mian", IS_CHEWING|IS_PINYIN, 215}, +{"miao", IS_CHEWING|IS_PINYIN, 216}, +{"mie", IS_CHEWING|IS_PINYIN, 217}, +{"mign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 219}, +{"mimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 219}, +{"min", IS_CHEWING|IS_PINYIN, 218}, +{"ming", IS_CHEWING|IS_PINYIN, 219}, +{"miou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 220}, +{"miu", IS_CHEWING|IS_PINYIN, 220}, +{"mo", IS_CHEWING|IS_PINYIN, 221}, +{"mou", IS_CHEWING|IS_PINYIN, 222}, +{"mu", IS_CHEWING|IS_PINYIN, 223}, +{"n", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 224}, +{"na", IS_CHEWING|IS_PINYIN, 225}, +{"nagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 228}, +{"nai", IS_CHEWING|IS_PINYIN, 226}, +{"namg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 228}, +{"nan", IS_CHEWING|IS_PINYIN, 227}, +{"nang", IS_CHEWING|IS_PINYIN, 228}, +{"nao", IS_CHEWING|IS_PINYIN, 229}, +{"ne", IS_CHEWING|IS_PINYIN, 230}, +{"negn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 233}, +{"nei", IS_CHEWING|IS_PINYIN, 231}, +{"nemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 233}, +{"nen", IS_CHEWING|IS_PINYIN, 232}, +{"neng", IS_CHEWING|IS_PINYIN, 233}, +{"ng", IS_CHEWING|IS_PINYIN, 234}, +{"ni", IS_CHEWING|IS_PINYIN, 235}, +{"niagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 238}, +{"niamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 238}, +{"nian", IS_CHEWING|IS_PINYIN, 237}, +{"niang", IS_CHEWING|IS_PINYIN, 238}, +{"niao", IS_CHEWING|IS_PINYIN, 239}, +{"nie", IS_CHEWING|IS_PINYIN, 240}, +{"nign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 242}, +{"nimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 242}, +{"nin", IS_CHEWING|IS_PINYIN, 241}, +{"ning", IS_CHEWING|IS_PINYIN, 242}, +{"niou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 243}, +{"niu", IS_CHEWING|IS_PINYIN, 243}, +{"nogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 244}, +{"nomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 244}, +{"non", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 244}, +{"nong", IS_CHEWING|IS_PINYIN, 244}, +{"nou", IS_CHEWING|IS_PINYIN, 245}, +{"nu", IS_CHEWING|IS_PINYIN, 246}, +{"nuan", IS_CHEWING|IS_PINYIN, 247}, +{"nue", IS_PINYIN|PINYIN_CORRECT_UE_VE, 251}, +{"nuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 248}, +{"nuo", IS_CHEWING|IS_PINYIN, 249}, +{"nv", IS_CHEWING|IS_PINYIN, 250}, +{"nve", IS_CHEWING|IS_PINYIN, 251}, +{"o", IS_CHEWING|IS_PINYIN, 252}, +{"ou", IS_CHEWING|IS_PINYIN, 253}, +{"p", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 254}, +{"pa", IS_CHEWING|IS_PINYIN, 255}, +{"pagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 258}, +{"pai", IS_CHEWING|IS_PINYIN, 256}, +{"pamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 258}, +{"pan", IS_CHEWING|IS_PINYIN, 257}, +{"pang", IS_CHEWING|IS_PINYIN, 258}, +{"pao", IS_CHEWING|IS_PINYIN, 259}, +{"pegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 262}, +{"pei", IS_CHEWING|IS_PINYIN, 260}, +{"pemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 262}, +{"pen", IS_CHEWING|IS_PINYIN, 261}, +{"peng", IS_CHEWING|IS_PINYIN, 262}, +{"pi", IS_CHEWING|IS_PINYIN, 263}, +{"pian", IS_CHEWING|IS_PINYIN, 264}, +{"piao", IS_CHEWING|IS_PINYIN, 265}, +{"pie", IS_CHEWING|IS_PINYIN, 266}, +{"pign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 268}, +{"pimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 268}, +{"pin", IS_CHEWING|IS_PINYIN, 267}, +{"ping", IS_CHEWING|IS_PINYIN, 268}, +{"po", IS_CHEWING|IS_PINYIN, 269}, +{"pou", IS_CHEWING|IS_PINYIN, 270}, +{"pu", IS_CHEWING|IS_PINYIN, 271}, +{"q", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 272}, +{"qi", IS_CHEWING|IS_PINYIN, 273}, +{"qia", IS_CHEWING|IS_PINYIN, 274}, +{"qiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 276}, +{"qiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 276}, +{"qian", IS_CHEWING|IS_PINYIN, 275}, +{"qiang", IS_CHEWING|IS_PINYIN, 276}, +{"qiao", IS_CHEWING|IS_PINYIN, 277}, +{"qie", IS_CHEWING|IS_PINYIN, 278}, +{"qign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 280}, +{"qimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 280}, +{"qin", IS_CHEWING|IS_PINYIN, 279}, +{"qing", IS_CHEWING|IS_PINYIN, 280}, +{"qiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 281}, +{"qiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 281}, +{"qion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 281}, +{"qiong", IS_CHEWING|IS_PINYIN, 281}, +{"qiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 282}, +{"qiu", IS_CHEWING|IS_PINYIN, 282}, +{"qu", IS_CHEWING|IS_PINYIN, 283}, +{"quan", IS_CHEWING|IS_PINYIN, 284}, +{"que", IS_CHEWING|IS_PINYIN, 285}, +{"quen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 286}, +{"qun", IS_CHEWING|IS_PINYIN, 286}, +{"qv", IS_PINYIN|PINYIN_CORRECT_V_U, 283}, +{"qvan", IS_PINYIN|PINYIN_CORRECT_V_U, 284}, +{"qve", IS_PINYIN|PINYIN_CORRECT_V_U, 285}, +{"qvn", IS_PINYIN|PINYIN_CORRECT_V_U, 286}, +{"r", IS_PINYIN|PINYIN_INCOMPLETE, 287}, +{"ragn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 289}, +{"ramg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 289}, +{"ran", IS_CHEWING|IS_PINYIN, 288}, +{"rang", IS_CHEWING|IS_PINYIN, 289}, +{"rao", IS_CHEWING|IS_PINYIN, 290}, +{"re", IS_CHEWING|IS_PINYIN, 291}, +{"regn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 293}, +{"remg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 293}, +{"ren", IS_CHEWING|IS_PINYIN, 292}, +{"reng", IS_CHEWING|IS_PINYIN, 293}, +{"ri", IS_CHEWING|IS_PINYIN, 294}, +{"rogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 295}, +{"romg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 295}, +{"ron", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 295}, +{"rong", IS_CHEWING|IS_PINYIN, 295}, +{"rou", IS_CHEWING|IS_PINYIN, 296}, +{"ru", IS_CHEWING|IS_PINYIN, 297}, +{"ruan", IS_CHEWING|IS_PINYIN, 299}, +{"ruei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 300}, +{"ruen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 301}, +{"rui", IS_CHEWING|IS_PINYIN, 300}, +{"run", IS_CHEWING|IS_PINYIN, 301}, +{"ruo", IS_CHEWING|IS_PINYIN, 302}, +{"s", IS_PINYIN|PINYIN_INCOMPLETE, 303}, +{"sa", IS_CHEWING|IS_PINYIN, 304}, +{"sagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 307}, +{"sai", IS_CHEWING|IS_PINYIN, 305}, +{"samg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 307}, +{"san", IS_CHEWING|IS_PINYIN, 306}, +{"sang", IS_CHEWING|IS_PINYIN, 307}, +{"sao", IS_CHEWING|IS_PINYIN, 308}, +{"se", IS_CHEWING|IS_PINYIN, 309}, +{"segn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 311}, +{"semg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 311}, +{"sen", IS_CHEWING|IS_PINYIN, 310}, +{"seng", IS_CHEWING|IS_PINYIN, 311}, +{"sh", IS_PINYIN|PINYIN_INCOMPLETE, 312}, +{"sha", IS_CHEWING|IS_PINYIN, 313}, +{"shagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 316}, +{"shai", IS_CHEWING|IS_PINYIN, 314}, +{"shamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 316}, +{"shan", IS_CHEWING|IS_PINYIN, 315}, +{"shang", IS_CHEWING|IS_PINYIN, 316}, +{"shao", IS_CHEWING|IS_PINYIN, 317}, +{"she", IS_CHEWING|IS_PINYIN, 318}, +{"shegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 321}, +{"shei", IS_CHEWING|IS_PINYIN, 319}, +{"shemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 321}, +{"shen", IS_CHEWING|IS_PINYIN, 320}, +{"sheng", IS_CHEWING|IS_PINYIN, 321}, +{"shi", IS_CHEWING|IS_PINYIN, 322}, +{"shou", IS_CHEWING|IS_PINYIN, 323}, +{"shu", IS_CHEWING|IS_PINYIN, 324}, +{"shua", IS_CHEWING|IS_PINYIN, 325}, +{"shuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 328}, +{"shuai", IS_CHEWING|IS_PINYIN, 326}, +{"shuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 328}, +{"shuan", IS_CHEWING|IS_PINYIN, 327}, +{"shuang", IS_CHEWING|IS_PINYIN, 328}, +{"shuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 329}, +{"shuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 330}, +{"shui", IS_CHEWING|IS_PINYIN, 329}, +{"shun", IS_CHEWING|IS_PINYIN, 330}, +{"shuo", IS_CHEWING|IS_PINYIN, 331}, +{"si", IS_CHEWING|IS_PINYIN, 332}, +{"sogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 333}, +{"somg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 333}, +{"son", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 333}, +{"song", IS_CHEWING|IS_PINYIN, 333}, +{"sou", IS_CHEWING|IS_PINYIN, 334}, +{"su", IS_CHEWING|IS_PINYIN, 335}, +{"suan", IS_CHEWING|IS_PINYIN, 336}, +{"suei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 337}, +{"suen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 338}, +{"sui", IS_CHEWING|IS_PINYIN, 337}, +{"sun", IS_CHEWING|IS_PINYIN, 338}, +{"suo", IS_CHEWING|IS_PINYIN, 339}, +{"t", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 340}, +{"ta", IS_CHEWING|IS_PINYIN, 341}, +{"tagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 344}, +{"tai", IS_CHEWING|IS_PINYIN, 342}, +{"tamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 344}, +{"tan", IS_CHEWING|IS_PINYIN, 343}, +{"tang", IS_CHEWING|IS_PINYIN, 344}, +{"tao", IS_CHEWING|IS_PINYIN, 345}, +{"te", IS_CHEWING|IS_PINYIN, 346}, +{"tegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 347}, +{"temg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 347}, +{"teng", IS_CHEWING|IS_PINYIN, 347}, +{"ti", IS_CHEWING|IS_PINYIN, 348}, +{"tian", IS_CHEWING|IS_PINYIN, 349}, +{"tiao", IS_CHEWING|IS_PINYIN, 350}, +{"tie", IS_CHEWING|IS_PINYIN, 351}, +{"tign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 352}, +{"timg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 352}, +{"ting", IS_CHEWING|IS_PINYIN, 352}, +{"togn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 353}, +{"tomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 353}, +{"ton", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 353}, +{"tong", IS_CHEWING|IS_PINYIN, 353}, +{"tou", IS_CHEWING|IS_PINYIN, 354}, +{"tu", IS_CHEWING|IS_PINYIN, 355}, +{"tuan", IS_CHEWING|IS_PINYIN, 356}, +{"tuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 357}, +{"tuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 358}, +{"tui", IS_CHEWING|IS_PINYIN, 357}, +{"tun", IS_CHEWING|IS_PINYIN, 358}, +{"tuo", IS_CHEWING|IS_PINYIN, 359}, +{"w", IS_PINYIN|PINYIN_INCOMPLETE, 360}, +{"wa", IS_CHEWING|IS_PINYIN, 361}, +{"wagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 364}, +{"wai", IS_CHEWING|IS_PINYIN, 362}, +{"wamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 364}, +{"wan", IS_CHEWING|IS_PINYIN, 363}, +{"wang", IS_CHEWING|IS_PINYIN, 364}, +{"wegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 367}, +{"wei", IS_CHEWING|IS_PINYIN, 365}, +{"wemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 367}, +{"wen", IS_CHEWING|IS_PINYIN, 366}, +{"weng", IS_CHEWING|IS_PINYIN, 367}, +{"wo", IS_CHEWING|IS_PINYIN, 368}, +{"wu", IS_CHEWING|IS_PINYIN, 369}, +{"x", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 370}, +{"xi", IS_CHEWING|IS_PINYIN, 371}, +{"xia", IS_CHEWING|IS_PINYIN, 372}, +{"xiagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 374}, +{"xiamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 374}, +{"xian", IS_CHEWING|IS_PINYIN, 373}, +{"xiang", IS_CHEWING|IS_PINYIN, 374}, +{"xiao", IS_CHEWING|IS_PINYIN, 375}, +{"xie", IS_CHEWING|IS_PINYIN, 376}, +{"xign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 378}, +{"ximg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 378}, +{"xin", IS_CHEWING|IS_PINYIN, 377}, +{"xing", IS_CHEWING|IS_PINYIN, 378}, +{"xiogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 379}, +{"xiomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 379}, +{"xion", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 379}, +{"xiong", IS_CHEWING|IS_PINYIN, 379}, +{"xiou", IS_PINYIN|PINYIN_CORRECT_IOU_IU, 380}, +{"xiu", IS_CHEWING|IS_PINYIN, 380}, +{"xu", IS_CHEWING|IS_PINYIN, 381}, +{"xuan", IS_CHEWING|IS_PINYIN, 382}, +{"xue", IS_CHEWING|IS_PINYIN, 383}, +{"xuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 384}, +{"xun", IS_CHEWING|IS_PINYIN, 384}, +{"xv", IS_PINYIN|PINYIN_CORRECT_V_U, 381}, +{"xvan", IS_PINYIN|PINYIN_CORRECT_V_U, 382}, +{"xve", IS_PINYIN|PINYIN_CORRECT_V_U, 383}, +{"xvn", IS_PINYIN|PINYIN_CORRECT_V_U, 384}, +{"y", IS_PINYIN|PINYIN_INCOMPLETE, 385}, +{"ya", IS_CHEWING|IS_PINYIN, 386}, +{"yagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 389}, +{"yamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 389}, +{"yan", IS_CHEWING|IS_PINYIN, 388}, +{"yang", IS_CHEWING|IS_PINYIN, 389}, +{"yao", IS_CHEWING|IS_PINYIN, 390}, +{"ye", IS_CHEWING|IS_PINYIN, 391}, +{"yi", IS_CHEWING|IS_PINYIN, 392}, +{"yign", IS_PINYIN|PINYIN_CORRECT_GN_NG, 394}, +{"yimg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 394}, +{"yin", IS_CHEWING|IS_PINYIN, 393}, +{"ying", IS_CHEWING|IS_PINYIN, 394}, +{"yo", IS_CHEWING|IS_PINYIN, 395}, +{"yogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 396}, +{"yomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 396}, +{"yon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 396}, +{"yong", IS_CHEWING|IS_PINYIN, 396}, +{"you", IS_CHEWING|IS_PINYIN, 397}, +{"yu", IS_CHEWING|IS_PINYIN, 398}, +{"yuan", IS_CHEWING|IS_PINYIN, 399}, +{"yue", IS_CHEWING|IS_PINYIN, 400}, +{"yuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 401}, +{"yun", IS_CHEWING|IS_PINYIN, 401}, +{"yv", IS_PINYIN|PINYIN_CORRECT_V_U, 398}, +{"yvan", IS_PINYIN|PINYIN_CORRECT_V_U, 399}, +{"yve", IS_PINYIN|PINYIN_CORRECT_V_U, 400}, +{"yvn", IS_PINYIN|PINYIN_CORRECT_V_U, 401}, +{"z", IS_PINYIN|PINYIN_INCOMPLETE, 402}, +{"za", IS_CHEWING|IS_PINYIN, 403}, +{"zagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 406}, +{"zai", IS_CHEWING|IS_PINYIN, 404}, +{"zamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 406}, +{"zan", IS_CHEWING|IS_PINYIN, 405}, +{"zang", IS_CHEWING|IS_PINYIN, 406}, +{"zao", IS_CHEWING|IS_PINYIN, 407}, +{"ze", IS_CHEWING|IS_PINYIN, 408}, +{"zegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 411}, +{"zei", IS_CHEWING|IS_PINYIN, 409}, +{"zemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 411}, +{"zen", IS_CHEWING|IS_PINYIN, 410}, +{"zeng", IS_CHEWING|IS_PINYIN, 411}, +{"zh", IS_PINYIN|PINYIN_INCOMPLETE, 412}, +{"zha", IS_CHEWING|IS_PINYIN, 413}, +{"zhagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 416}, +{"zhai", IS_CHEWING|IS_PINYIN, 414}, +{"zhamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 416}, +{"zhan", IS_CHEWING|IS_PINYIN, 415}, +{"zhang", IS_CHEWING|IS_PINYIN, 416}, +{"zhao", IS_CHEWING|IS_PINYIN, 417}, +{"zhe", IS_CHEWING|IS_PINYIN, 418}, +{"zhegn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 421}, +{"zhemg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 421}, +{"zhen", IS_CHEWING|IS_PINYIN, 420}, +{"zheng", IS_CHEWING|IS_PINYIN, 421}, +{"zhi", IS_CHEWING|IS_PINYIN, 422}, +{"zhogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 423}, +{"zhomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 423}, +{"zhon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 423}, +{"zhong", IS_CHEWING|IS_PINYIN, 423}, +{"zhou", IS_CHEWING|IS_PINYIN, 424}, +{"zhu", IS_CHEWING|IS_PINYIN, 425}, +{"zhua", IS_CHEWING|IS_PINYIN, 426}, +{"zhuagn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 429}, +{"zhuai", IS_CHEWING|IS_PINYIN, 427}, +{"zhuamg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 429}, +{"zhuan", IS_CHEWING|IS_PINYIN, 428}, +{"zhuang", IS_CHEWING|IS_PINYIN, 429}, +{"zhuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 430}, +{"zhuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 431}, +{"zhui", IS_CHEWING|IS_PINYIN, 430}, +{"zhun", IS_CHEWING|IS_PINYIN, 431}, +{"zhuo", IS_CHEWING|IS_PINYIN, 432}, +{"zi", IS_CHEWING|IS_PINYIN, 433}, +{"zogn", IS_PINYIN|PINYIN_CORRECT_GN_NG, 434}, +{"zomg", IS_PINYIN|PINYIN_CORRECT_MG_NG, 434}, +{"zon", IS_PINYIN|PINYIN_CORRECT_ON_ONG, 434}, +{"zong", IS_CHEWING|IS_PINYIN, 434}, +{"zou", IS_CHEWING|IS_PINYIN, 435}, +{"zu", IS_CHEWING|IS_PINYIN, 436}, +{"zuan", IS_CHEWING|IS_PINYIN, 437}, +{"zuei", IS_PINYIN|PINYIN_CORRECT_UEI_UI, 438}, +{"zuen", IS_PINYIN|PINYIN_CORRECT_UEN_UN, 439}, +{"zui", IS_CHEWING|IS_PINYIN, 438}, +{"zun", IS_CHEWING|IS_PINYIN, 439}, +{"zuo", IS_CHEWING|IS_PINYIN, 440} +}; + +const chewing_index_item_t chewing_index[] = { +{"ㄅ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 6}, +{"ㄅㄚ", IS_CHEWING|IS_PINYIN, 7}, +{"ㄅㄛ", IS_CHEWING|IS_PINYIN, 21}, +{"ㄅㄞ", IS_CHEWING|IS_PINYIN, 8}, +{"ㄅㄟ", IS_CHEWING|IS_PINYIN, 12}, +{"ㄅㄠ", IS_CHEWING|IS_PINYIN, 11}, +{"ㄅㄢ", IS_CHEWING|IS_PINYIN, 9}, +{"ㄅㄣ", IS_CHEWING|IS_PINYIN, 13}, +{"ㄅㄤ", IS_CHEWING|IS_PINYIN, 10}, +{"ㄅㄥ", IS_CHEWING|IS_PINYIN, 14}, +{"ㄅㄧ", IS_CHEWING|IS_PINYIN, 15}, +{"ㄅㄧㄝ", IS_CHEWING|IS_PINYIN, 18}, +{"ㄅㄧㄠ", IS_CHEWING|IS_PINYIN, 17}, +{"ㄅㄧㄢ", IS_CHEWING|IS_PINYIN, 16}, +{"ㄅㄧㄣ", IS_CHEWING|IS_PINYIN, 19}, +{"ㄅㄧㄥ", IS_CHEWING|IS_PINYIN, 20}, +{"ㄅㄨ", IS_CHEWING|IS_PINYIN, 22}, +{"ㄆ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 254}, +{"ㄆㄚ", IS_CHEWING|IS_PINYIN, 255}, +{"ㄆㄛ", IS_CHEWING|IS_PINYIN, 269}, +{"ㄆㄞ", IS_CHEWING|IS_PINYIN, 256}, +{"ㄆㄟ", IS_CHEWING|IS_PINYIN, 260}, +{"ㄆㄠ", IS_CHEWING|IS_PINYIN, 259}, +{"ㄆㄡ", IS_CHEWING|IS_PINYIN, 270}, +{"ㄆㄢ", IS_CHEWING|IS_PINYIN, 257}, +{"ㄆㄣ", IS_CHEWING|IS_PINYIN, 261}, +{"ㄆㄤ", IS_CHEWING|IS_PINYIN, 258}, +{"ㄆㄥ", IS_CHEWING|IS_PINYIN, 262}, +{"ㄆㄧ", IS_CHEWING|IS_PINYIN, 263}, +{"ㄆㄧㄝ", IS_CHEWING|IS_PINYIN, 266}, +{"ㄆㄧㄠ", IS_CHEWING|IS_PINYIN, 265}, +{"ㄆㄧㄢ", IS_CHEWING|IS_PINYIN, 264}, +{"ㄆㄧㄣ", IS_CHEWING|IS_PINYIN, 267}, +{"ㄆㄧㄥ", IS_CHEWING|IS_PINYIN, 268}, +{"ㄆㄨ", IS_CHEWING|IS_PINYIN, 271}, +{"ㄇ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 204}, +{"ㄇㄚ", IS_CHEWING|IS_PINYIN, 205}, +{"ㄇㄛ", IS_CHEWING|IS_PINYIN, 221}, +{"ㄇㄜ", IS_CHEWING|IS_PINYIN, 210}, +{"ㄇㄞ", IS_CHEWING|IS_PINYIN, 206}, +{"ㄇㄟ", IS_CHEWING|IS_PINYIN, 211}, +{"ㄇㄠ", IS_CHEWING|IS_PINYIN, 209}, +{"ㄇㄡ", IS_CHEWING|IS_PINYIN, 222}, +{"ㄇㄢ", IS_CHEWING|IS_PINYIN, 207}, +{"ㄇㄣ", IS_CHEWING|IS_PINYIN, 212}, +{"ㄇㄤ", IS_CHEWING|IS_PINYIN, 208}, +{"ㄇㄥ", IS_CHEWING|IS_PINYIN, 213}, +{"ㄇㄧ", IS_CHEWING|IS_PINYIN, 214}, +{"ㄇㄧㄝ", IS_CHEWING|IS_PINYIN, 217}, +{"ㄇㄧㄠ", IS_CHEWING|IS_PINYIN, 216}, +{"ㄇㄧㄡ", IS_CHEWING|IS_PINYIN, 220}, +{"ㄇㄧㄢ", IS_CHEWING|IS_PINYIN, 215}, +{"ㄇㄧㄣ", IS_CHEWING|IS_PINYIN, 218}, +{"ㄇㄧㄥ", IS_CHEWING|IS_PINYIN, 219}, +{"ㄇㄨ", IS_CHEWING|IS_PINYIN, 223}, +{"ㄈ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 90}, +{"ㄈㄚ", IS_CHEWING|IS_PINYIN, 91}, +{"ㄈㄛ", IS_CHEWING|IS_PINYIN, 98}, +{"ㄈㄜ", IS_CHEWING, 94}, +{"ㄈㄟ", IS_CHEWING|IS_PINYIN, 95}, +{"ㄈㄡ", IS_CHEWING|IS_PINYIN, 99}, +{"ㄈㄢ", IS_CHEWING|IS_PINYIN, 92}, +{"ㄈㄣ", IS_CHEWING|IS_PINYIN, 96}, +{"ㄈㄤ", IS_CHEWING|IS_PINYIN, 93}, +{"ㄈㄥ", IS_CHEWING|IS_PINYIN, 97}, +{"ㄈㄨ", IS_CHEWING|IS_PINYIN, 100}, +{"ㄉ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 60}, +{"ㄉㄚ", IS_CHEWING|IS_PINYIN, 61}, +{"ㄉㄜ", IS_CHEWING|IS_PINYIN, 66}, +{"ㄉㄞ", IS_CHEWING|IS_PINYIN, 62}, +{"ㄉㄟ", IS_CHEWING|IS_PINYIN, 67}, +{"ㄉㄠ", IS_CHEWING|IS_PINYIN, 65}, +{"ㄉㄡ", IS_CHEWING|IS_PINYIN, 79}, +{"ㄉㄢ", IS_CHEWING|IS_PINYIN, 63}, +{"ㄉㄣ", IS_CHEWING, 68}, +{"ㄉㄤ", IS_CHEWING|IS_PINYIN, 64}, +{"ㄉㄥ", IS_CHEWING|IS_PINYIN, 69}, +{"ㄉㄧ", IS_CHEWING|IS_PINYIN, 70}, +{"ㄉㄧㄚ", IS_CHEWING|IS_PINYIN, 71}, +{"ㄉㄧㄝ", IS_CHEWING|IS_PINYIN, 74}, +{"ㄉㄧㄠ", IS_CHEWING|IS_PINYIN, 73}, +{"ㄉㄧㄡ", IS_CHEWING|IS_PINYIN, 77}, +{"ㄉㄧㄢ", IS_CHEWING|IS_PINYIN, 72}, +{"ㄉㄧㄣ", IS_CHEWING, 75}, +{"ㄉㄧㄥ", IS_CHEWING|IS_PINYIN, 76}, +{"ㄉㄨ", IS_CHEWING|IS_PINYIN, 80}, +{"ㄉㄨㄛ", IS_CHEWING|IS_PINYIN, 84}, +{"ㄉㄨㄟ", IS_CHEWING|IS_PINYIN, 82}, +{"ㄉㄨㄢ", IS_CHEWING|IS_PINYIN, 81}, +{"ㄉㄨㄣ", IS_CHEWING|IS_PINYIN, 83}, +{"ㄉㄨㄥ", IS_CHEWING|IS_PINYIN, 78}, +{"ㄊ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 340}, +{"ㄊㄚ", IS_CHEWING|IS_PINYIN, 341}, +{"ㄊㄜ", IS_CHEWING|IS_PINYIN, 346}, +{"ㄊㄞ", IS_CHEWING|IS_PINYIN, 342}, +{"ㄊㄠ", IS_CHEWING|IS_PINYIN, 345}, +{"ㄊㄡ", IS_CHEWING|IS_PINYIN, 354}, +{"ㄊㄢ", IS_CHEWING|IS_PINYIN, 343}, +{"ㄊㄤ", IS_CHEWING|IS_PINYIN, 344}, +{"ㄊㄥ", IS_CHEWING|IS_PINYIN, 347}, +{"ㄊㄧ", IS_CHEWING|IS_PINYIN, 348}, +{"ㄊㄧㄝ", IS_CHEWING|IS_PINYIN, 351}, +{"ㄊㄧㄠ", IS_CHEWING|IS_PINYIN, 350}, +{"ㄊㄧㄢ", IS_CHEWING|IS_PINYIN, 349}, +{"ㄊㄧㄥ", IS_CHEWING|IS_PINYIN, 352}, +{"ㄊㄨ", IS_CHEWING|IS_PINYIN, 355}, +{"ㄊㄨㄛ", IS_CHEWING|IS_PINYIN, 359}, +{"ㄊㄨㄟ", IS_CHEWING|IS_PINYIN, 357}, +{"ㄊㄨㄢ", IS_CHEWING|IS_PINYIN, 356}, +{"ㄊㄨㄣ", IS_CHEWING|IS_PINYIN, 358}, +{"ㄊㄨㄥ", IS_CHEWING|IS_PINYIN, 353}, +{"ㄋ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 224}, +{"ㄋㄚ", IS_CHEWING|IS_PINYIN, 225}, +{"ㄋㄜ", IS_CHEWING|IS_PINYIN, 230}, +{"ㄋㄞ", IS_CHEWING|IS_PINYIN, 226}, +{"ㄋㄟ", IS_CHEWING|IS_PINYIN, 231}, +{"ㄋㄠ", IS_CHEWING|IS_PINYIN, 229}, +{"ㄋㄡ", IS_CHEWING|IS_PINYIN, 245}, +{"ㄋㄢ", IS_CHEWING|IS_PINYIN, 227}, +{"ㄋㄣ", IS_CHEWING|IS_PINYIN, 232}, +{"ㄋㄤ", IS_CHEWING|IS_PINYIN, 228}, +{"ㄋㄥ", IS_CHEWING|IS_PINYIN, 233}, +{"ㄋㄧ", IS_CHEWING|IS_PINYIN, 235}, +{"ㄋㄧㄚ", IS_CHEWING, 236}, +{"ㄋㄧㄝ", IS_CHEWING|IS_PINYIN, 240}, +{"ㄋㄧㄠ", IS_CHEWING|IS_PINYIN, 239}, +{"ㄋㄧㄡ", IS_CHEWING|IS_PINYIN, 243}, +{"ㄋㄧㄢ", IS_CHEWING|IS_PINYIN, 237}, +{"ㄋㄧㄣ", IS_CHEWING|IS_PINYIN, 241}, +{"ㄋㄧㄤ", IS_CHEWING|IS_PINYIN, 238}, +{"ㄋㄧㄥ", IS_CHEWING|IS_PINYIN, 242}, +{"ㄋㄨ", IS_CHEWING|IS_PINYIN, 246}, +{"ㄋㄨㄛ", IS_CHEWING|IS_PINYIN, 249}, +{"ㄋㄨㄢ", IS_CHEWING|IS_PINYIN, 247}, +{"ㄋㄨㄣ", IS_CHEWING, 248}, +{"ㄋㄨㄥ", IS_CHEWING|IS_PINYIN, 244}, +{"ㄋㄩ", IS_CHEWING|IS_PINYIN, 250}, +{"ㄋㄩㄝ", IS_CHEWING|IS_PINYIN, 251}, +{"ㄌ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 176}, +{"ㄌㄚ", IS_CHEWING|IS_PINYIN, 177}, +{"ㄌㄛ", IS_CHEWING|IS_PINYIN, 195}, +{"ㄌㄜ", IS_CHEWING|IS_PINYIN, 182}, +{"ㄌㄞ", IS_CHEWING|IS_PINYIN, 178}, +{"ㄌㄟ", IS_CHEWING|IS_PINYIN, 183}, +{"ㄌㄠ", IS_CHEWING|IS_PINYIN, 181}, +{"ㄌㄡ", IS_CHEWING|IS_PINYIN, 197}, +{"ㄌㄢ", IS_CHEWING|IS_PINYIN, 179}, +{"ㄌㄣ", IS_CHEWING, 184}, +{"ㄌㄤ", IS_CHEWING|IS_PINYIN, 180}, +{"ㄌㄥ", IS_CHEWING|IS_PINYIN, 185}, +{"ㄌㄧ", IS_CHEWING|IS_PINYIN, 186}, +{"ㄌㄧㄚ", IS_CHEWING|IS_PINYIN, 187}, +{"ㄌㄧㄝ", IS_CHEWING|IS_PINYIN, 191}, +{"ㄌㄧㄠ", IS_CHEWING|IS_PINYIN, 190}, +{"ㄌㄧㄡ", IS_CHEWING|IS_PINYIN, 194}, +{"ㄌㄧㄢ", IS_CHEWING|IS_PINYIN, 188}, +{"ㄌㄧㄣ", IS_CHEWING|IS_PINYIN, 192}, +{"ㄌㄧㄤ", IS_CHEWING|IS_PINYIN, 189}, +{"ㄌㄧㄥ", IS_CHEWING|IS_PINYIN, 193}, +{"ㄌㄨ", IS_CHEWING|IS_PINYIN, 198}, +{"ㄌㄨㄛ", IS_CHEWING|IS_PINYIN, 201}, +{"ㄌㄨㄢ", IS_CHEWING|IS_PINYIN, 199}, +{"ㄌㄨㄣ", IS_CHEWING|IS_PINYIN, 200}, +{"ㄌㄨㄥ", IS_CHEWING|IS_PINYIN, 196}, +{"ㄌㄩ", IS_CHEWING|IS_PINYIN, 202}, +{"ㄌㄩㄝ", IS_CHEWING|IS_PINYIN, 203}, +{"ㄍ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 101}, +{"ㄍㄚ", IS_CHEWING|IS_PINYIN, 102}, +{"ㄍㄜ", IS_CHEWING|IS_PINYIN, 107}, +{"ㄍㄞ", IS_CHEWING|IS_PINYIN, 103}, +{"ㄍㄟ", IS_CHEWING|IS_PINYIN, 108}, +{"ㄍㄠ", IS_CHEWING|IS_PINYIN, 106}, +{"ㄍㄡ", IS_CHEWING|IS_PINYIN, 112}, +{"ㄍㄢ", IS_CHEWING|IS_PINYIN, 104}, +{"ㄍㄣ", IS_CHEWING|IS_PINYIN, 109}, +{"ㄍㄤ", IS_CHEWING|IS_PINYIN, 105}, +{"ㄍㄥ", IS_CHEWING|IS_PINYIN, 110}, +{"ㄍㄨ", IS_CHEWING|IS_PINYIN, 113}, +{"ㄍㄨㄚ", IS_CHEWING|IS_PINYIN, 114}, +{"ㄍㄨㄛ", IS_CHEWING|IS_PINYIN, 120}, +{"ㄍㄨㄞ", IS_CHEWING|IS_PINYIN, 115}, +{"ㄍㄨㄟ", IS_CHEWING|IS_PINYIN, 118}, +{"ㄍㄨㄢ", IS_CHEWING|IS_PINYIN, 116}, +{"ㄍㄨㄣ", IS_CHEWING|IS_PINYIN, 119}, +{"ㄍㄨㄤ", IS_CHEWING|IS_PINYIN, 117}, +{"ㄍㄨㄥ", IS_CHEWING|IS_PINYIN, 111}, +{"ㄎ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 156}, +{"ㄎㄚ", IS_CHEWING|IS_PINYIN, 157}, +{"ㄎㄜ", IS_CHEWING|IS_PINYIN, 162}, +{"ㄎㄞ", IS_CHEWING|IS_PINYIN, 158}, +{"ㄎㄟ", IS_CHEWING, 163}, +{"ㄎㄠ", IS_CHEWING|IS_PINYIN, 161}, +{"ㄎㄡ", IS_CHEWING|IS_PINYIN, 167}, +{"ㄎㄢ", IS_CHEWING|IS_PINYIN, 159}, +{"ㄎㄣ", IS_CHEWING|IS_PINYIN, 164}, +{"ㄎㄤ", IS_CHEWING|IS_PINYIN, 160}, +{"ㄎㄥ", IS_CHEWING|IS_PINYIN, 165}, +{"ㄎㄨ", IS_CHEWING|IS_PINYIN, 168}, +{"ㄎㄨㄚ", IS_CHEWING|IS_PINYIN, 169}, +{"ㄎㄨㄛ", IS_CHEWING|IS_PINYIN, 175}, +{"ㄎㄨㄞ", IS_CHEWING|IS_PINYIN, 170}, +{"ㄎㄨㄟ", IS_CHEWING|IS_PINYIN, 173}, +{"ㄎㄨㄢ", IS_CHEWING|IS_PINYIN, 171}, +{"ㄎㄨㄣ", IS_CHEWING|IS_PINYIN, 174}, +{"ㄎㄨㄤ", IS_CHEWING|IS_PINYIN, 172}, +{"ㄎㄨㄥ", IS_CHEWING|IS_PINYIN, 166}, +{"ㄏ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 121}, +{"ㄏㄚ", IS_CHEWING|IS_PINYIN, 122}, +{"ㄏㄜ", IS_CHEWING|IS_PINYIN, 127}, +{"ㄏㄞ", IS_CHEWING|IS_PINYIN, 123}, +{"ㄏㄟ", IS_CHEWING|IS_PINYIN, 128}, +{"ㄏㄠ", IS_CHEWING|IS_PINYIN, 126}, +{"ㄏㄡ", IS_CHEWING|IS_PINYIN, 132}, +{"ㄏㄢ", IS_CHEWING|IS_PINYIN, 124}, +{"ㄏㄣ", IS_CHEWING|IS_PINYIN, 129}, +{"ㄏㄤ", IS_CHEWING|IS_PINYIN, 125}, +{"ㄏㄥ", IS_CHEWING|IS_PINYIN, 130}, +{"ㄏㄨ", IS_CHEWING|IS_PINYIN, 133}, +{"ㄏㄨㄚ", IS_CHEWING|IS_PINYIN, 134}, +{"ㄏㄨㄛ", IS_CHEWING|IS_PINYIN, 140}, +{"ㄏㄨㄞ", IS_CHEWING|IS_PINYIN, 135}, +{"ㄏㄨㄟ", IS_CHEWING|IS_PINYIN, 138}, +{"ㄏㄨㄢ", IS_CHEWING|IS_PINYIN, 136}, +{"ㄏㄨㄣ", IS_CHEWING|IS_PINYIN, 139}, +{"ㄏㄨㄤ", IS_CHEWING|IS_PINYIN, 137}, +{"ㄏㄨㄥ", IS_CHEWING|IS_PINYIN, 131}, +{"ㄐ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 141}, +{"ㄐㄧ", IS_CHEWING|IS_PINYIN, 142}, +{"ㄐㄧㄚ", IS_CHEWING|IS_PINYIN, 143}, +{"ㄐㄧㄝ", IS_CHEWING|IS_PINYIN, 147}, +{"ㄐㄧㄠ", IS_CHEWING|IS_PINYIN, 146}, +{"ㄐㄧㄡ", IS_CHEWING|IS_PINYIN, 151}, +{"ㄐㄧㄢ", IS_CHEWING|IS_PINYIN, 144}, +{"ㄐㄧㄣ", IS_CHEWING|IS_PINYIN, 148}, +{"ㄐㄧㄤ", IS_CHEWING|IS_PINYIN, 145}, +{"ㄐㄧㄥ", IS_CHEWING|IS_PINYIN, 149}, +{"ㄐㄩ", IS_CHEWING|IS_PINYIN, 152}, +{"ㄐㄩㄝ", IS_CHEWING|IS_PINYIN, 154}, +{"ㄐㄩㄢ", IS_CHEWING|IS_PINYIN, 153}, +{"ㄐㄩㄣ", IS_CHEWING|IS_PINYIN, 155}, +{"ㄐㄩㄥ", IS_CHEWING|IS_PINYIN, 150}, +{"ㄑ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 272}, +{"ㄑㄧ", IS_CHEWING|IS_PINYIN, 273}, +{"ㄑㄧㄚ", IS_CHEWING|IS_PINYIN, 274}, +{"ㄑㄧㄝ", IS_CHEWING|IS_PINYIN, 278}, +{"ㄑㄧㄠ", IS_CHEWING|IS_PINYIN, 277}, +{"ㄑㄧㄡ", IS_CHEWING|IS_PINYIN, 282}, +{"ㄑㄧㄢ", IS_CHEWING|IS_PINYIN, 275}, +{"ㄑㄧㄣ", IS_CHEWING|IS_PINYIN, 279}, +{"ㄑㄧㄤ", IS_CHEWING|IS_PINYIN, 276}, +{"ㄑㄧㄥ", IS_CHEWING|IS_PINYIN, 280}, +{"ㄑㄩ", IS_CHEWING|IS_PINYIN, 283}, +{"ㄑㄩㄝ", IS_CHEWING|IS_PINYIN, 285}, +{"ㄑㄩㄢ", IS_CHEWING|IS_PINYIN, 284}, +{"ㄑㄩㄣ", IS_CHEWING|IS_PINYIN, 286}, +{"ㄑㄩㄥ", IS_CHEWING|IS_PINYIN, 281}, +{"ㄒ", IS_CHEWING|IS_PINYIN|PINYIN_INCOMPLETE|CHEWING_INCOMPLETE, 370}, +{"ㄒㄧ", IS_CHEWING|IS_PINYIN, 371}, +{"ㄒㄧㄚ", IS_CHEWING|IS_PINYIN, 372}, +{"ㄒㄧㄝ", IS_CHEWING|IS_PINYIN, 376}, +{"ㄒㄧㄠ", IS_CHEWING|IS_PINYIN, 375}, +{"ㄒㄧㄡ", IS_CHEWING|IS_PINYIN, 380}, +{"ㄒㄧㄢ", IS_CHEWING|IS_PINYIN, 373}, +{"ㄒㄧㄣ", IS_CHEWING|IS_PINYIN, 377}, +{"ㄒㄧㄤ", IS_CHEWING|IS_PINYIN, 374}, +{"ㄒㄧㄥ", IS_CHEWING|IS_PINYIN, 378}, +{"ㄒㄩ", IS_CHEWING|IS_PINYIN, 381}, +{"ㄒㄩㄝ", IS_CHEWING|IS_PINYIN, 383}, +{"ㄒㄩㄢ", IS_CHEWING|IS_PINYIN, 382}, +{"ㄒㄩㄣ", IS_CHEWING|IS_PINYIN, 384}, +{"ㄒㄩㄥ", IS_CHEWING|IS_PINYIN, 379}, +{"ㄓ", IS_CHEWING|IS_PINYIN, 422}, +{"ㄓㄚ", IS_CHEWING|IS_PINYIN, 413}, +{"ㄓㄜ", IS_CHEWING|IS_PINYIN, 418}, +{"ㄓㄞ", IS_CHEWING|IS_PINYIN, 414}, +{"ㄓㄟ", IS_CHEWING, 419}, +{"ㄓㄠ", IS_CHEWING|IS_PINYIN, 417}, +{"ㄓㄡ", IS_CHEWING|IS_PINYIN, 424}, +{"ㄓㄢ", IS_CHEWING|IS_PINYIN, 415}, +{"ㄓㄣ", IS_CHEWING|IS_PINYIN, 420}, +{"ㄓㄤ", IS_CHEWING|IS_PINYIN, 416}, +{"ㄓㄥ", IS_CHEWING|IS_PINYIN, 421}, +{"ㄓㄨ", IS_CHEWING|IS_PINYIN, 425}, +{"ㄓㄨㄚ", IS_CHEWING|IS_PINYIN, 426}, +{"ㄓㄨㄛ", IS_CHEWING|IS_PINYIN, 432}, +{"ㄓㄨㄞ", IS_CHEWING|IS_PINYIN, 427}, +{"ㄓㄨㄟ", IS_CHEWING|IS_PINYIN, 430}, +{"ㄓㄨㄢ", IS_CHEWING|IS_PINYIN, 428}, +{"ㄓㄨㄣ", IS_CHEWING|IS_PINYIN, 431}, +{"ㄓㄨㄤ", IS_CHEWING|IS_PINYIN, 429}, +{"ㄓㄨㄥ", IS_CHEWING|IS_PINYIN, 423}, +{"ㄔ", IS_CHEWING|IS_PINYIN, 41}, +{"ㄔㄚ", IS_CHEWING|IS_PINYIN, 33}, +{"ㄔㄜ", IS_CHEWING|IS_PINYIN, 38}, +{"ㄔㄞ", IS_CHEWING|IS_PINYIN, 34}, +{"ㄔㄠ", IS_CHEWING|IS_PINYIN, 37}, +{"ㄔㄡ", IS_CHEWING|IS_PINYIN, 43}, +{"ㄔㄢ", IS_CHEWING|IS_PINYIN, 35}, +{"ㄔㄣ", IS_CHEWING|IS_PINYIN, 39}, +{"ㄔㄤ", IS_CHEWING|IS_PINYIN, 36}, +{"ㄔㄥ", IS_CHEWING|IS_PINYIN, 40}, +{"ㄔㄨ", IS_CHEWING|IS_PINYIN, 44}, +{"ㄔㄨㄚ", IS_CHEWING, 45}, +{"ㄔㄨㄛ", IS_CHEWING|IS_PINYIN, 51}, +{"ㄔㄨㄞ", IS_CHEWING|IS_PINYIN, 46}, +{"ㄔㄨㄟ", IS_CHEWING|IS_PINYIN, 49}, +{"ㄔㄨㄢ", IS_CHEWING|IS_PINYIN, 47}, +{"ㄔㄨㄣ", IS_CHEWING|IS_PINYIN, 50}, +{"ㄔㄨㄤ", IS_CHEWING|IS_PINYIN, 48}, +{"ㄔㄨㄥ", IS_CHEWING|IS_PINYIN, 42}, +{"ㄕ", IS_CHEWING|IS_PINYIN, 322}, +{"ㄕㄚ", IS_CHEWING|IS_PINYIN, 313}, +{"ㄕㄜ", IS_CHEWING|IS_PINYIN, 318}, +{"ㄕㄞ", IS_CHEWING|IS_PINYIN, 314}, +{"ㄕㄟ", IS_CHEWING|IS_PINYIN, 319}, +{"ㄕㄠ", IS_CHEWING|IS_PINYIN, 317}, +{"ㄕㄡ", IS_CHEWING|IS_PINYIN, 323}, +{"ㄕㄢ", IS_CHEWING|IS_PINYIN, 315}, +{"ㄕㄣ", IS_CHEWING|IS_PINYIN, 320}, +{"ㄕㄤ", IS_CHEWING|IS_PINYIN, 316}, +{"ㄕㄥ", IS_CHEWING|IS_PINYIN, 321}, +{"ㄕㄨ", IS_CHEWING|IS_PINYIN, 324}, +{"ㄕㄨㄚ", IS_CHEWING|IS_PINYIN, 325}, +{"ㄕㄨㄛ", IS_CHEWING|IS_PINYIN, 331}, +{"ㄕㄨㄞ", IS_CHEWING|IS_PINYIN, 326}, +{"ㄕㄨㄟ", IS_CHEWING|IS_PINYIN, 329}, +{"ㄕㄨㄢ", IS_CHEWING|IS_PINYIN, 327}, +{"ㄕㄨㄣ", IS_CHEWING|IS_PINYIN, 330}, +{"ㄕㄨㄤ", IS_CHEWING|IS_PINYIN, 328}, +{"ㄖ", IS_CHEWING|IS_PINYIN, 294}, +{"ㄖㄜ", IS_CHEWING|IS_PINYIN, 291}, +{"ㄖㄠ", IS_CHEWING|IS_PINYIN, 290}, +{"ㄖㄡ", IS_CHEWING|IS_PINYIN, 296}, +{"ㄖㄢ", IS_CHEWING|IS_PINYIN, 288}, +{"ㄖㄣ", IS_CHEWING|IS_PINYIN, 292}, +{"ㄖㄤ", IS_CHEWING|IS_PINYIN, 289}, +{"ㄖㄥ", IS_CHEWING|IS_PINYIN, 293}, +{"ㄖㄨ", IS_CHEWING|IS_PINYIN, 297}, +{"ㄖㄨㄚ", IS_CHEWING, 298}, +{"ㄖㄨㄛ", IS_CHEWING|IS_PINYIN, 302}, +{"ㄖㄨㄟ", IS_CHEWING|IS_PINYIN, 300}, +{"ㄖㄨㄢ", IS_CHEWING|IS_PINYIN, 299}, +{"ㄖㄨㄣ", IS_CHEWING|IS_PINYIN, 301}, +{"ㄖㄨㄥ", IS_CHEWING|IS_PINYIN, 295}, +{"ㄗ", IS_CHEWING|IS_PINYIN, 433}, +{"ㄗㄚ", IS_CHEWING|IS_PINYIN, 403}, +{"ㄗㄜ", IS_CHEWING|IS_PINYIN, 408}, +{"ㄗㄞ", IS_CHEWING|IS_PINYIN, 404}, +{"ㄗㄟ", IS_CHEWING|IS_PINYIN, 409}, +{"ㄗㄠ", IS_CHEWING|IS_PINYIN, 407}, +{"ㄗㄡ", IS_CHEWING|IS_PINYIN, 435}, +{"ㄗㄢ", IS_CHEWING|IS_PINYIN, 405}, +{"ㄗㄣ", IS_CHEWING|IS_PINYIN, 410}, +{"ㄗㄤ", IS_CHEWING|IS_PINYIN, 406}, +{"ㄗㄥ", IS_CHEWING|IS_PINYIN, 411}, +{"ㄗㄨ", IS_CHEWING|IS_PINYIN, 436}, +{"ㄗㄨㄛ", IS_CHEWING|IS_PINYIN, 440}, +{"ㄗㄨㄟ", IS_CHEWING|IS_PINYIN, 438}, +{"ㄗㄨㄢ", IS_CHEWING|IS_PINYIN, 437}, +{"ㄗㄨㄣ", IS_CHEWING|IS_PINYIN, 439}, +{"ㄗㄨㄥ", IS_CHEWING|IS_PINYIN, 434}, +{"ㄘ", IS_CHEWING|IS_PINYIN, 52}, +{"ㄘㄚ", IS_CHEWING|IS_PINYIN, 24}, +{"ㄘㄜ", IS_CHEWING|IS_PINYIN, 29}, +{"ㄘㄞ", IS_CHEWING|IS_PINYIN, 25}, +{"ㄘㄠ", IS_CHEWING|IS_PINYIN, 28}, +{"ㄘㄡ", IS_CHEWING|IS_PINYIN, 54}, +{"ㄘㄢ", IS_CHEWING|IS_PINYIN, 26}, +{"ㄘㄣ", IS_CHEWING|IS_PINYIN, 30}, +{"ㄘㄤ", IS_CHEWING|IS_PINYIN, 27}, +{"ㄘㄥ", IS_CHEWING|IS_PINYIN, 31}, +{"ㄘㄨ", IS_CHEWING|IS_PINYIN, 55}, +{"ㄘㄨㄛ", IS_CHEWING|IS_PINYIN, 59}, +{"ㄘㄨㄟ", IS_CHEWING|IS_PINYIN, 57}, +{"ㄘㄨㄢ", IS_CHEWING|IS_PINYIN, 56}, +{"ㄘㄨㄣ", IS_CHEWING|IS_PINYIN, 58}, +{"ㄘㄨㄥ", IS_CHEWING|IS_PINYIN, 53}, +{"ㄙ", IS_CHEWING|IS_PINYIN, 332}, +{"ㄙㄚ", IS_CHEWING|IS_PINYIN, 304}, +{"ㄙㄜ", IS_CHEWING|IS_PINYIN, 309}, +{"ㄙㄞ", IS_CHEWING|IS_PINYIN, 305}, +{"ㄙㄠ", IS_CHEWING|IS_PINYIN, 308}, +{"ㄙㄡ", IS_CHEWING|IS_PINYIN, 334}, +{"ㄙㄢ", IS_CHEWING|IS_PINYIN, 306}, +{"ㄙㄣ", IS_CHEWING|IS_PINYIN, 310}, +{"ㄙㄤ", IS_CHEWING|IS_PINYIN, 307}, +{"ㄙㄥ", IS_CHEWING|IS_PINYIN, 311}, +{"ㄙㄨ", IS_CHEWING|IS_PINYIN, 335}, +{"ㄙㄨㄛ", IS_CHEWING|IS_PINYIN, 339}, +{"ㄙㄨㄟ", IS_CHEWING|IS_PINYIN, 337}, +{"ㄙㄨㄢ", IS_CHEWING|IS_PINYIN, 336}, +{"ㄙㄨㄣ", IS_CHEWING|IS_PINYIN, 338}, +{"ㄙㄨㄥ", IS_CHEWING|IS_PINYIN, 333}, +{"ㄚ", IS_CHEWING|IS_PINYIN, 1}, +{"ㄛ", IS_CHEWING|IS_PINYIN, 252}, +{"ㄜ", IS_CHEWING|IS_PINYIN, 85}, +{"ㄞ", IS_CHEWING|IS_PINYIN, 2}, +{"ㄟ", IS_CHEWING|IS_PINYIN, 86}, +{"ㄠ", IS_CHEWING|IS_PINYIN, 5}, +{"ㄡ", IS_CHEWING|IS_PINYIN, 253}, +{"ㄢ", IS_CHEWING|IS_PINYIN, 3}, +{"ㄣ", IS_CHEWING|IS_PINYIN, 87}, +{"ㄤ", IS_CHEWING|IS_PINYIN, 4}, +{"ㄥ", IS_CHEWING, 88}, +{"ㄦ", IS_CHEWING|IS_PINYIN, 89}, +{"ㄧ", IS_CHEWING|IS_PINYIN, 392}, +{"ㄧㄚ", IS_CHEWING|IS_PINYIN, 386}, +{"ㄧㄛ", IS_CHEWING|IS_PINYIN, 395}, +{"ㄧㄝ", IS_CHEWING|IS_PINYIN, 391}, +{"ㄧㄞ", IS_CHEWING, 387}, +{"ㄧㄠ", IS_CHEWING|IS_PINYIN, 390}, +{"ㄧㄡ", IS_CHEWING|IS_PINYIN, 397}, +{"ㄧㄢ", IS_CHEWING|IS_PINYIN, 388}, +{"ㄧㄣ", IS_CHEWING|IS_PINYIN, 393}, +{"ㄧㄤ", IS_CHEWING|IS_PINYIN, 389}, +{"ㄧㄥ", IS_CHEWING|IS_PINYIN, 394}, +{"ㄨ", IS_CHEWING|IS_PINYIN, 369}, +{"ㄨㄚ", IS_CHEWING|IS_PINYIN, 361}, +{"ㄨㄛ", IS_CHEWING|IS_PINYIN, 368}, +{"ㄨㄞ", IS_CHEWING|IS_PINYIN, 362}, +{"ㄨㄟ", IS_CHEWING|IS_PINYIN, 365}, +{"ㄨㄢ", IS_CHEWING|IS_PINYIN, 363}, +{"ㄨㄣ", IS_CHEWING|IS_PINYIN, 366}, +{"ㄨㄤ", IS_CHEWING|IS_PINYIN, 364}, +{"ㄨㄥ", IS_CHEWING|IS_PINYIN, 367}, +{"ㄩ", IS_CHEWING|IS_PINYIN, 398}, +{"ㄩㄝ", IS_CHEWING|IS_PINYIN, 400}, +{"ㄩㄢ", IS_CHEWING|IS_PINYIN, 399}, +{"ㄩㄣ", IS_CHEWING|IS_PINYIN, 401}, +{"ㄩㄥ", IS_CHEWING|IS_PINYIN, 396}, +{"ㄫ", IS_CHEWING|IS_PINYIN, 234} +}; + +const content_table_item_t content_table[] = { +{"", "", "", "", ChewingKey()}, +{"a", "", "a", "ㄚ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"ai", "", "ai", "ㄞ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"an", "", "an", "ㄢ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"ang", "", "ang", "ㄤ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"ao", "", "ao", "ㄠ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"b", "b", "", "ㄅ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ba", "b", "a", "ㄅㄚ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"bai", "b", "ai", "ㄅㄞ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"ban", "b", "an", "ㄅㄢ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"bang", "b", "ang", "ㄅㄤ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"bao", "b", "ao", "ㄅㄠ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"bei", "b", "ei", "ㄅㄟ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"ben", "b", "en", "ㄅㄣ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"beng", "b", "eng", "ㄅㄥ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"bi", "b", "i", "ㄅㄧ", ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"bian", "b", "ian", "ㄅㄧㄢ", ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AN)}, +{"biao", "b", "iao", "ㄅㄧㄠ", ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AO)}, +{"bie", "b", "ie", "ㄅㄧㄝ", ChewingKey(CHEWING_B, CHEWING_I, CHEWING_E)}, +{"bin", "b", "in", "ㄅㄧㄣ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"bing", "b", "ing", "ㄅㄧㄥ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"bo", "b", "o", "ㄅㄛ", ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_O)}, +{"bu", "b", "u", "ㄅㄨ", ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"c", "c", "", "ㄘ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ca", "c", "a", "ㄘㄚ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"cai", "c", "ai", "ㄘㄞ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"can", "c", "an", "ㄘㄢ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"cang", "c", "ang", "ㄘㄤ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"cao", "c", "ao", "ㄘㄠ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"ce", "c", "e", "ㄘㄜ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"cen", "c", "en", "ㄘㄣ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"ceng", "c", "eng", "ㄘㄥ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"ch", "ch", "", "ㄔ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"cha", "ch", "a", "ㄔㄚ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"chai", "ch", "ai", "ㄔㄞ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"chan", "ch", "an", "ㄔㄢ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"chang", "ch", "ang", "ㄔㄤ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"chao", "ch", "ao", "ㄔㄠ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"che", "ch", "e", "ㄔㄜ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"chen", "ch", "en", "ㄔㄣ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"cheng", "ch", "eng", "ㄔㄥ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"chi", "ch", "i", "ㄔ", ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"chong", "ch", "ong", "ㄔㄨㄥ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"chou", "ch", "ou", "ㄔㄡ", ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"chu", "ch", "u", "ㄔㄨ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"chua", "ch", "ua", "ㄔㄨㄚ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_A)}, +{"chuai", "ch", "uai", "ㄔㄨㄞ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AI)}, +{"chuan", "ch", "uan", "ㄔㄨㄢ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AN)}, +{"chuang", "ch", "uang", "ㄔㄨㄤ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ANG)}, +{"chui", "ch", "ui", "ㄔㄨㄟ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EI)}, +{"chun", "ch", "un", "ㄔㄨㄣ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EN)}, +{"chuo", "ch", "uo", "ㄔㄨㄛ", ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_O)}, +{"ci", "c", "i", "ㄘ", ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"cong", "c", "ong", "ㄘㄨㄥ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"cou", "c", "ou", "ㄘㄡ", ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"cu", "c", "u", "ㄘㄨ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"cuan", "c", "uan", "ㄘㄨㄢ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_AN)}, +{"cui", "c", "ui", "ㄘㄨㄟ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_EI)}, +{"cun", "c", "un", "ㄘㄨㄣ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_EN)}, +{"cuo", "c", "uo", "ㄘㄨㄛ", ChewingKey(CHEWING_C, CHEWING_U, CHEWING_O)}, +{"d", "d", "", "ㄉ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"da", "d", "a", "ㄉㄚ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"dai", "d", "ai", "ㄉㄞ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"dan", "d", "an", "ㄉㄢ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"dang", "d", "ang", "ㄉㄤ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"dao", "d", "ao", "ㄉㄠ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"de", "d", "e", "ㄉㄜ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"dei", "d", "ei", "ㄉㄟ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"den", "d", "en", "ㄉㄣ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"deng", "d", "eng", "ㄉㄥ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"di", "d", "i", "ㄉㄧ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"dia", "d", "ia", "ㄉㄧㄚ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_A)}, +{"dian", "d", "ian", "ㄉㄧㄢ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AN)}, +{"diao", "d", "iao", "ㄉㄧㄠ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AO)}, +{"die", "d", "ie", "ㄉㄧㄝ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_E)}, +{"din", "d", "in", "ㄉㄧㄣ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"ding", "d", "ing", "ㄉㄧㄥ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"diu", "d", "iu", "ㄉㄧㄡ", ChewingKey(CHEWING_D, CHEWING_I, CHEWING_OU)}, +{"dong", "d", "ong", "ㄉㄨㄥ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"dou", "d", "ou", "ㄉㄡ", ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"du", "d", "u", "ㄉㄨ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"duan", "d", "uan", "ㄉㄨㄢ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_AN)}, +{"dui", "d", "ui", "ㄉㄨㄟ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_EI)}, +{"dun", "d", "un", "ㄉㄨㄣ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_EN)}, +{"duo", "d", "uo", "ㄉㄨㄛ", ChewingKey(CHEWING_D, CHEWING_U, CHEWING_O)}, +{"e", "", "e", "ㄜ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"ei", "", "ei", "ㄟ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"en", "", "en", "ㄣ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"eng", "", "eng", "ㄥ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"er", "", "er", "ㄦ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ER)}, +{"f", "f", "", "ㄈ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"fa", "f", "a", "ㄈㄚ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"fan", "f", "an", "ㄈㄢ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"fang", "f", "ang", "ㄈㄤ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"fe", "f", "e", "ㄈㄜ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"fei", "f", "ei", "ㄈㄟ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"fen", "f", "en", "ㄈㄣ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"feng", "f", "eng", "ㄈㄥ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"fo", "f", "o", "ㄈㄛ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_O)}, +{"fou", "f", "ou", "ㄈㄡ", ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"fu", "f", "u", "ㄈㄨ", ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"g", "g", "", "ㄍ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ga", "g", "a", "ㄍㄚ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"gai", "g", "ai", "ㄍㄞ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"gan", "g", "an", "ㄍㄢ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"gang", "g", "ang", "ㄍㄤ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"gao", "g", "ao", "ㄍㄠ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"ge", "g", "e", "ㄍㄜ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"gei", "g", "ei", "ㄍㄟ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"gen", "g", "en", "ㄍㄣ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"geng", "g", "eng", "ㄍㄥ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"gong", "g", "ong", "ㄍㄨㄥ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"gou", "g", "ou", "ㄍㄡ", ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"gu", "g", "u", "ㄍㄨ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"gua", "g", "ua", "ㄍㄨㄚ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_A)}, +{"guai", "g", "uai", "ㄍㄨㄞ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AI)}, +{"guan", "g", "uan", "ㄍㄨㄢ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AN)}, +{"guang", "g", "uang", "ㄍㄨㄤ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ANG)}, +{"gui", "g", "ui", "ㄍㄨㄟ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_EI)}, +{"gun", "g", "un", "ㄍㄨㄣ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_EN)}, +{"guo", "g", "uo", "ㄍㄨㄛ", ChewingKey(CHEWING_G, CHEWING_U, CHEWING_O)}, +{"h", "h", "", "ㄏ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ha", "h", "a", "ㄏㄚ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"hai", "h", "ai", "ㄏㄞ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"han", "h", "an", "ㄏㄢ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"hang", "h", "ang", "ㄏㄤ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"hao", "h", "ao", "ㄏㄠ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"he", "h", "e", "ㄏㄜ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"hei", "h", "ei", "ㄏㄟ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"hen", "h", "en", "ㄏㄣ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"heng", "h", "eng", "ㄏㄥ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"hong", "h", "ong", "ㄏㄨㄥ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"hou", "h", "ou", "ㄏㄡ", ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"hu", "h", "u", "ㄏㄨ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"hua", "h", "ua", "ㄏㄨㄚ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_A)}, +{"huai", "h", "uai", "ㄏㄨㄞ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AI)}, +{"huan", "h", "uan", "ㄏㄨㄢ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AN)}, +{"huang", "h", "uang", "ㄏㄨㄤ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ANG)}, +{"hui", "h", "ui", "ㄏㄨㄟ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_EI)}, +{"hun", "h", "un", "ㄏㄨㄣ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_EN)}, +{"huo", "h", "uo", "ㄏㄨㄛ", ChewingKey(CHEWING_H, CHEWING_U, CHEWING_O)}, +{"j", "j", "", "ㄐ", ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ji", "j", "i", "ㄐㄧ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"jia", "j", "ia", "ㄐㄧㄚ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A)}, +{"jian", "j", "ian", "ㄐㄧㄢ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN)}, +{"jiang", "j", "iang", "ㄐㄧㄤ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ANG)}, +{"jiao", "j", "iao", "ㄐㄧㄠ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AO)}, +{"jie", "j", "ie", "ㄐㄧㄝ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_E)}, +{"jin", "j", "in", "ㄐㄧㄣ", ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"jing", "j", "ing", "ㄐㄧㄥ", ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"jiong", "j", "iong", "ㄐㄩㄥ", ChewingKey(CHEWING_J, CHEWING_I, PINYIN_ONG)}, +{"jiu", "j", "iu", "ㄐㄧㄡ", ChewingKey(CHEWING_J, CHEWING_I, CHEWING_OU)}, +{"ju", "j", "u", "ㄐㄩ", ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ZERO_FINAL)}, +{"juan", "j", "uan", "ㄐㄩㄢ", ChewingKey(CHEWING_J, CHEWING_V, CHEWING_AN)}, +{"jue", "j", "ue", "ㄐㄩㄝ", ChewingKey(CHEWING_J, CHEWING_V, CHEWING_E)}, +{"jun", "j", "un", "ㄐㄩㄣ", ChewingKey(CHEWING_J, CHEWING_V, CHEWING_EN)}, +{"k", "k", "", "ㄎ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ka", "k", "a", "ㄎㄚ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"kai", "k", "ai", "ㄎㄞ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"kan", "k", "an", "ㄎㄢ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"kang", "k", "ang", "ㄎㄤ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"kao", "k", "ao", "ㄎㄠ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"ke", "k", "e", "ㄎㄜ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"kei", "k", "ei", "ㄎㄟ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"ken", "k", "en", "ㄎㄣ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"keng", "k", "eng", "ㄎㄥ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"kong", "k", "ong", "ㄎㄨㄥ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"kou", "k", "ou", "ㄎㄡ", ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"ku", "k", "u", "ㄎㄨ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"kua", "k", "ua", "ㄎㄨㄚ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_A)}, +{"kuai", "k", "uai", "ㄎㄨㄞ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AI)}, +{"kuan", "k", "uan", "ㄎㄨㄢ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AN)}, +{"kuang", "k", "uang", "ㄎㄨㄤ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ANG)}, +{"kui", "k", "ui", "ㄎㄨㄟ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_EI)}, +{"kun", "k", "un", "ㄎㄨㄣ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_EN)}, +{"kuo", "k", "uo", "ㄎㄨㄛ", ChewingKey(CHEWING_K, CHEWING_U, CHEWING_O)}, +{"l", "l", "", "ㄌ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"la", "l", "a", "ㄌㄚ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"lai", "l", "ai", "ㄌㄞ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"lan", "l", "an", "ㄌㄢ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"lang", "l", "ang", "ㄌㄤ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"lao", "l", "ao", "ㄌㄠ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"le", "l", "e", "ㄌㄜ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"lei", "l", "ei", "ㄌㄟ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"len", "l", "en", "ㄌㄣ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"leng", "l", "eng", "ㄌㄥ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"li", "l", "i", "ㄌㄧ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"lia", "l", "ia", "ㄌㄧㄚ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_A)}, +{"lian", "l", "ian", "ㄌㄧㄢ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AN)}, +{"liang", "l", "iang", "ㄌㄧㄤ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ANG)}, +{"liao", "l", "iao", "ㄌㄧㄠ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AO)}, +{"lie", "l", "ie", "ㄌㄧㄝ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_E)}, +{"lin", "l", "in", "ㄌㄧㄣ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"ling", "l", "ing", "ㄌㄧㄥ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"liu", "l", "iu", "ㄌㄧㄡ", ChewingKey(CHEWING_L, CHEWING_I, CHEWING_OU)}, +{"lo", "l", "o", "ㄌㄛ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_O)}, +{"long", "l", "ong", "ㄌㄨㄥ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"lou", "l", "ou", "ㄌㄡ", ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"lu", "l", "u", "ㄌㄨ", ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"luan", "l", "uan", "ㄌㄨㄢ", ChewingKey(CHEWING_L, CHEWING_U, CHEWING_AN)}, +{"lun", "l", "un", "ㄌㄨㄣ", ChewingKey(CHEWING_L, CHEWING_U, CHEWING_EN)}, +{"luo", "l", "uo", "ㄌㄨㄛ", ChewingKey(CHEWING_L, CHEWING_U, CHEWING_O)}, +{"lv", "l", "v", "ㄌㄩ", ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ZERO_FINAL)}, +{"lve", "l", "ve", "ㄌㄩㄝ", ChewingKey(CHEWING_L, CHEWING_V, CHEWING_E)}, +{"m", "m", "", "ㄇ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ma", "m", "a", "ㄇㄚ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"mai", "m", "ai", "ㄇㄞ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"man", "m", "an", "ㄇㄢ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"mang", "m", "ang", "ㄇㄤ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"mao", "m", "ao", "ㄇㄠ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"me", "m", "e", "ㄇㄜ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"mei", "m", "ei", "ㄇㄟ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"men", "m", "en", "ㄇㄣ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"meng", "m", "eng", "ㄇㄥ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"mi", "m", "i", "ㄇㄧ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"mian", "m", "ian", "ㄇㄧㄢ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AN)}, +{"miao", "m", "iao", "ㄇㄧㄠ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AO)}, +{"mie", "m", "ie", "ㄇㄧㄝ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_E)}, +{"min", "m", "in", "ㄇㄧㄣ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"ming", "m", "ing", "ㄇㄧㄥ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"miu", "m", "iu", "ㄇㄧㄡ", ChewingKey(CHEWING_M, CHEWING_I, CHEWING_OU)}, +{"mo", "m", "o", "ㄇㄛ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_O)}, +{"mou", "m", "ou", "ㄇㄡ", ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"mu", "m", "u", "ㄇㄨ", ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"n", "n", "", "ㄋ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"na", "n", "a", "ㄋㄚ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"nai", "n", "ai", "ㄋㄞ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"nan", "n", "an", "ㄋㄢ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"nang", "n", "ang", "ㄋㄤ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"nao", "n", "ao", "ㄋㄠ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"ne", "n", "e", "ㄋㄜ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"nei", "n", "ei", "ㄋㄟ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"nen", "n", "en", "ㄋㄣ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"neng", "n", "eng", "ㄋㄥ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"ng", "", "ng", "ㄫ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_NG)}, +{"ni", "n", "i", "ㄋㄧ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"nia", "n", "ia", "ㄋㄧㄚ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_A)}, +{"nian", "n", "ian", "ㄋㄧㄢ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AN)}, +{"niang", "n", "iang", "ㄋㄧㄤ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ANG)}, +{"niao", "n", "iao", "ㄋㄧㄠ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AO)}, +{"nie", "n", "ie", "ㄋㄧㄝ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_E)}, +{"nin", "n", "in", "ㄋㄧㄣ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"ning", "n", "ing", "ㄋㄧㄥ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"niu", "n", "iu", "ㄋㄧㄡ", ChewingKey(CHEWING_N, CHEWING_I, CHEWING_OU)}, +{"nong", "n", "ong", "ㄋㄨㄥ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"nou", "n", "ou", "ㄋㄡ", ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"nu", "n", "u", "ㄋㄨ", ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"nuan", "n", "uan", "ㄋㄨㄢ", ChewingKey(CHEWING_N, CHEWING_U, CHEWING_AN)}, +{"nun", "n", "un", "ㄋㄨㄣ", ChewingKey(CHEWING_N, CHEWING_U, CHEWING_EN)}, +{"nuo", "n", "uo", "ㄋㄨㄛ", ChewingKey(CHEWING_N, CHEWING_U, CHEWING_O)}, +{"nv", "n", "v", "ㄋㄩ", ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ZERO_FINAL)}, +{"nve", "n", "ve", "ㄋㄩㄝ", ChewingKey(CHEWING_N, CHEWING_V, CHEWING_E)}, +{"o", "", "o", "ㄛ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_O)}, +{"ou", "", "ou", "ㄡ", ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"p", "p", "", "ㄆ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"pa", "p", "a", "ㄆㄚ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"pai", "p", "ai", "ㄆㄞ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"pan", "p", "an", "ㄆㄢ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"pang", "p", "ang", "ㄆㄤ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"pao", "p", "ao", "ㄆㄠ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"pei", "p", "ei", "ㄆㄟ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"pen", "p", "en", "ㄆㄣ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"peng", "p", "eng", "ㄆㄥ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"pi", "p", "i", "ㄆㄧ", ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"pian", "p", "ian", "ㄆㄧㄢ", ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AN)}, +{"piao", "p", "iao", "ㄆㄧㄠ", ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AO)}, +{"pie", "p", "ie", "ㄆㄧㄝ", ChewingKey(CHEWING_P, CHEWING_I, CHEWING_E)}, +{"pin", "p", "in", "ㄆㄧㄣ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"ping", "p", "ing", "ㄆㄧㄥ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"po", "p", "o", "ㄆㄛ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_O)}, +{"pou", "p", "ou", "ㄆㄡ", ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"pu", "p", "u", "ㄆㄨ", ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"q", "q", "", "ㄑ", ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"qi", "q", "i", "ㄑㄧ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"qia", "q", "ia", "ㄑㄧㄚ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_A)}, +{"qian", "q", "ian", "ㄑㄧㄢ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AN)}, +{"qiang", "q", "iang", "ㄑㄧㄤ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ANG)}, +{"qiao", "q", "iao", "ㄑㄧㄠ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AO)}, +{"qie", "q", "ie", "ㄑㄧㄝ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_E)}, +{"qin", "q", "in", "ㄑㄧㄣ", ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"qing", "q", "ing", "ㄑㄧㄥ", ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"qiong", "q", "iong", "ㄑㄩㄥ", ChewingKey(CHEWING_Q, CHEWING_I, PINYIN_ONG)}, +{"qiu", "q", "iu", "ㄑㄧㄡ", ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_OU)}, +{"qu", "q", "u", "ㄑㄩ", ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ZERO_FINAL)}, +{"quan", "q", "uan", "ㄑㄩㄢ", ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_AN)}, +{"que", "q", "ue", "ㄑㄩㄝ", ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_E)}, +{"qun", "q", "un", "ㄑㄩㄣ", ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_EN)}, +{"r", "r", "", "ㄖ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ran", "r", "an", "ㄖㄢ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"rang", "r", "ang", "ㄖㄤ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"rao", "r", "ao", "ㄖㄠ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"re", "r", "e", "ㄖㄜ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"ren", "r", "en", "ㄖㄣ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"reng", "r", "eng", "ㄖㄥ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"ri", "r", "i", "ㄖ", ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"rong", "r", "ong", "ㄖㄨㄥ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"rou", "r", "ou", "ㄖㄡ", ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"ru", "r", "u", "ㄖㄨ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"rua", "r", "ua", "ㄖㄨㄚ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_A)}, +{"ruan", "r", "uan", "ㄖㄨㄢ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_AN)}, +{"rui", "r", "ui", "ㄖㄨㄟ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_EI)}, +{"run", "r", "un", "ㄖㄨㄣ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_EN)}, +{"ruo", "r", "uo", "ㄖㄨㄛ", ChewingKey(CHEWING_R, CHEWING_U, CHEWING_O)}, +{"s", "s", "", "ㄙ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"sa", "s", "a", "ㄙㄚ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"sai", "s", "ai", "ㄙㄞ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"san", "s", "an", "ㄙㄢ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"sang", "s", "ang", "ㄙㄤ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"sao", "s", "ao", "ㄙㄠ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"se", "s", "e", "ㄙㄜ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"sen", "s", "en", "ㄙㄣ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"seng", "s", "eng", "ㄙㄥ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"sh", "sh", "", "ㄕ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"sha", "sh", "a", "ㄕㄚ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"shai", "sh", "ai", "ㄕㄞ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"shan", "sh", "an", "ㄕㄢ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"shang", "sh", "ang", "ㄕㄤ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"shao", "sh", "ao", "ㄕㄠ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"she", "sh", "e", "ㄕㄜ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"shei", "sh", "ei", "ㄕㄟ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"shen", "sh", "en", "ㄕㄣ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"sheng", "sh", "eng", "ㄕㄥ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"shi", "sh", "i", "ㄕ", ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"shou", "sh", "ou", "ㄕㄡ", ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"shu", "sh", "u", "ㄕㄨ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"shua", "sh", "ua", "ㄕㄨㄚ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_A)}, +{"shuai", "sh", "uai", "ㄕㄨㄞ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AI)}, +{"shuan", "sh", "uan", "ㄕㄨㄢ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AN)}, +{"shuang", "sh", "uang", "ㄕㄨㄤ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ANG)}, +{"shui", "sh", "ui", "ㄕㄨㄟ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_EI)}, +{"shun", "sh", "un", "ㄕㄨㄣ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_EN)}, +{"shuo", "sh", "uo", "ㄕㄨㄛ", ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_O)}, +{"si", "s", "i", "ㄙ", ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"song", "s", "ong", "ㄙㄨㄥ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"sou", "s", "ou", "ㄙㄡ", ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"su", "s", "u", "ㄙㄨ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"suan", "s", "uan", "ㄙㄨㄢ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_AN)}, +{"sui", "s", "ui", "ㄙㄨㄟ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_EI)}, +{"sun", "s", "un", "ㄙㄨㄣ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_EN)}, +{"suo", "s", "uo", "ㄙㄨㄛ", ChewingKey(CHEWING_S, CHEWING_U, CHEWING_O)}, +{"t", "t", "", "ㄊ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ta", "t", "a", "ㄊㄚ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"tai", "t", "ai", "ㄊㄞ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"tan", "t", "an", "ㄊㄢ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"tang", "t", "ang", "ㄊㄤ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"tao", "t", "ao", "ㄊㄠ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"te", "t", "e", "ㄊㄜ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"teng", "t", "eng", "ㄊㄥ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"ti", "t", "i", "ㄊㄧ", ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"tian", "t", "ian", "ㄊㄧㄢ", ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AN)}, +{"tiao", "t", "iao", "ㄊㄧㄠ", ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AO)}, +{"tie", "t", "ie", "ㄊㄧㄝ", ChewingKey(CHEWING_T, CHEWING_I, CHEWING_E)}, +{"ting", "t", "ing", "ㄊㄧㄥ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"tong", "t", "ong", "ㄊㄨㄥ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"tou", "t", "ou", "ㄊㄡ", ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"tu", "t", "u", "ㄊㄨ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"tuan", "t", "uan", "ㄊㄨㄢ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_AN)}, +{"tui", "t", "ui", "ㄊㄨㄟ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_EI)}, +{"tun", "t", "un", "ㄊㄨㄣ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_EN)}, +{"tuo", "t", "uo", "ㄊㄨㄛ", ChewingKey(CHEWING_T, CHEWING_U, CHEWING_O)}, +{"w", "w", "", "PINYIN_W", ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"wa", "w", "a", "ㄨㄚ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_A)}, +{"wai", "w", "ai", "ㄨㄞ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AI)}, +{"wan", "w", "an", "ㄨㄢ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AN)}, +{"wang", "w", "ang", "ㄨㄤ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ANG)}, +{"wei", "w", "ei", "ㄨㄟ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_EI)}, +{"wen", "w", "en", "ㄨㄣ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_EN)}, +{"weng", "w", "eng", "ㄨㄥ", ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"wo", "w", "o", "ㄨㄛ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_O)}, +{"wu", "w", "u", "ㄨ", ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"x", "x", "", "ㄒ", ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"xi", "x", "i", "ㄒㄧ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"xia", "x", "ia", "ㄒㄧㄚ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_A)}, +{"xian", "x", "ian", "ㄒㄧㄢ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AN)}, +{"xiang", "x", "iang", "ㄒㄧㄤ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ANG)}, +{"xiao", "x", "iao", "ㄒㄧㄠ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AO)}, +{"xie", "x", "ie", "ㄒㄧㄝ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_E)}, +{"xin", "x", "in", "ㄒㄧㄣ", ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"xing", "x", "ing", "ㄒㄧㄥ", ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"xiong", "x", "iong", "ㄒㄩㄥ", ChewingKey(CHEWING_X, CHEWING_I, PINYIN_ONG)}, +{"xiu", "x", "iu", "ㄒㄧㄡ", ChewingKey(CHEWING_X, CHEWING_I, CHEWING_OU)}, +{"xu", "x", "u", "ㄒㄩ", ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ZERO_FINAL)}, +{"xuan", "x", "uan", "ㄒㄩㄢ", ChewingKey(CHEWING_X, CHEWING_V, CHEWING_AN)}, +{"xue", "x", "ue", "ㄒㄩㄝ", ChewingKey(CHEWING_X, CHEWING_V, CHEWING_E)}, +{"xun", "x", "un", "ㄒㄩㄣ", ChewingKey(CHEWING_X, CHEWING_V, CHEWING_EN)}, +{"y", "y", "", "PINYIN_Y", ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"ya", "y", "a", "ㄧㄚ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_A)}, +{"yai", "y", "ai", "ㄧㄞ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AI)}, +{"yan", "y", "an", "ㄧㄢ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AN)}, +{"yang", "y", "ang", "ㄧㄤ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ANG)}, +{"yao", "y", "ao", "ㄧㄠ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AO)}, +{"ye", "y", "e", "ㄧㄝ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_E)}, +{"yi", "y", "i", "ㄧ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"yin", "y", "in", "ㄧㄣ", ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_IN)}, +{"ying", "y", "ing", "ㄧㄥ", ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_ING)}, +{"yo", "y", "o", "ㄧㄛ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_O)}, +{"yong", "y", "ong", "ㄩㄥ", ChewingKey(PINYIN_Y, CHEWING_I, PINYIN_ONG)}, +{"you", "y", "ou", "ㄧㄡ", ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_OU)}, +{"yu", "y", "u", "ㄩ", ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ZERO_FINAL)}, +{"yuan", "y", "uan", "ㄩㄢ", ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_AN)}, +{"yue", "y", "ue", "ㄩㄝ", ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_E)}, +{"yun", "y", "un", "ㄩㄣ", ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_EN)}, +{"z", "z", "", "ㄗ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"za", "z", "a", "ㄗㄚ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"zai", "z", "ai", "ㄗㄞ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"zan", "z", "an", "ㄗㄢ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"zang", "z", "ang", "ㄗㄤ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"zao", "z", "ao", "ㄗㄠ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"ze", "z", "e", "ㄗㄜ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"zei", "z", "ei", "ㄗㄟ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"zen", "z", "en", "ㄗㄣ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"zeng", "z", "eng", "ㄗㄥ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"zh", "zh", "", "ㄓ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL)}, +{"zha", "zh", "a", "ㄓㄚ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_A)}, +{"zhai", "zh", "ai", "ㄓㄞ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AI)}, +{"zhan", "zh", "an", "ㄓㄢ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AN)}, +{"zhang", "zh", "ang", "ㄓㄤ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ANG)}, +{"zhao", "zh", "ao", "ㄓㄠ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AO)}, +{"zhe", "zh", "e", "ㄓㄜ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_E)}, +{"zhei", "zh", "ei", "ㄓㄟ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_EI)}, +{"zhen", "zh", "en", "ㄓㄣ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_EN)}, +{"zheng", "zh", "eng", "ㄓㄥ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ENG)}, +{"zhi", "zh", "i", "ㄓ", ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"zhong", "zh", "ong", "ㄓㄨㄥ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"zhou", "zh", "ou", "ㄓㄡ", ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"zhu", "zh", "u", "ㄓㄨ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"zhua", "zh", "ua", "ㄓㄨㄚ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_A)}, +{"zhuai", "zh", "uai", "ㄓㄨㄞ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AI)}, +{"zhuan", "zh", "uan", "ㄓㄨㄢ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AN)}, +{"zhuang", "zh", "uang", "ㄓㄨㄤ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ANG)}, +{"zhui", "zh", "ui", "ㄓㄨㄟ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_EI)}, +{"zhun", "zh", "un", "ㄓㄨㄣ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_EN)}, +{"zhuo", "zh", "uo", "ㄓㄨㄛ", ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_O)}, +{"zi", "z", "i", "ㄗ", ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ZERO_FINAL)}, +{"zong", "z", "ong", "ㄗㄨㄥ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, PINYIN_ONG)}, +{"zou", "z", "ou", "ㄗㄡ", ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_OU)}, +{"zu", "z", "u", "ㄗㄨ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ZERO_FINAL)}, +{"zuan", "z", "uan", "ㄗㄨㄢ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_AN)}, +{"zui", "z", "ui", "ㄗㄨㄟ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_EI)}, +{"zun", "z", "un", "ㄗㄨㄣ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_EN)}, +{"zuo", "z", "uo", "ㄗㄨㄛ", ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_O)} +}; + +const divided_table_item_t divided_table[] = { +{"bian", 182478, {"bi", "an"}, 100}, +{"bie", 63919, {"bi", "e"}, 100}, +{"dian", 179799, {"di", "an"}, 100}, +{"jian", 435752, {"ji", "an"}, 200}, +{"jiang", 139834, {"ji", "ang"}, 100}, +{"jie", 294175, {"ji", "e"}, 100}, +{"jue", 119987, {"ju", "e"}, 100}, +{"kuai", 63367, {"ku", "ai"}, 100}, +{"lian", 130021, {"li", "an"}, 100}, +{"liang", 185438, {"li", "ang"}, 100}, +{"liao", 39355, {"li", "ao"}, 100}, +{"luan", 17609, {"lu", "an"}, 100}, +{"qian", 195129, {"qi", "an"}, 100}, +{"qie", 70219, {"qi", "e"}, 100}, +{"shuan", 1114, {"shu", "an"}, 100}, +{"tian", 185905, {"ti", "an"}, 100}, +{"tuan", 17287, {"tu", "an"}, 100}, +{"xian", 280991, {"xi", "an"}, 300}, +{"yuan", 280423, {"yu", "an"}, 100}, +{"zuan", 4016, {"zu", "an"}, 100} +}; + +const resplit_table_item_t resplit_table[] = { +{{"a", "nan"}, 0, {"an", "an"}, 100}, +{{"an", "gang"}, 0, {"ang", "ang"}, 100}, +{{"ba", "nan"}, 0, {"ban", "an"}, 100}, +{{"ca", "nan"}, 0, {"can", "an"}, 100}, +{{"chan", "gan"}, 0, {"chang", "an"}, 100}, +{{"chan", "ge"}, 0, {"chang", "e"}, 100}, +{{"che", "nai"}, 0, {"chen", "ai"}, 100}, +{{"chen", "gan"}, 0, {"cheng", "an"}, 100}, +{{"chu", "nan"}, 100, {"chun", "an"}, 100}, +{{"dan", "gan"}, 0, {"dang", "an"}, 100}, +{{"e", "nai"}, 0, {"en", "ai"}, 100}, +{{"fa", "nan"}, 100, {"fan", "an"}, 100}, +{{"fan", "gai"}, 0, {"fang", "ai"}, 100}, +{{"fan", "gan"}, 100, {"fang", "an"}, 100}, +{{"fan", "ge"}, 0, {"fang", "e"}, 100}, +{{"ga", "nai"}, 0, {"gan", "ai"}, 100}, +{{"ga", "nen"}, 0, {"gan", "en"}, 100}, +{{"gan", "gao"}, 0, {"gang", "ao"}, 100}, +{{"guan", "gan"}, 100, {"guang", "an"}, 100}, +{{"hu", "nan"}, 100, {"hun", "an"}, 100}, +{{"huan", "gan"}, 0, {"huang", "an"}, 100}, +{{"ji", "ne"}, 0, {"jin", "e"}, 100}, +{{"ji", "nou"}, 0, {"jin", "ou"}, 100}, +{{"jia", "nai"}, 0, {"jian", "ai"}, 100}, +{{"jia", "nan"}, 100, {"jian", "an"}, 100}, +{{"jia", "ne"}, 0, {"jian", "e"}, 100}, +{{"jia", "nou"}, 0, {"jian", "ou"}, 100}, +{{"jian", "gan"}, 100, {"jiang", "an"}, 100}, +{{"jin", "gai"}, 0, {"jing", "ai"}, 100}, +{{"jin", "gan"}, 0, {"jing", "an"}, 100}, +{{"jin", "ge"}, 0, {"jing", "e"}, 100}, +{{"kuan", "gao"}, 0, {"kuang", "ao"}, 100}, +{{"li", "nan"}, 100, {"lin", "an"}, 100}, +{{"lia", "nai"}, 0, {"lian", "ai"}, 100}, +{{"lia", "ne"}, 0, {"lian", "e"}, 100}, +{{"lian", "gan"}, 0, {"liang", "an"}, 100}, +{{"ma", "ne"}, 0, {"man", "e"}, 100}, +{{"men", "gen"}, 0, {"meng", "en"}, 100}, +{{"min", "gan"}, 100, {"ming", "an"}, 100}, +{{"min", "ge"}, 100, {"ming", "e"}, 100}, +{{"na", "nai"}, 0, {"nan", "ai"}, 100}, +{{"na", "nan"}, 0, {"nan", "an"}, 200}, +{{"na", "nao"}, 0, {"nan", "ao"}, 100}, +{{"na", "nou"}, 0, {"nan", "ou"}, 100}, +{{"nin", "gan"}, 0, {"ning", "an"}, 100}, +{{"pa", "nan"}, 0, {"pan", "an"}, 100}, +{{"pen", "gan"}, 0, {"peng", "an"}, 100}, +{{"pin", "gan"}, 0, {"ping", "an"}, 100}, +{{"qi", "nai"}, 0, {"qin", "ai"}, 100}, +{{"qi", "nan"}, 0, {"qin", "an"}, 100}, +{{"qia", "nan"}, 0, {"qian", "an"}, 200}, +{{"qia", "ne"}, 0, {"qian", "e"}, 100}, +{{"qin", "gai"}, 0, {"qing", "ai"}, 100}, +{{"qin", "gan"}, 0, {"qing", "an"}, 100}, +{{"re", "nai"}, 0, {"ren", "ai"}, 100}, +{{"re", "nan"}, 0, {"ren", "an"}, 100}, +{{"san", "gou"}, 0, {"sang", "ou"}, 100}, +{{"shan", "gan"}, 100, {"shang", "an"}, 100}, +{{"she", "nai"}, 0, {"shen", "ai"}, 100}, +{{"she", "nao"}, 0, {"shen", "ao"}, 200}, +{{"wa", "nan"}, 0, {"wan", "an"}, 200}, +{{"wa", "ne"}, 0, {"wan", "e"}, 100}, +{{"wa", "nou"}, 0, {"wan", "ou"}, 100}, +{{"wen", "gan"}, 0, {"weng", "an"}, 100}, +{{"xi", "nai"}, 200, {"xin", "ai"}, 100}, +{{"xi", "nan"}, 100, {"xin", "an"}, 100}, +{{"xia", "nai"}, 0, {"xian", "ai"}, 100}, +{{"xia", "nan"}, 0, {"xian", "an"}, 100}, +{{"xia", "ne"}, 0, {"xian", "e"}, 100}, +{{"xian", "gai"}, 0, {"xiang", "ai"}, 100}, +{{"xian", "gan"}, 200, {"xiang", "an"}, 100}, +{{"xian", "ge"}, 100, {"xiang", "e"}, 100}, +{{"xin", "gai"}, 0, {"xing", "ai"}, 100}, +{{"xin", "gan"}, 200, {"xing", "an"}, 200}, +{{"ya", "nan"}, 0, {"yan", "an"}, 200}, +{{"yi", "nan"}, 300, {"yin", "an"}, 100}, +{{"yi", "ne"}, 0, {"yin", "e"}, 100}, +{{"zhan", "gai"}, 0, {"zhang", "ai"}, 100}, +{{"zhe", "nai"}, 0, {"zhen", "ai"}, 200}, +{{"zhe", "nan"}, 0, {"zhen", "an"}, 100}, +{{"zhen", "gan"}, 100, {"zheng", "an"}, 100}, +{{"zhua", "nan"}, 0, {"zhuan", "an"}, 100} +}; + +const gint chewing_key_table[CHEWING_NUMBER_OF_INITIALS * + CHEWING_NUMBER_OF_MIDDLES * + CHEWING_NUMBER_OF_FINALS] = { +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +2 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +3 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +4 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +5 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +85 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +86 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +87 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +88 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +89 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +234 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +252 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +253 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_I, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_ZERO_INITIAL, CHEWING_V, PINYIN_ING) */, +6 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +7 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +8 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +9 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +10 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +11 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +12 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +13 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +14 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +21 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +19 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +20 /* ChewingKey(CHEWING_B, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +15 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AI) */, +16 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ANG) */, +17 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_AO) */, +18 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_I, PINYIN_ING) */, +22 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_B, CHEWING_V, PINYIN_ING) */, +23 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +24 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +25 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +26 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +27 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +28 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +29 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +30 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +31 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +53 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +54 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +52 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_I, PINYIN_ING) */, +55 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_AI) */, +56 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, INVALID_EA) */, +57 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_EI) */, +58 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_NG) */, +59 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_C, CHEWING_V, PINYIN_ING) */, +32 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +33 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +34 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +35 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +36 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +37 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +38 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +39 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +40 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +42 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +43 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +41 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_I, PINYIN_ING) */, +44 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ZERO_FINAL) */, +45 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_A) */, +46 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AI) */, +47 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AN) */, +48 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, INVALID_EA) */, +49 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EI) */, +50 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_NG) */, +51 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_CH, CHEWING_V, PINYIN_ING) */, +60 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +61 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +62 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +63 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +64 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +65 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +66 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +67 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +68 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +69 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +78 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +79 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +75 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +76 /* ChewingKey(CHEWING_D, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +70 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ZERO_FINAL) */, +71 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AI) */, +72 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ANG) */, +73 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_AO) */, +74 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, PINYIN_ONG) */, +77 /* ChewingKey(CHEWING_D, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_I, PINYIN_ING) */, +80 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_AI) */, +81 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, INVALID_EA) */, +82 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_EI) */, +83 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_NG) */, +84 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_D, CHEWING_V, PINYIN_ING) */, +90 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +91 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +92 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +93 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +94 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +95 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +96 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +97 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +98 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +99 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_I, PINYIN_ING) */, +100 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_F, CHEWING_V, PINYIN_ING) */, +121 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +122 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +123 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +124 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +125 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +126 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +127 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +128 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +129 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +130 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +131 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +132 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_I, PINYIN_ING) */, +133 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ZERO_FINAL) */, +134 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_A) */, +135 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AI) */, +136 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AN) */, +137 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, INVALID_EA) */, +138 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_EI) */, +139 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_NG) */, +140 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_H, CHEWING_V, PINYIN_ING) */, +101 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +102 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +103 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +104 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +105 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +106 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +107 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +108 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +109 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +110 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +111 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +112 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_I, PINYIN_ING) */, +113 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ZERO_FINAL) */, +114 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_A) */, +115 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AI) */, +116 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AN) */, +117 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, INVALID_EA) */, +118 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_EI) */, +119 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_NG) */, +120 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_G, CHEWING_V, PINYIN_ING) */, +156 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +157 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +158 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +159 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +160 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +161 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +162 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +163 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +164 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +165 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +166 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +167 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_I, PINYIN_ING) */, +168 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ZERO_FINAL) */, +169 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_A) */, +170 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AI) */, +171 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AN) */, +172 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, INVALID_EA) */, +173 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_EI) */, +174 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_NG) */, +175 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_K, CHEWING_V, PINYIN_ING) */, +141 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +148 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +149 /* ChewingKey(CHEWING_J, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +142 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ZERO_FINAL) */, +143 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AI) */, +144 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AN) */, +145 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ANG) */, +146 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_AO) */, +147 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_O) */, +150 /* ChewingKey(CHEWING_J, CHEWING_I, PINYIN_ONG) */, +151 /* ChewingKey(CHEWING_J, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_I, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_U, PINYIN_ING) */, +152 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_AI) */, +153 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_AO) */, +154 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_EI) */, +155 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_J, CHEWING_V, PINYIN_ING) */, +204 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +205 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +206 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +207 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +208 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +209 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +210 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +211 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +212 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +213 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +221 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +222 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +218 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +219 /* ChewingKey(CHEWING_M, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +214 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AI) */, +215 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ANG) */, +216 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_AO) */, +217 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, PINYIN_ONG) */, +220 /* ChewingKey(CHEWING_M, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_I, PINYIN_ING) */, +223 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_M, CHEWING_V, PINYIN_ING) */, +224 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +225 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +226 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +227 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +228 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +229 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +230 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +231 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +232 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +233 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +244 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +245 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +241 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +242 /* ChewingKey(CHEWING_N, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +235 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ZERO_FINAL) */, +236 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AI) */, +237 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AN) */, +238 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ANG) */, +239 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_AO) */, +240 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, PINYIN_ONG) */, +243 /* ChewingKey(CHEWING_N, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_I, PINYIN_ING) */, +246 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_AI) */, +247 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_EI) */, +248 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_NG) */, +249 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_U, PINYIN_ING) */, +250 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_AO) */, +251 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_N, CHEWING_V, PINYIN_ING) */, +176 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +177 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +178 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +179 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +180 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +181 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +182 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +183 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +184 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +185 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +195 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +196 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +197 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +192 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +193 /* ChewingKey(CHEWING_L, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +186 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ZERO_FINAL) */, +187 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AI) */, +188 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AN) */, +189 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ANG) */, +190 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_AO) */, +191 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, PINYIN_ONG) */, +194 /* ChewingKey(CHEWING_L, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_I, PINYIN_ING) */, +198 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_AI) */, +199 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_EI) */, +200 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_NG) */, +201 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_U, PINYIN_ING) */, +202 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_AO) */, +203 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_L, CHEWING_V, PINYIN_ING) */, +287 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +288 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +289 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +290 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +291 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +292 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +293 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +295 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +296 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +294 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_I, PINYIN_ING) */, +297 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ZERO_FINAL) */, +298 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_AI) */, +299 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, INVALID_EA) */, +300 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_EI) */, +301 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_NG) */, +302 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_R, CHEWING_V, PINYIN_ING) */, +254 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +255 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +256 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +257 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +258 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +259 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +260 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +261 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +262 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +269 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +270 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +267 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +268 /* ChewingKey(CHEWING_P, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +263 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AI) */, +264 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ANG) */, +265 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_AO) */, +266 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_I, PINYIN_ING) */, +271 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_P, CHEWING_V, PINYIN_ING) */, +272 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +279 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +280 /* ChewingKey(CHEWING_Q, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +273 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ZERO_FINAL) */, +274 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AI) */, +275 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AN) */, +276 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ANG) */, +277 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_AO) */, +278 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_O) */, +281 /* ChewingKey(CHEWING_Q, CHEWING_I, PINYIN_ONG) */, +282 /* ChewingKey(CHEWING_Q, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_I, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_U, PINYIN_ING) */, +283 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_AI) */, +284 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_AO) */, +285 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_EI) */, +286 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_Q, CHEWING_V, PINYIN_ING) */, +303 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +304 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +305 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +306 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +307 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +308 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +309 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +310 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +311 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +333 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +334 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +332 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_I, PINYIN_ING) */, +335 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_AI) */, +336 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, INVALID_EA) */, +337 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_EI) */, +338 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_NG) */, +339 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_S, CHEWING_V, PINYIN_ING) */, +312 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +313 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +314 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +315 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +316 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +317 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +318 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +319 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +320 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +321 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +323 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +322 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_I, PINYIN_ING) */, +324 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ZERO_FINAL) */, +325 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_A) */, +326 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AI) */, +327 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AN) */, +328 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, INVALID_EA) */, +329 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_EI) */, +330 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_NG) */, +331 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_SH, CHEWING_V, PINYIN_ING) */, +340 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +341 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +342 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +343 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +344 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +345 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +346 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +347 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +353 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +354 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +352 /* ChewingKey(CHEWING_T, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +348 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AI) */, +349 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ANG) */, +350 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_AO) */, +351 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_I, PINYIN_ING) */, +355 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_AI) */, +356 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, INVALID_EA) */, +357 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_EI) */, +358 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_NG) */, +359 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_T, CHEWING_V, PINYIN_ING) */, +360 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +367 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_I, PINYIN_ING) */, +369 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ZERO_FINAL) */, +361 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_A) */, +362 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AI) */, +363 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AN) */, +364 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, INVALID_EA) */, +365 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_EI) */, +366 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_NG) */, +368 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(PINYIN_W, CHEWING_V, PINYIN_ING) */, +370 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +377 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +378 /* ChewingKey(CHEWING_X, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +371 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ZERO_FINAL) */, +372 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AI) */, +373 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AN) */, +374 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ANG) */, +375 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_AO) */, +376 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_O) */, +379 /* ChewingKey(CHEWING_X, CHEWING_I, PINYIN_ONG) */, +380 /* ChewingKey(CHEWING_X, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_I, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_U, PINYIN_ING) */, +381 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_AI) */, +382 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_AO) */, +383 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_EI) */, +384 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_X, CHEWING_V, PINYIN_ING) */, +385 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +393 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +394 /* ChewingKey(PINYIN_Y, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +392 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ZERO_FINAL) */, +386 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_A) */, +387 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AI) */, +388 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AN) */, +389 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ANG) */, +390 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_AO) */, +391 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_NG) */, +395 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_O) */, +396 /* ChewingKey(PINYIN_Y, CHEWING_I, PINYIN_ONG) */, +397 /* ChewingKey(PINYIN_Y, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_I, PINYIN_ING) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_AI) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, INVALID_EA) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_EI) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_NG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_U, PINYIN_ING) */, +398 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_AI) */, +399 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_AO) */, +400 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_EI) */, +401 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(PINYIN_Y, CHEWING_V, PINYIN_ING) */, +402 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +403 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +404 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +405 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +406 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +407 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +408 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +409 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +410 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +411 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +434 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +435 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +433 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_I, PINYIN_ING) */, +436 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_AI) */, +437 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, INVALID_EA) */, +438 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_EI) */, +439 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_NG) */, +440 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_Z, CHEWING_V, PINYIN_ING) */, +412 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ZERO_FINAL) */, +413 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_A) */, +414 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AI) */, +415 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AN) */, +416 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ANG) */, +417 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_AO) */, +418 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, INVALID_EA) */, +419 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_EI) */, +420 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_EN) */, +421 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_O) */, +423 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, PINYIN_ONG) */, +424 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_ZERO_MIDDLE, PINYIN_ING) */, +422 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_I, PINYIN_ING) */, +425 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ZERO_FINAL) */, +426 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_A) */, +427 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AI) */, +428 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AN) */, +429 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, INVALID_EA) */, +430 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_EI) */, +431 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_NG) */, +432 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_U, PINYIN_ING) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_ZERO_FINAL) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_A) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_AI) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_AN) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_ANG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_AO) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_E) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, INVALID_EA) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_EI) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_EN) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_ENG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_ER) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_NG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_O) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, PINYIN_ONG) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, CHEWING_OU) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, PINYIN_IN) */, +-1 /* ChewingKey(CHEWING_ZH, CHEWING_V, PINYIN_ING) */ +}; + +}; + +#endif diff --git a/src/storage/pinyin_phrase2.h b/src/storage/pinyin_phrase2.h new file mode 100644 index 0000000..ba2f32e --- /dev/null +++ b/src/storage/pinyin_phrase2.h @@ -0,0 +1,267 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef PINYIN_PHRASE2_H +#define PINYIN_PHRASE2_H + +#include "novel_types.h" +#include "chewing_key.h" +#include "pinyin_custom2.h" +#include "pinyin_parser2.h" + +namespace pinyin{ + +inline int pinyin_exact_compare2(const ChewingKey * key_lhs, + const ChewingKey * key_rhs, + int phrase_length){ + int i; + int result; + + /* compare initial */ + for (i = 0; i < phrase_length; ++i) { + result = key_lhs[i].m_initial - key_rhs[i].m_initial; + if (0 != result) + return result; + } + + /* compare middle and final */ + for (i = 0; i < phrase_length; ++i) { + result = key_lhs[i].m_middle - key_rhs[i].m_middle; + if (0 != result) + return result; + result = key_lhs[i].m_final - key_rhs[i].m_final; + if (0 != result) + return result; + } + + /* compare tone */ + for (i = 0; i < phrase_length; ++i) { + result = key_lhs[i].m_tone - key_rhs[i].m_tone; + if (0 != result) + return result; + } + + return 0; +} + + +inline int pinyin_compare_with_ambiguities2(pinyin_option_t options, + const ChewingKey * key_lhs, + const ChewingKey * key_rhs, + int phrase_length){ + int i; + int result; + + /* compare initial */ + for (i = 0; i < phrase_length; ++i) { + result = pinyin_compare_initial2 + (options, + (ChewingInitial)key_lhs[i].m_initial, + (ChewingInitial)key_rhs[i].m_initial); + if (0 != result) + return result; + } + + /* compare middle and final */ + for (i = 0; i < phrase_length; ++i) { + result = pinyin_compare_middle_and_final2 + (options, + (ChewingMiddle)key_lhs[i].m_middle, + (ChewingMiddle)key_rhs[i].m_middle, + (ChewingFinal) key_lhs[i].m_final, + (ChewingFinal) key_rhs[i].m_final); + if (0 != result) + return result; + } + + /* compare tone */ + for (i = 0; i < phrase_length; ++i) { + result = pinyin_compare_tone2 + (options, + (ChewingTone)key_lhs[i].m_tone, + (ChewingTone)key_rhs[i].m_tone); + if (0 != result) + return result; + } + + return 0; +} + +/* compute pinyin lower bound */ +inline void compute_lower_value2(pinyin_option_t options, + const ChewingKey * in_keys, + ChewingKey * out_keys, + int phrase_length) { + ChewingKey aKey; + + for (int i = 0; i < phrase_length; ++i) { + int k; int sel; + aKey = in_keys[i]; + + /* compute lower initial */ + sel = aKey.m_initial; + for (k = aKey.m_initial - 1; k >= CHEWING_ZERO_INITIAL; --k) { + if (0 != pinyin_compare_initial2 + (options, (ChewingInitial)aKey.m_initial, (ChewingInitial)k)) + break; + else + sel = k; + } + aKey.m_initial = (ChewingInitial)sel; + + /* compute lower middle, skipped as no fuzzy pinyin here. + * if needed in future, still use pinyin_compare_middle_and_final2 + * to check lower bound. + */ + + /* as chewing zero middle is the first item, and its value is zero, + * no need to adjust it for incomplete pinyin. + */ + + /* compute lower final */ + sel = aKey.m_final; + for (k = aKey.m_final - 1; k >= CHEWING_ZERO_FINAL; --k) { + if (0 != pinyin_compare_middle_and_final2 + (options, + (ChewingMiddle)aKey.m_middle, (ChewingMiddle) aKey.m_middle, + (ChewingFinal)aKey.m_final, (ChewingFinal)k)) + break; + else + sel = k; + } + aKey.m_final = (ChewingFinal)sel; + + /* compute lower tone */ + sel = aKey.m_tone; + for (k = aKey.m_tone - 1; k >= CHEWING_ZERO_TONE; --k) { + if (0 != pinyin_compare_tone2 + (options, (ChewingTone)aKey.m_tone, (ChewingTone)k)) + break; + else + sel = k; + } + aKey.m_tone = (ChewingTone)sel; + + /* save the result */ + out_keys[i] = aKey; + } +} + +/* compute pinyin upper bound */ +inline void compute_upper_value2(pinyin_option_t options, + const ChewingKey * in_keys, + ChewingKey * out_keys, + int phrase_length) { + ChewingKey aKey; + + for (int i = 0; i < phrase_length; ++i) { + int k; int sel; + aKey = in_keys[i]; + + /* compute upper initial */ + sel = aKey.m_initial; + for (k = aKey.m_initial + 1; k <= CHEWING_LAST_INITIAL; ++k) { + if (0 != pinyin_compare_initial2 + (options, (ChewingInitial)aKey.m_initial, (ChewingInitial)k)) + break; + else + sel = k; + } + aKey.m_initial = (ChewingInitial)sel; + + /* adjust it for incomplete pinyin. */ + + /* compute upper middle */ + sel = aKey.m_middle; + for (k = aKey.m_middle + 1; k <= CHEWING_LAST_MIDDLE; ++k) { + if (0 != pinyin_compare_middle_and_final2 + (options, + (ChewingMiddle)aKey.m_middle, (ChewingMiddle)k, + (ChewingFinal)aKey.m_final, (ChewingFinal)aKey.m_final)) + break; + else + sel = k; + } + aKey.m_middle = (ChewingMiddle)sel; + + /* compute upper final */ + sel = aKey.m_final; + for (k = aKey.m_final + 1; k <= CHEWING_LAST_FINAL; ++k) { + if (0 != pinyin_compare_middle_and_final2 + (options, + (ChewingMiddle)aKey.m_middle, (ChewingMiddle)aKey.m_middle, + (ChewingFinal)aKey.m_final, (ChewingFinal)k)) + break; + else + sel = k; + } + aKey.m_final = (ChewingFinal)sel; + + /* compute upper tone */ + sel = aKey.m_tone; + for (k = aKey.m_tone + 1; k <= CHEWING_LAST_TONE; ++k) { + if (0 != pinyin_compare_tone2 + (options, (ChewingTone)aKey.m_tone, (ChewingTone)k)) + break; + else + sel = k; + } + aKey.m_tone = (ChewingTone)sel; + + /* save the result */ + out_keys[i] = aKey; + } +} + + +template<size_t phrase_length> +struct PinyinIndexItem2{ + phrase_token_t m_token; + ChewingKey m_keys[phrase_length]; +public: + PinyinIndexItem2<phrase_length> (const ChewingKey * keys, + phrase_token_t token) { + memmove(m_keys, keys, sizeof(ChewingKey) * phrase_length); + m_token = token; + } +}; + + +/* for find the element in the phrase array */ +template<size_t phrase_length> +inline int phrase_exact_compare2(const PinyinIndexItem2<phrase_length> &lhs, + const PinyinIndexItem2<phrase_length> &rhs) +{ + ChewingKey * keys_lhs = (ChewingKey *) lhs.m_keys; + ChewingKey * keys_rhs = (ChewingKey *) rhs.m_keys; + return pinyin_exact_compare2(keys_lhs, keys_rhs, phrase_length); +} + +template<size_t phrase_length> +inline bool phrase_exact_less_than2(const PinyinIndexItem2<phrase_length> &lhs, + const PinyinIndexItem2<phrase_length> &rhs) +{ + return 0 > phrase_exact_compare2<phrase_length>(lhs, rhs); +} + +}; + +#endif diff --git a/src/storage/table_info.cpp b/src/storage/table_info.cpp new file mode 100644 index 0000000..795d93d --- /dev/null +++ b/src/storage/table_info.cpp @@ -0,0 +1,272 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2013 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "table_info.h" +#include <stdio.h> +#include <assert.h> +#include <string.h> + +using namespace pinyin; + + +static const pinyin_table_info_t reserved_tables[] = { + {RESERVED, NULL, NULL, NULL, NOT_USED}, + {GB_DICTIONARY, "gb_char.table", "gb_char.bin", "gb_char.dbin", SYSTEM_FILE}, + {GBK_DICTIONARY, "gbk_char.table", "gbk_char.bin", "gbk_char.dbin", SYSTEM_FILE}, + + {MERGED_DICTIONARY, "merged.table", "merged.bin", "merged.dbin", SYSTEM_FILE}, + + {USER_DICTIONARY, NULL, NULL, "user.bin", USER_FILE} +}; + + +SystemTableInfo::SystemTableInfo() { + m_binary_format_version = 0; + m_model_data_version = 0; + m_lambda = 0.; + + size_t i; + for (i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + pinyin_table_info_t * table_info = &m_table_info[i]; + + table_info->m_dict_index = i; + table_info->m_table_filename = NULL; + table_info->m_system_filename = NULL; + table_info->m_user_filename = NULL; + table_info->m_file_type = NOT_USED; + } +} + +SystemTableInfo::~SystemTableInfo() { + reset(); +} + +void SystemTableInfo::reset() { + m_binary_format_version = 0; + m_model_data_version = 0; + m_lambda = 0.; + + size_t i; + for (i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + pinyin_table_info_t * table_info = &m_table_info[i]; + + g_free((gchar *)table_info->m_table_filename); + table_info->m_table_filename = NULL; + g_free((gchar *)table_info->m_system_filename); + table_info->m_system_filename = NULL; + g_free((gchar *)table_info->m_user_filename); + table_info->m_user_filename = NULL; + + table_info->m_file_type = NOT_USED; + } +} + +void SystemTableInfo::postfix_tables() { + size_t i; + for (i = 0; i < G_N_ELEMENTS(reserved_tables); ++i) { + const pinyin_table_info_t * postfix = &reserved_tables[i]; + + guint8 index = postfix->m_dict_index; + pinyin_table_info_t * table_info = &m_table_info[index]; + assert(table_info->m_dict_index == index); + + table_info->m_table_filename = g_strdup(postfix->m_table_filename); + table_info->m_system_filename = g_strdup(postfix->m_system_filename); + table_info->m_user_filename = g_strdup(postfix->m_user_filename); + table_info->m_file_type = postfix->m_file_type; + } +} + +static gchar * to_string(const char * str) { + if (0 == strcmp(str, "NULL")) + return NULL; + + return g_strdup(str); +} + +static PHRASE_FILE_TYPE to_file_type(const char * str) { +#define HANDLE(x) { \ + if (0 == strcmp(str, #x)) \ + return x; \ + } + + HANDLE(NOT_USED); + HANDLE(SYSTEM_FILE); + HANDLE(DICTIONARY); + HANDLE(USER_FILE); + + assert(false); + +#undef HANDLE +} + +bool SystemTableInfo::load(const char * filename) { + reset(); + + FILE * input = fopen(filename, "r"); + if (NULL == input) { + fprintf(stderr, "open %s failed.\n", filename); + return false; + } + + int binver = 0, modelver = 0; + gfloat lambda = 0.; + + int num = fscanf(input, "binary format version:%d\n", &binver); + if (1 != num) { + fclose(input); + return false; + } + + num = fscanf(input, "model data version:%d\n", &modelver); + if (1 != num) { + fclose(input); + return false; + } + + num = fscanf(input, "lambda parameter:%f\n", &lambda); + if (1 != num) { + fclose(input); + return false; + } + +#if 0 + printf("binver:%d modelver:%d lambda:%f\n", binver, modelver, lambda); +#endif + + m_binary_format_version = binver; + m_model_data_version = modelver; + m_lambda = lambda; + + int index = 0; + char tablefile[256], sysfile[256], userfile[256], filetype[256]; + while (!feof(input)) { + num = fscanf(input, "%d %s %s %s %s\n", + &index, tablefile, sysfile, userfile, filetype); + + if (5 != num) + continue; + + if (!(0 <= index && index < PHRASE_INDEX_LIBRARY_COUNT)) + continue; + + /* save into m_table_info. */ + pinyin_table_info_t * table_info = &m_table_info[index]; + assert(index == table_info->m_dict_index); + + table_info->m_table_filename = to_string(tablefile); + table_info->m_system_filename = to_string(sysfile); + table_info->m_user_filename = to_string(userfile); + + table_info->m_file_type = to_file_type(filetype); + } + + fclose(input); + + /* postfix reserved tables. */ + postfix_tables(); + return true; +} + +const pinyin_table_info_t * SystemTableInfo::get_table_info() { + return m_table_info; +} + +gfloat SystemTableInfo::get_lambda() { + return m_lambda; +} + + +UserTableInfo::UserTableInfo() { + m_binary_format_version = 0; + m_model_data_version = 0; +} + +void UserTableInfo::reset() { + m_binary_format_version = 0; + m_model_data_version = 0; +} + +bool UserTableInfo::load(const char * filename) { + reset(); + + FILE * input = fopen(filename, "r"); + if (NULL == input) { + fprintf(stderr, "open %s failed.", filename); + return false; + } + + int binver = 0, modelver = 0; + + int num = fscanf(input, "binary format version:%d\n", &binver); + if (1 != num) { + fclose(input); + return false; + } + + num = fscanf(input, "model data version:%d\n", &modelver); + if (1 != num) { + fclose(input); + return false; + } + +#if 0 + printf("binver:%d modelver:%d\n", binver, modelver); +#endif + + m_binary_format_version = binver; + m_model_data_version = modelver; + + fclose(input); + + return true; +} + +bool UserTableInfo::save(const char * filename) { + FILE * output = fopen(filename, "w"); + if (NULL == output) { + fprintf(stderr, "write %s failed.\n", filename); + return false; + } + + fprintf(output, "binary format version:%d\n", m_binary_format_version); + fprintf(output, "model data version:%d\n", m_model_data_version); + + fclose(output); + + return true; +} + +bool UserTableInfo::is_conform(const SystemTableInfo * sysinfo) { + if (sysinfo->m_binary_format_version != m_binary_format_version) + return false; + + if (sysinfo->m_model_data_version != m_model_data_version) + return false; + + return true; +} + +bool UserTableInfo::make_conform(const SystemTableInfo * sysinfo) { + m_binary_format_version = sysinfo->m_binary_format_version; + m_model_data_version = sysinfo->m_model_data_version; + return true; +} diff --git a/src/storage/table_info.h b/src/storage/table_info.h new file mode 100644 index 0000000..8d7fa05 --- /dev/null +++ b/src/storage/table_info.h @@ -0,0 +1,97 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2013 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef TABLE_INFO_H +#define TABLE_INFO_H + +#include "novel_types.h" + + +namespace pinyin{ + +typedef enum { + NOT_USED, /* not used. */ + SYSTEM_FILE, /* system phrase file. */ + DICTIONARY, /* professional dictionary. */ + USER_FILE, /* user only phrase file. */ +} PHRASE_FILE_TYPE; + +typedef struct { + guint8 m_dict_index; /* for assert purpose. */ + const gchar * m_table_filename; + const gchar * m_system_filename; + const gchar * m_user_filename; + PHRASE_FILE_TYPE m_file_type; +} pinyin_table_info_t; + + +class UserTableInfo; + +class SystemTableInfo{ + friend class UserTableInfo; +private: + int m_binary_format_version; + int m_model_data_version; + gfloat m_lambda; + + pinyin_table_info_t m_table_info[PHRASE_INDEX_LIBRARY_COUNT]; + +private: + void reset(); + + void postfix_tables(); + +public: + SystemTableInfo(); + + ~SystemTableInfo(); + + bool load(const char * filename); + + const pinyin_table_info_t * get_table_info(); + + gfloat get_lambda(); +}; + +class UserTableInfo{ +private: + int m_binary_format_version; + int m_model_data_version; + +private: + void reset(); + +public: + UserTableInfo(); + + bool load(const char * filename); + + bool save(const char * filename); + + bool is_conform(const SystemTableInfo * sysinfo); + + bool make_conform(const SystemTableInfo * sysinfo); +}; + +}; + + +#endif diff --git a/src/storage/tag_utility.cpp b/src/storage/tag_utility.cpp new file mode 100644 index 0000000..081e931 --- /dev/null +++ b/src/storage/tag_utility.cpp @@ -0,0 +1,420 @@ +#include <glib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "novel_types.h" +#include "phrase_index.h" +#include "phrase_large_table2.h" +#include "tag_utility.h" + +namespace pinyin{ + +/* internal taglib structure */ +struct tag_entry{ + int m_line_type; + char * m_line_tag; + int m_num_of_values; + char ** m_required_tags; + /* char ** m_optional_tags; */ + /* int m_optional_count = 0; */ + char ** m_ignored_tags; +}; + +tag_entry tag_entry_copy(int line_type, const char * line_tag, + int num_of_values, + char * required_tags[], + char * ignored_tags[]){ + tag_entry entry; + entry.m_line_type = line_type; + entry.m_line_tag = g_strdup( line_tag ); + entry.m_num_of_values = num_of_values; + entry.m_required_tags = g_strdupv( required_tags ); + entry.m_ignored_tags = g_strdupv( ignored_tags ); + return entry; +} + +tag_entry tag_entry_clone(tag_entry * entry){ + return tag_entry_copy(entry->m_line_type, entry->m_line_tag, + entry->m_num_of_values, + entry->m_required_tags, entry->m_ignored_tags); +} + +void tag_entry_reclaim(tag_entry * entry){ + g_free( entry->m_line_tag ); + g_strfreev( entry->m_required_tags ); + g_strfreev(entry->m_ignored_tags); +} + +static bool taglib_free_tag_array(GArray * tag_array){ + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + tag_entry_reclaim(entry); + } + g_array_free(tag_array, TRUE); + return true; +} + +/* special unichar to be handled in split_line. */ +static gunichar backslash = 0; +static gunichar quote = 0; + +static gboolean split_line_init(){ + backslash = g_utf8_get_char("\\"); + quote = g_utf8_get_char("\""); + return TRUE; +} + +/* Pointer Array of Array of tag_entry */ +static GPtrArray * g_tagutils_stack = NULL; + +bool taglib_init(){ + assert( g_tagutils_stack == NULL); + g_tagutils_stack = g_ptr_array_new(); + GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); + g_ptr_array_add(g_tagutils_stack, tag_array); + + /* init split_line. */ + split_line_init(); + return true; +} + +bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, + const char * required_tags, const char * ignored_tags){ + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, + g_tagutils_stack->len - 1); + + /* some duplicate tagname or line_type check here. */ + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + if ( entry->m_line_type == line_type || + strcmp( entry->m_line_tag, line_tag ) == 0 ) + return false; + } + + char ** required = g_strsplit_set(required_tags, ",:", -1); + char ** ignored = g_strsplit_set(ignored_tags, ",:", -1); + + tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values, + required, ignored); + g_array_append_val(tag_array, entry); + + g_strfreev(required); + g_strfreev(ignored); + return true; +} + +static void ptr_array_entry_free(gpointer data, gpointer user_data){ + g_free(data); +} + +static gboolean hash_table_key_value_free(gpointer key, gpointer value, + gpointer user_data){ + g_free(key); + g_free(value); + return TRUE; +} + +/* split the line into tokens. */ +static gchar ** split_line(const gchar * line){ + /* array for tokens. */ + GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *)); + + for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){ + gunichar unichar = g_utf8_get_char(cur); + const gchar * begin = cur; + gchar * token = NULL; + + if ( g_unichar_isspace (unichar) ) { + continue; + }else if ( unichar == quote ) { + /* handles "\"". */ + /* skip the first '"'. */ + begin = cur = g_utf8_next_char(cur); + while (*cur) { + unichar = g_utf8_get_char(cur); + if ( unichar == backslash ) { + cur = g_utf8_next_char(cur); + g_return_val_if_fail(*cur, NULL); + } else if ( unichar == quote ){ + break; + } + cur = g_utf8_next_char(cur); + } + gchar * tmp = g_strndup( begin, cur - begin); + /* TODO: switch to own strdup_escape implementation + for \"->" transforming. */ + token = g_strdup_printf("%s", tmp); + g_free(tmp); + } else { + /* handles other tokens. */ + while(*cur) { + unichar = g_utf8_get_char(cur); + if ( g_unichar_isgraph(unichar) ) { + /* next unichar */ + cur = g_utf8_next_char(cur); + } else { + /* space and other characters handles. */ + break; + } + } + token = g_strndup( begin, cur - begin ); + } + + g_array_append_val(tokens, token); + if ( !*cur ) + break; + } + + return (gchar **)g_array_free(tokens, FALSE); +} + +bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, + GHashTable * required){ + /* reset values and required. */ + g_ptr_array_foreach(values, ptr_array_entry_free, NULL); + g_ptr_array_set_size(values, 0); + g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL); + + /* use own version of split_line + instead of g_strsplit_set for special token.*/ + char ** tokens = split_line(input_line); + int num_of_tokens = g_strv_length(tokens); + + char * line_tag = tokens[0]; + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + + tag_entry * cur_entry = NULL; + /* find line type. */ + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) { + cur_entry = entry; + break; + } + } + + if ( !cur_entry ) + return false; + + line_type = cur_entry->m_line_type; + + for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) { + g_return_val_if_fail(i < num_of_tokens, false); + char * value = g_strdup( tokens[i] ); + g_ptr_array_add(values, value); + } + + int ignored_len = g_strv_length( cur_entry->m_ignored_tags ); + int required_len = g_strv_length( cur_entry->m_required_tags); + + for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){ + g_return_val_if_fail(i < num_of_tokens, false); + const char * tmp = tokens[i]; + + /* check ignored tags. */ + bool tag_ignored = false; + for ( int m = 0; m < ignored_len; ++m) { + if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) { + tag_ignored = true; + break; + } + } + + if ( tag_ignored ) { + ++i; + continue; + } + + /* check required tags. */ + bool tag_required = false; + for ( int m = 0; m < required_len; ++m) { + if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) { + tag_required = true; + break; + } + } + + /* warning on the un-expected tags. */ + if ( !tag_required ) { + g_warning("un-expected tags:%s.\n", tmp); + ++i; + continue; + } + + char * key = g_strdup(tokens[i]); + ++i; + g_return_val_if_fail(i < num_of_tokens, false); + char * value = g_strdup(tokens[i]); + g_hash_table_insert(required, key, value); + } + + /* check for all required tags. */ + for ( int i = 0; i < required_len; ++i) { + const char * required_tag_str = cur_entry->m_required_tags[i]; + gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL); + if ( !result ) { + g_warning("missed required tags: %s.\n", required_tag_str); + g_strfreev(tokens); + return false; + } + } + + g_strfreev(tokens); + return true; +} + +bool taglib_remove_tag(int line_type){ + /* Note: duplicate entry check is in taglib_add_tag. */ + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + if (entry->m_line_type != line_type) + continue; + tag_entry_reclaim(entry); + g_array_remove_index(tag_array, i); + return true; + } + return false; +} + +bool taglib_push_state(){ + assert(g_tagutils_stack->len >= 1); + GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); + GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + for ( size_t i = 0; i < prev_tag_array->len; ++i) { + tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i); + tag_entry new_entry = tag_entry_clone(entry); + g_array_append_val(next_tag_array, new_entry); + } + g_ptr_array_add(g_tagutils_stack, next_tag_array); + return true; +} + +bool taglib_pop_state(){ + assert(g_tagutils_stack->len > 1); + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1); + taglib_free_tag_array(tag_array); + return true; +} + +bool taglib_fini(){ + for ( size_t i = 0; i < g_tagutils_stack->len; ++i){ + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i); + taglib_free_tag_array(tag_array); + } + g_ptr_array_free(g_tagutils_stack, TRUE); + g_tagutils_stack = NULL; + return true; +} + +#if 0 + +static phrase_token_t taglib_special_string_to_token(const char * string){ + struct token_pair{ + phrase_token_t token; + const char * string; + }; + + static const token_pair tokens [] = { + {sentence_start, "<start>"}, + {0, NULL} + }; + + const token_pair * pair = tokens; + while (pair->string) { + if ( strcmp(string, pair->string ) == 0 ) + return pair->token; + pair++; + } + + fprintf(stderr, "error: unknown token:%s.\n", string); + return 0; +} + +phrase_token_t taglib_string_to_token(PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + const char * string){ + phrase_token_t token = null_token; + if ( string[0] == '<' ) { + return taglib_special_string_to_token(string); + } + + glong phrase_len = g_utf8_strlen(string, -1); + ucs4_t * phrase = g_utf8_to_ucs4(string, -1, NULL, NULL, NULL); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + int result = phrase_table->search(phrase_len, phrase, tokens); + int num = get_first_token(tokens, token); + phrase_index->destroy_tokens(tokens); + + if ( !(result & SEARCH_OK) ) + fprintf(stderr, "error: unknown token:%s.\n", string); + + g_free(phrase); + return token; +} + +#endif + +static const char * taglib_special_token_to_string(phrase_token_t token){ + struct token_pair{ + phrase_token_t token; + const char * string; + }; + + static const token_pair tokens [] = { + {sentence_start, "<start>"}, + {0, NULL} + }; + + const token_pair * pair = tokens; + while (pair->token) { + if ( token == pair->token ) + return pair->string; + pair++; + } + + fprintf(stderr, "error: unknown token:%d.\n", token); + return NULL; +} + +char * taglib_token_to_string(FacadePhraseIndex * phrase_index, + phrase_token_t token) { + PhraseItem item; + ucs4_t buffer[MAX_PHRASE_LENGTH]; + + gchar * phrase; + /* deal with the special phrase index, for "<start>..." */ + if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) { + return g_strdup(taglib_special_token_to_string(token)); + } + + int result = phrase_index->get_phrase_item(token, item); + if (result != ERROR_OK) { + fprintf(stderr, "error: unknown token:%d.\n", token); + return NULL; + } + + item.get_phrase_string(buffer); + guint8 length = item.get_phrase_length(); + phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); + return phrase; +} + +bool taglib_validate_token_with_string(FacadePhraseIndex * phrase_index, + phrase_token_t token, + const char * string){ + bool result = false; + + char * str = taglib_token_to_string(phrase_index, token); + result = (0 == strcmp(str, string)); + g_free(str); + + return result; +} + + +}; diff --git a/src/storage/tag_utility.h b/src/storage/tag_utility.h new file mode 100644 index 0000000..ceb1d6c --- /dev/null +++ b/src/storage/tag_utility.h @@ -0,0 +1,151 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef TAG_UTILITY_H +#define TAG_UTILITY_H + +#include "novel_types.h" + +/* Note: the optional tag has been removed from the first implementation. + * Maybe the optional tag will be added back later. + */ + +namespace pinyin{ + +/** + * taglib_init: + * @returns: whether the initialize operation is successful. + * + * Initialize the n-gram tag parse library. + * + */ +bool taglib_init(); + +/** + * taglib_add_tag: + * @line_type: the line type. + * @line_tag: the line tag. + * @num_of_values: the number of values following the line tag. + * @required_tags: the required tags of the line. + * @ignored_tags: the ignored tags of the line. + * @returns: whether the add operation is successful. + * + * Add one line tag to the tag parse library. + * + * Note: the required and ignored tags are separated by ',' or ':' . + * + */ +bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags); + +/** + * taglib_read: + * @input_line: one input line. + * @line_type: the line type. + * @values: the values following the line tag. + * @required: the required tags of the line type. + * @returns: whether the line is parsed ok. + * + * Parse one input line into line_type, values and required tags. + * + * Note: most parameters are hash table of string (const char *). + * + */ +bool taglib_read(const char * input_line, int & line_type, + GPtrArray * values, GHashTable * required); + +/** + * taglib_remove_tag: + * @line_type: the type of the line tag. + * @returns: whether the remove operation is successful. + * + * Remove one line tag. + * + */ +bool taglib_remove_tag(int line_type); + +/** + * taglib_push_state: + * @returns: whether the push operation is successful. + * + * Push the current state onto the stack. + * + * Note: the taglib_push/pop_state functions are used to save + * the current known tag list in stack. + * Used when the parsing context is changed. + */ +bool taglib_push_state(); + +/** + * taglib_pop_state: + * @returns: whether the pop operation is successful. + * + * Pop the current state off the stack. + * + */ +bool taglib_pop_state(); + +/** + * taglib_fini: + * @returns: whether the finish operation is successful. + * + * Finish the n-gram tag parse library. + * + */ +bool taglib_fini(); + +class PhraseLargeTable2; +class FacadePhraseIndex; + + +/** + * taglib_token_to_string: + * @phrase_index: the phrase index for phrase string lookup. + * @token: the phrase token. + * @returns: the phrase string found in phrase index. + * + * Translate one token into the phrase string. + * + */ +char * taglib_token_to_string(FacadePhraseIndex * phrase_index, + phrase_token_t token); + +/** + * taglib_validate_token_with_string: + * @phrase_index: the phrase index. + * @token: the phrase token. + * @string: the phrase string. + * @returns: whether the token is validated with the phrase string. + * + * Validate the token with the phrase string. + * + */ +bool taglib_validate_token_with_string(FacadePhraseIndex * phrase_index, + phrase_token_t token, + const char * string); + +/* Note: the following function is only available when the optional tag exists. + bool taglib_report_status(int line_type); */ + +/* Note: taglib_write is omited, as printf is more suitable for this. */ + +}; + +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..5783407 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,33 @@ +add_subdirectory(include) +add_subdirectory(storage) +add_subdirectory(lookup) + +add_executable( + test_pinyin + test_pinyin.cpp +) + +target_link_libraries( + test_pinyin + libpinyin +) + +add_executable( + test_phrase + test_phrase.cpp +) + +target_link_libraries( + test_phrase + libpinyin +) + +add_executable( + test_chewing + test_chewing.cpp +) + +target_link_libraries( + test_chewing + libpinyin +) diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 0000000..8208214 --- /dev/null +++ b/tests/Makefile.am @@ -0,0 +1,50 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = include storage lookup + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I $(ac_aux_dir) + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + @GLIB2_CFLAGS@ + +noinst_HEADERS = timer.h \ + tests_helper.h + +noinst_PROGRAMS = test_pinyin \ + test_phrase \ + test_chewing + +test_pinyin_SOURCES = test_pinyin.cpp + +test_pinyin_LDADD = ../src/libpinyin.la @GLIB2_LIBS@ + +test_phrase_SOURCES = test_phrase.cpp + +test_phrase_LDADD = ../src/libpinyin.la @GLIB2_LIBS@ + +test_chewing_SOURCES = test_chewing.cpp + +test_chewing_LDADD = ../src/libpinyin.la @GLIB2_LIBS@ diff --git a/tests/include/CMakeLists.txt b/tests/include/CMakeLists.txt new file mode 100644 index 0000000..f51c87e --- /dev/null +++ b/tests/include/CMakeLists.txt @@ -0,0 +1,9 @@ +add_executable( + test_memory_chunk + test_memory_chunk.cpp +) + +target_link_libraries( + test_memory_chunk + libpinyin +)
\ No newline at end of file diff --git a/tests/include/Makefile.am b/tests/include/Makefile.am new file mode 100644 index 0000000..7174bec --- /dev/null +++ b/tests/include/Makefile.am @@ -0,0 +1,31 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + @GLIB2_CFLAGS@ + +TESTS = test_memory_chunk + +noinst_PROGRAMS = test_memory_chunk + +test_memory_chunk_SOURCES = test_memory_chunk.cpp + +test_memory_chunk_LDADD = @GLIB2_LIBS@ + diff --git a/tests/include/test_memory_chunk.cpp b/tests/include/test_memory_chunk.cpp new file mode 100644 index 0000000..9779c8f --- /dev/null +++ b/tests/include/test_memory_chunk.cpp @@ -0,0 +1,64 @@ +#include <stdio.h> +#include "pinyin_internal.h" + +//Test Memory Chunk Functionality +int main(int argc, char * argv[]){ + MemoryChunk* chunk; + chunk = new MemoryChunk(); + int i = 12; + chunk->set_content(0, &i, sizeof(int)); + + int * p = (int *)chunk->begin(); + assert(chunk->size() == sizeof(int)); + printf("%d\n", *p); + printf("%ld\n", chunk->capacity()); + + p = & i; + chunk->set_chunk(p, sizeof(int), NULL); + short t = 5; + chunk->set_content(sizeof(int), &t, sizeof(short)); + assert( sizeof(int) + sizeof(short) == chunk->size()); + printf("%ld\n", chunk->capacity()); + + p = (int *)chunk->begin(); + short * p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); + printf("%d\t%d\n", *p, *p2); + + chunk->set_content(sizeof(int) + sizeof(short), &t, sizeof(short)); + + assert( sizeof(int) + (sizeof(short) << 1) == chunk->size()); + printf("%ld\n", chunk->capacity()); + p = (int *)chunk->begin(); + p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); + printf("%d\t%d\t%d\n", *p, *p2, *(p2 + 1)); + + chunk->set_size(sizeof(int) + sizeof(short) *3); + p = (int *)chunk->begin(); + p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); + + chunk->set_content(0, &i, sizeof(int)); + + *(p2+2) = 3; + printf("%d\t%d\t%d\t%d\n", *p, *p2, *(p2 + 1), *(p2+2)); + + int m = 10; + chunk->set_chunk(&m, sizeof(int), NULL); + int n = 12; + chunk->insert_content(sizeof(int), &n, sizeof(int)); + n = 11; + chunk->insert_content(sizeof(int), &n, sizeof(int)); + + int * p3 = (int *)chunk->begin(); + printf("%d\t%d\t%d\n", *p3, *(p3+1), *(p3+2)); + + chunk->remove_content(sizeof(int), sizeof(int)); + printf("%d\t%d\n", *p3, *(p3+1)); + + int tmp; + assert(chunk->get_content(sizeof(int), &tmp, sizeof(int))); + printf("%d\n", tmp); + + delete chunk; + + return 0; +} diff --git a/tests/lookup/CMakeLists.txt b/tests/lookup/CMakeLists.txt new file mode 100644 index 0000000..3304c47 --- /dev/null +++ b/tests/lookup/CMakeLists.txt @@ -0,0 +1,21 @@ +include_directories(..) + +add_executable( + test_pinyin_lookup + test_pinyin_lookup.cpp +) + +target_link_libraries( + test_pinyin_lookup + libpinyin +) + +add_executable( + test_phrase_lookup + test_phrase_lookup.cpp +) + +target_link_libraries( + test_phrase_lookup + libpinyin +) diff --git a/tests/lookup/Makefile.am b/tests/lookup/Makefile.am new file mode 100644 index 0000000..4bcc176 --- /dev/null +++ b/tests/lookup/Makefile.am @@ -0,0 +1,34 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/tests \ + @GLIB2_CFLAGS@ + +noinst_PROGRAMS = test_pinyin_lookup \ + test_phrase_lookup + +test_pinyin_lookup_SOURCES = test_pinyin_lookup.cpp + +test_pinyin_lookup_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_phrase_lookup_SOURCES = test_phrase_lookup.cpp + +test_phrase_lookup_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
\ No newline at end of file diff --git a/tests/lookup/test_phrase_lookup.cpp b/tests/lookup/test_phrase_lookup.cpp new file mode 100644 index 0000000..c7bfd19 --- /dev/null +++ b/tests/lookup/test_phrase_lookup.cpp @@ -0,0 +1,118 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <stdio.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "tests_helper.h" + + +bool try_phrase_lookup(PhraseLookup * phrase_lookup, + ucs4_t * ucs4_str, glong ucs4_len){ + char * result_string = NULL; + MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + phrase_lookup->get_best_match(ucs4_len, ucs4_str, results); +#if 0 + for ( size_t i = 0; i < results->len; ++i) { + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if ( *token == null_token ) + continue; + printf("%d:%d\t", i, *token); + } + printf("\n"); +#endif + phrase_lookup->convert_to_utf8(results, result_string); + if (result_string) + printf("%s\n", result_string); + else + fprintf(stderr, "Error: Un-segmentable sentence encountered!\n"); + g_array_free(results, TRUE); + g_free(result_string); + return true; +} + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* init phrase table */ + FacadePhraseTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/phrase_index.bin"); + phrase_table.load(chunk, NULL); + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + /* init bi-gram */ + Bigram system_bigram; + system_bigram.attach("../../data/bigram.db", ATTACH_READONLY); + Bigram user_bigram; + + gfloat lambda = system_table_info.get_lambda(); + + /* init phrase lookup */ + PhraseLookup phrase_lookup(lambda, + &phrase_table, &phrase_index, + &system_bigram, &user_bigram); + + /* try one sentence */ + char * linebuf = NULL; + size_t size = 0; + ssize_t read; + while( (read = getline(&linebuf, &size, stdin)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + /* check non-ucs4 characters */ + const glong num_of_chars = g_utf8_strlen(linebuf, -1); + glong len = 0; + ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); + if ( len != num_of_chars ) { + fprintf(stderr, "non-ucs4 characters are not accepted.\n"); + g_free(sentence); + continue; + } + + try_phrase_lookup(&phrase_lookup, sentence, len); + g_free(sentence); + } + + free(linebuf); + return 0; +} diff --git a/tests/lookup/test_pinyin_lookup.cpp b/tests/lookup/test_pinyin_lookup.cpp new file mode 100644 index 0000000..3175db0 --- /dev/null +++ b/tests/lookup/test_pinyin_lookup.cpp @@ -0,0 +1,126 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "timer.h" +#include <string.h> +#include "pinyin_internal.h" +#include "tests_helper.h" + +size_t bench_times = 100; + +int main( int argc, char * argv[]){ + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + pinyin_option_t options = + USE_TONE | USE_RESPLIT_TABLE | PINYIN_CORRECT_ALL | PINYIN_AMB_ALL; + FacadeChewingTable largetable; + + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/pinyin_index.bin"); + largetable.load(options, chunk, NULL); + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + FacadePhraseIndex phrase_index; + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram system_bigram; + system_bigram.attach("../../data/bigram.db", ATTACH_READONLY); + Bigram user_bigram; + user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE); + + gfloat lambda = system_table_info.get_lambda(); + + PinyinLookup2 pinyin_lookup(lambda, options, + &largetable, &phrase_index, + &system_bigram, &user_bigram); + + /* prepare the prefixes for get_best_match. */ + TokenVector prefixes = g_array_new + (FALSE, FALSE, sizeof(phrase_token_t)); + g_array_append_val(prefixes, sentence_start); + + CandidateConstraints constraints = g_array_new + (TRUE, FALSE, sizeof(lookup_constraint_t)); + + MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + char* linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, stdin)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + FullPinyinParser2 parser; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); + + if ( 0 == keys->len ) /* invalid pinyin */ + continue; + + /* initialize constraints. */ + g_array_set_size(constraints, keys->len); + for ( size_t i = 0; i < constraints->len; ++i){ + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + guint32 start_time = record_time(); + for ( size_t i = 0; i < bench_times; ++i) + pinyin_lookup.get_best_match(prefixes, keys, constraints, results); + print_time(start_time, bench_times); + for ( size_t i = 0; i < results->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if ( null_token == *token) + continue; + printf("pos:%ld,token:%d\t", i, *token); + } + printf("\n"); + char * sentence = NULL; + pinyin_lookup.convert_to_utf8(results, sentence); + printf("%s\n", sentence); + + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + g_free(sentence); + } + + g_array_free(prefixes, TRUE); + g_array_free(constraints, TRUE); + g_array_free(results, TRUE); + + free(linebuf); + return 0; +} diff --git a/tests/storage/CMakeLists.txt b/tests/storage/CMakeLists.txt new file mode 100644 index 0000000..96b12fc --- /dev/null +++ b/tests/storage/CMakeLists.txt @@ -0,0 +1,71 @@ +include_directories(..) + +add_executable( + test_parser2 + test_parser2.cpp +) + +target_link_libraries( + test_parser2 + libpinyin +) + +add_executable( + test_chewing_table + test_chewing_table.cpp +) + +target_link_libraries( + test_chewing_table + libpinyin +) + +add_executable( + test_phrase_index + test_phrase_index.cpp +) + +target_link_libraries( + test_phrase_index + libpinyin +) + +add_executable( + test_phrase_index_logger + test_phrase_index_logger.cpp +) + +target_link_libraries( + test_phrase_index_logger + libpinyin +) + +add_executable( + test_phrase_table + test_phrase_table.cpp +) + +target_link_libraries( + test_phrase_table + libpinyin +) + +add_executable( + test_ngram + test_ngram.cpp +) + +target_link_libraries( + test_ngram + libpinyin +) + +add_executable( + test_flexible_ngram + test_flexible_ngram.cpp +) + +target_link_libraries( + test_flexible_ngram + libpinyin +) diff --git a/tests/storage/Makefile.am b/tests/storage/Makefile.am new file mode 100644 index 0000000..b7ed8b6 --- /dev/null +++ b/tests/storage/Makefile.am @@ -0,0 +1,71 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/tests \ + @GLIB2_CFLAGS@ + +TESTS = test_phrase_index_logger \ + test_ngram \ + test_flexible_ngram + +noinst_PROGRAMS = test_phrase_index \ + test_phrase_index_logger \ + test_phrase_table \ + test_ngram \ + test_flexible_ngram \ + test_parser2 \ + test_chewing_table \ + test_table_info + + +test_phrase_index_SOURCES = test_phrase_index.cpp + +test_phrase_index_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_phrase_index_logger_SOURCES = test_phrase_index_logger.cpp + +test_phrase_index_logger_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +test_phrase_table_SOURCES = test_phrase_table.cpp + +test_phrase_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_ngram_SOURCES = test_ngram.cpp + +test_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_flexible_ngram_SOURCES = test_flexible_ngram.cpp + +test_flexible_ngram_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +test_parser2_SOURCES = test_parser2.cpp + +test_parser2_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_chewing_table_SOURCES = test_chewing_table.cpp + +test_chewing_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +test_table_info_SOURCES = test_table_info.cpp + +test_table_info_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ diff --git a/tests/storage/test_chewing_table.cpp b/tests/storage/test_chewing_table.cpp new file mode 100644 index 0000000..f3d0f5d --- /dev/null +++ b/tests/storage/test_chewing_table.cpp @@ -0,0 +1,148 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "timer.h" +#include <string.h> +#include "pinyin_internal.h" +#include "tests_helper.h" + +size_t bench_times = 1000; + +int main(int argc, char * argv[]) { + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE; + ChewingLargeTable largetable(options); + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_table(phrase_files, &largetable, NULL, &phrase_index)) + exit(ENOENT); + + MemoryChunk * new_chunk = new MemoryChunk; + largetable.store(new_chunk); + largetable.load(new_chunk); + + char* linebuf = NULL; size_t size = 0; ssize_t read; + while ((read = getline(&linebuf, &size, stdin)) != -1) { + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + FullPinyinParser2 parser; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); + if (0 == keys->len) { + fprintf(stderr, "Invalid input.\n"); + continue; + } + + guint32 start = record_time(); + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(PhraseIndexRanges)); + + phrase_index.prepare_ranges(ranges); + + for (size_t i = 0; i < bench_times; ++i) { + phrase_index.clear_ranges(ranges); + largetable.search(keys->len, (ChewingKey *)keys->data, ranges); + } + print_time(start, bench_times); + + phrase_index.clear_ranges(ranges); + largetable.search(keys->len, (ChewingKey *)keys->data, ranges); + + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & range = ranges[i]; + if (!range) + continue; + + if (range->len) + printf("range items number:%d\n", range->len); + + for (size_t k = 0; k < range->len; ++k) { + PhraseIndexRange * onerange = + &g_array_index(range, PhraseIndexRange, k); + printf("start:%d\tend:%d\n", onerange->m_range_begin, + onerange->m_range_end); + + PhraseItem item; + for ( phrase_token_t token = onerange->m_range_begin; + token != onerange->m_range_end; ++token){ + + phrase_index.get_phrase_item( token, item); + + /* get phrase string */ + ucs4_t buffer[MAX_PHRASE_LENGTH + 1]; + item.get_phrase_string(buffer); + char * string = g_ucs4_to_utf8 + ( buffer, item.get_phrase_length(), + NULL, NULL, NULL); + printf("%s\t", string); + g_free(string); + + ChewingKey chewing_buffer[MAX_PHRASE_LENGTH]; + size_t npron = item.get_n_pronunciation(); + guint32 freq; + for (size_t m = 0; m < npron; ++m){ + item.get_nth_pronunciation(m, chewing_buffer, freq); + for (size_t n = 0; n < item.get_phrase_length(); + ++n){ + gchar * pinyins = + chewing_buffer[n].get_pinyin_string(); + printf("%s'", pinyins); + g_free(pinyins); + } + printf("\b\t%d\t", freq); + } + } + printf("\n"); + } + g_array_set_size(range, 0); + } + + phrase_index.destroy_ranges(ranges); + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + } + + if (linebuf) + free(linebuf); + + /* mask out all index items. */ + largetable.mask_out(0x0, 0x0); + + return 0; +} diff --git a/tests/storage/test_flexible_ngram.cpp b/tests/storage/test_flexible_ngram.cpp new file mode 100644 index 0000000..d7d7950 --- /dev/null +++ b/tests/storage/test_flexible_ngram.cpp @@ -0,0 +1,138 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin_internal.h" + +int main(int argc, char * argv[]) { + FlexibleSingleGram<guint32, guint32> single_gram; + typedef FlexibleSingleGram<guint32, guint32>::ArrayItemWithToken array_item_t; + + const guint32 total_freq = 16; + assert(single_gram.set_array_header(total_freq)); + + phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3 }; + guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; + + guint32 freq; + + for ( size_t i = 0; i < G_N_ELEMENTS(tokens); ++i ){ + if ( single_gram.get_array_item(tokens[i], freq) ) + assert(single_gram.set_array_item(tokens[i], freqs[i])); + else + assert(single_gram.insert_array_item(tokens[i], freqs[i])); + } + + single_gram.get_array_item(3, freq); + assert(freq == 32); + + printf("--------------------------------------------------------\n"); + PhraseIndexRange range; + FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(array_item_t)); + range.m_range_begin = 0; range.m_range_end = 8; + single_gram.search(&range, array); + for ( size_t i = 0; i < array->len; ++i ){ + array_item_t * item = &g_array_index(array, array_item_t, i); + printf("item:%d:%d\n", item->m_token, item->m_item); + } + + assert(single_gram.get_array_header(freq)); + assert(freq == total_freq); + + FlexibleBigram<guint32, guint32, guint32> bigram("TEST"); + assert(bigram.attach("/tmp/training.db", ATTACH_READWRITE|ATTACH_CREATE)); + bigram.store(1, &single_gram); + assert(single_gram.insert_array_item(5, 8)); + assert(single_gram.remove_array_item(1, freq)); + assert(single_gram.set_array_header(32)); + assert(single_gram.get_array_header(freq)); + printf("new array header:%d\n", freq); + bigram.store(2, &single_gram); + + for (int m = 1; m <= 2; ++m ){ + printf("--------------------------------------------------------\n"); + FlexibleSingleGram<guint32, guint32> * train_gram; + bigram.load(m, train_gram); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + train_gram->search(&range, array); + for ( size_t i = 0; i < array->len; ++i ){ + array_item_t * item = &g_array_index(array, array_item_t, i); + printf("item:%d:%d\n", item->m_token, item->m_item); + } + delete train_gram; + } + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(items); + printf("-----------------------items----------------------------\n"); + for ( size_t i = 0; i < items->len; ++i ){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + printf("item:%d\n", *token); + } + + printf("-----------------------magic header---------------------\n"); + bigram.set_magic_header(total_freq); + bigram.get_magic_header(freq); + assert(total_freq == freq); + printf("magic header:%d\n", freq); + + printf("-----------------------array header---------------------\n"); + for ( int i = 1; i <= 2; ++i){ + bigram.get_array_header(i, freq); + printf("single gram: %d, freq:%d\n", i, freq); + } + + bigram.set_array_header(1, 1); + + printf("-----------------------array header---------------------\n"); + for ( int i = 1; i <= 2; ++i){ + bigram.get_array_header(i, freq); + printf("single gram: %d, freq:%d\n", i, freq); + } + + for (int m = 1; m <= 2; ++m ){ + printf("--------------------------------------------------------\n"); + FlexibleSingleGram<guint32, guint32> * train_gram; + bigram.load(m, train_gram); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + train_gram->search(&range, array); + for ( size_t i = 0; i < array->len; ++i ){ + array_item_t * item = &g_array_index(array, array_item_t, i); + printf("item:%d:%d\n", item->m_token, item->m_item); + } + delete train_gram; + } + + assert(bigram.remove(1)); + + bigram.get_all_items(items); + printf("-----------------------items----------------------------\n"); + for ( size_t i = 0; i < items->len; ++i ){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + printf("item:%d\n", *token); + } + + g_array_free(items, TRUE); + g_array_free(array, TRUE); + return 0; +} diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp new file mode 100644 index 0000000..f82cf1f --- /dev/null +++ b/tests/storage/test_ngram.cpp @@ -0,0 +1,87 @@ +#include <stdio.h> +#include "pinyin_internal.h" + + +int main(int argc, char * argv[]){ + SingleGram single_gram; + + const guint32 total_freq = 16; + assert(single_gram.set_total_freq(total_freq)); + + phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3}; + guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; + + guint32 freq; + + for(size_t i = 0; i < 6 ;++i){ + if ( single_gram.get_freq(tokens[i], freq)) + assert(single_gram.set_freq(tokens[i], freqs[i])); + else + assert(single_gram.insert_freq(tokens[i], freqs[i])); + } + + single_gram.get_freq(3, freq); + assert(freq == 32); + + printf("--------------------------------------------------------\n"); + PhraseIndexRange range; + BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem)); + range.m_range_begin = 0; range.m_range_end = 8; + single_gram.search(&range,array); + for ( size_t i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + + assert(single_gram.get_total_freq(freq)); + assert(freq == total_freq); + + Bigram bigram; + assert(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE)); + bigram.store(1, &single_gram); + assert(single_gram.insert_freq(5, 8)); + assert(single_gram.remove_freq(1, freq)); + single_gram.set_total_freq(32); + + bigram.store(2, &single_gram); + + + SingleGram * gram = NULL; + for ( int m = 1; m <= 2; ++m ){ + printf("--------------------------------------------------------\n"); + bigram.load(m, gram); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + gram->search(&range,array); + for ( size_t i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete gram; + } + + printf("--------------------------------------------------------\n"); + assert(single_gram.get_total_freq(freq)); + printf("total_freq:%d\n", freq); + + g_array_free(array, TRUE); + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(items); + + printf("----------------------system----------------------------\n"); + for ( size_t i = 0; i < items->len; ++i){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + printf("item:%d\n", *token); + } + + assert(bigram.load_db("/tmp/test.db")); + assert(bigram.save_db("/tmp/test.db")); + + g_array_free(items, TRUE); + + /* mask out all index items. */ + bigram.mask_out(0x0, 0x0); + + return 0; +} diff --git a/tests/storage/test_parser2.cpp b/tests/storage/test_parser2.cpp new file mode 100644 index 0000000..638cd96 --- /dev/null +++ b/tests/storage/test_parser2.cpp @@ -0,0 +1,144 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "timer.h" +#include <errno.h> +#include <stdio.h> +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include "pinyin_parser2.h" + + +static const gchar * parsername = ""; +static gboolean incomplete = FALSE; + +static GOptionEntry entries[] = +{ + {"parser", 'p', 0, G_OPTION_ARG_STRING, &parsername, "parser", "fullpinyin doublepinyin chewing"}, + {"incomplete", 'i', 0, G_OPTION_ARG_NONE, &incomplete, "incomplete pinyin", NULL}, + {NULL} +}; + +#if 0 + " -s <scheme> specify scheme for doublepinyin/chewing.\n" + " schemes for doublepinyin: zrm, ms, ziguang, abc, pyjj, xhe.\n" + " schemes for chewing: standard, ibm, ginyieh, eten.\n" +#endif + + +size_t bench_times = 1000; + +using namespace pinyin; + + +int main(int argc, char * argv[]) { + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- test pinyin parser"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE | USE_RESPLIT_TABLE; + if (incomplete) + options |= PINYIN_INCOMPLETE | CHEWING_INCOMPLETE; + + PinyinParser2 * parser = NULL; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + /* create the parser */ + if (strcmp("fullpinyin", parsername) == 0) { + parser = new FullPinyinParser2(); + } else if (strcmp("doublepinyin", parsername) == 0) { + parser = new DoublePinyinParser2(); + } else if (strcmp("chewing", parsername) == 0) { + parser = new ChewingParser2(); + } + + if (!parser) + parser = new FullPinyinParser2(); + + char* linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, stdin)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + +#if 0 + ChewingKey key; + bool success = parser->parse_one_key(options, key, + linebuf, strlen(linebuf)); + if (success) { + gchar * pinyins = key.get_pinyin_string(); + printf("pinyin:%s\n", pinyins); + g_free(pinyins); + } +#endif + +#if 1 + int len = 0; + guint32 start_time = record_time(); + for ( size_t i = 0; i < bench_times; ++i) + len = parser->parse(options, keys, key_rests, + linebuf, strlen(linebuf)); + + print_time(start_time, bench_times); + + printf("parsed %d chars, %d keys.\n", len, keys->len); + + assert(keys->len == key_rests->len); + + for (size_t i = 0; i < keys->len; ++i) { + ChewingKey * key = + &g_array_index(keys, ChewingKey, i); + ChewingKeyRest * key_rest = + &g_array_index(key_rests, ChewingKeyRest, i); + + gchar * pinyins = key->get_pinyin_string(); + printf("%s %d %d\t", pinyins, + key_rest->m_raw_begin, key_rest->m_raw_end); + g_free(pinyins); + } + printf("\n"); +#endif + + } + + if (linebuf) + free(linebuf); + + delete parser; + + g_array_free(key_rests, TRUE); + g_array_free(keys, TRUE); + + return 0; +} diff --git a/tests/storage/test_phrase_index.cpp b/tests/storage/test_phrase_index.cpp new file mode 100644 index 0000000..79a3ca4 --- /dev/null +++ b/tests/storage/test_phrase_index.cpp @@ -0,0 +1,122 @@ +#include "timer.h" +#include <stdio.h> +#include <errno.h> +#include "pinyin_internal.h" +#include "tests_helper.h" + +size_t bench_times = 100000; + +int main(int argc, char * argv[]){ + PhraseItem phrase_item; + ucs4_t string1 = 2; + ChewingKey key1 = ChewingKey(CHEWING_CH, CHEWING_ZERO_MIDDLE, CHEWING_ENG); + ChewingKey key2 = ChewingKey(CHEWING_SH, CHEWING_ZERO_MIDDLE, CHEWING_ANG); + + + phrase_item.set_phrase_string(1, &string1); + phrase_item.add_pronunciation(&key1, 100); + phrase_item.add_pronunciation(&key2, 300); + + assert(phrase_item.get_phrase_length() == 1); + + ChewingKey key3; + guint32 freq; + phrase_item.get_nth_pronunciation(0, &key3, freq); + assert(key3 == key1); + assert(freq == 100); + phrase_item.get_nth_pronunciation(1, &key3, freq); + assert(key3 == key2); + assert(freq == 300); + + pinyin_option_t options = 0; + gfloat poss = phrase_item.get_pronunciation_possibility(options, &key1); + printf("pinyin possiblitiy:%f\n", poss); + + assert(phrase_item.get_unigram_frequency() == 0); + + ucs4_t string2; + phrase_item.get_phrase_string(&string2); + assert(string1 == string2); + + FacadePhraseIndex phrase_index_test; + assert(!phrase_index_test.add_phrase_item(1, &phrase_item)); + + MemoryChunk* chunk = new MemoryChunk; + assert(phrase_index_test.store(0, chunk)); + assert(phrase_index_test.load(0, chunk)); + + PhraseItem item2; + guint32 time = record_time(); + for ( size_t i = 0; i < bench_times; ++i){ + phrase_index_test.get_phrase_item(1, item2); + assert(item2.get_unigram_frequency() == 0); + assert(item2.get_n_pronunciation() == 2); + assert(item2.get_phrase_length() == 1); + assert(item2.get_pronunciation_possibility(options, &key2) == 0.75); + } + print_time(time, bench_times); + + { + PhraseItem item3; + phrase_index_test.get_phrase_item(1, item3); + item3.increase_pronunciation_possibility(options, &key1, 200); + assert(item3.get_pronunciation_possibility(options, &key1) == 0.5) ; + } + + { + PhraseItem item5; + phrase_index_test.get_phrase_item(1, item5); + gfloat poss = item5.get_pronunciation_possibility(options, &key1); + printf("pinyin poss:%f\n", poss); + assert(poss == 0.5); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_table(phrase_files, NULL, NULL, &phrase_index)) + exit(ENOENT); + + phrase_index.compact(); + + MemoryChunk* store1 = new MemoryChunk; + phrase_index.store(1, store1); + phrase_index.load(1, store1); + + MemoryChunk* store2 = new MemoryChunk; + phrase_index.store(2, store2); + phrase_index.load(2, store2); + + phrase_index.compact(); + + phrase_index.get_phrase_item(16870553, item2); + assert( item2.get_phrase_length() == 14); + assert( item2.get_n_pronunciation() == 1); + + ucs4_t buf[1024]; + item2.get_phrase_string(buf); + char * string = g_ucs4_to_utf8( buf, 14, NULL, NULL, NULL); + printf("%s\n", string); + g_free(string); + + guint32 delta = 3; + phrase_index.add_unigram_frequency(16870553, delta); + phrase_index.get_phrase_item(16870553, item2); + assert( item2.get_unigram_frequency() == 3); + + phrase_index.get_phrase_item(16777222, item2); + assert(item2.get_phrase_length() == 1); + assert(item2.get_n_pronunciation() == 2); + + return 0; +} diff --git a/tests/storage/test_phrase_index_logger.cpp b/tests/storage/test_phrase_index_logger.cpp new file mode 100644 index 0000000..c423c40 --- /dev/null +++ b/tests/storage/test_phrase_index_logger.cpp @@ -0,0 +1,67 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "pinyin_internal.h" + + +/* TODO: check whether gb_char.bin and gb_char2.bin should be the same. */ + +int main(int argc, char * argv[]){ + FacadePhraseIndex phrase_index; + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + PhraseIndexRange range; + assert(ERROR_OK == phrase_index.get_range(1, range)); + for (size_t i = range.m_range_begin; i < range.m_range_end; ++i ) { + phrase_index.add_unigram_frequency(i, 1); + } + + printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq()); + + MemoryChunk * new_chunk = new MemoryChunk; + phrase_index.store(1, new_chunk); + new_chunk->save("/tmp/gb_char.bin"); + delete new_chunk; + + chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + new_chunk = new MemoryChunk; + assert(phrase_index.diff(1, chunk, new_chunk)); + new_chunk->save("/tmp/gb_char.dbin"); + delete new_chunk; + + chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + new_chunk = new MemoryChunk; + new_chunk->load("/tmp/gb_char.dbin"); + assert(phrase_index.merge(1, new_chunk)); + chunk = new MemoryChunk; + phrase_index.store(1, chunk); + chunk->save("/tmp/gb_char2.bin"); + delete chunk; + + printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq()); + + return 0; +} diff --git a/tests/storage/test_phrase_table.cpp b/tests/storage/test_phrase_table.cpp new file mode 100644 index 0000000..a9c8ed5 --- /dev/null +++ b/tests/storage/test_phrase_table.cpp @@ -0,0 +1,86 @@ +#include "timer.h" +#include <string.h> +#include "pinyin_internal.h" +#include "tests_helper.h" + +size_t bench_times = 1000; + +int main(int argc, char * argv[]){ + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + PhraseLargeTable2 largetable; + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_table(phrase_files, NULL, &largetable, &phrase_index)) + exit(ENOENT); + + MemoryChunk * chunk = new MemoryChunk; + largetable.store(chunk); + largetable.load(chunk); + + char* linebuf = NULL; size_t size = 0; ssize_t read; + while ((read = getline(&linebuf, &size, stdin)) != -1) { + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + glong phrase_len = g_utf8_strlen(linebuf, -1); + ucs4_t * new_phrase = g_utf8_to_ucs4(linebuf, -1, NULL, NULL, NULL); + + if (0 == phrase_len) + continue; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index.prepare_tokens(tokens); + + guint32 start = record_time(); + for (size_t i = 0; i < bench_times; ++i){ + phrase_index.clear_tokens(tokens); + largetable.search(phrase_len, new_phrase, tokens); + } + print_time(start, bench_times); + + phrase_index.clear_tokens(tokens); + int retval = largetable.search(phrase_len, new_phrase, tokens); + + if (retval & SEARCH_OK) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * array = tokens[i]; + if (NULL == array) + continue; + + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = g_array_index + (array, phrase_token_t, k); + + printf("token:%d\t", token); + } + } + printf("\n"); + } + + phrase_index.destroy_tokens(tokens); + g_free(new_phrase); + } + + if ( linebuf ) + free(linebuf); + + /* mask out all index items. */ + largetable.mask_out(0x0, 0x0); + + return 0; +} diff --git a/tests/storage/test_table_info.cpp b/tests/storage/test_table_info.cpp new file mode 100644 index 0000000..68b4735 --- /dev/null +++ b/tests/storage/test_table_info.cpp @@ -0,0 +1,84 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2013 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include "pinyin_internal.h" + + +int main(int argc, char * argv[]) { + SystemTableInfo system_table_info; + + bool retval = system_table_info.load("../../data/table.conf"); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + printf("lambda:%f\n", system_table_info.get_lambda()); + + size_t i; + for (i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = + system_table_info.get_table_info() + i; + + assert(i == table_info->m_dict_index); + printf("table index:%d\n", table_info->m_dict_index); + + switch(table_info->m_file_type) { + case NOT_USED: + printf("not used.\n"); + break; + + case SYSTEM_FILE: + printf("system file:%s %s %s.\n", table_info->m_table_filename, + table_info->m_system_filename, table_info->m_user_filename); + break; + + case DICTIONARY: + printf("dictionary:%s %s %s.\n", table_info->m_table_filename, + table_info->m_system_filename, table_info->m_user_filename); + break; + + case USER_FILE: + printf("user file:%s.\n", table_info->m_user_filename); + break; + + default: + assert(false); + } + } + + UserTableInfo user_table_info; + retval = user_table_info.is_conform(&system_table_info); + assert(!retval); + + user_table_info.make_conform(&system_table_info); + retval = user_table_info.is_conform(&system_table_info); + assert(retval); + + assert(user_table_info.save("/tmp/user.conf")); + assert(user_table_info.load("/tmp/user.conf")); + + retval = user_table_info.is_conform(&system_table_info); + assert(retval); + + return 0; +} diff --git a/tests/test_chewing.cpp b/tests/test_chewing.cpp new file mode 100644 index 0000000..5a5701f --- /dev/null +++ b/tests/test_chewing.cpp @@ -0,0 +1,68 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +int main(int argc, char * argv[]){ + pinyin_context_t * context = + pinyin_init("../data", "../data"); + + pinyin_instance_t * instance = pinyin_alloc_instance(context); + + char* linebuf = NULL; + size_t size = 0; + ssize_t read; + while( (read = getline(&linebuf, &size, stdin)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + pinyin_parse_more_chewings + (instance, linebuf); + pinyin_guess_sentence(instance); + + char * sentence = NULL; + pinyin_get_sentence (instance, &sentence); + if (sentence) + printf("%s\n", sentence); + g_free(sentence); + + pinyin_train(instance); + pinyin_reset(instance); + pinyin_save(context); + } + + pinyin_free_instance(instance); + + pinyin_mask_out(context, 0x0, 0x0); + pinyin_save(context); + pinyin_fini(context); + + free(linebuf); + return 0; +} diff --git a/tests/test_phrase.cpp b/tests/test_phrase.cpp new file mode 100644 index 0000000..6e5ef3b --- /dev/null +++ b/tests/test_phrase.cpp @@ -0,0 +1,74 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +int main(int argc, char * argv[]){ + pinyin_context_t * context = + pinyin_init("../data", "../data"); + + pinyin_instance_t * instance = pinyin_alloc_instance(context); + + char* linebuf = NULL; + size_t size = 0; + ssize_t read; + while( (read = getline(&linebuf, &size, stdin)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + pinyin_phrase_segment(instance, linebuf); + guint len = 0; + pinyin_get_n_phrase(instance, &len); + + for ( size_t i = 0; i < len; ++i ){ + phrase_token_t token = null_token; + pinyin_get_phrase_token(instance, i, &token); + + if ( null_token == token ) + continue; + + char * word = NULL; + pinyin_token_get_phrase(instance, token, NULL, &word); + printf("%s\t", word); + g_free(word); + } + printf("\n"); + + pinyin_save(context); + } + + pinyin_free_instance(instance); + + pinyin_mask_out(context, 0x0, 0x0); + pinyin_save(context); + pinyin_fini(context); + + free(linebuf); + return 0; +} diff --git a/tests/test_pinyin.cpp b/tests/test_pinyin.cpp new file mode 100644 index 0000000..f94263b --- /dev/null +++ b/tests/test_pinyin.cpp @@ -0,0 +1,97 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +int main(int argc, char * argv[]){ + pinyin_context_t * context = + pinyin_init("../data", "../data"); + + pinyin_option_t options = + PINYIN_CORRECT_ALL | USE_DIVIDED_TABLE | USE_RESPLIT_TABLE | + DYNAMIC_ADJUST; + pinyin_set_options(context, options); + + pinyin_instance_t * instance = pinyin_alloc_instance(context); + + char * prefixbuf = NULL; size_t prefixsize = 0; + char * linebuf = NULL; size_t linesize = 0; + ssize_t read; + + while( TRUE ){ + fprintf(stdout, "prefix:"); + fflush(stdout); + + if ((read = getline(&prefixbuf, &prefixsize, stdin)) == -1) + break; + + if ( '\n' == prefixbuf[strlen(prefixbuf) - 1] ) { + prefixbuf[strlen(prefixbuf) - 1] = '\0'; + } + + fprintf(stdout, "pinyin:"); + fflush(stdout); + + if ((read = getline(&linebuf, &linesize, stdin)) == -1) + break; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + pinyin_parse_more_full_pinyins(instance, linebuf); + pinyin_guess_sentence_with_prefix(instance, prefixbuf); + pinyin_guess_full_pinyin_candidates(instance, 0); + + guint len = 0; + pinyin_get_n_candidate(instance, &len); + for (size_t i = 0; i < len; ++i) { + lookup_candidate_t * candidate = NULL; + pinyin_get_candidate(instance, i, &candidate); + + const char * word = NULL; + pinyin_get_candidate_string(instance, candidate, &word); + + printf("%s\t", word); + } + printf("\n"); + + pinyin_train(instance); + pinyin_reset(instance); + pinyin_save(context); + } + + pinyin_free_instance(instance); + + pinyin_mask_out(context, 0x0, 0x0); + pinyin_save(context); + pinyin_fini(context); + + free(prefixbuf); free(linebuf); + return 0; +} diff --git a/tests/tests_helper.h b/tests/tests_helper.h new file mode 100644 index 0000000..431dbc8 --- /dev/null +++ b/tests/tests_helper.h @@ -0,0 +1,86 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef TESTS_HELPER_H +#define TESTS_HELPER_H + +static bool load_phrase_index(const pinyin_table_info_t * phrase_files, + FacadePhraseIndex * phrase_index){ + MemoryChunk * chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (SYSTEM_FILE != table_info->m_file_type) + continue; + + const char * binfile = table_info->m_system_filename; + + gchar * filename = g_build_filename("..", "..", "data", + binfile, NULL); + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if (!retval) { + fprintf(stderr, "open %s failed!\n", binfile); + delete chunk; + return false; + } + + phrase_index->load(i, chunk); + g_free(filename); + } + return true; +} + +static bool load_phrase_table(const pinyin_table_info_t * phrase_files, + ChewingLargeTable * chewing_table, + PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index){ + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (SYSTEM_FILE != table_info->m_file_type) + continue; + + const char * tablename = table_info->m_table_filename; + + gchar * filename = g_build_filename("..", "..", "data", + tablename, NULL); + FILE * tablefile = fopen(filename, "r"); + if (NULL == tablefile) { + fprintf(stderr, "open %s failed!\n", tablename); + return false; + } + g_free(filename); + + if (chewing_table) + chewing_table->load_text(tablefile); + fseek(tablefile, 0L, SEEK_SET); + if (phrase_table) + phrase_table->load_text(tablefile); + fseek(tablefile, 0L, SEEK_SET); + if (phrase_index) + phrase_index->load_text(i, tablefile); + fclose(tablefile); + } + return true; +} + +#endif diff --git a/tests/timer.h b/tests/timer.h new file mode 100644 index 0000000..d3f0822 --- /dev/null +++ b/tests/timer.h @@ -0,0 +1,48 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef TIMER_H +#define TIMER_H + +#include <sys/time.h> +#include <stdio.h> +#include <glib.h> + + +static guint32 record_time () +{ + timeval tv; + gettimeofday (&tv, NULL); + return (guint32) tv.tv_sec * 1000000 + tv.tv_usec; +} + +static void print_time (guint32 old_time, guint32 times) +{ + timeval tv; + gettimeofday (&tv, NULL); + + guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time; + + printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted ); +} + + +#endif diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt new file mode 100644 index 0000000..dbd7855 --- /dev/null +++ b/utils/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(segment) +add_subdirectory(storage) +add_subdirectory(training)
\ No newline at end of file diff --git a/utils/Makefile.am b/utils/Makefile.am new file mode 100644 index 0000000..bc0f3e5 --- /dev/null +++ b/utils/Makefile.am @@ -0,0 +1,27 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = storage segment training + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I $(ac_aux_dir) + +noinst_HEADERS = utils_helper.h diff --git a/utils/segment/CMakeLists.txt b/utils/segment/CMakeLists.txt new file mode 100644 index 0000000..82e4deb --- /dev/null +++ b/utils/segment/CMakeLists.txt @@ -0,0 +1,19 @@ +add_executable( + spseg + spseg.cpp +) + +target_link_libraries( + spseg + libpinyin +) + +add_executable( + ngseg + ngseg.cpp +) + +target_link_libraries( + ngseg + libpinyin +)
\ No newline at end of file diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am new file mode 100644 index 0000000..579d6e4 --- /dev/null +++ b/utils/segment/Makefile.am @@ -0,0 +1,39 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ + @GLIB2_CFLAGS@ + +noinst_PROGRAMS = spseg ngseg mergeseq + +spseg_SOURCES = spseg.cpp + +spseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +ngseg_SOURCES = ngseg.cpp + +ngseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +mergeseq_SOURCES = mergeseq.cpp + +mergeseq_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp new file mode 100644 index 0000000..1a26064 --- /dev/null +++ b/utils/segment/mergeseq.cpp @@ -0,0 +1,278 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2013 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include <string.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +void print_help(){ + printf("Usage: mergeseq [-o outputfile] [inputfile]\n"); +} + + +static gboolean gen_extra_enter = FALSE; +static gchar * outputfile = NULL; + +static GOptionEntry entries[] = +{ + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"}, + {NULL} +}; + + +/* data structure definition. */ +typedef struct{ + phrase_token_t m_token; + gint m_token_len; +} TokenInfo; + + +/* GArray of ucs4 characters. */ +typedef GArray * UnicodeCharVector; +/* GArray of TokenInfo. */ +typedef GArray * TokenInfoVector; + +gint calculate_sequence_length(TokenInfoVector tokeninfos) { + gint len = 0; + + size_t i = 0; + for (i = 0; i < tokeninfos->len; ++i) { + TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i); + len += token_info->m_token_len; + } + + return len; +} + +/* if merge sequence found, merge and output it, + * if not, just output the first token; + * pop the first token or sequence. + */ +bool merge_sequence(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + UnicodeCharVector unichars, + TokenInfoVector tokeninfos) { + assert(tokeninfos->len > 0); + + bool found = false; + TokenInfo * token_info = NULL; + phrase_token_t token = null_token; + + ucs4_t * ucs4_str = (ucs4_t *) unichars->data; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + + /* search the merge sequence. */ + size_t index = tokeninfos->len; + gint seq_len = calculate_sequence_length(tokeninfos); + while (seq_len > 0) { + /* do phrase table search. */ + int retval = phrase_table->search(seq_len, ucs4_str, tokens); + + if (retval & SEARCH_OK) { + int num = get_first_token(tokens, token); + found = true; + break; + } + + --index; + token_info = &g_array_index(tokeninfos, TokenInfo, index); + seq_len -= token_info->m_token_len; + } + + phrase_index->destroy_tokens(tokens); + + /* push the merged sequence back. */ + if (found) { + /* pop up the origin sequence. */ + g_array_remove_range(tokeninfos, 0, index); + + TokenInfo info; + info.m_token = token; + info.m_token_len = seq_len; + g_array_prepend_val(tokeninfos, info); + } + + return found; +} + +bool pop_first_token(UnicodeCharVector unichars, + TokenInfoVector tokeninfos, + FILE * output) { + ucs4_t * ucs4_str = (ucs4_t *) unichars->data; + + /* pop it. */ + TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, 0); + phrase_token_t token = token_info->m_token; + gint token_len = token_info->m_token_len; + + glong read = 0; + gchar * utf8_str = g_ucs4_to_utf8(ucs4_str, token_len, &read, NULL, NULL); + assert(read == token_len); + fprintf(output, "%d %s\n", token, utf8_str); + g_free(utf8_str); + + g_array_remove_range(unichars, 0, token_len); + g_array_remove_index(tokeninfos, 0); + + return true; +} + +bool feed_line(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + UnicodeCharVector unichars, + TokenInfoVector tokeninfos, + const char * linebuf, + FILE * output) { + + TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf); + + if (null_token == token) { + /* empty the queue. */ + while (0 != tokeninfos->len) { + merge_sequence(phrase_table, phrase_index, unichars, tokeninfos); + pop_first_token(unichars, tokeninfos, output); + } + + assert(0 == unichars->len); + assert(0 == tokeninfos->len); + + /* restore the null token line. */ + fprintf(output, "%s\n", linebuf); + + return false; + } + + PhraseItem item; + phrase_index->get_phrase_item(token, item); + gint len = item.get_phrase_length(); + + TokenInfo info; + info.m_token = token; + info.m_token_len = len; + g_array_append_val(tokeninfos, info); + + ucs4_t buffer[MAX_PHRASE_LENGTH]; + item.get_phrase_string(buffer); + g_array_append_vals(unichars, buffer, len); + + /* probe merge sequence. */ + len = calculate_sequence_length(tokeninfos); + while (len >= MAX_PHRASE_LENGTH) { + merge_sequence(phrase_table, phrase_index, unichars, tokeninfos); + pop_first_token(unichars, tokeninfos, output); + len = calculate_sequence_length(tokeninfos); + } + + return true; +} + + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- merge word sequence"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (outputfile) { + output = fopen(outputfile, "w"); + if (NULL == output) { + perror("open file failed"); + exit(EINVAL); + } + } + + if (argc > 2) { + fprintf(stderr, "too many arguments.\n"); + exit(EINVAL); + } + + if (2 == argc) { + input = fopen(argv[1], "r"); + if (NULL == input) { + perror("open file failed"); + exit(EINVAL); + } + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* init phrase table */ + FacadePhraseTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + GArray * unichars = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); + GArray * tokeninfos = g_array_new(TRUE, TRUE, sizeof(TokenInfo)); + + char * linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, input)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if (0 == strlen(linebuf)) + continue; + + feed_line(&phrase_table, &phrase_index, + unichars, tokeninfos, + linebuf, output); + } + + g_array_free(unichars, TRUE); + g_array_free(tokeninfos, TRUE); + free(linebuf); + fclose(input); + fclose(output); + return 0; +} diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp new file mode 100644 index 0000000..03fe5b4 --- /dev/null +++ b/utils/segment/ngseg.cpp @@ -0,0 +1,261 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +void print_help(){ + printf("Usage: ngseg [--generate-extra-enter] [-o outputfile] [inputfile]\n"); +} + + +static gboolean gen_extra_enter = FALSE; +static gchar * outputfile = NULL; + +static GOptionEntry entries[] = +{ + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"}, + {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL}, + {NULL} +}; + + +/* n-gram based sentence segment. */ + +/* Note: + * Currently libpinyin supports ucs4 characters. + * This is a pre-processor tool for raw corpus, + * and skips non-Chinese characters. + */ + +/* TODO: + * Try to add punctuation mark and english support, + * such as ',', '.', '?', '!', <english>, and other punctuations. + */ + +enum CONTEXT_STATE{ + CONTEXT_INIT, + CONTEXT_SEGMENTABLE, + CONTEXT_UNKNOWN +}; + +bool deal_with_segmentable(PhraseLookup * phrase_lookup, + GArray * current_ucs4, + FILE * output){ + char * result_string = NULL; + MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + phrase_lookup->get_best_match(current_ucs4->len, + (ucs4_t *) current_ucs4->data, results); + + phrase_lookup->convert_to_utf8(results, result_string); + + if (result_string) { + fprintf(output, "%s\n", result_string); + } else { + char * tmp_string = g_ucs4_to_utf8 + ( (ucs4_t *) current_ucs4->data, current_ucs4->len, + NULL, NULL, NULL); + fprintf(stderr, "Un-segmentable sentence encountered:%s\n", + tmp_string); + g_array_free(results, TRUE); + return false; + } + g_array_free(results, TRUE); + g_free(result_string); + return true; +} + +bool deal_with_unknown(GArray * current_ucs4, FILE * output){ + char * result_string = g_ucs4_to_utf8 + ( (ucs4_t *) current_ucs4->data, current_ucs4->len, + NULL, NULL, NULL); + fprintf(output, "%d %s\n", null_token, result_string); + g_free(result_string); + return true; +} + + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- n-gram segment"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (outputfile) { + output = fopen(outputfile, "w"); + if (NULL == output) { + perror("open file failed"); + exit(EINVAL); + } + } + + if (argc > 2) { + fprintf(stderr, "too many arguments.\n"); + exit(EINVAL); + } + + if (2 == argc) { + input = fopen(argv[1], "r"); + if (NULL == input) { + perror("open file failed"); + exit(EINVAL); + } + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* init phrase table */ + FacadePhraseTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + /* init bi-gram */ + Bigram system_bigram; + system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); + Bigram user_bigram; + + gfloat lambda = system_table_info.get_lambda(); + + /* init phrase lookup */ + PhraseLookup phrase_lookup(lambda, + &phrase_table, &phrase_index, + &system_bigram, &user_bigram); + + + CONTEXT_STATE state, next_state; + GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index.prepare_tokens(tokens); + + /* split the sentence */ + char * linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, input)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + /* check non-ucs4 characters */ + const glong num_of_chars = g_utf8_strlen(linebuf, -1); + glong len = 0; + ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); + if ( len != num_of_chars ) { + fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); + fprintf(output, "%d \n", null_token); + continue; + } + + /* only new-line persists. */ + if ( 0 == num_of_chars ) { + fprintf(output, "%d \n", null_token); + continue; + } + + state = CONTEXT_INIT; + int result = phrase_table.search( 1, sentence, tokens); + g_array_append_val( current_ucs4, sentence[0]); + if ( result & SEARCH_OK ) + state = CONTEXT_SEGMENTABLE; + else + state = CONTEXT_UNKNOWN; + + for ( int i = 1; i < num_of_chars; ++i) { + int result = phrase_table.search( 1, sentence + i, tokens); + if ( result & SEARCH_OK ) + next_state = CONTEXT_SEGMENTABLE; + else + next_state = CONTEXT_UNKNOWN; + + if ( state == next_state ){ + g_array_append_val(current_ucs4, sentence[i]); + continue; + } + + assert ( state != next_state ); + if ( state == CONTEXT_SEGMENTABLE ) + deal_with_segmentable(&phrase_lookup, current_ucs4, output); + + if ( state == CONTEXT_UNKNOWN ) + deal_with_unknown(current_ucs4, output); + + /* save the current character */ + g_array_set_size(current_ucs4, 0); + g_array_append_val(current_ucs4, sentence[i]); + state = next_state; + } + + if ( current_ucs4->len ) { + /* this seems always true. */ + if ( state == CONTEXT_SEGMENTABLE ) + deal_with_segmentable(&phrase_lookup, current_ucs4, output); + + if ( state == CONTEXT_UNKNOWN ) + deal_with_unknown(current_ucs4, output); + g_array_set_size(current_ucs4, 0); + } + + /* print extra enter */ + if ( gen_extra_enter ) + fprintf(output, "%d \n", null_token); + + g_free(sentence); + } + phrase_index.destroy_tokens(tokens); + + /* print enter at file tail */ + fprintf(output, "%d \n", null_token); + g_array_free(current_ucs4, TRUE); + free(linebuf); + fclose(input); + fclose(output); + return 0; +} diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp new file mode 100644 index 0000000..b543cc5 --- /dev/null +++ b/utils/segment/spseg.cpp @@ -0,0 +1,343 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010,2013 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <string.h> +#include <locale.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +void print_help(){ + printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n"); +} + +static gboolean gen_extra_enter = FALSE; +static gchar * outputfile = NULL; + +static GOptionEntry entries[] = +{ + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"}, + {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL}, + {NULL} +}; + + +/* graph shortest path sentence segment. */ + +/* Note: + * Currently libpinyin only supports ucs4 characters, as this is a + * pre-processor tool for raw corpus, it will skip all sentences + * which contains non-ucs4 characters. + */ + +enum CONTEXT_STATE{ + CONTEXT_INIT, + CONTEXT_SEGMENTABLE, + CONTEXT_UNKNOWN +}; + +struct SegmentStep{ + phrase_token_t m_handle; + ucs4_t * m_phrase; + size_t m_phrase_len; + //use formula W = number of words. Zero handle means one word. + guint m_nword; + //backtrace information, -1 one step backward. + gint m_backward_nstep; +public: + SegmentStep(){ + m_handle = null_token; + m_phrase = NULL; + m_phrase_len = 0; + m_nword = UINT_MAX; + m_backward_nstep = -0; + } +}; + +bool backtrace(GArray * steps, glong phrase_len, GArray * strings); + +/* Note: do not free phrase, as it is used by strings (array of segment). */ +bool segment(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + GArray * current_ucs4, + GArray * strings /* Array of SegmentStep. */){ + ucs4_t * phrase = (ucs4_t *)current_ucs4->data; + guint phrase_len = current_ucs4->len; + + /* Prepare for shortest path segment dynamic programming. */ + GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); + SegmentStep step; + for ( glong i = 0; i < phrase_len + 1; ++i ){ + g_array_append_val(steps, step); + } + + SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0); + first_step->m_nword = 0; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + + for ( glong i = 0; i < phrase_len + 1; ++i ) { + SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i); + size_t nword = step_begin->m_nword; + for ( glong k = i + 1; k < phrase_len + 1; ++k ) { + size_t len = k - i; + ucs4_t * cur_phrase = phrase + i; + + phrase_token_t token = null_token; + int result = phrase_table->search(len, cur_phrase, tokens); + int num = get_first_token(tokens, token); + + if ( !(result & SEARCH_OK) ){ + token = null_token; + if ( 1 != len ) + continue; + } + ++nword; + + SegmentStep * step_end = &g_array_index(steps, SegmentStep, k); + if ( nword < step_end->m_nword ) { + step_end->m_handle = token; + step_end->m_phrase = cur_phrase; + step_end->m_phrase_len = len; + step_end->m_nword = nword; + step_end->m_backward_nstep = i - k; + } + if ( !(result & SEARCH_CONTINUED) ) + break; + } + } + phrase_index->destroy_tokens(tokens); + + return backtrace(steps, phrase_len, strings); +} + +bool backtrace(GArray * steps, glong phrase_len, GArray * strings){ + /* backtracing to get the result. */ + size_t cur_step = phrase_len; + g_array_set_size(strings, 0); + while ( cur_step ){ + SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step); + g_array_append_val(strings, *step); + cur_step = cur_step + step->m_backward_nstep; + /* intended to avoid leaking internal informations. */ + step->m_nword = 0; step->m_backward_nstep = 0; + } + + /* reverse the strings. */ + for ( size_t i = 0; i < strings->len / 2; ++i ) { + SegmentStep * head, * tail; + head = &g_array_index(strings, SegmentStep, i); + tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i ); + SegmentStep tmp; + tmp = *head; + *head = *tail; + *tail = tmp; + } + + g_array_free(steps, TRUE); + return true; +} + +bool deal_with_segmentable(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + GArray * current_ucs4, + FILE * output){ + + /* do segment stuff. */ + GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); + segment(phrase_table, phrase_index, current_ucs4, strings); + + /* print out the split phrase. */ + for ( glong i = 0; i < strings->len; ++i ) { + SegmentStep * step = &g_array_index(strings, SegmentStep, i); + char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL); + fprintf(output, "%d %s\n", step->m_handle, string); + g_free(string); + } + + g_array_free(strings, TRUE); + return true; +} + +bool deal_with_unknown(GArray * current_ucs4, FILE * output){ + char * result_string = g_ucs4_to_utf8 + ( (ucs4_t *) current_ucs4->data, current_ucs4->len, + NULL, NULL, NULL); + fprintf(output, "%d %s\n", null_token, result_string); + g_free(result_string); + return true; +} + + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- shortest path segment"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (outputfile) { + output = fopen(outputfile, "w"); + if (NULL == output) { + perror("open file failed"); + exit(EINVAL); + } + } + + if (argc > 2) { + fprintf(stderr, "too many arguments.\n"); + exit(EINVAL); + } + + if (2 == argc) { + input = fopen(argv[1], "r"); + if (NULL == input) { + perror("open file failed"); + exit(EINVAL); + } + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* init phrase table */ + FacadePhraseTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + CONTEXT_STATE state, next_state; + GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index.prepare_tokens(tokens); + + char * linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, input)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + /* check non-ucs4 characters. */ + const glong num_of_chars = g_utf8_strlen(linebuf, -1); + glong len = 0; + ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); + if ( len != num_of_chars ) { + fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); + fprintf(output, "%d \n", null_token); + continue; + } + + /* only new-line persists. */ + if ( 0 == num_of_chars ) { + fprintf(output, "%d \n", null_token); + continue; + } + + state = CONTEXT_INIT; + int result = phrase_table.search( 1, sentence, tokens); + g_array_append_val( current_ucs4, sentence[0]); + if ( result & SEARCH_OK ) + state = CONTEXT_SEGMENTABLE; + else + state = CONTEXT_UNKNOWN; + + for ( int i = 1; i < num_of_chars; ++i) { + int result = phrase_table.search( 1, sentence + i, tokens); + if ( result & SEARCH_OK ) + next_state = CONTEXT_SEGMENTABLE; + else + next_state = CONTEXT_UNKNOWN; + + if ( state == next_state ){ + g_array_append_val(current_ucs4, sentence[i]); + continue; + } + + assert ( state != next_state ); + if ( state == CONTEXT_SEGMENTABLE ) + deal_with_segmentable(&phrase_table, &phrase_index, + current_ucs4, output); + + if ( state == CONTEXT_UNKNOWN ) + deal_with_unknown(current_ucs4, output); + + /* save the current character */ + g_array_set_size(current_ucs4, 0); + g_array_append_val(current_ucs4, sentence[i]); + state = next_state; + } + + if ( current_ucs4->len ) { + /* this seems always true. */ + if ( state == CONTEXT_SEGMENTABLE ) + deal_with_segmentable(&phrase_table, &phrase_index, + current_ucs4, output); + + if ( state == CONTEXT_UNKNOWN ) + deal_with_unknown(current_ucs4, output); + g_array_set_size(current_ucs4, 0); + } + + /* print extra enter */ + if ( gen_extra_enter ) + fprintf(output, "%d \n", null_token); + + g_free(sentence); + } + phrase_index.destroy_tokens(tokens); + + /* print enter at file tail */ + fprintf(output, "%d \n", null_token); + g_array_free(current_ucs4, TRUE); + free(linebuf); + fclose(input); + fclose(output); + return 0; +} diff --git a/utils/storage/CMakeLists.txt b/utils/storage/CMakeLists.txt new file mode 100644 index 0000000..63cabcd --- /dev/null +++ b/utils/storage/CMakeLists.txt @@ -0,0 +1,29 @@ +add_executable( + gen_binary_files + gen_binary_files.cpp +) + +target_link_libraries( + gen_binary_files + libpinyin +) + +add_executable( + import_interpolation + import_interpolation.cpp +) + +target_link_libraries( + import_interpolation + libpinyin +) + +add_executable( + export_interpolation + export_interpolation.cpp +) + +target_link_libraries( + export_interpolation + libpinyin +) diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am new file mode 100644 index 0000000..db63488 --- /dev/null +++ b/utils/storage/Makefile.am @@ -0,0 +1,45 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ + @GLIB2_CFLAGS@ + +bin_PROGRAMS = gen_binary_files \ + import_interpolation + +noinst_PROGRAMS = export_interpolation \ + gen_pinyin_table + +gen_binary_files_SOURCES = gen_binary_files.cpp + +gen_binary_files_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +import_interpolation_SOURCES = import_interpolation.cpp + +import_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +export_interpolation_SOURCES = export_interpolation.cpp + +export_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +gen_pinyin_table_SOURCES = gen_pinyin_table.cpp + +gen_pinyin_table_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp new file mode 100644 index 0000000..c43eefb --- /dev/null +++ b/utils/storage/export_interpolation.cpp @@ -0,0 +1,144 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <stdio.h> +#include <assert.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +/* export interpolation model as textual format */ + +bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index); +bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram); + +bool begin_data(FILE * output){ + fprintf(output, "\\data model interpolation\n"); + return true; +} + +bool end_data(FILE * output){ + fprintf(output, "\\end\n"); + return true; +} + +int main(int argc, char * argv[]){ + FILE * output = stdout; + const char * bigram_filename = SYSTEM_BIGRAM; + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + bigram.attach(bigram_filename, ATTACH_READONLY); + + begin_data(output); + + gen_unigram(output, &phrase_index); + gen_bigram(output, &phrase_index, &bigram); + + end_data(output); + return 0; +} + +bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) { + fprintf(output, "\\1-gram\n"); + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) { + + PhraseIndexRange range; + int result = phrase_index->get_range(i, range); + if (ERROR_OK != result ) + continue; + + PhraseItem item; + for (phrase_token_t token = range.m_range_begin; + token < range.m_range_end; token++) { + int result = phrase_index->get_phrase_item(token, item); + + if ( result == ERROR_NO_ITEM ) + continue; + assert( result == ERROR_OK); + + size_t freq = item.get_unigram_frequency(); + if ( 0 == freq ) + continue; + char * phrase = taglib_token_to_string(phrase_index, token); + if ( phrase ) + fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq); + + g_free(phrase); + } + } + return true; +} + +bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){ + fprintf(output, "\\2-gram\n"); + + /* Retrieve all user items. */ + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + bigram->get_all_items(items); + + PhraseItem item; + + for(size_t i = 0; i < items->len; i++){ + phrase_token_t token = g_array_index(items, phrase_token_t, i); + SingleGram * single_gram = NULL; + bigram->load(token, single_gram); + + BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); + single_gram->retrieve_all(array); + for(size_t j = 0; j < array->len; j++) { + BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j); + + char * word1 = taglib_token_to_string(phrase_index, token); + char * word2 = taglib_token_to_string(phrase_index, item->m_token); + guint32 freq = item->m_count; + + if ( word1 && word2) + fprintf(output, "\\item %d %s %d %s count %d\n", + token, word1, item->m_token, word2, freq); + + g_free(word1); g_free(word2); + } + + g_array_free(array, TRUE); + delete single_gram; + } + + g_array_free(items, TRUE); + return true; +} diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp new file mode 100644 index 0000000..4216b44 --- /dev/null +++ b/utils/storage/gen_binary_files.cpp @@ -0,0 +1,115 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +static const gchar * table_dir = "."; + +static GOptionEntry entries[] = +{ + {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL}, + {NULL} +}; + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate binary files"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + bool retval = system_table_info.load(filename); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + g_free(filename); + + /* generate pinyin index*/ + pinyin_option_t options = USE_TONE; + ChewingLargeTable chewing_table(options); + PhraseLargeTable2 phrase_table; + + /* generate phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + assert(table_info->m_dict_index == i); + + if (SYSTEM_FILE != table_info->m_file_type && + DICTIONARY != table_info->m_file_type) + continue; + + const char * tablename = table_info->m_table_filename; + + filename = g_build_filename(table_dir, tablename, NULL); + FILE * tablefile = fopen(filename, "r"); + + if (NULL == tablefile) { + fprintf(stderr, "open %s failed!\n", tablename); + exit(ENOENT); + } + + chewing_table.load_text(tablefile); + fseek(tablefile, 0L, SEEK_SET); + phrase_table.load_text(tablefile); + fseek(tablefile, 0L, SEEK_SET); + phrase_index.load_text(i, tablefile); + fclose(tablefile); + g_free(filename); + } + + MemoryChunk * new_chunk = new MemoryChunk; + chewing_table.store(new_chunk); + new_chunk->save(SYSTEM_PINYIN_INDEX); + chewing_table.load(new_chunk); + + new_chunk = new MemoryChunk; + phrase_table.store(new_chunk); + new_chunk->save(SYSTEM_PHRASE_INDEX); + phrase_table.load(new_chunk); + + phrase_index.compact(); + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + if (!save_dictionary(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp new file mode 100644 index 0000000..3b541d1 --- /dev/null +++ b/utils/storage/gen_pinyin_table.cpp @@ -0,0 +1,330 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <stdio.h> +#include <glib.h> +#include "pinyin_internal.h" + + +void print_help(){ + printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n" + "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n" + "<OUTPUTFILE> the result output file\n" + "<FILEi> input pinyin files\n" + "<PHRASE_INDEX> phrase index identifier\n"); +} + + +static gint phrase_index = 0; +static const gchar * outputfile = "temp.out"; + +static GOptionEntry entries[] = +{ + {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL}, + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL}, + {NULL} +}; + + +using namespace pinyin; + +/* map from phrase_item to GArray of chewing_and_freq_item */ +GTree * g_chewing_tree; +/* Array of GArray of phrase_and_array_item */ +GArray * g_item_array[MAX_PHRASE_LENGTH + 1]; + +struct phrase_item{ + size_t length; + gunichar * uniphrase; +}; + +struct chewing_and_freq_item{ + ChewingKeyVector keys; + ChewingKeyRestVector key_rests; + guint32 freq; +}; + +struct phrase_and_array_item{ + phrase_item phrase; /* the key of g_chewing_tree */ + /* Array of chewing_and_freq_item */ + GArray * chewing_and_freq_array; /* the value of g_chewing_tree */ +}; + + +void feed_file(const char * filename); + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq); + +gboolean store_one_item(gpointer key, gpointer value, gpointer data); + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata); + +void gen_phrase_file(const char * outputfile, int phrase_index); + + +gint phrase_item_compare(gconstpointer a, gconstpointer b){ + phrase_item * itema = (phrase_item *) a; + phrase_item * itemb = (phrase_item *) b; + if ( itema->length != itemb->length ) + return itema->length - itemb->length; + else + return memcmp(itema->uniphrase, itemb->uniphrase, + sizeof(gunichar) * itema->length); +} + + +int main(int argc, char * argv[]){ + int i; + + g_chewing_tree = g_tree_new(phrase_item_compare); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate pinyin table"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + for (i = 1; i < argc; ++i) { + feed_file(argv[i]); + } + + printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree)); + + /* store in item array */ + g_item_array[0] = NULL; + for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_item_array[i] = g_array_new + (FALSE, TRUE, sizeof(phrase_and_array_item)); + } + g_tree_foreach(g_chewing_tree, store_one_item, NULL); + + /* sort item array */ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i); + } + + gen_phrase_file(outputfile, phrase_index); + + return 0; +} + +void feed_file ( const char * filename){ + char phrase[1024], pinyin[1024]; + guint32 freq; + + FILE * infile = fopen(filename, "r"); + if ( NULL == infile ){ + fprintf(stderr, "Can't open file %s.\n", filename); + exit(ENOENT); + } + + while ( !feof(infile)){ + int num = fscanf(infile, "%s %s %u", + phrase, pinyin, &freq); + + if (3 != num) + continue; + + if (feof(infile)) + break; + + feed_line(phrase, pinyin, freq); + } + + fclose(infile); +} + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq) { + phrase_item * item = new phrase_item; + item->length = g_utf8_strlen(phrase, -1); + + /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp + * where is the code which I don't want to touch. :-) + */ + + if (item->length >= MAX_PHRASE_LENGTH) { + fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); + + FullPinyinParser2 parser; + ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = g_array_new + (FALSE, FALSE, sizeof(ChewingKeyRest)); + + pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + assert(keys->len == key_rests->len); + + if (keys->len != item->length) { + fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq); + delete item; + return; + } + + GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item); + + chewing_and_freq_item value_item; + value_item.keys = keys; value_item.key_rests = key_rests; + value_item.freq = freq; + + assert(item->length == value_item.keys->len); + if (NULL == array) { + array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item)); + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + return; + } + + bool found = false; + for (size_t i = 0; i < array->len; ++i) { + chewing_and_freq_item * cur_item = + &g_array_index(array, chewing_and_freq_item, i); + int result = pinyin_exact_compare2 + ((ChewingKey *) value_item.keys->data, + (ChewingKey *) cur_item->keys->data, + value_item.keys->len); + + if (0 == result) { + fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", + phrase, pinyin, freq); + cur_item->freq += freq; + found = true; + } + } + + if (!found) { + g_array_append_val(array, value_item); + g_tree_insert(g_chewing_tree, item, array); + } else { + /* clean up */ + g_array_free(keys, TRUE); + g_array_free(key_rests, TRUE); + } + + delete item; +} + + +gboolean store_one_item(gpointer key, gpointer value, gpointer data) { + phrase_and_array_item item; + item.phrase = *((phrase_item *) key); + item.chewing_and_freq_array = (GArray *) value; + int len = item.phrase.length; + g_array_append_val(g_item_array[len], item); + return FALSE; +} + + +int phrase_array_compare(gconstpointer lhs, gconstpointer rhs, + gpointer userdata) { + int phrase_length = *((int *) userdata); + phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs; + phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs; + + ChewingKeyVector keys_lhs = g_array_index + (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + ChewingKeyVector keys_rhs = g_array_index + (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys; + return pinyin_exact_compare2((ChewingKey *)keys_lhs->data, + (ChewingKey *)keys_rhs->data, phrase_length); +} + + +void gen_phrase_file(const char * outputfile, int phrase_index){ + FILE * outfile = fopen(outputfile, "w"); + if (NULL == outfile ) { + fprintf(stderr, "Can't write file %s.\n", outputfile); + exit(ENOENT); + } + + phrase_token_t token = 1; + + /* phrase length index */ + for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) { + GArray * item_array = g_item_array[i]; + + /* item array index */ + for (size_t m = 0; m < item_array->len; ++m) { + phrase_and_array_item * item = &g_array_index + (item_array, phrase_and_array_item, m); + phrase_item phrase = item->phrase; + GArray * chewing_and_freqs = item->chewing_and_freq_array; + + gchar * phrase_str = g_ucs4_to_utf8 + (phrase.uniphrase, phrase.length, NULL, NULL, NULL); + + /* iterate each pinyin */ + for (size_t n = 0; n < chewing_and_freqs->len; ++n) { + chewing_and_freq_item * chewing_and_freq = + &g_array_index + (chewing_and_freqs, chewing_and_freq_item, n); + + ChewingKeyVector keys = chewing_and_freq->keys; + ChewingKeyRestVector key_rests = chewing_and_freq->key_rests; + + GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *)); + gchar * pinyin = NULL; + + size_t k; + for (k = 0; k < keys->len; ++k) { + ChewingKey key = g_array_index(keys, ChewingKey, k); + ChewingKeyRest key_rest = g_array_index + (key_rests, ChewingKeyRest, k); + + //assert (CHEWING_ZERO_TONE != key.m_tone); + pinyin = key.get_pinyin_string(); + g_array_append_val(pinyins, pinyin); + } + gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data); + + for (k = 0; k < pinyins->len; ++k) { + g_free(g_array_index(pinyins, gchar *, k)); + } + g_array_free(pinyins, TRUE); + + guint32 freq = chewing_and_freq->freq; + + /* avoid zero freq */ + if (freq < 3) freq = 3; + + fprintf(outfile, "%s\t%s\t%d\t%d\n", + pinyin_str, phrase_str, + PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq); + + g_free(pinyin_str); + } + g_free(phrase_str); + token++; + } + } + + fclose(outfile); +} diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp new file mode 100644 index 0000000..205a27a --- /dev/null +++ b/utils/storage/import_interpolation.cpp @@ -0,0 +1,313 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +static const gchar * table_dir = "."; + +static GOptionEntry entries[] = +{ + {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL}, + {NULL} +}; + + +enum LINE_TYPE{ + BEGIN_LINE = 1, + END_LINE, + GRAM_1_LINE, + GRAM_2_LINE, + GRAM_1_ITEM_LINE, + GRAM_2_ITEM_LINE +}; + +static int line_type = 0; +static GPtrArray * values = NULL; +static GHashTable * required = NULL; +/* variables for line buffer. */ +static char * linebuf = NULL; +static size_t len = 0; + +bool parse_headline(); + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index); + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * bigram); + +static ssize_t my_getline(FILE * input){ + ssize_t result = getline(&linebuf, &len, input); + if ( result == -1 ) + return result; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + return result; +} + +bool parse_headline(){ + /* enter "\data" line */ + assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); + + /* read "\data" line */ + if ( !taglib_read(linebuf, line_type, values, required) ) { + fprintf(stderr, "error: interpolation model expected.\n"); + return false; + } + + assert(line_type == BEGIN_LINE); + /* check header */ + TAGLIB_GET_TAGVALUE(const char *, model, (const char *)); + if ( !( strcmp("interpolation", model) == 0 ) ) { + fprintf(stderr, "error: interpolation model expected.\n"); + return false; + } + return true; +} + +bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + + do { + retry: + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case END_LINE: + goto end; + case GRAM_1_LINE: + my_getline(input); + parse_unigram(input, phrase_table, phrase_index); + goto retry; + case GRAM_2_LINE: + my_getline(input); + parse_bigram(input, phrase_table, phrase_index, bigram); + goto retry; + default: + assert(false); + } + } while (my_getline(input) != -1) ; + + end: + taglib_pop_state(); + return true; +} + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", "")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_1_ITEM_LINE:{ + /* handle \item in \1-gram */ + TAGLIB_GET_TOKEN(token, 0); + TAGLIB_GET_PHRASE_STRING(word, 1); + assert(taglib_validate_token_with_string + (phrase_index, token, word)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + phrase_index->add_unigram_frequency(token, count); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", "")); + + phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL; + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_2_ITEM_LINE:{ + /* handle \item in \2-gram */ + /* two tokens */ + TAGLIB_GET_TOKEN(token1, 0); + TAGLIB_GET_PHRASE_STRING(word1, 1); + assert(taglib_validate_token_with_string + (phrase_index, token1, word1)); + + TAGLIB_GET_TOKEN(token2, 2); + TAGLIB_GET_PHRASE_STRING(word2, 3); + assert(taglib_validate_token_with_string + (phrase_index, token2, word2)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + + if ( last_token != token1 ) { + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + + /* safe guard */ + last_token = null_token; + last_single_gram = NULL; + } + SingleGram * single_gram = NULL; + bigram->load(token1, single_gram); + + /* create the new single gram */ + if ( single_gram == NULL ) + single_gram = new SingleGram; + last_token = token1; + last_single_gram = single_gram; + } + + /* save the freq */ + assert(NULL != last_single_gram); + guint32 total_freq = 0; + assert(last_single_gram->get_total_freq(total_freq)); + assert(last_single_gram->insert_freq(token2, count)); + total_freq += count; + assert(last_single_gram->set_total_freq(total_freq)); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + //safe guard + last_token = 0; + last_single_gram = NULL; + } + + taglib_pop_state(); + return true; +} + +int main(int argc, char * argv[]){ + FILE * input = stdin; + const char * bigram_filename = SYSTEM_BIGRAM; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- import interpolation model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + bool retval = system_table_info.load(filename); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + g_free(filename); + + PhraseLargeTable2 phrase_table; + + MemoryChunk * chunk = new MemoryChunk; + retval = chunk->load(SYSTEM_PHRASE_INDEX); + if (!retval) { + fprintf(stderr, "open phrase_index.bin failed!\n"); + exit(ENOENT); + } + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bigram_filename); + exit(ENOENT); + } + + taglib_init(); + + values = g_ptr_array_new(); + required = g_hash_table_new(g_str_hash, g_str_equal); + + /* read first line */ + ssize_t result = my_getline(input); + if ( result == -1 ) { + fprintf(stderr, "empty file input.\n"); + exit(ENODATA); + } + + if (!parse_headline()) + exit(ENODATA); + + result = my_getline(input); + if ( result != -1 ) + parse_body(input, &phrase_table, &phrase_index, &bigram); + + taglib_fini(); + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} diff --git a/utils/training/CMakeLists.txt b/utils/training/CMakeLists.txt new file mode 100644 index 0000000..ee59bcd --- /dev/null +++ b/utils/training/CMakeLists.txt @@ -0,0 +1,129 @@ +add_executable( + gen_ngram + gen_ngram.cpp +) + +target_link_libraries( + gen_ngram + libpinyin +) + +add_executable( + gen_deleted_ngram + gen_deleted_ngram.cpp +) + +target_link_libraries( + gen_deleted_ngram + libpinyin +) + +add_executable( + gen_unigram + gen_unigram.cpp +) + +target_link_libraries( + gen_unigram + libpinyin +) + +add_executable( + gen_k_mixture_model + gen_k_mixture_model.cpp +) + +target_link_libraries( + gen_k_mixture_model + libpinyin +) + +add_executable( + estimate_interpolation + estimate_interpolation.cpp +) + +target_link_libraries( + estimate_interpolation + libpinyin +) + +add_executable( + estimate_k_mixture_model + estimate_k_mixture_model.cpp +) + +target_link_libraries( + estimate_k_mixture_model + libpinyin +) + +add_executable( + merge_k_mixture_model + merge_k_mixture_model.cpp +) + +target_link_libraries( + merge_k_mixture_model + libpinyin +) + +add_executable( + prune_k_mixture_model + prune_k_mixture_model.cpp +) + +target_link_libraries( + prune_k_mixture_model + libpinyin +) + +add_executable( + import_k_mixture_model + import_k_mixture_model.cpp +) + +target_link_libraries( + import_k_mixture_model + libpinyin +) + +add_executable( + export_k_mixture_model + export_k_mixture_model.cpp +) + +target_link_libraries( + export_k_mixture_model + libpinyin +) + +add_executable( + k_mixture_model_to_interpolation + k_mixture_model_to_interpolation.cpp +) + +target_link_libraries( + k_mixture_model_to_interpolation + libpinyin +) + +add_executable( + validate_k_mixture_model + validate_k_mixture_model.cpp +) + +target_link_libraries( + validate_k_mixture_model + libpinyin +) + +add_executable( + eval_correction_rate + eval_correction_rate.cpp +) + +target_link_libraries( + eval_correction_rate + libpinyin +)
\ No newline at end of file diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am new file mode 100644 index 0000000..dc834ec --- /dev/null +++ b/utils/training/Makefile.am @@ -0,0 +1,97 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ + @GLIB2_CFLAGS@ + +noinst_HEADERS = k_mixture_model.h + +bin_PROGRAMS = gen_unigram + +noinst_PROGRAMS = gen_ngram \ + gen_deleted_ngram \ + gen_k_mixture_model \ + estimate_interpolation \ + estimate_k_mixture_model \ + merge_k_mixture_model \ + prune_k_mixture_model \ + import_k_mixture_model \ + export_k_mixture_model \ + k_mixture_model_to_interpolation \ + validate_k_mixture_model \ + eval_correction_rate + +gen_ngram_SOURCES = gen_ngram.cpp + +gen_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +gen_deleted_ngram_SOURCES = gen_deleted_ngram.cpp + +gen_deleted_ngram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +gen_unigram_SOURCES = gen_unigram.cpp + +gen_unigram_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +gen_k_mixture_model_SOURCES = gen_k_mixture_model.cpp + +gen_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +estimate_interpolation_SOURCES = estimate_interpolation.cpp + +estimate_interpolation_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +estimate_k_mixture_model_SOURCES = estimate_k_mixture_model.cpp + +estimate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +merge_k_mixture_model_SOURCES = merge_k_mixture_model.cpp + +merge_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +prune_k_mixture_model_SOURCES = prune_k_mixture_model.cpp + +prune_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +import_k_mixture_model_SOURCES = import_k_mixture_model.cpp + +import_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +export_k_mixture_model_SOURCES = export_k_mixture_model.cpp + +export_k_mixture_model_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ + +k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp + +k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +validate_k_mixture_model_SOURCES = validate_k_mixture_model.cpp + +validate_k_mixture_model_LDADD = ../../src/libpinyin_internal.la \ + @GLIB2_LIBS@ + +eval_correction_rate_SOURCES = eval_correction_rate.cpp + +eval_correction_rate_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@ diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp new file mode 100644 index 0000000..5cdc680 --- /dev/null +++ b/utils/training/estimate_interpolation.cpp @@ -0,0 +1,144 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2008 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +#include <math.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +parameter_t compute_interpolation(SingleGram * deleted_bigram, + FacadePhraseIndex * unigram, + SingleGram * bigram){ + bool success; + parameter_t lambda = 0, next_lambda = 0.6; + parameter_t epsilon = 0.001; + + while ( fabs(lambda - next_lambda) > epsilon){ + lambda = next_lambda; + next_lambda = 0; + guint32 table_num = 0; + parameter_t numerator = 0; + parameter_t part_of_denominator = 0; + + BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); + deleted_bigram->retrieve_all(array); + + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, i); + //get the phrase token + phrase_token_t token = item->m_token; + guint32 deleted_count = item->m_count; + + { + guint32 freq = 0; + parameter_t elem_poss = 0; + if (bigram && bigram->get_freq(token, freq)){ + guint32 total_freq; + assert(bigram->get_total_freq(total_freq)); + assert(0 != total_freq); + elem_poss = freq / (parameter_t) total_freq; + } + numerator = lambda * elem_poss; + } + + { + parameter_t elem_poss = 0; + PhraseItem item; + if (!unigram->get_phrase_item(token, item)){ + guint32 freq = item.get_unigram_frequency(); + guint32 total_freq = unigram->get_phrase_index_total_freq(); + elem_poss = freq / (parameter_t)total_freq; + } + part_of_denominator = (1 - lambda) * elem_poss; + } + + if (0 == (numerator + part_of_denominator)) + continue; + + next_lambda += deleted_count * (numerator / (numerator + part_of_denominator)); + } + assert(deleted_bigram->get_total_freq(table_num)); + next_lambda /= table_num; + + g_array_free(array, TRUE); + } + lambda = next_lambda; + return lambda; +} + +int main(int argc, char * argv[]){ + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); + + Bigram deleted_bigram; + deleted_bigram.attach(DELETED_BIGRAM, ATTACH_READONLY); + + GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + deleted_bigram.get_all_items(deleted_items); + + parameter_t lambda_sum = 0; + int lambda_count = 0; + + for ( int i = 0; i < deleted_items->len; ++i ){ + phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); + SingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + SingleGram * deleted_single_gram = NULL; + deleted_bigram.load(*token, deleted_single_gram); + + parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram); + + printf("token:%d lambda:%f\n", *token, lambda); + + lambda_sum += lambda; + lambda_count ++; + + if (single_gram) + delete single_gram; + delete deleted_single_gram; + } + + printf("average lambda:%f\n", (lambda_sum/lambda_count)); + g_array_free(deleted_items, TRUE); + return 0; +} + diff --git a/utils/training/estimate_k_mixture_model.cpp b/utils/training/estimate_k_mixture_model.cpp new file mode 100644 index 0000000..c0fa03f --- /dev/null +++ b/utils/training/estimate_k_mixture_model.cpp @@ -0,0 +1,159 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <locale.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" + +static const gchar * bigram_filename = "k_mixture_model_ngram.db"; +static const gchar * deleted_bigram_filename = "k_mixture_model_deleted_ngram.db"; + +static GOptionEntry entries[] = +{ + {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "the bigram file", NULL}, + {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &deleted_bigram_filename, "the deleted bigram file", NULL}, + {NULL} +}; + + +parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram, + KMixtureModelBigram * unigram, + KMixtureModelSingleGram * bigram){ + bool success; + parameter_t lambda = 0, next_lambda = 0.6; + parameter_t epsilon = 0.001; + + KMixtureModelMagicHeader magic_header; + assert(unigram->get_magic_header(magic_header)); + assert(0 != magic_header.m_total_freq); + + while (fabs(lambda - next_lambda) > epsilon){ + lambda = next_lambda; + next_lambda = 0; + parameter_t numerator = 0; + parameter_t part_of_denominator = 0; + + FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + deleted_bigram->retrieve_all(array); + + for ( size_t i = 0; i < array->len; ++i){ + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i); + //get the phrase token + phrase_token_t token = item->m_token; + guint32 deleted_count = item->m_item.m_WC; + + { + parameter_t elem_poss = 0; + KMixtureModelArrayHeader array_header; + KMixtureModelArrayItem array_item; + if ( bigram && bigram->get_array_item(token, array_item) ){ + assert(bigram->get_array_header(array_header)); + assert(0 != array_header.m_WC); + elem_poss = array_item.m_WC / (parameter_t) array_header.m_WC; + } + numerator = lambda * elem_poss; + } + + { + parameter_t elem_poss = 0; + KMixtureModelArrayHeader array_header; + if (unigram->get_array_header(token, array_header)){ + elem_poss = array_header.m_freq / (parameter_t) magic_header.m_total_freq; + } + part_of_denominator = (1 - lambda) * elem_poss; + } + if (0 == (numerator + part_of_denominator)) + continue; + + next_lambda += deleted_count * (numerator / (numerator + part_of_denominator)); + } + KMixtureModelArrayHeader header; + assert(deleted_bigram->get_array_header(header)); + assert(0 != header.m_WC); + next_lambda /= header.m_WC; + + g_array_free(array, TRUE); + } + lambda = next_lambda; + return lambda; +} + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- estimate k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + /* TODO: magic header signature check here. */ + KMixtureModelBigram unigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + unigram.attach(bigram_filename, ATTACH_READONLY); + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(bigram_filename, ATTACH_READONLY); + + KMixtureModelBigram deleted_bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + deleted_bigram.attach(deleted_bigram_filename, ATTACH_READONLY); + + GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + deleted_bigram.get_all_items(deleted_items); + + parameter_t lambda_sum = 0; + int lambda_count = 0; + + for( size_t i = 0; i < deleted_items->len; ++i ){ + phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + KMixtureModelSingleGram * deleted_single_gram = NULL; + deleted_bigram.load(*token, deleted_single_gram); + + KMixtureModelArrayHeader array_header; + if (single_gram) + assert(single_gram->get_array_header(array_header)); + KMixtureModelArrayHeader deleted_array_header; + assert(deleted_single_gram->get_array_header(deleted_array_header)); + + if ( 0 != deleted_array_header.m_WC ) { + parameter_t lambda = compute_interpolation(deleted_single_gram, &unigram, single_gram); + + printf("token:%d lambda:%f\n", *token, lambda); + + lambda_sum += lambda; + lambda_count ++; + } + + if (single_gram) + delete single_gram; + delete deleted_single_gram; + } + + printf("average lambda:%f\n", (lambda_sum/lambda_count)); + g_array_free(deleted_items, TRUE); + return 0; +} diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp new file mode 100644 index 0000000..b45781d --- /dev/null +++ b/utils/training/eval_correction_rate.cpp @@ -0,0 +1,211 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin_internal.h" +#include "utils_helper.h" + + +void print_help(){ + printf("Usage: eval_correction_rate\n"); +} + +bool get_possible_pinyin(FacadePhraseIndex * phrase_index, + TokenVector tokens, ChewingKeyVector keys){ + ChewingKey buffer[MAX_PHRASE_LENGTH]; + size_t key_index; guint32 max_freq; + guint32 freq; + g_array_set_size(keys, 0); + + for (size_t i = 0; i < tokens->len; ++i){ + phrase_token_t * token = &g_array_index(tokens, phrase_token_t, i); + PhraseItem item; + phrase_index->get_phrase_item(*token, item); + key_index = 0; max_freq = 0; + for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) { + freq = 0; + assert(item.get_nth_pronunciation(m, buffer, freq)); + if ( freq > max_freq ) { + key_index = m; + max_freq = freq; + } + } + + assert(item.get_nth_pronunciation(key_index, buffer, freq)); + assert(max_freq == freq); + guint8 len = item.get_phrase_length(); + g_array_append_vals(keys, buffer, len); + } + return true; +} + +bool get_best_match(PinyinLookup2 * pinyin_lookup, + ChewingKeyVector keys, TokenVector tokens){ + /* prepare the prefixes for get_best_match. */ + TokenVector prefixes = g_array_new + (FALSE, FALSE, sizeof(phrase_token_t)); + g_array_append_val(prefixes, sentence_start); + + /* initialize constraints. */ + CandidateConstraints constraints = g_array_new + (TRUE, FALSE, sizeof(lookup_constraint_t)); + g_array_set_size(constraints, keys->len); + for ( size_t i = 0; i < constraints->len; ++i ) { + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + bool retval = pinyin_lookup->get_best_match(prefixes, keys, constraints, tokens); + + g_array_free(prefixes, TRUE); + g_array_free(constraints, TRUE); + return retval; +} + +bool do_one_test(PinyinLookup2 * pinyin_lookup, + FacadePhraseIndex * phrase_index, + TokenVector tokens){ + bool retval = false; + + ChewingKeyVector keys = g_array_new(FALSE, TRUE, sizeof(ChewingKey)); + TokenVector guessed_tokens = g_array_new + (FALSE, TRUE, sizeof(phrase_token_t)); + + get_possible_pinyin(phrase_index, tokens, keys); + get_best_match(pinyin_lookup, keys, guessed_tokens); + /* compare the results */ + char * sentence = NULL; char * guessed_sentence = NULL; + pinyin_lookup->convert_to_utf8(tokens, sentence); + pinyin_lookup->convert_to_utf8 + (guessed_tokens, guessed_sentence); + + if ( strcmp(sentence, guessed_sentence) != 0 ) { + fprintf(stderr, "test sentence:%s\n", sentence); + fprintf(stderr, "guessed sentence:%s\n", guessed_sentence); + fprintf(stderr, "the result mis-matches.\n"); + retval = false; + } else { + retval = true; + } + + g_free(sentence); g_free(guessed_sentence); + g_array_free(keys, TRUE); + g_array_free(guessed_tokens, TRUE); + return retval; +} + +int main(int argc, char * argv[]){ + const char * evals_text = "evals2.text"; + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + pinyin_option_t options = USE_TONE; + FacadeChewingTable largetable; + + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PINYIN_INDEX); + largetable.load(options, chunk, NULL); + + FacadePhraseTable2 phrase_table; + chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram system_bigram; + system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); + Bigram user_bigram; + user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE); + + gfloat lambda = system_table_info.get_lambda(); + + PinyinLookup2 pinyin_lookup(lambda, options, + &largetable, &phrase_index, + &system_bigram, &user_bigram); + + /* open evals text. */ + FILE * evals_file = fopen(evals_text, "r"); + if ( NULL == evals_file ) { + fprintf(stderr, "Can't open file:%s\n", evals_text); + exit(ENOENT); + } + + /* Evaluates the correction rate of test text documents. */ + size_t tested_count = 0; size_t passed_count = 0; + char* linebuf = NULL; size_t size = 0; + TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t)); + + phrase_token_t token = null_token; + while( getline(&linebuf, &size, evals_file) ) { + if ( feof(evals_file) ) + break; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + + if ( null_token == token ) { + if ( tokens->len ) { /* one test. */ + if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { + tested_count ++; passed_count ++; + } else { + tested_count ++; + } + g_array_set_size(tokens, 0); + } + } else { + g_array_append_val(tokens, token); + } + } + + if ( tokens->len ) { /* one test. */ + if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { + tested_count ++; passed_count ++; + } else { + tested_count ++; + } + } + + parameter_t rate = passed_count / (parameter_t) tested_count; + printf("correction rate:%f\n", rate); + + g_array_free(tokens, TRUE); + fclose(evals_file); + free(linebuf); + + return 0; +} diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp new file mode 100644 index 0000000..e446e79 --- /dev/null +++ b/utils/training/export_k_mixture_model.cpp @@ -0,0 +1,156 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <locale.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" +#include "utils_helper.h" + +static const gchar * k_mixture_model_filename = NULL; + +static GOptionEntry entries[] = +{ + {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL}, + {NULL} +}; + + +bool print_k_mixture_model_magic_header(FILE * output, + KMixtureModelBigram * bigram){ + KMixtureModelMagicHeader magic_header; + if ( !bigram->get_magic_header(magic_header) ){ + fprintf(stderr, "no magic header in k mixture model.\n"); + exit(ENODATA); + } + fprintf(output, "\\data model \"k mixture model\" count %d N %d " + "total_freq %d\n", magic_header.m_WC, magic_header.m_N, + magic_header.m_total_freq); + return true; +} + +bool print_k_mixture_model_array_headers(FILE * output, + KMixtureModelBigram * bigram, + FacadePhraseIndex * phrase_index){ + fprintf(output, "\\1-gram\n"); + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t token = g_array_index(items, phrase_token_t, i); + KMixtureModelArrayHeader array_header; + assert(bigram->get_array_header(token, array_header)); + char * phrase = taglib_token_to_string(phrase_index, token); + if ( phrase ) + fprintf(output, "\\item %d %s count %d freq %d\n", + token, phrase, array_header.m_WC, array_header.m_freq); + + g_free(phrase); + } + return true; +} + +bool print_k_mixture_model_array_items(FILE * output, + KMixtureModelBigram * bigram, + FacadePhraseIndex * phrase_index){ + fprintf(output, "\\2-gram\n"); + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t token = g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + assert(bigram->load(token, single_gram)); + FlexibleBigramPhraseArray array = g_array_new + (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + single_gram->retrieve_all(array); + + for (size_t m = 0; m < array->len; ++m){ + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m); + char * word1 = taglib_token_to_string(phrase_index, token); + char * word2 = taglib_token_to_string(phrase_index, item->m_token); + + if (word1 && word2) + fprintf(output, "\\item %d %s %d %s count %d T %d N_n_0 %d n_1 %d Mr %d\n", + token, word1, item->m_token, word2, + item->m_item.m_WC, item->m_item.m_WC, + item->m_item.m_N_n_0, item->m_item.m_n_1, + item->m_item.m_Mr); + + g_free(word1); g_free(word2); + } + + g_array_free(array, TRUE); + delete single_gram; + } + + g_array_free(items, TRUE); + return true; +} + +bool end_data(FILE * output){ + fprintf(output, "\\end\n"); + return true; +} + +int main(int argc, char * argv[]){ + FILE * output = stdout; + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- export k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + if (!bigram.attach(k_mixture_model_filename, ATTACH_READONLY)) { + fprintf(stderr, "open %s failed.\n", k_mixture_model_filename); + exit(ENOENT); + } + + print_k_mixture_model_magic_header(output, &bigram); + print_k_mixture_model_array_headers(output, &bigram, &phrase_index); + print_k_mixture_model_array_items(output, &bigram, &phrase_index); + + end_data(output); + + return 0; +} diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp new file mode 100644 index 0000000..b6f96fa --- /dev/null +++ b/utils/training/gen_deleted_ngram.cpp @@ -0,0 +1,128 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007, 2011 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +static gboolean train_pi_gram = TRUE; +static const gchar * bigram_filename = DELETED_BIGRAM; + +static GOptionEntry entries[] = +{ + {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL}, + {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "deleted bi-gram file", NULL}, + {NULL} +}; + + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate deleted n-gram"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* load phrase table. */ + PhraseLargeTable2 phrase_table; + MemoryChunk * new_chunk = new MemoryChunk; + new_chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(new_chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENODATA); + + Bigram bigram; + bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); + + char* linebuf = NULL; size_t size = 0; + phrase_token_t last_token, cur_token = last_token = 0; + while( getline(&linebuf, &size, stdin) ){ + if ( feof(stdin) ) + break; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + + last_token = cur_token; + cur_token = token; + + /* skip null_token in second word. */ + if ( null_token == cur_token ) + continue; + + /* skip pi-gram training. */ + if ( null_token == last_token ){ + if ( !train_pi_gram ) + continue; + last_token = sentence_start; + } + + /* train bi-gram */ + SingleGram * single_gram = NULL; + bigram.load(last_token, single_gram); + + if ( NULL == single_gram ){ + single_gram = new SingleGram; + } + guint32 freq, total_freq; + //increase freq + if (single_gram->get_freq(cur_token, freq)) + assert(single_gram->set_freq(cur_token, freq + 1)); + else + assert(single_gram->insert_freq(cur_token, 1)); + //increase total freq + single_gram->get_total_freq(total_freq); + single_gram->set_total_freq(total_freq + 1); + + bigram.store(last_token, single_gram); + delete single_gram; + } + + free(linebuf); + return 0; +} diff --git a/utils/training/gen_k_mixture_model.cpp b/utils/training/gen_k_mixture_model.cpp new file mode 100644 index 0000000..2dfb3d1 --- /dev/null +++ b/utils/training/gen_k_mixture_model.cpp @@ -0,0 +1,411 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include <glib.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" +#include "k_mixture_model.h" + +/* Hash token of Hash token of word count. */ +typedef GHashTable * HashofDocument; +typedef GHashTable * HashofSecondWord; + +typedef GHashTable * HashofUnigram; + + +void print_help(){ + printf("Usage: gen_k_mixture_model [--skip-pi-gram-training]\n" + " [--maximum-occurs-allowed <INT>]\n" + " [--maximum-increase-rates-allowed <FLOAT>]\n" + " [--k-mixture-model-file <FILENAME>]\n" + " {<FILENAME>}+\n"); +} + + +static gint g_maximum_occurs = 20; +static parameter_t g_maximum_increase_rates = 3.; +static gboolean g_train_pi_gram = TRUE; +static const gchar * g_k_mixture_model_filename = NULL; + +static GOptionEntry entries[] = +{ + {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &g_train_pi_gram, "skip pi-gram training", NULL}, + {"maximum-occurs-allowed", 0, 0, G_OPTION_ARG_INT, &g_maximum_occurs, "maximum occurs allowed", NULL}, + {"maximum-increase-rates-allowed", 0, 0, G_OPTION_ARG_DOUBLE, &g_maximum_increase_rates, "maximum increase rates allowed", NULL}, + {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &g_k_mixture_model_filename, "k mixture model file", NULL}, + {NULL} +}; + + +bool read_document(PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + FILE * document, + HashofDocument hash_of_document, + HashofUnigram hash_of_unigram){ + + char * linebuf = NULL;size_t size = 0; + phrase_token_t last_token, cur_token = last_token = 0; + + while ( getline(&linebuf, &size, document) ){ + if ( feof(document) ) + break; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf); + + last_token = cur_token; + cur_token = token; + + /* skip null_token in second word. */ + if ( null_token == cur_token ) + continue; + + gpointer value = NULL; + gboolean lookup_result = g_hash_table_lookup_extended + (hash_of_unigram, GUINT_TO_POINTER(cur_token), + NULL, &value); + if ( !lookup_result ){ + g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token), + GUINT_TO_POINTER(1)); + } else { + guint32 freq = GPOINTER_TO_UINT(value); + freq ++; + g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token), + GUINT_TO_POINTER(freq)); + } + + /* skip pi-gram training. */ + if ( null_token == last_token ){ + if ( !g_train_pi_gram ) + continue; + last_token = sentence_start; + } + + /* remember the (last_token, cur_token) word pair. */ + HashofSecondWord hash_of_second_word = NULL; + lookup_result = g_hash_table_lookup_extended + (hash_of_document, GUINT_TO_POINTER(last_token), + NULL, &value); + if ( !lookup_result ){ + hash_of_second_word = g_hash_table_new + (g_direct_hash, g_direct_equal); + } else { + hash_of_second_word = (HashofSecondWord) value; + } + + value = NULL; + lookup_result = g_hash_table_lookup_extended + (hash_of_second_word, GUINT_TO_POINTER(cur_token), + NULL, &value); + guint32 count = 0; + if ( lookup_result ) { + count = GPOINTER_TO_UINT(value); + } + count ++; + g_hash_table_insert(hash_of_second_word, + GUINT_TO_POINTER(cur_token), + GUINT_TO_POINTER(count)); + g_hash_table_insert(hash_of_document, + GUINT_TO_POINTER(last_token), + hash_of_second_word); + } + + free(linebuf); + + return true; +} + +static void train_word_pair(HashofUnigram hash_of_unigram, + KMixtureModelSingleGram * single_gram, + phrase_token_t token2, guint32 count){ + KMixtureModelArrayItem array_item; + + bool exists = single_gram->get_array_item(token2, array_item); + if ( exists ) { + guint32 maximum_occurs_allowed = std_lite::max + ((guint32)g_maximum_occurs, + (guint32)ceil(array_item.m_Mr * g_maximum_increase_rates)); + /* Exceeds the maximum occurs allowed of the word or phrase, + * in a single document. + */ + if ( count > maximum_occurs_allowed ){ + gpointer value = NULL; + assert( g_hash_table_lookup_extended + (hash_of_unigram, GUINT_TO_POINTER(token2), + NULL, &value) ); + guint32 freq = GPOINTER_TO_UINT(value); + freq -= count; + if ( freq > 0 ) { + g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2), + GUINT_TO_POINTER(freq)); + } else if ( freq == 0 ) { + assert(g_hash_table_steal(hash_of_unigram, + GUINT_TO_POINTER(token2))); + } else { + assert(false); + } + return; + } + array_item.m_WC += count; + /* array_item.m_T += count; the same as m_WC. */ + array_item.m_N_n_0 ++; + if ( 1 == count ) + array_item.m_n_1 ++; + array_item.m_Mr = std_lite::max(array_item.m_Mr, count); + assert(single_gram->set_array_item(token2, array_item)); + } else { /* item doesn't exist. */ + /* the same as above. */ + if ( count > g_maximum_occurs ){ + gpointer value = NULL; + assert( g_hash_table_lookup_extended + (hash_of_unigram, GUINT_TO_POINTER(token2), + NULL, &value) ); + guint32 freq = GPOINTER_TO_UINT(value); + freq -= count; + if ( freq > 0 ) { + g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(token2), + GUINT_TO_POINTER(freq)); + } else if ( freq == 0 ) { + assert(g_hash_table_steal(hash_of_unigram, + GUINT_TO_POINTER(token2))); + } else { + assert(false); + } + return; + } + memset(&array_item, 0, sizeof(KMixtureModelArrayItem)); + array_item.m_WC = count; + /* array_item.m_T = count; the same as m_WC. */ + array_item.m_N_n_0 = 1; + if ( 1 == count ) + array_item.m_n_1 = 1; + array_item.m_Mr = count; + assert(single_gram->insert_array_item(token2, array_item)); + } + + /* save delta in the array header. */ + KMixtureModelArrayHeader array_header; + single_gram->get_array_header(array_header); + array_header.m_WC += count; + single_gram->set_array_header(array_header); +} + +bool train_single_gram(HashofUnigram hash_of_unigram, + HashofDocument hash_of_document, + KMixtureModelSingleGram * single_gram, + phrase_token_t token1, + guint32 & delta){ + assert(NULL != single_gram); + delta = 0; /* delta in WC of single_gram. */ + KMixtureModelArrayHeader array_header; + assert(single_gram->get_array_header(array_header)); + guint32 saved_array_header_WC = array_header.m_WC; + + HashofSecondWord hash_of_second_word = NULL; + gpointer key, value = NULL; + assert(g_hash_table_lookup_extended + (hash_of_document, GUINT_TO_POINTER(token1), + NULL, &value)); + hash_of_second_word = (HashofSecondWord) value; + assert(NULL != hash_of_second_word); + + /* train word pair */ + GHashTableIter iter; + g_hash_table_iter_init(&iter, hash_of_second_word); + while (g_hash_table_iter_next(&iter, &key, &value)) { + phrase_token_t token2 = GPOINTER_TO_UINT(key); + guint32 count = GPOINTER_TO_UINT(value); + train_word_pair(hash_of_unigram, single_gram, token2, count); + } + + assert(single_gram->get_array_header(array_header)); + delta = array_header.m_WC - saved_array_header_WC; + return true; +} + +static bool train_second_word(HashofUnigram hash_of_unigram, + KMixtureModelBigram * bigram, + HashofDocument hash_of_document, + phrase_token_t token1){ + guint32 delta = 0; + + KMixtureModelSingleGram * single_gram = NULL; + bool exists = bigram->load(token1, single_gram); + if ( !exists ) + single_gram = new KMixtureModelSingleGram; + train_single_gram(hash_of_unigram, hash_of_document, + single_gram, token1, delta); + + if ( 0 == delta ){ /* Please consider maximum occurs allowed. */ + delete single_gram; + return false; + } + + /* save the single gram. */ + assert(bigram->store(token1, single_gram)); + delete single_gram; + + KMixtureModelMagicHeader magic_header; + if (!bigram->get_magic_header(magic_header)){ + /* the first time to access the new k mixture model file. */ + memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader)); + } + + if ( magic_header.m_WC + delta < magic_header.m_WC ){ + fprintf(stderr, "the m_WC integer in magic header overflows.\n"); + return false; + } + magic_header.m_WC += delta; + assert(bigram->set_magic_header(magic_header)); + + return true; +} + +/* Note: this method is a post-processing method, run this last. */ +static bool post_processing_unigram(KMixtureModelBigram * bigram, + HashofUnigram hash_of_unigram){ + GHashTableIter iter; + gpointer key, value; + guint32 total_freq = 0; + + g_hash_table_iter_init(&iter, hash_of_unigram); + while (g_hash_table_iter_next(&iter, &key, &value)){ + guint32 token = GPOINTER_TO_UINT(key); + guint32 freq = GPOINTER_TO_UINT(value); + KMixtureModelArrayHeader array_header; + bool result = bigram->get_array_header(token, array_header); + array_header.m_freq += freq; + total_freq += freq; + bigram->set_array_header(token, array_header); + } + + KMixtureModelMagicHeader magic_header; + assert(bigram->get_magic_header(magic_header)); + if ( magic_header.m_total_freq + total_freq < magic_header.m_total_freq ){ + fprintf(stderr, "the m_total_freq in magic header overflows.\n"); + return false; + } + magic_header.m_total_freq += total_freq; + assert(bigram->set_magic_header(magic_header)); + + return true; +} + +int main(int argc, char * argv[]){ + int i = 1; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + PhraseLargeTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); + + while ( i < argc ){ + const char * filename = argv[i]; + FILE * document = fopen(filename, "r"); + if ( NULL == document ){ + int err_saved = errno; + fprintf(stderr, "can't open file: %s.\n", filename); + fprintf(stderr, "error:%s.\n", strerror(err_saved)); + exit(err_saved); + } + + HashofDocument hash_of_document = g_hash_table_new + (g_direct_hash, g_direct_equal); + HashofUnigram hash_of_unigram = g_hash_table_new + (g_direct_hash, g_direct_equal); + + assert(read_document(&phrase_table, &phrase_index, document, + hash_of_document, hash_of_unigram)); + fclose(document); + document = NULL; + + GHashTableIter iter; + gpointer key, value; + + /* train the document, and convert it to k mixture model. */ + g_hash_table_iter_init(&iter, hash_of_document); + while (g_hash_table_iter_next(&iter, &key, &value)) { + phrase_token_t token1 = GPOINTER_TO_UINT(key); + train_second_word(hash_of_unigram, &bigram, + hash_of_document, token1); + } + + KMixtureModelMagicHeader magic_header; + assert(bigram.get_magic_header(magic_header)); + magic_header.m_N ++; + assert(bigram.set_magic_header(magic_header)); + + post_processing_unigram(&bigram, hash_of_unigram); + + /* free resources of g_hash_of_document */ + g_hash_table_iter_init(&iter, hash_of_document); + while (g_hash_table_iter_next(&iter, &key, &value)) { + HashofSecondWord second_word = (HashofSecondWord) value; + g_hash_table_iter_steal(&iter); + g_hash_table_unref(second_word); + } + g_hash_table_unref(hash_of_document); + hash_of_document = NULL; + + g_hash_table_unref(hash_of_unigram); + hash_of_unigram = NULL; + + ++i; + } + + return 0; +} diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp new file mode 100644 index 0000000..1947959 --- /dev/null +++ b/utils/training/gen_ngram.cpp @@ -0,0 +1,136 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007, 2011 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +static gboolean train_pi_gram = TRUE; +static const gchar * bigram_filename = SYSTEM_BIGRAM; + +static GOptionEntry entries[] = +{ + {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL}, + {"bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "bi-gram file", NULL}, + {NULL} +}; + +int main(int argc, char * argv[]){ + FILE * input = stdin; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- generate n-gram"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + PhraseLargeTable2 phrase_table; + /* init phrase table */ + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + Bigram bigram; + bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); + + char* linebuf = NULL; size_t size = 0; + phrase_token_t last_token, cur_token = last_token = 0; + while( getline(&linebuf, &size, input) ){ + if ( feof(input) ) + break; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); + + last_token = cur_token; + cur_token = token; + + /* skip null_token in second word. */ + if ( null_token == cur_token ) + continue; + + /* training uni-gram */ + phrase_index.add_unigram_frequency(cur_token, 1); + + /* skip pi-gram training. */ + if ( null_token == last_token ){ + if ( !train_pi_gram ) + continue; + last_token = sentence_start; + } + + /* train bi-gram */ + SingleGram * single_gram = NULL; + bigram.load(last_token, single_gram); + + if ( NULL == single_gram ){ + single_gram = new SingleGram; + } + guint32 freq, total_freq; + /* increase freq */ + if (single_gram->get_freq(cur_token, freq)) + assert(single_gram->set_freq(cur_token, freq + 1)); + else + assert(single_gram->insert_freq(cur_token, 1)); + /* increase total freq */ + single_gram->get_total_freq(total_freq); + single_gram->set_total_freq(total_freq + 1); + + bigram.store(last_token, single_gram); + delete single_gram; + } + + free(linebuf); + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp new file mode 100644 index 0000000..f4c51af --- /dev/null +++ b/utils/training/gen_unigram.cpp @@ -0,0 +1,111 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + +static const gchar * table_dir = "."; + +static GOptionEntry entries[] = +{ + {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL}, + {NULL} +}; + +/* increase all unigram frequency by a constant. */ + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- increase uni-gram"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + bool retval = system_table_info.load(filename); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + g_free(filename); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + /* Note: please increase the value when corpus size becomes larger. + * To avoid zero value when computing unigram frequency in float format. + */ + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + assert(table_info->m_dict_index == i); + + if (SYSTEM_FILE != table_info->m_file_type && + DICTIONARY != table_info->m_file_type) + continue; + + guint32 freq = 1; +#if 0 + /* skip GBK_DICTIONARY. */ + if (GBK_DICTIONARY == table_info->m_dict_index) + freq = 1; +#endif + + const char * binfile = table_info->m_system_filename; + + MemoryChunk * chunk = new MemoryChunk; + bool retval = chunk->load(binfile); + if (!retval) { + fprintf(stderr, "load %s failed!\n", binfile); + exit(ENOENT); + } + + phrase_index.load(i, chunk); + + PhraseIndexRange range; + int result = phrase_index.get_range(i, range); + if ( result == ERROR_OK ) { + for (size_t token = range.m_range_begin; + token <= range.m_range_end; ++token) { + phrase_index.add_unigram_frequency(token, freq); + } + } + } + + if (!save_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + if (!save_dictionary(phrase_files, &phrase_index)) + exit(ENOENT); + + return 0; +} diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp new file mode 100644 index 0000000..40870cf --- /dev/null +++ b/utils/training/import_k_mixture_model.cpp @@ -0,0 +1,322 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include "pinyin_internal.h" +#include "utils_helper.h" +#include "k_mixture_model.h" + +static const gchar * k_mixture_model_filename = NULL; + +static GOptionEntry entries[] = +{ + {"k-mixture-model-file", 0, 0, G_OPTION_ARG_FILENAME, &k_mixture_model_filename, "k mixture model file", NULL}, + {NULL} +}; + + +enum LINE_TYPE{ + BEGIN_LINE = 1, + END_LINE, + GRAM_1_LINE, + GRAM_2_LINE, + GRAM_1_ITEM_LINE, + GRAM_2_ITEM_LINE +}; + +static int line_type = 0; +static GPtrArray * values = NULL; +static GHashTable * required = NULL; +/* variables for line buffer. */ +static char * linebuf = NULL; +static size_t len = 0; + +bool parse_headline(KMixtureModelBigram * bigram); + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram); + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram); + + +static ssize_t my_getline(FILE * input){ + ssize_t result = getline(&linebuf, &len, input); + if ( result == -1 ) + return result; + + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + return result; +} + +bool parse_headline(KMixtureModelBigram * bigram){ + /* enter "\data" line */ + assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model:count:N:total_freq", "")); + + /* read "\data" line */ + if ( !taglib_read(linebuf, line_type, values, required) ) { + fprintf(stderr, "error: k mixture model expected.\n"); + return false; + } + + assert(line_type == BEGIN_LINE); + /* check header */ + TAGLIB_GET_TAGVALUE(const char *, model, (const char *)); + if ( !( strcmp("k mixture model", model) == 0 ) ) { + fprintf(stderr, "error: k mixture model expected.\n"); + return false; + } + + TAGLIB_GET_TAGVALUE(glong, count, atol); + TAGLIB_GET_TAGVALUE(glong, N, atol); + TAGLIB_GET_TAGVALUE(glong, total_freq, atol); + + KMixtureModelMagicHeader magic_header; + memset(&magic_header, 0, sizeof(KMixtureModelMagicHeader)); + magic_header.m_WC =count; magic_header.m_N = N; + magic_header.m_total_freq = total_freq; + bigram->set_magic_header(magic_header); + + return true; +} + +bool parse_body(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + + do { + retry: + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case END_LINE: + goto end; + case GRAM_1_LINE: + my_getline(input); + parse_unigram(input, phrase_table, phrase_index, bigram); + goto retry; + case GRAM_2_LINE: + my_getline(input); + parse_bigram(input, phrase_table, phrase_index, bigram); + goto retry; + default: + assert(false); + } + } while (my_getline(input) != -1) ; + + end: + taglib_pop_state(); + return true; +} + +bool parse_unigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count:freq", "")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_1_ITEM_LINE:{ + /* handle \item in \1-gram */ + TAGLIB_GET_TOKEN(token, 0); + TAGLIB_GET_PHRASE_STRING(word, 1); + assert(taglib_validate_token_with_string + (phrase_index, token, word)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + TAGLIB_GET_TAGVALUE(glong, freq, atol); + + KMixtureModelArrayHeader array_header; + memset(&array_header, 0, sizeof(KMixtureModelArrayHeader)); + array_header.m_WC = count; array_header.m_freq = freq; + bigram->set_array_header(token, array_header); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_bigram(FILE * input, PhraseLargeTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + KMixtureModelBigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, + "count:T:N_n_0:n_1:Mr", "")); + + phrase_token_t last_token = null_token; + KMixtureModelSingleGram * last_single_gram = NULL; + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_2_ITEM_LINE:{ + /* handle \item in \2-gram */ + /* two tokens */ + TAGLIB_GET_TOKEN(token1, 0); + TAGLIB_GET_PHRASE_STRING(word1, 1); + assert(taglib_validate_token_with_string + (phrase_index, token1, word1)); + + TAGLIB_GET_TOKEN(token2, 2); + TAGLIB_GET_PHRASE_STRING(word2, 3); + assert(taglib_validate_token_with_string + (phrase_index, token2, word2)); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + TAGLIB_GET_TAGVALUE(glong, T, atol); + assert(count == T); + TAGLIB_GET_TAGVALUE(glong, N_n_0, atol); + TAGLIB_GET_TAGVALUE(glong, n_1, atol); + TAGLIB_GET_TAGVALUE(glong, Mr, atol); + + KMixtureModelArrayItem array_item; + memset(&array_item, 0, sizeof(KMixtureModelArrayItem)); + array_item.m_WC = count; array_item.m_N_n_0 = N_n_0; + array_item.m_n_1 = n_1; array_item.m_Mr = Mr; + + if ( last_token != token1 ) { + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + /* safe guard */ + last_token = null_token; + last_single_gram = NULL; + } + KMixtureModelSingleGram * single_gram = NULL; + bigram->load(token1, single_gram); + + /* create the new single gram */ + if ( single_gram == NULL ) + single_gram = new KMixtureModelSingleGram; + last_token = token1; + last_single_gram = single_gram; + } + + assert(NULL != last_single_gram); + assert(last_single_gram->insert_array_item(token2, array_item)); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + if ( last_token && last_single_gram ) { + bigram->store(last_token, last_single_gram); + delete last_single_gram; + /* safe guard */ + last_token = null_token; + last_single_gram = NULL; + } + + taglib_pop_state(); + return true; +} + +int main(int argc, char * argv[]){ + FILE * input = stdin; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- import k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + PhraseLargeTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk); + + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); + + taglib_init(); + + /* prepare to read n-gram model */ + values = g_ptr_array_new(); + required = g_hash_table_new(g_str_hash, g_str_equal); + + ssize_t result = my_getline(input); + if ( result == -1 ) { + fprintf(stderr, "empty file input.\n"); + exit(ENODATA); + } + + if (!parse_headline(&bigram)) + exit(ENODATA); + + result = my_getline(input); + if ( result != -1 ) + parse_body(input, &phrase_table, &phrase_index, &bigram); + + taglib_fini(); + + return 0; +} diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h new file mode 100644 index 0000000..ad8d3d8 --- /dev/null +++ b/utils/training/k_mixture_model.h @@ -0,0 +1,172 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef K_MIXTURE_MODEL +#define K_MIXTURE_MODEL + +#include <math.h> +#include "novel_types.h" +#include "flexible_ngram.h" + +namespace pinyin{ + +typedef guint32 corpus_count_t; + +/* Note: storage parameters: N, T, n_r. + * N: the total number of documents. + * T: the total number of instances of the word or phrase. + * n_r: the number of documents having exactly <b>r</b> occurrences. + * only n_0, n_1 are used here. + */ + +static inline parameter_t compute_alpha(corpus_count_t N, corpus_count_t n_0){ + parameter_t alpha = 1 - n_0 / (parameter_t) N; + return alpha; +} + +static inline parameter_t compute_gamma(corpus_count_t N, + corpus_count_t n_0, + corpus_count_t n_1){ + parameter_t gamma = 1 - n_1 / (parameter_t) (N - n_0); + return gamma; +} + +static inline parameter_t compute_B(corpus_count_t N, + corpus_count_t T, + corpus_count_t n_0, + corpus_count_t n_1){ + /* Note: re-check this, to see if we can remove if statement. */ + /* Please consider B_2 is no less than 2 in paper. */ +#if 1 + if ( 0 == T - n_1 && 0 == N - n_0 - n_1 ) + return 2; +#endif + + parameter_t B = (T - n_1 ) / (parameter_t) (N - n_0 - n_1); + return B; +} + +/* three parameters model */ +static inline parameter_t compute_Pr_G_3(corpus_count_t k, + parameter_t alpha, + parameter_t gamma, + parameter_t B){ + if ( k == 0 ) + return 1 - alpha; + + if ( k == 1 ) + return alpha * (1 - gamma); + + if ( k > 1 ) { + return (alpha * gamma / (B - 1)) * pow((1 - 1 / (B - 1)) , k - 2); + } + + assert(false); +} + +static inline parameter_t compute_Pr_G_3_with_count(corpus_count_t k, + corpus_count_t N, + corpus_count_t T, + corpus_count_t n_0, + corpus_count_t n_1){ + parameter_t alpha = compute_alpha(N, n_0); + parameter_t gamma = compute_gamma(N, n_0, n_1); + parameter_t B = compute_B(N, T, n_0, n_1); + + return compute_Pr_G_3(k, alpha, gamma, B); +} + +/* two parameters model */ +static inline parameter_t compute_Pr_G_2(corpus_count_t k, + parameter_t alpha, + parameter_t B){ + parameter_t gamma = 1 - 1 / (B - 1); + return compute_Pr_G_3(k, alpha, gamma, B); +} + +static inline parameter_t compute_Pr_G_2_with_count(corpus_count_t k, + corpus_count_t N, + corpus_count_t T, + corpus_count_t n_0, + corpus_count_t n_1){ + parameter_t alpha = compute_alpha(N, n_0); + parameter_t B = compute_B(N, T, n_0, n_1); + return compute_Pr_G_2(k, alpha, B); +} + +#define K_MIXTURE_MODEL_MAGIC_NUMBER "KMMP" + +typedef struct{ + /* the total number of instances of all words. */ + guint32 m_WC; + /* the total number of documents. */ + guint32 m_N; + /* the total freq of uni-gram. */ + guint32 m_total_freq; +} KMixtureModelMagicHeader; + +typedef struct{ + /* the total number of instances of word W1. */ + guint32 m_WC; + /* the freq of uni-gram. see m_total_freq in magic header also. */ + guint32 m_freq; +} KMixtureModelArrayHeader; + +typedef struct{ + /* the total number of all W1,W2 word pair. */ + guint32 m_WC; + + /* the total number of instances of the word or phrase. + (two word phrase) */ + /* guint32 m_T; Please use m_WC instead. + alias of m_WC, always the same. */ + + /* n_r: the number of documents having exactly r occurrences. */ + /* guint32 m_n_0; + Note: compute this value using the following equation. + m_n_0 = KMixtureModelMagicHeader.m_N - m_N_n_0; + m_N_n_0, the number of documents which contains the word or phrase. + (two word phrase) */ + guint32 m_N_n_0; + guint32 m_n_1; + + /* maximum instances of the word or phrase (two word phrase) + in previous documents last seen. */ + guint32 m_Mr; +} KMixtureModelArrayItem; + +typedef FlexibleBigram<KMixtureModelMagicHeader, + KMixtureModelArrayHeader, + KMixtureModelArrayItem> +KMixtureModelBigram; + +typedef FlexibleSingleGram<KMixtureModelArrayHeader, + KMixtureModelArrayItem> +KMixtureModelSingleGram; + +typedef KMixtureModelSingleGram::ArrayItemWithToken +KMixtureModelArrayItemWithToken; + +}; + + +#endif diff --git a/utils/training/k_mixture_model_to_interpolation.cpp b/utils/training/k_mixture_model_to_interpolation.cpp new file mode 100644 index 0000000..c5a66ec --- /dev/null +++ b/utils/training/k_mixture_model_to_interpolation.cpp @@ -0,0 +1,214 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "pinyin_internal.h" +#include "utils_helper.h" + +enum LINE_TYPE{ + BEGIN_LINE = 1, + END_LINE, + GRAM_1_LINE, + GRAM_2_LINE, + GRAM_1_ITEM_LINE, + GRAM_2_ITEM_LINE +}; + +static int line_type = 0; +static GPtrArray * values = NULL; +static GHashTable * required = NULL; +/* variables for line buffer. */ +static char * linebuf = NULL; +static size_t len = 0; + +bool parse_headline(FILE * input, FILE * output); + +bool parse_unigram(FILE * input, FILE * output); + +bool parse_bigram(FILE * input, FILE * output); + +static ssize_t my_getline(FILE * input){ + ssize_t result = getline(&linebuf, &len, input); + if ( result == -1 ) + return result; + + linebuf[strlen(linebuf) - 1] = '\0'; + return result; +} + +bool parse_headline(FILE * input, FILE * output) { + /* enter "\data" line */ + assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", + "count:N:total_freq")); + + /* read "\data" line */ + if ( !taglib_read(linebuf, line_type, values, required) ) { + fprintf(stderr, "error: k mixture model expected.\n"); + return false; + } + + assert(line_type == BEGIN_LINE); + TAGLIB_GET_TAGVALUE(const char *, model, (const char *)); + if ( !( strcmp("k mixture model", model) == 0 ) ){ + fprintf(stderr, "error: k mixture model expected.\n"); + return false; + } + + /* print header */ + fprintf(output, "\\data model interpolation\n"); + + return true; +} + +bool parse_body(FILE * input, FILE * output){ + taglib_push_state(); + + assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + + do { + retry: + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case END_LINE: + fprintf(output, "\\end\n"); + goto end; + case GRAM_1_LINE: + fprintf(output, "\\1-gram\n"); + my_getline(input); + parse_unigram(input, output); + goto retry; + case GRAM_2_LINE: + fprintf(output, "\\2-gram\n"); + my_getline(input); + parse_bigram(input, output); + goto retry; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_unigram(FILE * input, FILE * output){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "freq", "count")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case GRAM_1_ITEM_LINE: { + /* handle \item in \1-gram */ + TAGLIB_GET_TOKEN(token, 0); + TAGLIB_GET_PHRASE_STRING(word, 1); + + /* remove the "<start>" in the uni-gram of interpolation model */ + if ( sentence_start == token ) + break; + + TAGLIB_GET_TAGVALUE(glong, freq, atol); + + /* ignore zero unigram freq item */ + if ( 0 != freq ) + fprintf(output, "\\item %d %s count %ld\n", token, word, freq); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +bool parse_bigram(FILE * input, FILE * output){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, + "count", "T:N_n_0:n_1:Mr")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch (line_type) { + case GRAM_2_ITEM_LINE:{ + /* handle \item in \2-gram */ + /* two strings */ + TAGLIB_GET_TOKEN(token1, 0); + TAGLIB_GET_PHRASE_STRING(word1, 1); + + TAGLIB_GET_TOKEN(token2, 2); + TAGLIB_GET_PHRASE_STRING(word2, 3); + + TAGLIB_GET_TAGVALUE(glong, count, atol); + fprintf(output, "\\item %d %s %d %s count %ld\n", + token1, word1, token2, word2, count); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1); + + end: + taglib_pop_state(); + return true; +} + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + taglib_init(); + + values = g_ptr_array_new(); + required = g_hash_table_new(g_str_hash, g_str_equal); + + ssize_t result = my_getline(input); + if ( result == -1 ) { + fprintf(stderr, "empty file input.\n"); + exit(ENODATA); + } + + if (!parse_headline(input, output)) + exit(ENODATA); + + result = my_getline(input); + if ( result != -1 ) + parse_body(input, output); + + taglib_fini(); + + return 0; +} diff --git a/utils/training/merge_k_mixture_model.cpp b/utils/training/merge_k_mixture_model.cpp new file mode 100644 index 0000000..ab08010 --- /dev/null +++ b/utils/training/merge_k_mixture_model.cpp @@ -0,0 +1,239 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <locale.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" + +void print_help(){ + printf("Usage: merge_k_mixture_model [--result-file <RESULT_FILENAME>]\n"); + printf(" {<SOURCE_FILENAME>}+\n"); +} + +static const gchar * result_filename = NULL; + +static GOptionEntry entries[] = +{ + {"result-file", 0, 0, G_OPTION_ARG_FILENAME, &result_filename, "merged result file", NULL}, + {NULL} +}; + +static bool merge_two_phrase_array( /* in */ FlexibleBigramPhraseArray first, + /* in */ FlexibleBigramPhraseArray second, + /* out */ FlexibleBigramPhraseArray & merged ){ + /* avoid to do empty merge. */ + assert( NULL != first && NULL != second && NULL != merged ); + + /* merge two arrays. */ + guint first_index, second_index = first_index = 0; + KMixtureModelArrayItemWithToken * first_item, + * second_item = first_item = NULL; + while ( first_index < first->len && second_index < second->len ){ + first_item = &g_array_index(first, KMixtureModelArrayItemWithToken, + first_index); + second_item = &g_array_index(second, KMixtureModelArrayItemWithToken, + second_index); + if ( first_item->m_token > second_item->m_token ) { + g_array_append_val(merged, *second_item); + second_index ++; + } else if ( first_item->m_token < second_item->m_token ) { + g_array_append_val(merged, *first_item); + first_index ++; + } else /* first_item->m_token == second_item->m_token */ { + KMixtureModelArrayItemWithToken merged_item; + memset(&merged_item, 0, sizeof(KMixtureModelArrayItemWithToken)); + merged_item.m_token = first_item->m_token;/* same as second_item */ + merged_item.m_item.m_WC = first_item->m_item.m_WC + + second_item->m_item.m_WC; + /* merged_item.m_item.m_T = first_item->m_item.m_T + + second_item->m_item.m_T; */ + merged_item.m_item.m_N_n_0 = first_item->m_item.m_N_n_0 + + second_item->m_item.m_N_n_0; + merged_item.m_item.m_n_1 = first_item->m_item.m_n_1 + + second_item->m_item.m_n_1; + merged_item.m_item.m_Mr = std_lite::max(first_item->m_item.m_Mr, + second_item->m_item.m_Mr); + g_array_append_val(merged, merged_item); + first_index ++; second_index ++; + } + } + + /* add remained items. */ + while ( first_index < first->len ){ + first_item = &g_array_index(first, KMixtureModelArrayItemWithToken, + first_index); + g_array_append_val(merged, *first_item); + first_index++; + } + + while ( second_index < second->len ){ + second_item = &g_array_index(second, KMixtureModelArrayItemWithToken, + second_index); + g_array_append_val(merged, *second_item); + second_index++; + } + + return true; +} + +static bool merge_magic_header( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + + KMixtureModelMagicHeader target_magic_header; + KMixtureModelMagicHeader new_magic_header; + KMixtureModelMagicHeader merged_magic_header; + + memset(&merged_magic_header, 0, sizeof(KMixtureModelMagicHeader)); + if (!target->get_magic_header(target_magic_header)) { + memset(&target_magic_header, 0, sizeof(KMixtureModelMagicHeader)); + } + assert(new_one->get_magic_header(new_magic_header)); + if ( target_magic_header.m_WC + new_magic_header.m_WC < + std_lite::max( target_magic_header.m_WC, new_magic_header.m_WC ) ){ + fprintf(stderr, "the m_WC integer in magic header overflows.\n"); + return false; + } + if ( target_magic_header.m_total_freq + new_magic_header.m_total_freq < + std_lite::max( target_magic_header.m_total_freq, + new_magic_header.m_total_freq ) ){ + fprintf(stderr, "the m_total_freq in magic header overflows.\n"); + return false; + } + + merged_magic_header.m_WC = target_magic_header.m_WC + + new_magic_header.m_WC; + merged_magic_header.m_N = target_magic_header.m_N + + new_magic_header.m_N; + merged_magic_header.m_total_freq = target_magic_header.m_total_freq + + new_magic_header.m_total_freq; + + assert(target->set_magic_header(merged_magic_header)); + return true; +} + +static bool merge_array_items( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + + GArray * new_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + new_one->get_all_items(new_items); + + for ( size_t i = 0; i < new_items->len; ++i ){ + phrase_token_t * token = &g_array_index(new_items, phrase_token_t, i); + KMixtureModelSingleGram * target_single_gram = NULL; + KMixtureModelSingleGram * new_single_gram = NULL; + + assert(new_one->load(*token, new_single_gram)); + bool exists_in_target = target->load(*token, target_single_gram); + if ( !exists_in_target ){ + target->store(*token, new_single_gram); + delete new_single_gram; + continue; + } + + /* word count in array header in parallel with array items */ + KMixtureModelArrayHeader target_array_header; + KMixtureModelArrayHeader new_array_header; + KMixtureModelArrayHeader merged_array_header; + + assert(new_one->get_array_header(*token, new_array_header)); + assert(target->get_array_header(*token, target_array_header)); + memset(&merged_array_header, 0, sizeof(KMixtureModelArrayHeader)); + + merged_array_header.m_WC = target_array_header.m_WC + + new_array_header.m_WC; + merged_array_header.m_freq = target_array_header.m_freq + + new_array_header.m_freq; + /* end of word count in array header computing. */ + + assert(NULL != target_single_gram); + KMixtureModelSingleGram * merged_single_gram = + new KMixtureModelSingleGram; + + FlexibleBigramPhraseArray target_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + target_single_gram->retrieve_all(target_array); + + FlexibleBigramPhraseArray new_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + new_single_gram->retrieve_all(new_array); + FlexibleBigramPhraseArray merged_array = + g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + + assert(merge_two_phrase_array(target_array, new_array, merged_array)); + + g_array_free(target_array, TRUE); + g_array_free(new_array, TRUE); + delete target_single_gram; delete new_single_gram; + + for ( size_t m = 0; m < merged_array->len; ++m ){ + KMixtureModelArrayItemWithToken * item = + &g_array_index(merged_array, + KMixtureModelArrayItemWithToken, m); + merged_single_gram->insert_array_item(item->m_token, item->m_item); + } + + assert(merged_single_gram->set_array_header(merged_array_header)); + assert(target->store(*token, merged_single_gram)); + delete merged_single_gram; + g_array_free(merged_array, TRUE); + } + + g_array_free(new_items, TRUE); + return true; +} + +bool merge_two_k_mixture_model( /* in & out */ KMixtureModelBigram * target, + /* in */ KMixtureModelBigram * new_one ){ + assert(NULL != target); + assert(NULL != new_one); + return merge_array_items(target, new_one) && + merge_magic_header(target, new_one); +} + +int main(int argc, char * argv[]){ + int i = 1; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- merge k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + KMixtureModelBigram target(K_MIXTURE_MODEL_MAGIC_NUMBER); + target.attach(result_filename, ATTACH_READWRITE|ATTACH_CREATE); + + while (i < argc){ + const char * new_filename = argv[i]; + KMixtureModelBigram new_one(K_MIXTURE_MODEL_MAGIC_NUMBER); + new_one.attach(new_filename, ATTACH_READONLY); + if ( !merge_two_k_mixture_model(&target, &new_one) ) + exit(EOVERFLOW); + ++i; + } + + return 0; +} diff --git a/utils/training/prune_k_mixture_model.cpp b/utils/training/prune_k_mixture_model.cpp new file mode 100644 index 0000000..40dfb87 --- /dev/null +++ b/utils/training/prune_k_mixture_model.cpp @@ -0,0 +1,192 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + + +#include <errno.h> +#include <locale.h> +#include <limits.h> +#include "pinyin_internal.h" +#include "k_mixture_model.h" + + +void print_help(){ + printf("Usage: prune_k_mixture_model -k <INT> --CDF <DOUBLE> <FILENAME>\n"); +} + +static gint g_prune_k = 3; +static parameter_t g_prune_poss = 0.99; + +static GOptionEntry entries[] = +{ + {"pruneK", 'k', 0, G_OPTION_ARG_INT, &g_prune_k, "k parameter", NULL}, + {"CDF", 0, 0, G_OPTION_ARG_DOUBLE, &g_prune_poss, "CDF parameter", NULL}, + {NULL} +}; + + +bool prune_k_mixture_model(KMixtureModelMagicHeader * magic_header, + KMixtureModelSingleGram * & bigram, + FlexibleBigramPhraseArray removed_array){ + bool success; + + FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + bigram->retrieve_all(array); + + for ( size_t i = 0; i < array->len; ++i) { + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i); + phrase_token_t token = item->m_token; + parameter_t remained_poss = 1; parameter_t one_poss = 0; + bool errors = false; + for ( size_t k = 0; k < g_prune_k; ++k){ + one_poss = compute_Pr_G_3_with_count + (k, magic_header->m_N, item->m_item.m_WC, + magic_header->m_N - item->m_item.m_N_n_0, + item->m_item.m_n_1); + if ( !(0 <= one_poss && one_poss <= 1) ) + errors = true; + remained_poss -= one_poss; + } + + if ( fabs(remained_poss) < DBL_EPSILON ) + remained_poss = 0.; + + /* some wrong possibility. */ + if ( errors || !(0 <= remained_poss && remained_poss <= 1) ) { + fprintf(stderr, "some wrong possibility is encountered:%f.\n", + remained_poss); + fprintf(stderr, "k:%d N:%d WC:%d n_0:%d n_1:%d\n", + g_prune_k, magic_header->m_N, item->m_item.m_WC, + magic_header->m_N - item->m_item.m_N_n_0, + item->m_item.m_n_1); + exit(EDOM); + } + + if ( remained_poss < g_prune_poss ) { + /* prune this word or phrase. */ + KMixtureModelArrayItem removed_item; + bigram->remove_array_item(token, removed_item); + assert( memcmp(&removed_item, &(item->m_item), + sizeof(KMixtureModelArrayItem)) == 0 ); + + KMixtureModelArrayItemWithToken removed_item_with_token; + removed_item_with_token.m_token = token; + removed_item_with_token.m_item = removed_item; + g_array_append_val(removed_array, removed_item_with_token); + + KMixtureModelArrayHeader array_header; + bigram->get_array_header(array_header); + guint32 removed_count = removed_item.m_WC; + array_header.m_WC -= removed_count; + bigram->set_array_header(array_header); + magic_header->m_WC -= removed_count; + magic_header->m_total_freq -= removed_count; + } + } + + return true; +} + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- prune k mixture model"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (2 != argc) { + fprintf(stderr, "wrong arguments.\n"); + exit(EINVAL); + } + + const gchar * bigram_filename = argv[1]; + + /* TODO: magic header signature check here. */ + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(bigram_filename, ATTACH_READWRITE); + + KMixtureModelMagicHeader magic_header; + if (!bigram.get_magic_header(magic_header)) { + fprintf(stderr, "no magic header in k mixture model.\n"); + exit(ENODATA); + } + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(items); + + /* print prune progress */ + size_t progress = 0; size_t onestep = items->len / 20; + for ( size_t i = 0; i < items->len; ++i ){ + if ( progress >= onestep ) { + progress = 0; fprintf(stderr, "*"); + } + progress ++; + + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + FlexibleBigramPhraseArray removed_array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + + prune_k_mixture_model(&magic_header, single_gram, removed_array); + bigram.store(*token, single_gram); + + delete single_gram; + + /* post processing for unigram reduce */ + for (size_t m = 0; m < removed_array->len; ++m ){ + KMixtureModelArrayItemWithToken * item = + &g_array_index(removed_array, + KMixtureModelArrayItemWithToken, m); + KMixtureModelArrayHeader array_header; + assert(bigram.get_array_header(item->m_token, array_header)); + array_header.m_freq -= item->m_item.m_WC; + assert(array_header.m_freq >= 0); + assert(bigram.set_array_header(item->m_token, array_header)); + } + + g_array_free(removed_array, TRUE); + removed_array = NULL; + } + + fprintf(stderr, "\n"); + + bigram.set_magic_header(magic_header); + + /* post processing clean up zero items */ + KMixtureModelArrayHeader array_header; + for ( size_t i = 0; i < items->len; ++i ){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + assert(bigram.get_array_header(*token, array_header)); + if ( 0 == array_header.m_WC && 0 == array_header.m_freq ) + assert(bigram.remove(*token)); + } + + g_array_free(items, TRUE); + + return 0; +} diff --git a/utils/training/validate_k_mixture_model.cpp b/utils/training/validate_k_mixture_model.cpp new file mode 100644 index 0000000..7c057b9 --- /dev/null +++ b/utils/training/validate_k_mixture_model.cpp @@ -0,0 +1,174 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "pinyin_internal.h" +#include "k_mixture_model.h" + +void print_help(){ + printf("Usage: validate_k_mixture_model <FILENAME>\n"); +} + +bool validate_unigram(KMixtureModelBigram * bigram){ + KMixtureModelMagicHeader magic_header; + if( !bigram->get_magic_header(magic_header) ){ + fprintf(stderr, "no magic header in k mixture model.\n"); + return false; + } + + guint32 expected_word_count = magic_header.m_WC; + if ( 0 == expected_word_count ){ + fprintf(stderr, "word count in magic header is unexpected zero.\n"); + return false; + } + guint32 expected_total_freq = magic_header.m_total_freq; + if ( 0 == expected_total_freq ){ + fprintf(stderr, "total freq in magic header is unexpected zero.\n"); + return false; + } + + if ( expected_word_count != expected_total_freq ){ + fprintf(stderr, "the word count doesn't match the total freq.\n"); + return false; + } + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + guint32 word_count = 0; guint32 total_freq = 0; + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelArrayHeader array_header; + assert(bigram->get_array_header(*token, array_header)); + word_count += array_header.m_WC; + total_freq += array_header.m_freq; + } + + if ( word_count != expected_word_count ){ + fprintf(stderr, "word count in magic header:%d\n", + expected_word_count); + fprintf(stderr, "sum of word count in array headers:%d\n", word_count); + fprintf(stderr, "the sum differs from word count.\n"); + return false; + } + if ( total_freq != expected_total_freq ){ + fprintf(stderr, "total freq in magic header:%d\n", + expected_total_freq); + fprintf(stderr, "sum of freqs in array headers:%d\n", total_freq); + fprintf(stderr, "the total freq differs from sum of freqs.\n"); + return false; + } + + g_array_free(items, TRUE); + return true; +} + +bool validate_bigram(KMixtureModelBigram * bigram){ + bool result = true; + + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram->get_all_items(items); + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); + KMixtureModelSingleGram * single_gram = NULL; + assert(bigram->load(*token, single_gram)); + + FlexibleBigramPhraseArray array = g_array_new + (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); + single_gram->retrieve_all(array); + + KMixtureModelArrayHeader array_header; + assert(single_gram->get_array_header(array_header)); + + guint32 expected_sum = array_header.m_WC; + guint32 freq = array_header.m_freq; + if ( 0 == expected_sum ){ + if ( 0 != array->len ){ + fprintf(stderr, "in the array header of token %d:\n", *token); + fprintf(stderr, "word count is zero but has array items.\n"); + result = false; + } + if ( 0 != freq ){ + delete single_gram; + continue; + } else { + fprintf(stderr, "in the array header of token %d:\n", *token); + fprintf(stderr, "both word count and freq are " + "unexpected zero.\n"); + result = false; + } + } + + guint32 sum = 0; + for (size_t m = 0; m< array->len; ++m){ + KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m); + + sum += item->m_item.m_WC; + } + + if ( sum != expected_sum ){ + fprintf(stderr, "word count in array header:%d\n", expected_sum); + fprintf(stderr, "sum of word count in array items:%d\n", sum); + fprintf(stderr, "the sum differs from word count.\n"); + result = false; + } + + g_array_free(array, TRUE); + delete single_gram; + } + + g_array_free(items, TRUE); + return result; +} + +int main(int argc, char * argv[]){ + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- validate k mixture model"); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (2 != argc) { + fprintf(stderr, "wrong arguments.\n"); + exit(EINVAL); + } + + const char * k_mixture_model_filename = argv[1]; + + KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); + bigram.attach(k_mixture_model_filename, ATTACH_READONLY); + + if (!validate_unigram(&bigram)) { + fprintf(stderr, "k mixture model validation failed.\n"); + exit(ENODATA); + } + + if (!validate_bigram(&bigram)) { + fprintf(stderr, "k mixture model validation failed.\n"); + exit(ENODATA); + } + + return 0; +} diff --git a/utils/utils_helper.h b/utils/utils_helper.h new file mode 100644 index 0000000..b91067b --- /dev/null +++ b/utils/utils_helper.h @@ -0,0 +1,147 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef UTILS_HELPER_H +#define UTILS_HELPER_H + + +#define TAGLIB_GET_TOKEN(var, index) \ + phrase_token_t var = null_token; \ + { \ + const char * string = (const char *) g_ptr_array_index \ + (values, index); \ + var = atoi(string); \ + } + +#define TAGLIB_GET_PHRASE_STRING(var, index) \ + const char * var = NULL; \ + { \ + var = (const char *) g_ptr_array_index \ + (values, index); \ + } + +#define TAGLIB_GET_TAGVALUE(type, var, conv) \ + type var; \ + { \ + gpointer value = NULL; \ + assert(g_hash_table_lookup_extended \ + (required, #var, NULL, &value)); \ + var = conv((const char *)value); \ + } + +#define TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, var, line) \ + phrase_token_t var = null_token; \ + do { \ + if (0 == strlen(line)) \ + break; \ + \ + gchar ** strs = g_strsplit_set(line, " \t", 2); \ + if (2 != g_strv_length(strs)) \ + assert(false); \ + \ + phrase_token_t _token = atoi(strs[0]); \ + const char * phrase = strs[1]; \ + if (null_token != _token) \ + assert(taglib_validate_token_with_string \ + (phrase_index, _token, phrase)); \ + \ + var = _token; \ + \ + g_strfreev(strs); \ + } while(false); + + +static bool load_phrase_index(const pinyin_table_info_t * phrase_files, + FacadePhraseIndex * phrase_index) { + MemoryChunk * chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (SYSTEM_FILE != table_info->m_file_type) + continue; + + const char * binfile = table_info->m_system_filename; + + chunk = new MemoryChunk; + bool retval = chunk->load(binfile); + if (!retval) { + fprintf(stderr, "load %s failed!\n", binfile); + delete chunk; + return false; + } + + phrase_index->load(i, chunk); + } + return true; +} + +static bool save_phrase_index(const pinyin_table_info_t * phrase_files, + FacadePhraseIndex * phrase_index) { + MemoryChunk * new_chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (SYSTEM_FILE != table_info->m_file_type) + continue; + + const char * binfile = table_info->m_system_filename; + + new_chunk = new MemoryChunk; + phrase_index->store(i, new_chunk); + bool retval = new_chunk->save(binfile); + if (!retval) { + fprintf(stderr, "save %s failed.", binfile); + delete new_chunk; + return false; + } + + phrase_index->load(i, new_chunk); + } + return true; +} + +static bool save_dictionary(const pinyin_table_info_t * phrase_files, + FacadePhraseIndex * phrase_index) { + MemoryChunk * new_chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (DICTIONARY != table_info->m_file_type) + continue; + + const char * binfile = table_info->m_system_filename; + + new_chunk = new MemoryChunk; + phrase_index->store(i, new_chunk); + bool retval = new_chunk->save(binfile); + if (!retval) { + fprintf(stderr, "save %s failed.", binfile); + delete new_chunk; + return false; + } + + phrase_index->load(i, new_chunk); + } + return true; +} + +#endif |