diff options
author | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
commit | f41d1fdf83408e042ab07925710a8913bad0c27c (patch) | |
tree | 1757833ac4cdd0830834d2f9ef92be07c0bc1a5b | |
parent | 34acf9be9033e0dc0a5905999133482c20b6cbf3 (diff) | |
download | libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.gz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.xz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.zip |
import from pinyin.
46 files changed, 9161 insertions, 0 deletions
@@ -0,0 +1,2 @@ +James Su 2002,2003,2006 <suzhe@tsinghua.edu.cn> +Peng Wu 2006-2007 <alexepico@gmail.com> @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..0dd9b9e --- /dev/null +++ b/Makefile.am @@ -0,0 +1,35 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +EXTRA_DIST = config.rpath m4/ChangeLog COPYING \ + intltool-extract.in \ + intltool-update.in \ + intltool-merge.in + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = po intl src utils modules tests data + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak \ + intltool-extract \ + intltool-merge \ + intltool-update + + +ACLOCAL = aclocal -I . + diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 0000000..cb8d4ae --- /dev/null +++ b/autogen.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +aclocal + +libtoolize --force + +autoheader + +automake -a + +autoconf + +./configure --enable-tests diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..1464c43 --- /dev/null +++ b/configure.ac @@ -0,0 +1,143 @@ +# -*- Autoconf -*- +# Process this file with autoconf to produce a configure script. + + +AC_PREREQ(2.60) +AC_INIT(novel-pinyin, 0.2.5, alexepico@gmail.com) +AM_INIT_AUTOMAKE +AC_CONFIG_SRCDIR([config.h.in]) +AC_CONFIG_HEADER([config.h]) + +SCIM_VERSION=1.2.0 +AC_SUBST(SCIM_VERSION) + +# Checks for programs. +AC_PROG_CXX +AC_PROG_CC +AC_PROG_CPP +AC_PROG_INSTALL +AC_PROG_LN_S +AC_PROG_MAKE_SET + +AC_GNU_SOURCE + +# Init libtool +AC_LIBTOOL_WIN32_DLL +AC_LIBTOOL_DLOPEN +AC_PROG_LIBTOOL +AC_LIB_LTDL +AC_SUBST(LIBTOOL_DEPS) + +# Init gettext +ALL_LINGUAS="zh_CN zh_TW"#"ko ja de fr it" +AM_GNU_GETTEXT + +# Check intltool +#AC_PROG_INTLTOOL +IT_PROG_INTLTOOL([0.33], [no-xml]) + +GETTEXT_PACKAGE="novel-pinyin" +AC_SUBST(GETTEXT_PACKAGE) + +# libtool option to control which symbols are exported +# right now, symbols starting with _ are not exported +LIBTOOL_EXPORT_OPTIONS='-export-symbols-regex "^[[^_]].*"' +AC_SUBST(LIBTOOL_EXPORT_OPTIONS) + +# Checks for libraries. +PKG_CHECK_MODULES(GLIB2, [glib-2.0 >= 2.4.0]) + +GLIB2_CPPFLAGS=`$PKG_CONFIG --cflags glib-2.0` + +AC_SUBST(GLIB2_CPPFLAGS) + +GLIB2_LDFLAGS=`$PKG_CONFIG --libs glib-2.0` + +AC_SUBST(GLIB2_LDFLAGS) + +# Check SCIM +PKG_CHECK_MODULES(SCIM,[scim >= $SCIM_VERSION]) + +# Check if we should build setup module +PKG_CHECK_MODULES(SCIM_GTKUTILS,[scim-gtkutils >= $SCIM_VERSION], + [SCIM_HAS_GTKUTILS=yes], + [SCIM_HAS_GTKUTILS=no]) + +if test "$SCIM_HAS_GTKUTILS" = "yes"; then + SCIM_BUILD_SETUP=1 +else + SCIM_BUILD_SETUP=0 +fi + +AM_CONDITIONAL(SCIM_BUILD_SETUP, [test "$SCIM_HAS_GTKUTILS" = "yes"]) +AC_SUBST(SCIM_BUILD_SETUP) + + +SCIM_ICONDIR=`$PKG_CONFIG --variable=icondir scim` +SCIM_MODULEDIR=`$PKG_CONFIG --variable=moduledir scim` +SCIM_DATADIR=`$PKG_CONFIG --variable=scimdatadir scim` + +if test "x$SCIM_ICONDIR" = "x"; then + SCIM_ICONDIR=${datadir}/scim/icons +fi + +if test "x$SCIM_MODULEDIR" = "x"; then + SCIM_MODULEDIR=${libdir}/scim-1.0 +fi + +if test "x$SCIM_DATADIR" = "x"; then + SCIM_DATADIR=${datadir}/scim +fi + +AC_SUBST(SCIM_ICONDIR) +AC_SUBST(SCIM_MODULEDIR) +AC_SUBST(SCIM_DATADIR) + +NOVEL_PINYIN_DATADIR=$SCIM_DATADIR/novel-pinyin + +AC_SUBST(NOVEL_PINYIN_DATADIR) + +# Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS([locale.h stdlib.h string.h sys/time.h unistd.h]) + +# Checks for typedefs, structures, and compiler characteristics. +AC_HEADER_STDBOOL +AC_C_CONST +AC_C_INLINE +AC_TYPE_SIZE_T +AC_HEADER_TIME + +# Checks for library functions. +AC_FUNC_MALLOC +AC_FUNC_MEMCMP +AC_FUNC_REALLOC +AC_FUNC_STAT +AC_CHECK_FUNCS([gettimeofday memmove memset setlocale]) + +AC_CHECK_HEADERS([libintl.h string.h]) + +AC_CHECK_HEADER([db.h], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4])) + +AC_SEARCH_LIBS([db_create], [db], [], AC_MSG_ERROR([Cannot find Berkeley DB library version 4])) + + +AC_CONFIG_FILES([Makefile + po/Makefile.in + intl/Makefile + data/Makefile + src/Makefile + src/include/Makefile + src/storage/Makefile + src/segment/Makefile + src/training/Makefile + src/lookup/Makefile + modules/Makefile + modules/scim/Makefile + tests/Makefile + tests/include/Makefile + tests/storage/Makefile + tests/lookup/Makefile + utils/Makefile + utils/storage/Makefile]) +AC_OUTPUT diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..59e009f --- /dev/null +++ b/src/Makefile.am @@ -0,0 +1,25 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = include storage segment training lookup + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I $(ac_aux_dir) diff --git a/src/include/Makefile.am b/src/include/Makefile.am new file mode 100644 index 0000000..bb605ee --- /dev/null +++ b/src/include/Makefile.am @@ -0,0 +1,22 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +MAINTAINERCLEANFILES = Makefile.in + +noinst_HEADERS = memory_chunk.h \ + novel_types.h \ + stl_lite.h diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h new file mode 100755 index 0000000..3571256 --- /dev/null +++ b/src/include/memory_chunk.h @@ -0,0 +1,264 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef MEMORY_CHUNK_H +#define MEMORY_CHUNK_H + +#include <assert.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <stdlib.h> +#include "stl_lite.h" + +/* for unmanaged mode + * m_free_func == free , when memory is allocated by malloc + * m_free_func == NULL, + * when memory is in small protion of allocated area + * m_free_func == other, + * malloc then free. + */ + +class MemoryChunk{ + typedef void (* free_func_t)(void *); +private: + char * m_data_begin; + char * m_data_end; //one data pass the end. + char * m_allocated; //one data pass the end. + free_func_t m_free_func; + +private: + void reset(){ + if ( m_free_func ) + (*m_free_func)(m_data_begin); + m_data_begin = NULL; + m_data_end = NULL; + m_allocated = NULL; + m_free_func = NULL; + } + + void ensure_has_space(size_t new_size){ + int delta_size = m_data_begin + new_size - m_data_end; + if ( delta_size <= 0 ) return; + ensure_has_more_space ( delta_size ); + } + + /* enlarge function */ + void ensure_has_more_space(size_t extra_size){ + if ( 0 == extra_size ) return; + size_t newsize; + size_t cursize = size(); + if ( m_free_func != free ) { + /* copy on resize */ + newsize = cursize + extra_size; + /* do the copy */ + char * tmp = (char *) malloc(newsize); + assert(tmp); + memset(tmp, 0, newsize); + memmove(tmp, m_data_begin, cursize); + /* free the origin memory */ + if ( m_free_func){ + (*m_free_func)(m_data_begin); + } + + /* change varibles */ + m_data_begin = tmp; + m_data_end = m_data_begin + cursize; + m_allocated = m_data_begin + newsize; + m_free_func = free; + return; + } + /* the memory area is managed by this memory chunk */ + if ( extra_size <= (size_t) (m_allocated - m_data_end)) + return; + newsize = std_lite::max( capacity()<<1, cursize + extra_size); + m_data_begin = (char *) realloc(m_data_begin, newsize); + assert(m_data_begin); + memset(m_data_begin + cursize, 0, newsize - cursize); + m_data_end = m_data_begin + cursize; + m_allocated = m_data_begin + newsize; + return; + } + +public: + /* constructors */ + MemoryChunk(){ + m_data_begin = NULL; + m_data_end = NULL; + m_allocated = NULL; + m_free_func = NULL; + } + + /* destructors */ + ~MemoryChunk(){ + reset(); + } + + /* read access method */ + void* begin() const{ + return m_data_begin; + } + + void* end() const{ + return m_data_end; + } + + size_t size(){ + return m_data_end - m_data_begin; + } + + void set_size(size_t newsize){ + ensure_has_space(newsize); + m_data_end = m_data_begin + newsize; + } + + size_t capacity(){ + return m_allocated - m_data_begin; + } + + /* + * Transfer management of a memory chunk allocated by other part system + * to the memory chunk. + */ + void set_chunk(void* begin, size_t length, free_func_t free_func){ + if ( m_free_func ) + m_free_func( m_data_begin ); + + m_data_begin = (char *) begin; + m_data_end = (char *) m_data_begin + length; + m_allocated = (char *) m_data_begin + length; + m_free_func = free_func; + } + + /* subchunk + * use set_buffer internally. + * new chunk need to be deleted. + */ + MemoryChunk * get_sub_chunk(size_t offset, size_t length){ + MemoryChunk * retval = new MemoryChunk(); + char * begin_pos = m_data_begin + offset; + retval->set_chunk(begin_pos, length, NULL); + return retval; + } + /* write function + * Data are written directly to the memory area. + */ + bool set_content(size_t offset, const void * data, size_t len){ + size_t cursize = std_lite::max(size(), offset + len); + ensure_has_space(offset + len); + memmove(m_data_begin + offset, data, len); + m_data_end = m_data_begin + cursize; + return true; + } + /* insert function + * Data are written to the memory area, + * the original content are moved towards the rear. + * parameter offset start from zero. + */ + bool insert_content(size_t offset, const void * data, size_t length){ + ensure_has_more_space(length); + size_t move_size = size() - offset; + memmove(m_data_begin + offset + length, m_data_begin + offset, move_size); + memmove(m_data_begin + offset, data, length); + m_data_end += length; + return true; + } + /* remove function + * Data are removed directly, + * the following content are moved towards the front. + */ + bool remove_content(size_t offset, size_t length){ + size_t move_size = size() - offset - length; + memmove(m_data_begin + offset, m_data_begin + offset + length, move_size); + m_data_end -= length; + return true; + } + + /* get_content function + * Get the binary data + */ + bool get_content(size_t offset, void * buffer, size_t length){ + if ( size() < offset + length ) + return false; + memcpy( buffer, m_data_begin + offset, length); + return true; + } + + /* compact memory, reduce the size */ + void compact_memory(){ + if ( m_free_func != free ) + return; + size_t newsize = size(); + m_data_begin = (char *) realloc(m_data_begin, newsize); + m_allocated = m_data_begin + newsize; + } + + /* file storage functions */ + bool load(const char * filename){ + /* free old data */ + reset(); + + struct stat stat_buf; + + int retval = stat(filename, &stat_buf); + + if ( retval ) + return false; + + FILE* file = fopen(filename, "r"); + if ( !file ) + return false; + int data_len = stat_buf.st_size; + void* data = malloc(data_len); + if ( !data ){ + fclose(file); + return false; + } + + data_len = fread(data, 1, data_len, file); + set_chunk(data, data_len, free); + //Fixes memory chunk end. + if ( stat_buf.st_size > data_len ) + m_allocated = (char *) m_data_begin + stat_buf.st_size; + fclose(file); + return true; + } + + bool save(const char * filename){ + FILE* file = fopen(filename, "w"); + if ( !file ) + return false; + + size_t data_len = fwrite(begin(), 1, size(), file); + if ( data_len != size()){ + fclose(file); + return false; + } + + fsync(fileno(file)); + fclose(file); + return true; + } +}; + +#endif diff --git a/src/include/novel_types.h b/src/include/novel_types.h new file mode 100755 index 0000000..a992e8e --- /dev/null +++ b/src/include/novel_types.h @@ -0,0 +1,117 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef NOVEL_TYPES_H +#define NOVEL_TYPES_H + +#include <limits.h> +#include <glib.h> + +typedef guint32 phrase_token_t; +typedef gunichar2 utf16_t; + +/* + * Phrase Index Library Definition + * Reserve 4-bits for future usage. + */ + +#define PHRASE_MASK 0x00FFFFFF +#define PHRASE_INDEX_LIBRARY_MASK 0x0F000000 +#define PHRASE_INDEX_LIBRARY_COUNT (1<<4) +#define PHRASE_INDEX_LIBRARY_INDEX(token) ((token&PHRASE_INDEX_LIBRARY_MASK)>>24) +#define PHRASE_INDEX_MAKE_TOKEN(phrase_index, token) \ + ( ( (phrase_index<<24) & PHRASE_INDEX_LIBRARY_MASK)|(token & PHRASE_MASK)) + + +/* + * PhraseIndexRanges definitions + */ + +struct PhraseIndexRange{ + phrase_token_t m_range_begin; + phrase_token_t m_range_end; /* pass the last item like stl */ +}; + +/*Array of PhraseIndexRange*/ +typedef GArray * PhraseIndexRanges[PHRASE_INDEX_LIBRARY_COUNT]; + +/* + * PinYin Table Definition + */ +class MemoryChunk; + + +/* For both PinYin Table and Phrase Table */ +enum SearchResult{ + SEARCH_NONE = 0x00, /* found nothing */ + SEARCH_OK = 0x01 , /* found items */ + SEARCH_CONTINUED = 0x02 /* has longer word in the storage to search */ +}; + +enum AddIndexResult{ + INSERT_OK = 0 , /* insert ok */ + INSERT_ITEM_EXISTS /* item already exists */ +}; + +enum RemoveIndexResult{ + REMOVE_OK = 0, /* remove ok */ + REMOVE_ITEM_DONOT_EXISTS /* item don't exists */ +}; +/* + * n-gram Definition + * no B parameter(there are duplicated items in uni-gram and bi-gram) + * used in system n-gram and user n-gram. + * using delta technique. + */ + +struct BigramPhraseItem{ + phrase_token_t m_token; + gfloat m_freq; /* P(W2|W1) */ +}; + +typedef GArray * BigramPhraseArray; /* Array of HighLevelPhraseItem */ + +/* + * n-gram Definition + * n-gram library + */ + +enum AttachOption{ + ATTACH_NEW_FILE = 1, + ATTACH_READ = 2, + ATTACH_READ_WRITE = 3 +}; + +#define MAX_PHRASE_LENGTH 16 + +const phrase_token_t sentence_start = 1; +const phrase_token_t token_min = 0; +const phrase_token_t token_max = UINT_MAX; + +const char c_separate = '#'; +typedef guint32 table_offset_t; + +typedef double parameter_t; + +#define LAMBDA_PARAMETER 0.588792 + +#endif diff --git a/src/include/stl_lite.h b/src/include/stl_lite.h new file mode 100644 index 0000000..0612782 --- /dev/null +++ b/src/include/stl_lite.h @@ -0,0 +1,285 @@ +#ifndef STL_LITE_H +#define STL_LITE_H + +#include <ctype.h> +#include <stdlib.h> +#include <string.h> + +namespace std_lite{ + + /** + * @brief This does what you think it does. + * @param a A thing of arbitrary type. + * @param b Another thing of arbitrary type. + * @return The lesser of the parameters. + * + * This is the simple classic generic implementation. It will work on + * temporary expressions, since they are only evaluated once, unlike a + * preprocessor macro. + */ + template<typename _Tp> + inline const _Tp& + min(const _Tp& __a, const _Tp& __b) + { + //return __b < __a ? __b : __a; + if (__b < __a) + return __b; + return __a; + } + + + /** + * @brief This does what you think it does. + * @param a A thing of arbitrary type. + * @param b Another thing of arbitrary type. + * @return The greater of the parameters. + * + * This is the simple classic generic implementation. It will work on + * temporary expressions, since they are only evaluated once, unlike a + * preprocessor macro. + */ + template<typename _Tp> + inline const _Tp& + max(const _Tp& __a, const _Tp& __b) + { + //return __a < __b ? __b : __a; + if (__a < __b) + return __b; + return __a; + } + + /** + * This is one of the @link s20_3_1_base functor base classes@endlink. + */ + template <class _Arg1, class _Arg2, class _Result> + struct binary_function + { + typedef _Arg1 first_argument_type; ///< the type of the first argument + /// (no surprises here) + + typedef _Arg2 second_argument_type; ///< the type of the second argument + typedef _Result result_type; ///< type of the return type + }; + /** @} */ + + /// pair holds two objects of arbitrary type. + template<class _T1, class _T2> + struct pair + { + typedef _T1 first_type; ///< @c first_type is the first bound type + typedef _T2 second_type; ///< @c second_type is the second bound type + + _T1 first; ///< @c first is a copy of the first object + _T2 second; ///< @c second is a copy of the second object + + // _GLIBCXX_RESOLVE_LIB_DEFECTS + // 265. std::pair::pair() effects overly restrictive + /** The default constructor creates @c first and @c second using their + * respective default constructors. */ + pair() + : first(), second() { } + + /** Two objects may be passed to a @c pair constructor to be copied. */ + pair(const _T1& __a, const _T2& __b) + : first(__a), second(__b) { } + + /** There is also a templated copy ctor for the @c pair class itself. */ + template<class _U1, class _U2> + pair(const pair<_U1, _U2>& __p) + : first(__p.first), second(__p.second) { } + }; + + /// Two pairs of the same type are equal iff their members are equal. + template<class _T1, class _T2> + inline bool + operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return __x.first == __y.first && __x.second == __y.second; } + + /// <http://gcc.gnu.org/onlinedocs/libstdc++/20_util/howto.html#pairlt> + template<class _T1, class _T2> + inline bool + operator<(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return __x.first < __y.first + || (!(__y.first < __x.first) && __x.second < __y.second); } + + /// Uses @c operator== to find the result. + template<class _T1, class _T2> + inline bool + operator!=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return !(__x == __y); } + + /// Uses @c operator< to find the result. + template<class _T1, class _T2> + inline bool + operator>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return __y < __x; } + + /// Uses @c operator< to find the result. + template<class _T1, class _T2> + inline bool + operator<=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return !(__y < __x); } + + /// Uses @c operator< to find the result. + template<class _T1, class _T2> + inline bool + operator>=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return !(__x < __y); } + + /** + * @brief A convenience wrapper for creating a pair from two objects. + * @param x The first object. + * @param y The second object. + * @return A newly-constructed pair<> object of the appropriate type. + * + * The standard requires that the objects be passed by reference-to-const, + * but LWG issue #181 says they should be passed by const value. We follow + * the LWG by default. + */ + // _GLIBCXX_RESOLVE_LIB_DEFECTS + // 181. make_pair() unintended behavior + template<class _T1, class _T2> + inline pair<_T1, _T2> + make_pair(_T1 __x, _T2 __y) + { return pair<_T1, _T2>(__x, __y); } + + /** + * @brief Finds the first position in which @a val could be inserted + * without changing the ordering. + * @param first An iterator. + * @param last Another iterator. + * @param val The search term. + * @param comp A functor to use for comparisons. + * @return An iterator pointing to the first element "not less than" @a val, + * or end() if every element is less than @a val. + * @ingroup binarysearch + * + * The comparison function should have the same effects on ordering as + * the function used for the initial sort. + */ + template<typename _ForwardIterator, typename _Tp, typename _Compare> + _ForwardIterator + lower_bound(_ForwardIterator __first, _ForwardIterator __last, + const _Tp& __val, _Compare __comp) + { + typedef size_t _DistanceType; + + _DistanceType __len = __last - __first; + _DistanceType __half; + _ForwardIterator __middle; + + while (__len > 0) + { + __half = __len >> 1; + __middle = __first; + __middle += __half; + if (__comp(*__middle, __val)) + { + __first = __middle; + ++__first; + __len = __len - __half - 1; + } + else + __len = __half; + } + return __first; + } + + /** + * @brief Finds the last position in which @a val could be inserted + * without changing the ordering. + * @param first An iterator. + * @param last Another iterator. + * @param val The search term. + * @param comp A functor to use for comparisons. + * @return An iterator pointing to the first element greater than @a val, + * or end() if no elements are greater than @a val. + * @ingroup binarysearch + * + * The comparison function should have the same effects on ordering as + * the function used for the initial sort. + */ + template<typename _ForwardIterator, typename _Tp, typename _Compare> + _ForwardIterator + upper_bound(_ForwardIterator __first, _ForwardIterator __last, + const _Tp& __val, _Compare __comp) + { + typedef size_t _DistanceType; + _DistanceType __len = __last - __first; + _DistanceType __half; + _ForwardIterator __middle; + + while (__len > 0) + { + __half = __len >> 1; + __middle = __first; + __middle += __half; + if (__comp(__val, *__middle)) + __len = __half; + else + { + __first = __middle; + ++__first; + __len = __len - __half - 1; + } + } + return __first; + } + + /** + * @brief Finds the largest subrange in which @a val could be inserted + * at any place in it without changing the ordering. + * @param first An iterator. + * @param last Another iterator. + * @param val The search term. + * @param comp A functor to use for comparisons. + * @return An pair of iterators defining the subrange. + * @ingroup binarysearch + * + * This is equivalent to + * @code + * std::make_pair(lower_bound(first, last, val, comp), + * upper_bound(first, last, val, comp)) + * @endcode + * but does not actually call those functions. + */ + template<typename _ForwardIterator, typename _Tp, typename _Compare> + pair<_ForwardIterator, _ForwardIterator> + equal_range(_ForwardIterator __first, _ForwardIterator __last, + const _Tp& __val, + _Compare __comp) + { + + typedef size_t _DistanceType; + + _DistanceType __len = __last - __first; + _DistanceType __half; + _ForwardIterator __middle, __left, __right; + + while (__len > 0) + { + __half = __len >> 1; + __middle = __first; + __middle += __half; + if (__comp(*__middle, __val)) + { + __first = __middle; + ++__first; + __len = __len - __half - 1; + } + else if (__comp(__val, *__middle)) + __len = __half; + else + { + __left = lower_bound(__first, __middle, __val, __comp); + __first += __len; + __right = upper_bound(++__middle, __first, __val, __comp); + return pair<_ForwardIterator, _ForwardIterator>(__left, __right); + } + } + return pair<_ForwardIterator, _ForwardIterator>(__first, __first); + } + + +} +#endif diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am new file mode 100644 index 0000000..2b7d21f --- /dev/null +++ b/src/lookup/Makefile.am @@ -0,0 +1,30 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CPPFLAGS@ + +noinst_HEADERS = lookup.h winner_tree.h + +noinst_PROGRAMS = + +noinst_LTLIBRARIES = liblookup.la + +liblookup_la_SOURCES = pinyin_lookup.cpp winner_tree.cpp diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h new file mode 100644 index 0000000..676c6ea --- /dev/null +++ b/src/lookup/lookup.h @@ -0,0 +1,144 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LOOKUP_H +#define LOOKUP_H + +#include <float.h> +#include <glib.h> +#include "novel_types.h" +#include "pinyin_base.h" + +class WinnerTree; + +/** @file lookup.h + * @brief the definitions of lookup related classes and structs. + * Currently only contains pinyin lookup. + */ + +typedef phrase_token_t lookup_key_t; + +struct lookup_value_t{ + phrase_token_t m_handles[2]; + gfloat m_poss; + gint32 m_last_step; + lookup_value_t(gfloat poss = FLT_MAX){ + m_handles[0] = NULL; m_handles[1] = NULL; + m_poss = poss; + m_last_step = -1; + } +}; + +enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH }; + +struct lookup_constraint_t{ + constraint_type m_type; + union{ + phrase_token_t m_token; + guint32 m_constraint_step; /* index of m_token */ + }; +}; + +typedef GArray * CandidateConstraints; /* Array of lookup_constraint_t */ +typedef GArray * MatchResults; /* Array of phrase_token_t */ + +namespace novel{ +class PinyinLargeTable; +class FacadePhraseIndex; +class Bigram; +}; + +typedef GHashTable * LookupStepIndex; +/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */ +typedef GArray * LookupStepContent; /* array of lookup_value_t */ + + +class IBranchIterator{ +public: + virtual ~IBranchIterator(){} + virtual bool has_next() = 0; + virtual lookup_value_t next() = 0; + virtual lookup_value_t max() = 0; +}; + +class PinyinLookup{ +private: + static const gfloat bigram_lambda = LAMBDA_PARAMETER; + static const gfloat unigram_lambda = 1 - LAMBDA_PARAMETER; + + PhraseItem m_cache_phrase_item; +protected: + //saved varibles + CandidateConstraints m_constraints; + PinyinKeyVector m_keys; + + novel::PinyinLargeTable * m_pinyin_table; + novel::FacadePhraseIndex * m_phrase_index; + novel::PinyinCustomSettings * m_custom; + novel::Bigram * m_bigram; + + //internal step data structure + GPtrArray * m_steps_index; + /* Array of LookupStepIndex */ + GPtrArray * m_steps_content; + /* Array of LookupStepContent */ + + GArray * m_table_cache; + /* Array of PhraseIndexRanges */ + + WinnerTree * m_winner_tree; + + size_t prepare_table_cache(int nstep, int total_pinyin); + + bool search_unigram(IBranchIterator * iter, int nstep, int npinyin); + bool search_bigram(IBranchIterator * iter, int nstep, int npinyin); + + bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token); + bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss); + + bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step); + + bool final_step(MatchResults & results); +public: + PinyinLookup( PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram); + + ~PinyinLookup(); + + bool get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results); + + bool train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results); + + bool convert_to_utf8(MatchResults results, /* out */ char * & result_string); + + bool add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token); + + bool clear_constraint(CandidateConstraints constraints, size_t index); + + bool validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys); + + /* init pinyin table lookup array */ + bool prepare_pinyin_lookup(PhraseIndexRanges ranges); + /* destroy pinyin table lookup array */ + bool destroy_pinyin_lookup(PhraseIndexRanges ranges); +}; + +#endif diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp new file mode 100644 index 0000000..c335453 --- /dev/null +++ b/src/lookup/pinyin_lookup.cpp @@ -0,0 +1,587 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <math.h> +#include <assert.h> +#include <iostream> +#include "stl_lite.h" +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" +#include "phrase_index.h" +#include "ngram.h" +#include "lookup.h" +#include "winner_tree.h" + +const gfloat PinyinLookup::bigram_lambda; +const gfloat PinyinLookup::unigram_lambda; + +PinyinLookup::PinyinLookup(PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram){ + m_custom = custom; + m_pinyin_table = pinyin_table; + m_phrase_index = phrase_index; + m_bigram = bigram; + m_winner_tree = new WinnerTree; + m_steps_index = g_ptr_array_new(); + m_steps_content = g_ptr_array_new(); + m_table_cache = g_array_new(FALSE, TRUE, sizeof(PhraseIndexRanges)); + g_array_set_size(m_table_cache, 1); +} + +PinyinLookup::~PinyinLookup(){ + if ( m_winner_tree ) + delete m_winner_tree; + m_winner_tree = NULL; + //free resources + for ( size_t i = 0; i < m_table_cache->len; ++i){ + PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i); + destroy_pinyin_lookup(*ranges); + } + //g_array_set_size(m_table_cache, 1); + g_array_free(m_table_cache, TRUE); + + //free m_steps_index + for ( size_t i = 0; i < m_steps_index->len; ++i){ + GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i); + g_hash_table_destroy(table); + g_ptr_array_index(m_steps_index, i) = NULL; + } + g_ptr_array_free(m_steps_index, TRUE); + + //free m_steps_content + for ( size_t i = 0; i < m_steps_content->len; ++i){ + GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i); + g_array_free(array, TRUE); + g_ptr_array_index(m_steps_content, i) = NULL; + } + g_ptr_array_free(m_steps_content, TRUE); + +} + +bool PinyinLookup::prepare_pinyin_lookup(PhraseIndexRanges ranges){ + //memset(ranges, 0, sizeof(ranges)); + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){ + GArray * & array = ranges[i]; + assert(NULL == array); + if (m_phrase_index->m_sub_phrase_indices[i]){ + array = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange)); + } + } + return true; +} + +bool PinyinLookup::destroy_pinyin_lookup(PhraseIndexRanges ranges){ + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ + GArray * & array = ranges[i]; + if ( array ) + g_array_free(array, TRUE); + array = NULL; + } + return true; +} + +size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){ + //free resources + for ( size_t i = 0; i < m_table_cache->len; ++i){ + PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i); + destroy_pinyin_lookup(*ranges); + } + //g_array_set_size(m_table_cache, 1); + PinyinKey * pinyin_keys = (PinyinKey *)m_keys->data; + pinyin_keys += nstep; + //init resources + g_array_set_size(m_table_cache, MAX_PHRASE_LENGTH + 1); + size_t len; + for ( len = 1; len <= total_pinyin && len <= MAX_PHRASE_LENGTH; ++len){ + PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, len); + prepare_pinyin_lookup(*ranges); + int result = m_pinyin_table->search(len, pinyin_keys, *ranges); + if (!( result & SEARCH_CONTINUED)){ + ++len; + break; + } + } + g_array_set_size(m_table_cache, std_lite::min(len, (size_t) MAX_PHRASE_LENGTH + 1)); + return m_table_cache->len - 1; +} + +bool PinyinLookup::get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){ + //g_array_set_size(results, 0); + + m_constraints = constraints; + m_keys = keys; + int nstep = keys->len + 1; + + //free m_steps_index + for ( size_t i = 0; i < m_steps_index->len; ++i){ + GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i); + g_hash_table_destroy(table); + g_ptr_array_index(m_steps_index, i) = NULL; + } + + //free m_steps_content + for ( size_t i = 0; i < m_steps_content->len; ++i){ + GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i); + g_array_free(array, TRUE); + g_ptr_array_index(m_steps_content, i) = NULL; + } + + //add null start step + g_ptr_array_set_size(m_steps_index, nstep); + g_ptr_array_set_size(m_steps_content, nstep); + + for ( size_t i = 0 ; i < nstep; ++i ){ + //initialize m_steps_index + g_ptr_array_index(m_steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal); + //initialize m_steps_content + g_ptr_array_index(m_steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t)); + } + + lookup_key_t initial_key = sentence_start; + lookup_value_t initial_value(log(1)); + initial_value.m_handles[1] = sentence_start; + GArray * initial_step_content = (GArray *) g_ptr_array_index(m_steps_content, 0); + initial_step_content = g_array_append_val(initial_step_content, initial_value); + GHashTable * initial_step_index = (GHashTable *) g_ptr_array_index(m_steps_index, 0); + g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), GUINT_TO_POINTER(initial_step_content->len - 1)); + +#if 0 + LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, 0); + IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step); + size_t npinyin = prepare_table_cache(0, keys->len); + search_unigram(iter, 0, npinyin); + delete iter; +#endif + + for ( size_t i = 0 ; i < nstep - 1 ; ++i ){ + LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, i); + IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step); + size_t npinyin = prepare_table_cache(i, keys->len - i); + search_bigram(iter, i, npinyin), + search_unigram(iter, i, npinyin); + delete iter; + } + return final_step(results); +} + +bool PinyinLookup::search_unigram(IBranchIterator * iter, int nstep, int npinyin){ + lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep); + if ( CONSTRAINT_NOSEARCH == constraint->m_type ) + return false; + GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return false; + lookup_value_t max_step = iter->max(); + if ( CONSTRAINT_ONESTEP == constraint->m_type){ + return unigram_gen_next_step(nstep, &max_step, constraint->m_token); + } + if ( NO_CONSTRAINT == constraint->m_type ){ + bool found = false; + for ( size_t i = 1; i < m_table_cache->len && i <= MAX_PHRASE_LENGTH; ++i){ + lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1); + if ( constraint->m_type != NO_CONSTRAINT ) + continue; + PhraseIndexRanges * ranges = &g_array_index(m_table_cache,PhraseIndexRanges, i); + for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = (*ranges)[m]; + if ( !array ) continue; + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); + for ( phrase_token_t token = range->m_range_begin; + token != range->m_range_end; ++token){ + found = unigram_gen_next_step(nstep, &max_step, token)|| found; + } + } + } + } + return found; + } + return false; +} + + +bool PinyinLookup::search_bigram(IBranchIterator * iter, + int nstep, int npinyin){ + lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep); + if ( CONSTRAINT_NOSEARCH == constraint->m_type ) + return false; + GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep); + + bool found = false; + BigramPhraseArray bigram_phrase_items = g_array_new(FALSE, FALSE, + sizeof(BigramPhraseItem)); + while ( iter->has_next() ){ + lookup_value_t cur_step = iter->next(); + //printf("token:%d\t%d\n", cur_step.m_handles[0], cur_step.m_handles[1]); + phrase_token_t index_token = cur_step.m_handles[1]; + SingleGram * system, * user; + m_bigram->load(index_token, system, user); + if ( system && user ){ + guint32 total_freq; + assert(user->get_total_freq(total_freq)); + assert(system->set_total_freq(total_freq)); + } + if ( CONSTRAINT_ONESTEP == constraint->m_type ){ + phrase_token_t token = constraint->m_token; + if ( system ){ + guint32 freq; + if( system->get_freq(token, freq) ){ + guint32 total_freq; + system->get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found; + } + } + if ( user ){ + guint32 freq; + if( user->get_freq(token, freq)){ + guint32 total_freq; + user->get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found; + } + } + } + + if ( NO_CONSTRAINT == constraint->m_type ){ + for ( size_t i = 1; i < m_table_cache->len + && i <= MAX_PHRASE_LENGTH;++i ){ + lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1); + if ( constraint->m_type != NO_CONSTRAINT ) + continue; + + PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i); + for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = (*ranges)[m]; + if ( !array ) continue; + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); + if (system){ + g_array_set_size(bigram_phrase_items, 0); + system->search(range, bigram_phrase_items); + for( size_t k = 0; k < bigram_phrase_items->len; + ++k){ + BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); + found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found; + } + } + if (user){ + g_array_set_size(bigram_phrase_items, 0); + user->search(range, bigram_phrase_items); + for( size_t k = 0; k < bigram_phrase_items->len; + ++k){ + BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); + found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found; + } + } + } + } + } + } + if (system) + delete system; + if (user) + delete user; + } + g_array_free(bigram_phrase_items, TRUE); + return found; +} + + +bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token){ + PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep; + if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gfloat elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat) + m_phrase_index->get_phrase_index_total_freq(); + if ( elem_poss < FLT_EPSILON ) + return false; + gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys); + if (pinyin_poss < FLT_EPSILON ) + return false; + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss){ + PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep; + if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gfloat unigram_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat) + m_phrase_index->get_phrase_index_total_freq(); + if ( bigram_poss < FLT_EPSILON && unigram_poss < FLT_EPSILON ) + return false; + gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys); + if ( pinyin_poss < FLT_EPSILON ) + return false; + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + + log(( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) *pinyin_poss); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup::save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step){ + lookup_key_t next_key = next_step->m_handles[1]; + GHashTable * next_lookup_index = (GHashTable *) g_ptr_array_index(m_steps_index, next_step_pos); + GArray * next_lookup_content = (GArray *) g_ptr_array_index(m_steps_content, next_step_pos); + + gpointer key, value; + gboolean lookup_result = g_hash_table_lookup_extended(next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); + size_t step_index = GPOINTER_TO_UINT(value); + if ( !lookup_result ){ + g_array_append_val(next_lookup_content, *next_step); + g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1)); + return true; + }else{ + lookup_value_t * orig_next_value = &g_array_index(next_lookup_content, lookup_value_t,step_index); + if ( orig_next_value->m_poss < next_step->m_poss) { + orig_next_value->m_handles[0] = next_step->m_handles[0]; + assert(orig_next_value->m_handles[1] == next_step->m_handles[1]); + orig_next_value->m_poss = next_step->m_poss; + orig_next_value->m_last_step = next_step->m_last_step; + return true; + } + return false; + } +} + +bool PinyinLookup::final_step(MatchResults & results){ + //reset results + g_array_set_size(results, m_steps_content->len); + for ( size_t i = 0 ; i < m_steps_content->len ; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + *token = NULL; + } + //find max element + size_t last_step_pos = m_steps_content->len - 1; + + GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos); + if ( last_step_array->len == 0 ) + return false; + lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0); + for ( size_t i = 1; i < last_step_array->len; ++i){ + lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + //backtracing + while( true ){ + int cur_step_pos = max_value->m_last_step; + if ( -1 == cur_step_pos ) + break; + + phrase_token_t * token = &g_array_index(results, phrase_token_t, cur_step_pos); + *token = max_value->m_handles[1]; + + phrase_token_t last_token = max_value->m_handles[0]; + + + GHashTable * lookup_step_index = (GHashTable *)g_ptr_array_index(m_steps_index, cur_step_pos); + gpointer key, value; + gboolean result = g_hash_table_lookup_extended(lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); + if (!result) + return false; + GArray * lookup_step_content = (GArray *)g_ptr_array_index(m_steps_content, cur_step_pos); + + max_value = &g_array_index(lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); + } + + //no need to reverse the result + + return true; +} + +bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){ + bool train_next = false; + PinyinKey * pinyin_keys = (PinyinKey *)keys->data; + //TODO: verify the new training method. + phrase_token_t last_token = sentence_start; + // constraints->len + 1 == results->len + guint32 train_factor = 23; + for ( size_t i = 0; i < constraints->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if ( *token == NULL ) + continue; + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + if (train_next || CONSTRAINT_ONESTEP == constraint->m_type ){ + if (CONSTRAINT_ONESTEP == constraint->m_type){ + assert(*token == constraint->m_token); + train_next = true; + }else{ + train_next = false; + } + //add pi-gram frequency + //std::cout<<"i:"<<i<<"last_token:"<<last_token<<"\ttoken:"<<*token<<std::endl; + m_phrase_index->get_phrase_item(*token, m_cache_phrase_item); + m_cache_phrase_item.increase_pinyin_possibility(*m_custom, pinyin_keys + i, train_factor); + m_phrase_index->add_unigram_frequency(*token, train_factor); + if ( last_token ){ + SingleGram * system, *user; + m_bigram->load(last_token, system, user); + guint32 total_freq; + if ( !user ){ + total_freq = 0; + if ( system ) + assert(system->get_total_freq(total_freq)); + user = new SingleGram; + user->set_total_freq(total_freq); + } + guint32 freq = 0; + if ( !user->get_freq(*token, freq)){ + if (system) system->get_freq(*token, freq); + user->set_freq(*token, freq); + } + assert(user->get_total_freq(total_freq)); + //protect against total_freq overflow. + if ( train_factor > 0 && total_freq > total_freq + train_factor) + goto next; + assert(user->set_total_freq(total_freq + train_factor)); + assert(user->get_freq(*token, freq)); + //if total_freq is not overflow, then freq won't overflow. + assert(user->set_freq(*token, freq + train_factor)); + assert(m_bigram->store(last_token, user)); + next: + if (system) delete system; + if (user) delete user; + } + } + last_token = *token; + } + return true; +} + +bool PinyinLookup::convert_to_utf8(MatchResults results, /* out */ char * & result_string){ + result_string = g_strdup(""); + for ( size_t i = 0; i < results->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if ( NULL == *token ) + continue; + m_phrase_index->get_phrase_item(*token, m_cache_phrase_item); + utf16_t buffer[MAX_PHRASE_LENGTH]; + m_cache_phrase_item.get_phrase_string(buffer); + guint8 length = m_cache_phrase_item.get_phrase_length(); + gchar * phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL); + char * tmp = result_string; + result_string = g_strconcat(result_string, phrase, NULL); + g_free(tmp); g_free(phrase); + } + return true; +} + +bool PinyinLookup::add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token){ + if ( !m_phrase_index->get_phrase_item(token, m_cache_phrase_item) ) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + if ( index + phrase_length > constraints->len ) + return false; + + for ( size_t i = index; i < index + phrase_length ; ++i ){ + clear_constraint(constraints, i); + } + + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index); + constraint->m_type = CONSTRAINT_ONESTEP; + constraint->m_token = token; + + for (size_t i = 1; i < phrase_length; ++i){ + constraint = &g_array_index(constraints, lookup_constraint_t, index + i); + constraint->m_type = CONSTRAINT_NOSEARCH; + constraint->m_constraint_step = index; + } + return true; +} + +bool PinyinLookup::clear_constraint(CandidateConstraints constraints, size_t index){ + if ( index < 0 || index >= constraints->len ) + return false; + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index); + if (constraint->m_type == NO_CONSTRAINT) + return false; + if (constraint->m_type == CONSTRAINT_NOSEARCH){ + index = constraint->m_constraint_step; + constraint = &g_array_index(constraints, lookup_constraint_t, index); + } + + assert(constraint->m_type == CONSTRAINT_ONESTEP); + + phrase_token_t token = constraint->m_token; + if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + for ( size_t i = 0; i < phrase_length; ++i){ + if ( index + i >= constraints->len ) + continue; + constraint = &g_array_index(constraints, lookup_constraint_t, index + i); + constraint->m_type = NO_CONSTRAINT; + } + return true; +} + +bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys){ + //resize constraints array + size_t constraints_length = constraints->len; + if ( m_parsed_keys->len > constraints_length ){ + g_array_set_size(constraints, m_parsed_keys->len); + //initialize new element + for( size_t i = constraints_length; i < m_parsed_keys->len; ++i){ + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + }else if (m_parsed_keys->len < constraints_length ){ + g_array_set_size(constraints, m_parsed_keys->len); + } + + PinyinKey * pinyin_keys = (PinyinKey *)m_parsed_keys->data; + + for ( size_t i = 0; i < constraints->len; ++i){ + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + if ( constraint->m_type == CONSTRAINT_ONESTEP ){ + phrase_token_t token = constraint->m_token; + m_phrase_index->get_phrase_item(token, m_cache_phrase_item); + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + //clear too long constraint + if ( i + phrase_length > constraints->len ){ + clear_constraint(constraints, i); + continue; + } + //clear invalidated pinyin + gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyin_keys + i); + if ( pinyin_poss < FLT_EPSILON ){ + clear_constraint(constraints, i); + } + } + } + return true; +} diff --git a/src/lookup/winner_tree.cpp b/src/lookup/winner_tree.cpp new file mode 100644 index 0000000..248a749 --- /dev/null +++ b/src/lookup/winner_tree.cpp @@ -0,0 +1,141 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <float.h> +#include <limits.h> +#include <stdio.h> +#include "memory_chunk.h" +#include "phrase_index.h" +#include "lookup.h" +#include "winner_tree.h" + +WinnerTreeBranchIterator::WinnerTreeBranchIterator(WinnerTree & tree) + :m_tree(tree), m_counter(0){ + m_max_value = m_tree.m_items[m_tree.get_winner()]; + m_counter = 0; +} + +bool WinnerTreeBranchIterator::has_next(){ + if ( m_counter >= m_tree.m_tree_size) + return false; + return m_counter < nbranch; +} + +lookup_value_t WinnerTreeBranchIterator::next(){ + int winner = m_tree.get_winner(); + lookup_value_t tmp = m_tree.m_items[winner]; + m_tree.m_items[winner].m_poss = + - FLT_MAX; + m_tree.replay(winner); + ++m_counter; + return tmp; +} + +void WinnerTree::play(int p, int lc, int rc){ + m_tree[p] = winner(lc, rc); + //continue competition + while( p > 1 && p % 2) { + m_tree[p/2] = winner( m_tree[p - 1], m_tree[p]); + p/=2; + } +} + + +bool WinnerTree::initialize(LookupStepContent cur_step){ + size_t size = cur_step->len; + if ( size > m_max_tree_size ){ + init(size); + } + assert(size > nbranch); + m_tree_size = size; + + //initialize array tree + int nindex = 1; + + for( size_t i = 0; i < cur_step->len ; ++i){ + lookup_value_t * cur_value = &g_array_index(cur_step, lookup_value_t, i); + m_items[nindex++] = *cur_value; + } + + //compute s = 2 ^ log(n -1) + int i, s; + for( s = 1; 2 * s <= m_tree_size - 1; s += s); + + m_low_ext = 2 * (m_tree_size - s); + m_offset = 2 * s - 1; + + //compute outside nodes + for( i = 2; i <= m_low_ext; i += 2) + play((m_offset + i)/2, i - 1, i); + //compute other nodes + if ( m_tree_size % 2){ + play( m_tree_size / 2, m_tree[m_tree_size - 1], m_low_ext +1); + i = m_low_ext + 3; + }else i = m_low_ext + 2; + + //compute others + for( ; i <= m_tree_size; i += 2) + play( (i - m_low_ext + m_tree_size - 1) / 2, i - 1, i); + return true; +} + +void WinnerTree::replay(int i){ + assert( 1 <= i && i <= m_tree_size); + + int p; //compete node + int lc; //p's left child + int rc; //p's right child + + //first compete + if ( i <= m_low_ext){ + p = (m_offset + i) / 2; + lc = 2 * p - m_offset; + rc = lc + 1; + }else{ + p = (i - m_low_ext + m_tree_size -1) / 2; + if ( 2 * p == m_tree_size - 1 ){ + lc = m_tree[2*p]; + rc = i; + }else{ + lc = 2 * p - m_tree_size + 1 + m_low_ext; + rc = lc + 1; + } + } + + m_tree[p] = winner(lc, rc); + + //added by wupeng + if ( ( p | 0x01 ) == m_tree_size ){ + p /= 2; + m_tree[p] = winner( m_tree[2 * p], m_low_ext + 1 ); + } + + //compute others + p /= 2; + for( ; p >= 1 ; p /= 2) + m_tree[p] = winner( m_tree[2 * p], m_tree[2 * p + 1]); +} + +int WinnerTree::winner(int lc, int rc){ + return m_items[lc].m_poss > m_items[rc].m_poss ? + lc : rc; +} diff --git a/src/lookup/winner_tree.h b/src/lookup/winner_tree.h new file mode 100644 index 0000000..262f196 --- /dev/null +++ b/src/lookup/winner_tree.h @@ -0,0 +1,148 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LOOKUP_WINNER_TREE_H +#define LOOKUP_WINNER_TREE_H + +#include <assert.h> +#include "lookup.h" + +const int nbranch = 32; + +class DirectBranchIterator: public IBranchIterator{//for nitem <= nbranch + LookupStepContent m_step_content; + size_t m_iter_pos; +public: + //Constructor + DirectBranchIterator(LookupStepContent step_content) + :m_step_content(step_content) + { m_iter_pos = 0; } + + //Destructor + virtual ~DirectBranchIterator(){} + + //Member Function + bool has_next(){ + return m_iter_pos != m_step_content->len; + } + + lookup_value_t next(){ + lookup_value_t * tmp = &g_array_index(m_step_content, + lookup_value_t, m_iter_pos); + ++m_iter_pos; + return *tmp; + } + + lookup_value_t max(){ + lookup_value_t * max_value = &g_array_index(m_step_content, lookup_value_t, 0); + for ( size_t i = 1 ; i < m_step_content->len; ++i){ + lookup_value_t * cur_value = &g_array_index(m_step_content, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + return *max_value; + } +}; + +class WinnerTree; + +class WinnerTreeBranchIterator: public IBranchIterator{//for nitem <= nbranch + WinnerTree& m_tree; + int m_counter; + lookup_value_t m_max_value; +public: + //Constructor + WinnerTreeBranchIterator(WinnerTree & tree); + + //Destructor + virtual ~WinnerTreeBranchIterator(){} + + //Member Function + bool has_next(); + + lookup_value_t next(); + + lookup_value_t max(){ + return m_max_value; + } + +}; + +class WinnerTree{ + friend class WinnerTreeBranchIterator; +private: + size_t m_max_tree_size; // maxsize + int m_tree_size; // n + int m_low_ext; + int m_offset; + int * m_tree; + MemoryChunk m_buffer; + MemoryChunk m_tree_buffer; + lookup_value_t * m_items; + + int winner(int lc, int rc); + + void play(int p, int lc, int rc); + + void init(int tree_size){ + m_max_tree_size = tree_size; + //data buffer + m_buffer.set_size( sizeof(lookup_value_t) * (tree_size + 1) ); + m_items = (lookup_value_t *) m_buffer.begin(); + + //tree item buffer + m_tree_buffer.set_size( sizeof(int) * m_max_tree_size); + m_tree = (int * ) m_tree_buffer.begin(); + m_tree_size = 0; + } + +public: + + //Constructor + WinnerTree(int tree_size = 10){ + init(tree_size); + } + + //Destructor + ~WinnerTree() { } + + //need delete this + IBranchIterator* get_iterator(LookupStepContent step){ + if ( step->len <= nbranch ) + return new DirectBranchIterator(step); + //TODO:another situation > nbranch + assert(initialize(step)); + return new WinnerTreeBranchIterator(*this); + } + +protected: + + int get_winner() const { + return (m_tree_size)? m_tree[1] : 0; + } + + //Member Function + bool initialize(LookupStepContent cur_step); + void replay(int i); +}; + +#endif diff --git a/src/segment/Makefile.am b/src/segment/Makefile.am new file mode 100644 index 0000000..0e58ddf --- /dev/null +++ b/src/segment/Makefile.am @@ -0,0 +1,28 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CPPFLAGS@ + +noinst_PROGRAMS = mmseg + +mmseg_SOURCES = mmseg.cpp + +mmseg_LDADD = @GLIB2_LDFLAGS@ diff --git a/src/segment/mmseg.cpp b/src/segment/mmseg.cpp new file mode 100644 index 0000000..6a3d7f7 --- /dev/null +++ b/src/segment/mmseg.cpp @@ -0,0 +1,212 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <string.h> +#include <limits.h> +#include <locale.h> +#include <glib.h> +#include "novel_types.h" + +static GHashTable * g_phrases; + +struct SegmentStep{ + phrase_token_t m_handle; + char * m_phrase; + //use formula W = No. of words. Zero handle means one word. + size_t m_nword; + //backtracing information, -1 one step backward. + gint8 m_backward_nstep; +}; + +//read gb_char.table and gbk_char.table +bool init_phrases(FILE * infile){ + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + while (!feof(infile)){ + fscanf(infile, "%s", pinyin); + fscanf(infile, "%s", phrase); + fscanf(infile, "%d", &token); + fscanf(infile, "%ld", &freq); + if ( feof(infile) ) + break; + g_hash_table_insert(g_phrases, g_strdup(phrase), + GUINT_TO_POINTER(token)); + } + return true; +} + +bool segment(GHashTable * phrases, // Lookup Phrases + const char * phrase, + GArray * strings /* Array of const char * */){ + GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); + GArray * offsets = g_array_new(TRUE, TRUE, sizeof(size_t)); + //construct dynamic programming. + size_t phrase_length = g_utf8_strlen(phrase, -1); + const char * p = phrase; + size_t offset = p - phrase; + g_array_append_val(offsets, offset); + g_array_set_size(steps, phrase_length + 1); + for ( size_t i = 0 ; i < phrase_length; ++i){ + p = g_utf8_next_char(p); + offset = p - phrase; + g_array_append_val(offsets, offset); + } + assert( *p == '\0' ); + + //initialize segment steps values. + for ( size_t i = 0; i < phrase_length + 1; ++i){ + SegmentStep* step = &g_array_index(steps, SegmentStep, i); + step->m_nword = UINT_MAX; + } + + for ( size_t i = 0 ; i < phrase_length + 1; ++i){ + size_t* offset_begin = &g_array_index(offsets, size_t, i); + const char * phrase_begin = phrase + *offset_begin; + SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i); + size_t nword = step_begin->m_nword; + for ( size_t k = i + 1; k < phrase_length + 1; ++k){ + size_t* offset_end = &g_array_index(offsets, size_t, k); + size_t len = *offset_end - *offset_begin; + char * cur_phrase = g_strndup(phrase_begin, len); + phrase_token_t token; + gpointer orig_key, value; + gboolean result = g_hash_table_lookup_extended + (phrases, cur_phrase, &orig_key, &value); + if ( result ){ + token = GPOINTER_TO_UINT(value); + }else{ + token = 0; + if ( 1 != k - i ){ //skip non-phrase + g_free(cur_phrase); + continue; + } + } + ++nword; + SegmentStep * step_end = &g_array_index(steps, SegmentStep, k); + if ( nword < step_end->m_nword){ + if ( step_end->m_phrase ){ + g_free(step_end->m_phrase); + step_end->m_phrase = NULL; + } + step_end->m_handle = token; + step_end->m_phrase = cur_phrase; + step_end->m_nword = nword; + step_end->m_backward_nstep = k - i; + }else{ + g_free(cur_phrase); + } + } + } + //backtracing to get the result. + size_t cur_step = phrase_length; + g_array_set_size(strings, 0); + while ( cur_step ){ + SegmentStep* step_end = &g_array_index(steps, SegmentStep, cur_step); + char * str_dup = g_strdup(step_end->m_phrase); + g_array_append_val(strings, str_dup); + cur_step = cur_step - step_end->m_backward_nstep; + } + + for ( size_t i = 0; i < strings->len / 2; ++i){ + char ** phrase_head = &g_array_index(strings, char * , i); + char ** phrase_tail = &g_array_index(strings, char * , strings->len -1 - i); + char * phrase_tmp; + phrase_tmp = * phrase_head; + * phrase_head = * phrase_tail; + * phrase_tail = phrase_tmp; + } + + //free strndup memory + for ( size_t i = 0; i < steps->len; ++i){ + SegmentStep* step = &g_array_index(steps, SegmentStep, i); + if ( step->m_phrase ){ + g_free(step->m_phrase); + step->m_phrase = NULL; + } + } + + g_array_free(offsets, TRUE); + g_array_free(steps, TRUE); + return true; +} + +void print_help(){ + printf("Usage: mmseg [--generate-extra-enter]\n"); + exit(1); +} + +int main(int argc, char * argv[]){ + int i = 1; + bool gen_extra_enter = false; + + setlocale(LC_ALL,""); + while ( i < argc ){ + if ( strcmp("--help", argv[i] ) == 0) { + print_help(); + }else if ( strcmp("--generate-extra-enter", argv[i]) == 0) { + gen_extra_enter = true; + } + ++i; + } + + g_phrases = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL); + //init phrase lookup + FILE * gb_file = fopen("../../data/gb_char.table", "r"); + if ( gb_file == NULL ){ + fprintf(stderr, "can't open gb_char.table!\n"); + exit(1); + } + init_phrases(gb_file); + fclose(gb_file); + FILE * gbk_file = fopen("../../data/gbk_char.table", "r"); + if ( gbk_file == NULL ){ + fprintf(stderr, "can't open gbk_char.table!\n"); + exit(1); + } + init_phrases(gbk_file); + fclose(gbk_file); + + char* linebuf = (char *)malloc ( 1024 * sizeof (char) ); + size_t size = 1024; + while( getline(&linebuf, &size, stdin) ){ + if ( feof(stdin) ) + break; + linebuf[strlen(linebuf)-1] = '\0'; + + GArray * phrases = g_array_new(TRUE, TRUE, sizeof( char *)); + segment(g_phrases, linebuf, phrases); + for ( size_t i = 0; i < phrases->len; ++i){ + char * phrase = g_array_index(phrases, char *, i); + printf("%s\n", phrase); + g_free(phrase); + } + if ( gen_extra_enter ) + printf("\n"); + g_array_free(phrases, TRUE); + } + free(linebuf); +} diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am new file mode 100644 index 0000000..adf2b5c --- /dev/null +++ b/src/storage/Makefile.am @@ -0,0 +1,35 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CPPFLAGS@ + +noinst_HEADERS = pinyin_large_table.h \ + pinyin_base.h \ + pinyin_phrase.h \ + phrase_index.h \ + pinyin_zhuyin_map_data.h \ + ngram.h + +noinst_LTLIBRARIES = libstorage.la + +libstorage_la_SOURCES = pinyin_base.cpp \ + pinyin_large_table.cpp \ + phrase_index.cpp \ + ngram.cpp + diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp new file mode 100644 index 0000000..7fdc58f --- /dev/null +++ b/src/storage/ngram.cpp @@ -0,0 +1,283 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <stdio.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "ngram.h" + +struct SingleGramItem{ + phrase_token_t m_token; + guint32 m_freq; +}; + +SingleGram::SingleGram(){ + m_chunk.set_size(sizeof(guint32)); + memset(m_chunk.begin(), 0, sizeof(guint32)); +} + +SingleGram::SingleGram(void * buffer, size_t length){ + m_chunk.set_chunk(buffer, length, NULL); +} + +bool SingleGram::set_total_freq(guint32 m_total){ + char * buf_begin = (char *)m_chunk.begin(); + *((guint32 *)buf_begin) = m_total; + return true; +} + +bool SingleGram::get_total_freq(guint32 & m_total){ + char * buf_begin = (char *)m_chunk.begin(); + m_total = *((guint32 *)buf_begin); + return true; +} + +bool SingleGram::prune(){ +#if 1 + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *)m_chunk.end(); + + size_t nitem = 0; + for ( SingleGramItem * cur = begin; cur != end; ++cur){ + cur->m_freq--; + nitem++; + if ( cur->m_freq == 0 ){ + size_t offset = sizeof(guint32) + (cur - begin) + * sizeof(SingleGramItem) ; + m_chunk.remove_content(offset, sizeof(SingleGramItem)); + } + } + guint32 total_freq; + assert(get_total_freq(total_freq)); + assert(set_total_freq(total_freq - nitem)); +#endif + return true; +} + +bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){ + return lhs.m_token < rhs.m_token; +} + +bool SingleGram::search(/* in */ PhraseIndexRange * range, + /* out */ BigramPhraseArray array){ + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = range->m_range_begin; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + guint32 total_freq; + BigramPhraseItem bigram_item; + assert(get_total_freq(total_freq)); + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token >= range->m_range_end ) + break; + bigram_item.m_token = cur_item->m_token; + bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq; + g_array_append_val(array, bigram_item); + } + return true; +} + +bool SingleGram::get_freq(/* in */ phrase_token_t token, + /* out */ guint32 & freq){ + freq = 0; + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + freq = cur_item -> m_freq; + return true; + } + } + return false; +} + +bool SingleGram::set_freq(/* in */ phrase_token_t token, + guint32 freq){ + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + SingleGramItem insert_item; + insert_item.m_token = token; + insert_item.m_freq = freq; + for ( ;cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ){ + size_t offset = sizeof(guint32) + + sizeof(SingleGramItem) * (cur_item - begin); + m_chunk.insert_content(offset, &insert_item, + sizeof(SingleGramItem)); + return true; + } + if ( cur_item->m_token == token ){ + cur_item -> m_freq = freq; + return true; + } + } + m_chunk.insert_content(m_chunk.size(), &insert_item, + sizeof(SingleGramItem)); + return true; +} + + +bool Bigram::attach(const char * systemfile, const char * userfile){ + reset(); + if ( systemfile ){ + int ret = db_create(&m_system, NULL, 0); + if ( ret != 0 ) + assert(false); + + m_system->open(m_system, NULL, systemfile, NULL, + DB_HASH, DB_RDONLY, 0664); + if ( ret != 0) + return false; + } + + if ( userfile ){ + int ret = db_create(&m_user, NULL, 0); + if ( ret != 0 ) + assert(false); + + m_user->open(m_user, NULL, userfile, NULL, DB_HASH, DB_CREATE, 0664); + if ( ret != 0) + return false; + } + return true; +} + +bool Bigram::load(phrase_token_t index, SingleGram * & system_gram, SingleGram * & user_gram){ + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + system_gram = NULL; user_gram = NULL; + if ( m_system ){ + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + int ret = m_system->get(m_system, NULL, &db_key, &db_data, 0); + if ( ret == 0 ) + system_gram = new SingleGram(db_data.data, db_data.size); + } + if ( m_user ){ + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + int ret = m_user->get(m_user, NULL, &db_key, &db_data, 0); + if ( ret == 0 ) + user_gram = new SingleGram(db_data.data, db_data.size); + } + return true; +} + +bool Bigram::store(phrase_token_t index, SingleGram * user_gram){ + if ( !m_user ) + return false; + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = user_gram->m_chunk.begin(); + db_data.size = user_gram->m_chunk.size(); + + int ret = m_user->put(m_user, NULL, &db_key, &db_data, 0); + return ret == 0; +} + +bool Bigram::get_all_items(GArray * system, GArray * user){ + bool retval = false; + g_array_set_size(system, 0); + g_array_set_size(user, 0); + if ( m_system ){ + DBC * cursorp; + DBT key, data; + int ret; + /* Get a cursor */ + m_system->cursor(m_system, NULL, &cursorp, 0); + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + assert(key.size == sizeof(phrase_token_t)); + phrase_token_t * token = (phrase_token_t *)key.data; + g_array_append_val(system, *token); + } + + if (ret != DB_NOTFOUND) { + fprintf(stderr, "system db error, exit!"); + exit(1); + } + + /* Cursors must be closed */ + if (cursorp != NULL) + cursorp->c_close(cursorp); + + retval = true; + } + if ( m_user ){ + DBC * cursorp; + DBT key, data; + int ret; + /* Get a cursor */ + m_user->cursor(m_user, NULL, &cursorp, 0); + + /* Initialize out DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + assert(key.size == sizeof(phrase_token_t)); + phrase_token_t * token = (phrase_token_t *) key.data; + g_array_append_val(user, *token); + } + + if (ret != DB_NOTFOUND){ + fprintf(stderr, "user db error, exit!"); + exit(1); + } + + /* Cursor must be closed */ + if ( cursorp != NULL) + cursorp->c_close(cursorp); + + retval = true; + } + return retval; +} diff --git a/src/storage/ngram.h b/src/storage/ngram.h new file mode 100644 index 0000000..39a9ecc --- /dev/null +++ b/src/storage/ngram.h @@ -0,0 +1,119 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef NGRAM_H +#define NGRAM_H + +#include <db.h> + +namespace novel{ + +class Bigram; + +/* Note: + * When transfer from system ngram to user ngram, + * if user ngram doesn't exist, + * copy total freq from system ngram to user ngram, + * so the total freq exists. + * if item freq don't exist, copy item freq from system to user ngram, + * so the item freq exists. + * if user ngram already exists(always true), increases the total freq, + * if item ngram already exists(always true), increases the freq. + */ + +class SingleGram{ + friend class Bigram; +private: + MemoryChunk m_chunk; + SingleGram(void * buffer, size_t length); +public: + /* Null Constructor */ + SingleGram(); + /* search method */ + /* the array result contains many items */ + bool search(/* in */ PhraseIndexRange * range, + /* out */ BigramPhraseArray array); + + bool get_freq(/* in */ phrase_token_t token, + /* out */ guint32 & freq); + + /* set_freq method + */ + bool set_freq(/* in */ phrase_token_t token, + guint32 freq); + + /* set_total_freq method + * used in user bigram table + */ + bool set_total_freq(guint32 m_total); + + /* get_total_freq method + * used in user bigram table + */ + bool get_total_freq(guint32 & m_total); + + /* prune one method + * only used in training + */ + bool prune(); +}; + +class Bigram{ +private: + DB * m_system; + DB * m_user; +public: + Bigram(){ + m_system = NULL; m_user = NULL; + } + + ~Bigram(){ + reset(); + } + + void reset(){ + if ( m_system ){ + m_system->close(m_system, 0); + m_system = NULL; + } + if ( m_user ){ + m_user->close(m_user, 0); + m_user = NULL; + } + } + + /* attach system and user bi-gram */ + /* when with training systemdb is NULL, only user_gram */ + bool attach(const char * systemfile, const char * userfile); + + bool load(phrase_token_t index, SingleGram * & system_gram, SingleGram * & user_gram); + bool store(phrase_token_t index, SingleGram * user_gram); + /* array of phrase_token_t items, for parameter estimation. */ + bool get_all_items(GArray * system, GArray * user); +}; + +}; + +using namespace novel; + + +#endif diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp new file mode 100644 index 0000000..7dbecb3 --- /dev/null +++ b/src/storage/phrase_index.cpp @@ -0,0 +1,340 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "phrase_index.h" + +bool PhraseItem::set_n_pronunciation(guint8 n_prouns){ + m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8)); + return true; +} + +bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){ + guint8 phrase_length = get_phrase_length(); + table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32)); + bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey)); + if ( !retval ) + return retval; + return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32)); +} + +void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){ + guint8 phrase_length = get_phrase_length(); + set_n_pronunciation(get_n_pronunciation() + 1); + m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey)); + m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32)); +} + +void PhraseItem::remove_nth_pronunciation(size_t index){ + guint8 phrase_length = get_phrase_length(); + set_n_pronunciation(get_n_pronunciation() - 1); + size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32)); + m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32)); +} + +bool PhraseItem::get_phrase_string(utf16_t * phrase){ + guint8 phrase_length = get_phrase_length(); + return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t)); +} + +bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){ + m_chunk.set_content(0, &phrase_length, sizeof(guint8)); + m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t)); + return true; +} + +void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom, + PinyinKey * pinyin_keys, + gint32 delta){ + guint8 phrase_length = get_phrase_length(); + guint8 npron = get_n_pronunciation(); + size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ); + char * buf_begin = (char *) m_chunk.begin(); + guint32 total_freq = 0; + for ( int i = 0 ; i < npron ; ++i){ + char * pinyin_begin = buf_begin + offset + + i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) ); + guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey)); + total_freq += *freq; + if ( 0 == pinyin_compare_with_ambiguities(custom, + (PinyinKey *)pinyin_begin, + pinyin_keys, + phrase_length)){ + //protect against total_freq overflow. + if ( delta > 0 && total_freq > total_freq + delta ) + return; + *freq += delta; + total_freq += delta; + } + } +} + + +guint32 SubPhraseIndex::get_phrase_index_total_freq(){ + return m_total_freq; +} + +bool SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){ + table_offset_t offset; + guint32 freq; + bool result = m_phrase_index.get_content + ((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + + if ( !result) + return result; + + if ( 0 == offset ) + return false; + + result = m_phrase_content.get_content + (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32)); + //protect total_freq overflow + if ( delta > 0 && m_total_freq > m_total_freq + delta ) + return false; + freq += delta; + m_total_freq += delta; + return m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32)); +} + +bool SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){ + table_offset_t offset; + guint8 phrase_length; + guint8 n_prons; + + bool result = m_phrase_index.get_content + ((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + + if ( !result ) + return result; + + if ( 0 == offset ) + return false; + + result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8)); + if ( !result ) + return result; + + result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8)); + if ( !result ) + return result; + + size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) ); + item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL); + return true; +} + +bool SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){ + table_offset_t offset = m_phrase_content.size(); + if ( 0 == offset ) + offset = 8; + m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size()); + m_phrase_index.set_content((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + m_total_freq += item->get_unigram_frequency(); + return true; +} + +bool SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){ + table_offset_t offset; + guint8 phrase_length; + guint8 n_prons; + + bool result = m_phrase_index.get_content + ((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + + if ( !result ) + return result; + + if ( 0 == offset ) + return false; + + result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8)); + if ( !result ) + return result; + + result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8)); + if ( !result ) + return result; + + size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) ); + item = new PhraseItem; + //implictly copy data from m_chunk_content. + item->m_chunk.set_content(0, (char *) m_phrase_content.begin() + offset, length); + + const table_offset_t zero_const = 0; + m_phrase_index.set_content((token & PHRASE_MASK) + * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t)); + m_total_freq -= item->get_unigram_frequency(); + return true; +} + +bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ){ + sub_phrases = new SubPhraseIndex; + } + + bool retval = sub_phrases->load(chunk, 0, chunk->size()); + if ( !retval ) + return retval; + m_total_freq += sub_phrases->get_phrase_index_total_freq(); + return retval; +} + +bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){ + table_offset_t end; + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ) + return false; + + sub_phrases->store(new_chunk, 0, end); + return true; +} + +bool FacadePhraseIndex::unload(guint8 phrase_index){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ) + return false; + m_total_freq -= sub_phrases->get_phrase_index_total_freq(); + delete sub_phrases; + sub_phrases = NULL; + return true; +} + +bool SubPhraseIndex::load(MemoryChunk * chunk, + table_offset_t offset, table_offset_t end){ + //save the memory chunk + if ( m_chunk ){ + delete m_chunk; + m_chunk = NULL; + } + m_chunk = chunk; + + char * buf_begin = (char *)chunk->begin(); + chunk->get_content(offset, &m_total_freq, sizeof(guint32)); + offset += sizeof(guint32); + table_offset_t index_one, index_two, index_three; + chunk->get_content(offset, &index_one, sizeof(table_offset_t)); + offset += sizeof(table_offset_t); + chunk->get_content(offset, &index_two, sizeof(table_offset_t)); + offset += sizeof(table_offset_t); + chunk->get_content(offset, &index_three, sizeof(table_offset_t)); + offset += sizeof(table_offset_t); + g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE); + g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE); + g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE); + m_phrase_index.set_chunk(buf_begin + index_one, + index_two - 1 - index_one, NULL); + m_phrase_content.set_chunk(buf_begin + index_two, + index_three - 1 - index_two, NULL); + g_return_val_if_fail( index_three <= end, FALSE); + return true; +} + +bool SubPhraseIndex::store(MemoryChunk * new_chunk, + table_offset_t offset, table_offset_t& end){ + new_chunk->set_content(offset, &m_total_freq, sizeof(guint32)); + table_offset_t index = offset + sizeof(guint32); + + offset = index + sizeof(table_offset_t) * 3 ; + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size()); + offset += m_phrase_index.size(); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + + new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size()); + offset += m_phrase_content.size(); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + return true; +} + +bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ){ + sub_phrases = new SubPhraseIndex; + } + + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + PhraseItem * item_ptr = new PhraseItem; + phrase_token_t cur_token = 0; + while ( !feof(infile)){ + fscanf(infile, "%s", pinyin); + fscanf(infile, "%s", phrase); + fscanf(infile, "%ld", &token); + fscanf(infile, "%ld", &freq); + if ( feof(infile) ) + break; + + glong written; + utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL, + &written, NULL); + + if ( 0 == cur_token ){ + cur_token = token; + item_ptr->set_phrase_string(written, phrase_utf16); + } + + if ( cur_token != token ){ + add_phrase_item( cur_token, item_ptr); + delete item_ptr; + item_ptr = new PhraseItem; + cur_token = token; + item_ptr->set_phrase_string(written, phrase_utf16); + } + + PinyinDefaultParser parser; + NullPinyinValidator validator; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + parser.parse(validator, keys, poses, pinyin); + + assert ( item_ptr->get_phrase_length() == keys->len ); + item_ptr->append_pronunciation((PinyinKey *)keys->data, freq); + + g_array_free(keys, TRUE); + g_array_free(poses, TRUE); + g_free(phrase_utf16); + } + + add_phrase_item( cur_token, item_ptr); + delete item_ptr; + m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq(); + return true; +} diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h new file mode 100755 index 0000000..e635453 --- /dev/null +++ b/src/storage/phrase_index.h @@ -0,0 +1,250 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef PHRASE_INDEX_H +#define PHRASE_INDEX_H + +#include <stdio.h> +#include <glib.h> +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "memory_chunk.h" + +class PinyinLookup; + +namespace novel{ + +/* Because this is not large, + * Store this in user home directory. + */ + +const int phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32); + +class PhraseItem{ + friend class SubPhraseIndex; +private: + MemoryChunk m_chunk; + bool set_n_pronunciation(guint8 n_prouns); +public: + /* Null Constructor */ + PhraseItem(){ + m_chunk.set_size(phrase_item_header); + memset(m_chunk.begin(), 0, m_chunk.size()); + } + + PhraseItem(MemoryChunk chunk){ + m_chunk = chunk; + assert ( m_chunk.size() >= phrase_item_header); + } + + /* functions */ + guint8 get_phrase_length(){ + char * buf_begin = (char *)m_chunk.begin(); + return (*(guint8 *)buf_begin); + } + + guint8 get_n_pronunciation(){ + char * buf_begin = ( char *) m_chunk.begin(); + return (*(guint8 *)(buf_begin + sizeof(guint8))); + } + + guint32 get_unigram_frequency(){ + char * buf_begin = (char *)m_chunk.begin(); + return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8))); + } + + gfloat get_pinyin_possibility(PinyinCustomSettings & custom, + PinyinKey * pinyin_keys){ + guint8 phrase_length = get_phrase_length(); + guint8 npron = get_n_pronunciation(); + size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ); + char * buf_begin = (char *)m_chunk.begin(); + guint32 matched = 0, total_freq =0; + for ( int i = 0 ; i < npron ; ++i){ + char * pinyin_begin = buf_begin + offset + + i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) ); + guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey)); + total_freq += *freq; + if ( 0 == pinyin_compare_with_ambiguities(custom, + (PinyinKey *)pinyin_begin, + pinyin_keys, + phrase_length)){ + matched += *freq; + } + } + // use preprocessor to avoid zero freq, in gen_pinyin_table. + /* + if ( 0 == total_freq ) + return 0.1; + */ + gfloat retval = matched / (gfloat) total_freq; + /* + if ( 0 == retval ) + return 0.03; + */ + return retval; + } + + void increase_pinyin_possibility(PinyinCustomSettings & custom, + PinyinKey * pinyin_keys, + gint32 delta); + + bool get_phrase_string(utf16_t * phrase); + bool set_phrase_string(guint8 phrase_length, utf16_t * phrase); + bool get_nth_pronunciation(size_t index, + /* out */ PinyinKey * pinyin, + /* out */ guint32 & freq); + /* Normally don't change the first pronunciation, + * which decides the token number. + */ + void append_pronunciation(PinyinKey * pinyin, guint32 freq); + void remove_nth_pronunciation(size_t index); +}; + +/* + * In Sub Phrase Index, token == (token & PHRASE_MASK). + */ + +class SubPhraseIndex{ +private: + guint32 m_total_freq; + MemoryChunk m_phrase_index; + MemoryChunk m_phrase_content; + MemoryChunk * m_chunk; +public: + SubPhraseIndex():m_total_freq(0){ + m_chunk = NULL; + } + + ~SubPhraseIndex(){ + reset(); + } + + void reset(){ + if ( m_chunk ){ + delete m_chunk; + m_chunk = NULL; + } + } + + bool load(MemoryChunk * chunk, + table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, + table_offset_t offset, table_offset_t & end); + + /* Zero-gram */ + guint32 get_phrase_index_total_freq(); + bool add_unigram_frequency(phrase_token_t token, guint32 delta); + /* get_phrase_item function can't modify the phrase item, + * but can increment the freq of the special pronunciation. + */ + bool get_phrase_item(phrase_token_t token, PhraseItem & item); + bool add_phrase_item(phrase_token_t token, PhraseItem * item); + /* remove_phrase_item will substract item->get_unigram_frequency() + * from m_total_freq + */ + bool remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item); +}; + +class FacadePhraseIndex{ + friend class ::PinyinLookup; +private: + guint32 m_total_freq; + SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT]; +public: + FacadePhraseIndex(){ + m_total_freq = 0; + memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices)); + } + + ~FacadePhraseIndex(){ + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){ + if ( m_sub_phrase_indices[i] ){ + delete m_sub_phrase_indices[i]; + m_sub_phrase_indices[i] = NULL; + } + } + } + + /* load/store single sub phrase index, according to the config files. */ + bool load_text(guint8 phrase_index, FILE * infile); + bool load(guint8 phrase_index, MemoryChunk * chunk); + bool store(guint8 phrase_index, MemoryChunk * new_chunk); + bool unload(guint8 phrase_index); + + /* Zero-gram */ + guint32 get_phrase_index_total_freq(){ + return m_total_freq; + } + + bool add_unigram_frequency(phrase_token_t token, guint32 delta){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ) + return false; + m_total_freq += delta; + return sub_phrase->add_unigram_frequency(token, delta); + } + + /* get_phrase_item function can't modify the phrase item */ + bool get_phrase_item(phrase_token_t token, PhraseItem & item){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ) + return false; + return sub_phrase->get_phrase_item(token, item); + } + + bool add_phrase_item(phrase_token_t token, PhraseItem * item){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ){ + sub_phrase = new SubPhraseIndex; + } + m_total_freq += item->get_unigram_frequency(); + return sub_phrase->add_phrase_item(token, item); + } + + bool remove_phrase_item(phrase_token_t token, PhraseItem * & item){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ){ + return false; + } + bool result = sub_phrase->remove_phrase_item(token, item); + if ( !result ) + return result; + m_total_freq -= item->get_unigram_frequency(); + return result; + } +}; + +}; + +using namespace novel; + + + + + +#endif diff --git a/src/storage/pinyin_base.cpp b/src/storage/pinyin_base.cpp new file mode 100644 index 0000000..cffee3c --- /dev/null +++ b/src/storage/pinyin_base.cpp @@ -0,0 +1,1425 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2002,2003,2006 James Su + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "stl_lite.h" +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" + +// Internal data definition + +/** + * struct of pinyin token. + * + * this struct store the informations of a pinyin token + * (an initial or final) + */ +struct PinyinToken +{ + const char *latin; /**< Latin name of the token. */ + const char *zhuyin; /**< Zhuyin name in UTF-8. */ + int latin_len; /**< length of Latin name. */ + int zhuyin_len; /**< length of Chinese name. */ +}; + +/** + * struct to index PinyinToken list. + */ +struct PinyinTokenIndex +{ + int start; + int num; +}; + +static const PinyinToken __pinyin_initials[] = +{ + {"", "", 0, 0}, + {"b", "ㄅ", 1, 1}, + {"c", "ㄘ", 1, 1}, + {"ch","ㄔ", 2, 1}, + {"d", "ㄉ", 1, 1}, + {"f", "ㄈ", 1, 1}, + {"h", "ㄏ", 1, 1}, + {"g", "ㄍ", 1, 1}, + {"j", "ㄐ", 1, 1}, + {"k", "ㄎ", 1, 1}, + {"m", "ㄇ", 1, 1}, + {"n", "ㄋ", 1, 1}, + {"l", "ㄌ", 1, 1}, + {"r", "ㄖ", 1, 1}, + {"p", "ㄆ", 1, 1}, + {"q", "ㄑ", 1, 1}, + {"s", "ㄙ", 1, 1}, + {"sh","ㄕ", 2, 1}, + {"t", "ㄊ", 1, 1}, + {"w", "ㄨ", 1, 1}, //Should be omitted in some case. + {"x", "ㄒ", 1, 1}, + {"y", "ㄧ", 1, 1}, //Should be omitted in some case. + {"z", "ㄗ", 1, 1}, + {"zh","ㄓ", 2, 1} +}; + +static const PinyinToken __pinyin_finals[] = +{ + {"", "", 0, 0}, + {"a", "ㄚ", 1, 1}, + {"ai", "ㄞ", 2, 1}, + {"an", "ㄢ", 2, 1}, + {"ang", "ㄤ", 3, 1}, + {"ao", "ㄠ", 2, 1}, + {"e", "ㄜ", 1, 1}, + {"ea", "ㄝ", 2, 1}, + {"ei", "ㄟ", 2, 1}, + {"en", "ㄣ", 2, 1}, + {"eng", "ㄥ", 3, 1}, + {"er", "ㄦ", 2, 1}, + {"i", "ㄧ", 1, 1}, + {"ia", "ㄧㄚ", 2, 2}, + {"ian", "ㄧㄢ", 3, 2}, + {"iang","ㄧㄤ", 4, 2}, + {"iao", "ㄧㄠ", 3, 2}, + {"ie", "ㄧㄝ", 2, 2}, + {"in", "ㄧㄣ", 2, 2}, + {"ing", "ㄧㄥ", 3, 2}, + {"iong","ㄩㄥ", 4, 2}, + {"iu", "ㄧㄡ", 2, 2}, + {"ng", "ㄣ", 2, 1}, + {"o", "ㄛ", 1, 1}, + {"ong", "ㄨㄥ", 3, 2}, + {"ou", "ㄡ", 2, 1}, + {"u", "ㄨ", 1, 1}, + {"ua", "ㄨㄚ", 2, 2}, + {"uai", "ㄨㄞ", 3, 2}, + {"uan", "ㄨㄢ", 3, 2}, + {"uang","ㄨㄤ", 4, 2}, + {"ue", "ㄩㄝ", 2, 2}, + {"ueng","ㄨㄥ", 4, 2}, + {"ui", "ㄨㄟ", 2, 2}, + {"un", "ㄨㄣ", 2, 2}, + {"uo", "ㄨㄛ", 2, 2}, + {"v", "ㄩ", 1, 1}, + {"van", "ㄩㄢ", 3, 2}, + {"ve", "ㄩㄝ", 2, 2}, + {"vn", "ㄩㄣ", 2, 2} +}; + +static const PinyinToken __pinyin_tones [] = +{ + {"", "", 0, 0}, + {"1", "ˉ", 1, 1}, + {"2", "ˊ", 1, 1}, + {"3", "ˇ", 1, 1}, + {"4", "ˋ", 1, 1}, + {"5", "˙", 1, 1} +}; + +static const PinyinTokenIndex __pinyin_initials_index[] = +{ + //a b c d e f g h i j k l m + {-1,0},{1,1}, {2,2}, {4,1}, {-1,0},{5,1}, {7,1}, {6,1}, {-1,0},{8,1}, {9,1}, {12,1},{10,1}, + //n o p q r s t u v w x y z + {11,1},{-1,0},{14,1},{15,1},{13,1},{16,2},{18,1},{-1,0},{-1,0},{19,1},{20,1},{21,1},{22,2} +}; + +static const PinyinTokenIndex __pinyin_finals_index[] = +{ + //a b c d e f g h i j k l m + {1,5}, {-1,0},{-1,0},{-1,0},{6,6},{-1,0},{-1,0},{-1,0},{12,10},{-1,0},{-1,0},{-1,0},{-1,0}, + //n o p q r s t u v w x y z + {22,1},{23,3},{-1,0},{-1,0},{-1,0},{-1,0},{-1,0},{26,10},{36,4},{-1,0},{-1,0},{-1,0},{-1,0} +}; + + + +static const PinyinInitial __shuang_pin_stone_initial_map [] = +{ + PINYIN_ZeroInitial, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Shi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Chi, // U + PINYIN_Zhi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_stone_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ia, PINYIN_Ua }, // B + { PINYIN_Uan, PINYIN_ZeroFinal }, // C + { PINYIN_Ao, PINYIN_ZeroFinal }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_An, PINYIN_ZeroFinal }, // F + { PINYIN_Ang, PINYIN_ZeroFinal }, // G + { PINYIN_Uang,PINYIN_Iang }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_Ian, PINYIN_ZeroFinal }, // J + { PINYIN_Iao, PINYIN_ZeroFinal }, // K + { PINYIN_In, PINYIN_ZeroFinal }, // L + { PINYIN_Ie, PINYIN_ZeroFinal }, // M + { PINYIN_Iu, PINYIN_ZeroFinal }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Ou, PINYIN_ZeroFinal }, // P + { PINYIN_Ing, PINYIN_Er }, // Q + { PINYIN_En, PINYIN_ZeroFinal }, // R + { PINYIN_Ai, PINYIN_ZeroFinal }, // S + { PINYIN_Ng, PINYIN_Eng }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_Ui }, // V + { PINYIN_Ei, PINYIN_ZeroFinal }, // W + { PINYIN_Uai, PINYIN_Ue }, // X + { PINYIN_Ong, PINYIN_Iong }, // Y + { PINYIN_Un, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_zrm_initial_map [] = +{ + PINYIN_ZeroInitial, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Chi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Shi, // U + PINYIN_Zhi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_zrm_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ou, PINYIN_ZeroFinal }, // B + { PINYIN_Iao, PINYIN_ZeroFinal }, // C + { PINYIN_Uang,PINYIN_Iang }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_En, PINYIN_ZeroFinal }, // F + { PINYIN_Ng, PINYIN_Eng }, // G + { PINYIN_Ang, PINYIN_ZeroFinal }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_An, PINYIN_ZeroFinal }, // J + { PINYIN_Ao, PINYIN_ZeroFinal }, // K + { PINYIN_Ai, PINYIN_ZeroFinal }, // L + { PINYIN_Ian, PINYIN_ZeroFinal }, // M + { PINYIN_In, PINYIN_ZeroFinal }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Un, PINYIN_ZeroFinal }, // P + { PINYIN_Iu, PINYIN_ZeroFinal }, // Q + { PINYIN_Uan, PINYIN_Er }, // R + { PINYIN_Ong, PINYIN_Iong }, // S + { PINYIN_Ue, PINYIN_ZeroFinal }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_Ui }, // V + { PINYIN_Ia, PINYIN_Ua }, // W + { PINYIN_Ie, PINYIN_ZeroFinal }, // X + { PINYIN_Ing, PINYIN_Uai }, // Y + { PINYIN_Ei, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_ms_initial_map [] = +{ + PINYIN_ZeroInitial, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Chi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Shi, // U + PINYIN_Zhi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_ms_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ou, PINYIN_ZeroFinal }, // B + { PINYIN_Iao, PINYIN_ZeroFinal }, // C + { PINYIN_Uang,PINYIN_Iang }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_En, PINYIN_ZeroFinal }, // F + { PINYIN_Ng, PINYIN_Eng }, // G + { PINYIN_Ang, PINYIN_ZeroFinal }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_An, PINYIN_ZeroFinal }, // J + { PINYIN_Ao, PINYIN_ZeroFinal }, // K + { PINYIN_Ai, PINYIN_ZeroFinal }, // L + { PINYIN_Ian, PINYIN_ZeroFinal }, // M + { PINYIN_In, PINYIN_ZeroFinal }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Un, PINYIN_ZeroFinal }, // P + { PINYIN_Iu, PINYIN_ZeroFinal }, // Q + { PINYIN_Uan, PINYIN_Er }, // R + { PINYIN_Ong, PINYIN_Iong }, // S + { PINYIN_Ue, PINYIN_ZeroFinal }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_Ui }, // V + { PINYIN_Ia, PINYIN_Ua }, // W + { PINYIN_Ie, PINYIN_ZeroFinal }, // X + { PINYIN_Uai, PINYIN_V }, // Y + { PINYIN_Ei, PINYIN_ZeroFinal }, // Z + { PINYIN_Ing, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_ziguang_initial_map [] = +{ + PINYIN_Chi, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Shi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Zhi, // U + PINYIN_ZeroInitial, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_ziguang_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Iao, PINYIN_ZeroFinal }, // B + { PINYIN_Ing, PINYIN_ZeroFinal }, // C + { PINYIN_Ie, PINYIN_ZeroFinal }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_Ian, PINYIN_ZeroFinal }, // F + { PINYIN_Uang,PINYIN_Iang }, // G + { PINYIN_Ong, PINYIN_Iong }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_Iu, PINYIN_Er }, // J + { PINYIN_Ei, PINYIN_ZeroFinal }, // K + { PINYIN_Uan, PINYIN_ZeroFinal }, // L + { PINYIN_Un, PINYIN_ZeroFinal }, // M + { PINYIN_Ui, PINYIN_Ue }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Ai, PINYIN_ZeroFinal }, // P + { PINYIN_Ao, PINYIN_ZeroFinal }, // Q + { PINYIN_An, PINYIN_ZeroFinal }, // R + { PINYIN_Ang, PINYIN_ZeroFinal }, // S + { PINYIN_Ng, PINYIN_Eng }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_ZeroFinal }, // V + { PINYIN_En, PINYIN_ZeroFinal }, // W + { PINYIN_Ia, PINYIN_Ua }, // X + { PINYIN_In, PINYIN_Uai }, // Y + { PINYIN_Ou, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_abc_initial_map [] = +{ + PINYIN_Zhi, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_Chi, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_ZeroInitial, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_ZeroInitial, // U + PINYIN_Shi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_abc_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ou, PINYIN_ZeroFinal }, // B + { PINYIN_In, PINYIN_Uai }, // C + { PINYIN_Ia, PINYIN_Ua }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_En, PINYIN_ZeroFinal }, // F + { PINYIN_Ng, PINYIN_Eng }, // G + { PINYIN_Ang, PINYIN_ZeroFinal }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_An, PINYIN_ZeroFinal }, // J + { PINYIN_Ao, PINYIN_ZeroFinal }, // K + { PINYIN_Ai, PINYIN_ZeroFinal }, // L + { PINYIN_Ui, PINYIN_Ue }, // M + { PINYIN_Un, PINYIN_ZeroFinal }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Uan, PINYIN_ZeroFinal }, // P + { PINYIN_Ei, PINYIN_ZeroFinal }, // Q + { PINYIN_Iu, PINYIN_Er }, // R + { PINYIN_Ong, PINYIN_Iong }, // S + { PINYIN_Uang,PINYIN_Iang }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_ZeroFinal }, // V + { PINYIN_Ian, PINYIN_ZeroFinal }, // W + { PINYIN_Ie, PINYIN_ZeroFinal }, // X + { PINYIN_Ing, PINYIN_ZeroFinal }, // Y + { PINYIN_Iao, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_liushi_initial_map [] = +{ + PINYIN_ZeroInitial, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Chi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Shi, // U + PINYIN_Zhi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_liushi_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ao, PINYIN_ZeroFinal }, // B + { PINYIN_Ang, PINYIN_ZeroFinal }, // C + { PINYIN_Uan, PINYIN_ZeroFinal }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_An, PINYIN_ZeroFinal }, // F + { PINYIN_Ong, PINYIN_Iong }, // G + { PINYIN_Ui, PINYIN_Ue }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_Ia, PINYIN_Ua }, // J + { PINYIN_Un, PINYIN_ZeroFinal }, // K + { PINYIN_Iu, PINYIN_ZeroFinal }, // L + { PINYIN_In, PINYIN_ZeroFinal }, // M + { PINYIN_Uang,PINYIN_Iang }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Ng, PINYIN_Eng }, // P + { PINYIN_Ing, PINYIN_ZeroFinal }, // Q + { PINYIN_Ou, PINYIN_Er }, // R + { PINYIN_Ai, PINYIN_ZeroFinal }, // S + { PINYIN_Ian, PINYIN_ZeroFinal }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_En }, // V + { PINYIN_Ei, PINYIN_ZeroFinal }, // W + { PINYIN_Ie, PINYIN_ZeroFinal }, // X + { PINYIN_Uai, PINYIN_ZeroFinal }, // Y + { PINYIN_Iao, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + +static const size_t __zhuyin_zhuyin_map_start_char = 0x3105; +static const size_t __zhuyin_zhuyin_map_tone_start_idx = 37; +static const PinyinKey __zhuyin_zhuyin_map [][3] = +{ + {PinyinKey(PINYIN_Bo),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Po),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Mo),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Fo),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_De),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Te),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ne),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Le),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ge),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ke),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_He),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ji),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Qi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Xi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Zhi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Chi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Shi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ri),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Zi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ci),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Si),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_A),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_O),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_E),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ea),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ai),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ei),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ao),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ou),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_An),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_En),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ang),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Eng),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Er),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_I),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_U),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_V),PinyinKey(),PinyinKey()}, +}; + +static const size_t __zhuyin_map_start_char = 0x20; +#include "pinyin_zhuyin_map_data.h" + +static const PinyinKey (*__zhuyin_maps []) [3] = { + __zhuyin_zhuyin_map, + __zhuyin_standard_map, + __zhuyin_hsu_map, + __zhuyin_ibm_map, + __zhuyin_gin_yieh_map, + __zhuyin_et_map, + __zhuyin_et26_map, + 0 +}; + + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinCustomSettings + +PinyinCustomSettings::PinyinCustomSettings () + : use_incomplete (true) +{ + for (size_t i=0; i<=PINYIN_AmbLast; ++i) + use_ambiguities [i] = false; +} + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinKey + +const guint16 PinyinKey::min_value = 0; +const guint16 PinyinKey::max_value = PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones - 1; + +const char* +PinyinKey::get_initial_string () const +{ + return __pinyin_initials [m_initial].latin; +} + +const char* +PinyinKey::get_initial_zhuyin_string () const +{ + if ((m_initial == PINYIN_Wu && m_final == PINYIN_U) || + (m_initial == PINYIN_Yi && + (m_final == PINYIN_I || m_final == PINYIN_In || m_final == PINYIN_Ing || m_final == PINYIN_Ong || + m_final == PINYIN_U || m_final == PINYIN_Ue || m_final == PINYIN_Uan || m_final == PINYIN_Un))) + return ""; + + return __pinyin_initials [m_initial].zhuyin; +} + +const char* +PinyinKey::get_final_string () const +{ + return __pinyin_finals [m_final].latin; +} + +const char* +PinyinKey::get_final_zhuyin_string () const +{ + if (m_initial == PINYIN_Yi && m_final == PINYIN_Ong) { + return __pinyin_finals [PINYIN_Iong].zhuyin; + } else if (m_initial == PINYIN_Yi || m_initial == PINYIN_Ji || m_initial == PINYIN_Qi || m_initial == PINYIN_Xi) { + switch (m_final) { + case PINYIN_U: + return __pinyin_finals [PINYIN_V].zhuyin; + case PINYIN_Ue: + return __pinyin_finals [PINYIN_Ve].zhuyin; + case PINYIN_Uan: + return __pinyin_finals [PINYIN_Van].zhuyin; + case PINYIN_Un: + return __pinyin_finals [PINYIN_Vn].zhuyin; + } + if (m_initial == PINYIN_Yi && m_final == PINYIN_E) + return __pinyin_finals [PINYIN_Ea].zhuyin; + } else if ((m_initial == PINYIN_Ne || m_initial == PINYIN_Le) && m_final == PINYIN_Ue) { + return __pinyin_finals [PINYIN_Ve].zhuyin; + } else if ((m_initial == PINYIN_Zhi || m_initial == PINYIN_Chi || m_initial == PINYIN_Shi || + m_initial == PINYIN_Zi || m_initial == PINYIN_Ci || m_initial == PINYIN_Si || + m_initial == PINYIN_Ri) && m_final == PINYIN_I) { + return ""; + } + + return __pinyin_finals [m_final].zhuyin; +} + +const char* +PinyinKey::get_tone_string () const +{ + return __pinyin_tones [m_tone].latin; +} + +const char* +PinyinKey::get_tone_zhuyin_string () const +{ + return __pinyin_tones [m_tone].zhuyin; +} + +const char * +PinyinKey::get_key_string () const +{ + char key [16]; + g_snprintf (key, 15, "%s%s%s", get_initial_string(), get_final_string(), get_tone_string ()); + + return g_strdup(key); +} + +const char * +PinyinKey::get_key_zhuyin_string () const +{ + char key [32]; + g_snprintf (key, 31, "%s%s%s", get_initial_zhuyin_string(), get_final_zhuyin_string(), get_tone_zhuyin_string ()); + + return g_strdup (key); +} + +int +PinyinKey::set (const PinyinValidator &validator, const char *str, int len) +{ + if (!str || ! (*str)) + return 0; + + PinyinDefaultParser parser; + + return parser.parse_one_key (validator, *this, str, len); +} + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinValidator +BitmapPinyinValidator::BitmapPinyinValidator (const PinyinLargeTable *table) +{ + initialize (table); +} + +void +BitmapPinyinValidator::initialize (const PinyinLargeTable *table) +{ + memset (m_bitmap, 0, sizeof (m_bitmap)); + + if (!table) return; + + for (guint16 val=0; val<=PinyinKey::max_value; ++val) + if (!table->has_key (PinyinKey (val))) + m_bitmap [val >> 3] |= (1 << (val % 8)); +} + +bool +BitmapPinyinValidator::operator () (PinyinKey key) const +{ + if (key.is_empty ()) return false; + + guint16 val = key.get_value (); + + return (m_bitmap [ val >> 3 ] & (1 << (val % 8))) == 0; +} + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinParser +PinyinParser::~PinyinParser () +{ +} + +struct PinyinReplaceRulePair +{ + PinyinInitial initial; + PinyinFinal final; + PinyinInitial new_initial; + PinyinFinal new_final; +}; + +class PinyinReplaceRulePairLessThan +{ +public: + bool operator () (const PinyinReplaceRulePair &lhs, const PinyinReplaceRulePair &rhs) const { + if (lhs.initial < rhs.initial) return true; + if (lhs.initial > rhs.initial) return false; + return lhs.final < rhs.final; + } +}; + +void +PinyinParser::normalize (PinyinKey &key) +{ + static const PinyinReplaceRulePair rules [] = + { +#if 0 + {PINYIN_ZeroInitial, PINYIN_I, PINYIN_Yi, PINYIN_I}, + {PINYIN_ZeroInitial, PINYIN_Ia, PINYIN_Yi, PINYIN_A}, + {PINYIN_ZeroInitial, PINYIN_Ian, PINYIN_Yi, PINYIN_An}, + {PINYIN_ZeroInitial, PINYIN_Iang, PINYIN_Yi, PINYIN_Ang}, + {PINYIN_ZeroInitial, PINYIN_Iao, PINYIN_Yi, PINYIN_Ao}, + {PINYIN_ZeroInitial, PINYIN_Ie, PINYIN_Yi, PINYIN_E}, + {PINYIN_ZeroInitial, PINYIN_In, PINYIN_Yi, PINYIN_In}, + {PINYIN_ZeroInitial, PINYIN_Ing, PINYIN_Yi, PINYIN_Ing}, + {PINYIN_ZeroInitial, PINYIN_Iong, PINYIN_Yi, PINYIN_Ong}, + {PINYIN_ZeroInitial, PINYIN_Iu, PINYIN_Yi, PINYIN_Ou}, + {PINYIN_ZeroInitial, PINYIN_U, PINYIN_Wu, PINYIN_U}, + {PINYIN_ZeroInitial, PINYIN_Ua, PINYIN_Wu, PINYIN_A}, + {PINYIN_ZeroInitial, PINYIN_Uai, PINYIN_Wu, PINYIN_Ai}, + {PINYIN_ZeroInitial, PINYIN_Uan, PINYIN_Wu, PINYIN_An}, + {PINYIN_ZeroInitial, PINYIN_Uang, PINYIN_Wu, PINYIN_Ang}, + {PINYIN_ZeroInitial, PINYIN_Ue, PINYIN_Wu, PINYIN_E}, + {PINYIN_ZeroInitial, PINYIN_Ueng, PINYIN_Wu, PINYIN_Eng}, + {PINYIN_ZeroInitial, PINYIN_Ui, PINYIN_Wu, PINYIN_Ei}, + {PINYIN_ZeroInitial, PINYIN_Un, PINYIN_Wu, PINYIN_En}, + {PINYIN_ZeroInitial, PINYIN_Uo, PINYIN_Wu, PINYIN_O}, + {PINYIN_ZeroInitial, PINYIN_V, PINYIN_Yi, PINYIN_U}, + {PINYIN_ZeroInitial, PINYIN_Van, PINYIN_Yi, PINYIN_Uan}, + {PINYIN_ZeroInitial, PINYIN_Ve, PINYIN_Yi, PINYIN_Ue}, + {PINYIN_ZeroInitial, PINYIN_Vn, PINYIN_Yi, PINYIN_Un}, +#endif + {PINYIN_Ji, PINYIN_V, PINYIN_Ji, PINYIN_U}, + {PINYIN_Ji, PINYIN_Van, PINYIN_Ji, PINYIN_Uan}, + {PINYIN_Ji, PINYIN_Ve, PINYIN_Ji, PINYIN_Ue}, + {PINYIN_Ji, PINYIN_Vn, PINYIN_Ji, PINYIN_Un}, + {PINYIN_Ne, PINYIN_Ve, PINYIN_Ne, PINYIN_Ue}, + {PINYIN_Le, PINYIN_Ve, PINYIN_Le, PINYIN_Ue}, + {PINYIN_Qi, PINYIN_V, PINYIN_Qi, PINYIN_U}, + {PINYIN_Qi, PINYIN_Van, PINYIN_Qi, PINYIN_Uan}, + {PINYIN_Qi, PINYIN_Ve, PINYIN_Qi, PINYIN_Ue}, + {PINYIN_Qi, PINYIN_Vn, PINYIN_Qi, PINYIN_Un}, + {PINYIN_Xi, PINYIN_V, PINYIN_Xi, PINYIN_U}, + {PINYIN_Xi, PINYIN_Van, PINYIN_Xi, PINYIN_Uan}, + {PINYIN_Xi, PINYIN_Ve, PINYIN_Xi, PINYIN_Ue}, + {PINYIN_Xi, PINYIN_Vn, PINYIN_Xi, PINYIN_Un} + }; + static const PinyinReplaceRulePair *rules_start = rules; + static const PinyinReplaceRulePair *rules_end = rules + sizeof(rules)/sizeof(PinyinReplaceRulePair); + + PinyinReplaceRulePair kp; + + kp.initial = key.get_initial (); + kp.final = key.get_final (); + + const PinyinReplaceRulePair *p = std_lite::lower_bound (rules_start, rules_end, kp, PinyinReplaceRulePairLessThan ()); + + if (p->initial == kp.initial && p->final == kp.final) { + key.set_initial (p->new_initial); + key.set_final (p->new_final); + } +} + +//============== Internal functions used by PinyinDefaultParser ============== +static int +__default_parser_parse_initial (PinyinInitial &initial, const char *str, int len) +{ + int lastlen = 0; + + initial = PINYIN_ZeroInitial; + + if (str && *str >= 'a' && *str <= 'z') { + int start = __pinyin_initials_index [*str - 'a'].start; + int end = __pinyin_initials_index [*str - 'a'].num + start; + + if (start > 0) { + for (int i = start; i < end; ++i) { + if ((len < 0 || len >= __pinyin_initials [i].latin_len) && __pinyin_initials [i].latin_len >= lastlen) { + int j; + for (j = 1; j < __pinyin_initials [i].latin_len; ++j) { + if (str [j] != __pinyin_initials [i].latin [j]) + break; + } + if (j == __pinyin_initials [i].latin_len) { + initial = static_cast<PinyinInitial>(i); + lastlen = __pinyin_initials [i].latin_len; + } + } + } + } + } + + return lastlen; +} +static int +__default_parser_parse_final (PinyinFinal &final, const char *str, int len) +{ + int lastlen = 0; + + final = PINYIN_ZeroFinal; + + if (str && *str >= 'a' && *str <= 'z') { + int start = __pinyin_finals_index [*str - 'a'].start; + int end = __pinyin_finals_index [*str - 'a'].num + start; + + if (start > 0) { + for (int i = start; i < end; ++i) { + if ((len < 0 || len >= __pinyin_finals [i].latin_len) && __pinyin_finals [i].latin_len >= lastlen) { + int j; + for (j = 1; j < __pinyin_finals [i].latin_len; ++j) { + if (str [j] != __pinyin_finals [i].latin [j]) + break; + } + if (j == __pinyin_finals [i].latin_len) { + final = static_cast<PinyinFinal>(i); + lastlen = __pinyin_finals [i].latin_len; + } + } + } + } + } + + return lastlen; +} +static int +__default_parser_parse_tone (PinyinTone &tone, const char *str, int len) +{ + tone = PINYIN_ZeroTone; + + if (str && (len >= 1 || len < 0)) { + int kt = (*str) - '0'; + if (kt >= PINYIN_First && kt <= PINYIN_LastTone) { + tone = static_cast<PinyinTone>(kt); + return 1; + } + } + return 0; +} + +static int +__default_parser_parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) +{ + int initial_len = 0; + int final_len = 0; + int tone_len = 0; + + const char *ptr; + + PinyinInitial initial; + PinyinFinal final; + PinyinTone tone; + + key.clear (); + + if (!str || !len) return 0; + + if (len < 0) len = strlen (str); + + while (len > 0) { + ptr = str; + + initial = PINYIN_ZeroInitial; + final = PINYIN_ZeroFinal; + tone = PINYIN_ZeroTone; + + final_len = __default_parser_parse_final (final, ptr, len); + ptr += final_len; + len -= final_len; + + // An initial is present + if (final == PINYIN_ZeroFinal) { + initial_len = __default_parser_parse_initial (initial, ptr, len); + ptr += initial_len; + len -= initial_len; + if (len){ + final_len = __default_parser_parse_final (final, ptr, len); + ptr += final_len; + len -= final_len; + } + } + + if (len) + tone_len = __default_parser_parse_tone (tone, ptr, len); + + key.set (initial, final, tone); + + PinyinParser::normalize (key); + + // A valid key was found, return. + if (validator (key)) break; + + // The key is invalid, reduce the len and find again. + len = initial_len + final_len + tone_len - 1; + + initial_len = final_len = tone_len = 0; + + key.clear (); + } + + len = initial_len + final_len + tone_len; + + return len; +} + +struct DefaultParserCacheElement +{ + PinyinKey key; + PinyinKeyPos pos; + int num_keys; + int parsed_len; + int next_start; +}; + +typedef GArray* DefaultParserCache; /* Array of DefaultParserCacheElement */ + +static int +__default_parser_parse_recursive (const PinyinValidator &validator, + DefaultParserCache &cache, + int &real_start, + int &num_keys, + const char *str, + int len, + int start) +{ + if (*str == 0 || len == 0) return 0; + + int used_len = 0; + + real_start = 0; + num_keys = 0; + + if (*str == '\'' || *str == ' ') { + ++used_len; + ++str; + ++start; + --len; + } + + if (!isalpha (*str) || !len) + return 0; + + real_start = start; + + // The best keys start from this position have been found, just return the result. + DefaultParserCacheElement* element = &g_array_index + (cache, DefaultParserCacheElement, start); + + + if (element->num_keys >=0) { + num_keys = element->num_keys; + return element->parsed_len; + } + + PinyinKey first_key; + PinyinKey best_first_key; + PinyinKeyPos pos; + + int first_len = 0; + int best_first_len = 0; + + int remained_len = 0; + int best_remained_len = 0; + + int remained_keys = 0; + int best_remained_keys = 0; + + int remained_start = 0; + int best_remained_start = 0; + + first_len = __default_parser_parse_one_key (validator, first_key, str, len); + + if (!first_len) { + element = &g_array_index(cache, DefaultParserCacheElement, start); + + element->key = PinyinKey (); + element->num_keys = 0; + element->parsed_len = 0; + element->next_start = start; + return 0; + } + + best_first_key = first_key; + best_first_len = first_len; + + if (len > first_len) { + char ch1 = str [first_len -1]; + char ch2 = str [first_len]; + + best_remained_len = __default_parser_parse_recursive (validator, + cache, + best_remained_start, + best_remained_keys, + str + first_len, + len - first_len, + start + first_len); + + // For those keys which the last char is 'g' or 'n' or 'r', try put the end char into the next key. + if (first_len > 1 && + (((ch1=='g' || ch1=='n' || ch1=='r') && (ch2=='a' || ch2=='e' || ch2=='i' || ch2=='o' || ch2=='u' || ch2=='v')) || + ((ch1=='a' || ch1=='e' || ch1=='o') && (ch2=='i' || ch2=='n' || ch2=='o' || ch2=='r' || ch2=='u')))) { + + first_len = __default_parser_parse_one_key (validator, first_key, str, first_len - 1); + + if (first_len) { + remained_len = __default_parser_parse_recursive (validator, + cache, + remained_start, + remained_keys, + str + first_len, + len - first_len, + start + first_len); + + + DefaultParserCacheElement* best_remained_element = &g_array_index + (cache, DefaultParserCacheElement, best_remained_start); + + // A better seq was found. + if (remained_len != 0 && (remained_len + first_len) >= (best_remained_len + best_first_len) && + (remained_keys <= best_remained_keys || best_remained_keys == 0)) { +#if 0 + if ((remained_len + first_len) > (best_remained_len + best_first_len) || + remained_keys < best_remained_keys || + best_remained_element->key.get_final () == PINYIN_ZeroFinal || + best_remained_element->key.get_initial () == PINYIN_Wu || + best_remained_element->key.get_initial () == PINYIN_Yi) { +#endif + best_first_len = first_len; + best_first_key = first_key; + best_remained_len = remained_len; + best_remained_keys = remained_keys; + best_remained_start = remained_start; +#if 0 + } +#endif + } + } + } + } + + num_keys = best_remained_keys + 1; + + + element = &g_array_index + (cache, DefaultParserCacheElement, start); + + pos.set_pos(start); + pos.set_length(best_first_len); + + element->key = best_first_key; + element->pos = pos; + element->num_keys = num_keys; + element->parsed_len = used_len + best_first_len + best_remained_len; + element->next_start = best_remained_start; + + return element->parsed_len; +} +//============================================================================ + +PinyinDefaultParser::~PinyinDefaultParser () +{ +} + +int +PinyinDefaultParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const +{ + return __default_parser_parse_one_key (validator, key, str, len); +} + +int +PinyinDefaultParser::parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len) const +{ + g_array_set_size(keys, 0); + g_array_set_size(poses, 0); + + if (!str || !len) return 0; + + if (len < 0) len = strlen (str); + + DefaultParserCacheElement elm; + + elm.num_keys = -1L; + elm.parsed_len = 0; + elm.next_start = 0; + + DefaultParserCache cache = g_array_new (FALSE, TRUE, sizeof (DefaultParserCacheElement)); + g_array_set_size(cache, len); + for ( size_t index = 0 ; index < len ; index++){ + DefaultParserCacheElement * element = + &g_array_index(cache,DefaultParserCacheElement, index); + *element = elm; + } + int start = 0; + int num_keys = 0; + + len = __default_parser_parse_recursive (validator, cache, start, num_keys, str, len, 0); + + for (size_t i=0; i<(size_t)num_keys; ++i) { + DefaultParserCacheElement* element = &g_array_index + (cache, DefaultParserCacheElement, start); + g_array_append_val(keys, element->key); + g_array_append_val(poses, element->pos); + start = element->next_start; + } + + return len; +} + +PinyinShuangPinParser::PinyinShuangPinParser (PinyinShuangPinScheme scheme) +{ + set_scheme (scheme); +} + +PinyinShuangPinParser::PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]) +{ + set_scheme (initial_map, final_map); +} + +PinyinShuangPinParser::~PinyinShuangPinParser () +{ +} + +int +PinyinShuangPinParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const +{ + key.clear (); + + if (!str || !len || ! (*str)) return 0; + + if (len < 0) len = strlen (str); + + PinyinInitial initial = PINYIN_ZeroInitial; + PinyinFinal final = PINYIN_ZeroFinal; + PinyinFinal final_cands [4] = { PINYIN_ZeroFinal, PINYIN_ZeroFinal, PINYIN_ZeroFinal, PINYIN_ZeroFinal }; + + PinyinTone tone = PINYIN_ZeroTone; + + int idx [2] = {-1, -1}; + int used_len = 0; + + size_t i; + bool matched = false; + + for (i = 0; i < 2 && i < (size_t) len; ++i) { + if (str [i] >= 'a' && str [i] <= 'z') idx [i] = str [i] - 'a'; + else if (str [i] == ';') idx [i] = 26; + } + + // parse initial or final + if (idx [0] >= 0) { + initial = m_initial_map [idx[0]]; + final_cands [0] = m_final_map [idx[0]][0]; + final_cands [1] = m_final_map [idx[0]][1]; + } + + if (initial == PINYIN_ZeroInitial && final_cands [0] == PINYIN_ZeroFinal) + return 0; + + // parse final, if str [0] == 'o' (idx [0] == 14) then just skip to parse final. + if (idx [1] >= 0 && (initial != PINYIN_ZeroInitial || idx[0] == 14)) { + final_cands [2] = m_final_map [idx [1]][0]; + final_cands [3] = m_final_map [idx [1]][1]; + + for (i = 2; i < 4; ++i) { + if (final_cands [i] != PINYIN_ZeroFinal) { + key.set (initial, final_cands [i]); + PinyinParser::normalize (key); + + if (validator (key)) { + final = final_cands [i]; + matched = true; + used_len = 2; + str += 2; + len -= 2; + break; + } + } + } + } + + if (!matched) { + initial = PINYIN_ZeroInitial; + for (i = 0; i < 2; ++i) { + key.set (initial, final_cands [i]); + PinyinParser::normalize (key); + + if (validator (key)) { + final = final_cands [i]; + matched = true; + used_len = 1; + ++str; + --len; + break; + } + } + } + + if (!matched) return 0; + + // parse tone + if (len) { + int kt = (*str) - '0'; + if (kt >= PINYIN_First && kt <= PINYIN_LastTone) { + tone = static_cast<PinyinTone>(kt); + + key.set (initial, final, tone); + + if (validator (key)) { + return used_len + 1; + } + } + } + + return used_len; +} + +int +PinyinShuangPinParser::parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len) const +{ + g_array_set_size(keys, 0); + g_array_set_size(poses, 0); + + if (!str || !len || ! (*str)) return 0; + + if (len < 0) len = strlen (str); + + int used_len = 0; + + PinyinKey key; + PinyinKeyPos pos; + + while (used_len < len) { + if (*str == '\'' || *str == ' ') { + ++str; + ++used_len; + continue; + } + + int one_len = parse_one_key (validator, key, str, len); + + if (one_len) { + pos.set_pos(used_len); + pos.set_length(one_len); + g_array_append_val(keys, key); + g_array_append_val(poses, pos); + } else { + break; + } + + str += one_len; + used_len += one_len; + } + + return used_len; +} + +void +PinyinShuangPinParser::set_scheme (PinyinShuangPinScheme scheme) +{ + switch (scheme) { + case SHUANG_PIN_STONE: + set_scheme (__shuang_pin_stone_initial_map, __shuang_pin_stone_final_map); + break; + case SHUANG_PIN_ZRM: + set_scheme (__shuang_pin_zrm_initial_map, __shuang_pin_zrm_final_map); + break; + case SHUANG_PIN_MS: + set_scheme (__shuang_pin_ms_initial_map, __shuang_pin_ms_final_map); + break; + case SHUANG_PIN_ZIGUANG: + set_scheme (__shuang_pin_ziguang_initial_map, __shuang_pin_ziguang_final_map); + break; + case SHUANG_PIN_ABC: + set_scheme (__shuang_pin_abc_initial_map, __shuang_pin_abc_final_map); + break; + case SHUANG_PIN_LIUSHI: + set_scheme (__shuang_pin_liushi_initial_map, __shuang_pin_liushi_final_map); + break; + default: + set_scheme (__shuang_pin_zrm_initial_map, __shuang_pin_zrm_final_map); + return; + } +} + +void +PinyinShuangPinParser::set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]) +{ + for (size_t i = 0; i < 27; ++i) { + m_initial_map [i] = initial_map [i]; + m_final_map [i][0] = final_map [i][0]; + m_final_map [i][1] = final_map [i][1]; + } +} + +void +PinyinShuangPinParser::get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2]) +{ + for (size_t i = 0; i < 27; ++i) { + initial_map [i] = m_initial_map [i]; + final_map [i][0] = m_final_map [i][0]; + final_map [i][1] = m_final_map [i][1]; + } +} + +namespace novel{ + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinKey comparision classe +int pinyin_compare_initial (const PinyinCustomSettings &custom, + PinyinInitial lhs, + PinyinInitial rhs) +{ + if ((lhs == rhs) || + (custom.use_ambiguities [PINYIN_AmbZhiZi] && + ((lhs == PINYIN_Zhi && rhs == PINYIN_Zi) || + (lhs == PINYIN_Zi && rhs == PINYIN_Zhi))) || + + (custom.use_ambiguities [PINYIN_AmbChiCi] && + ((lhs == PINYIN_Chi && rhs == PINYIN_Ci) || + (lhs == PINYIN_Ci && rhs == PINYIN_Chi))) || + + (custom.use_ambiguities [PINYIN_AmbShiSi] && + ((lhs == PINYIN_Shi && rhs == PINYIN_Si) || + (lhs == PINYIN_Si && rhs == PINYIN_Shi))) || + + (custom.use_ambiguities [PINYIN_AmbLeRi] && + ((lhs == PINYIN_Le && rhs == PINYIN_Ri) || + (lhs == PINYIN_Ri && rhs == PINYIN_Le))) || + + (custom.use_ambiguities [PINYIN_AmbNeLe] && + ((lhs == PINYIN_Ne && rhs == PINYIN_Le) || + (lhs == PINYIN_Le && rhs == PINYIN_Ne))) || + + (custom.use_ambiguities [PINYIN_AmbFoHe] && + ((lhs == PINYIN_Fo && rhs == PINYIN_He) || + (lhs == PINYIN_He && rhs == PINYIN_Fo))) + ) + return 0; + else if (lhs < rhs) return -1; + return 1; +} + +int pinyin_compare_final (const PinyinCustomSettings &custom, + PinyinFinal lhs, + PinyinFinal rhs) +{ + if(((lhs == rhs) || + (custom.use_ambiguities [PINYIN_AmbAnAng] && + ((lhs == PINYIN_An && rhs == PINYIN_Ang) || + (lhs == PINYIN_Ang && rhs == PINYIN_An))) || + + (custom.use_ambiguities [PINYIN_AmbEnEng] && + ((lhs == PINYIN_En && rhs == PINYIN_Eng) || + (lhs == PINYIN_Eng && rhs == PINYIN_En))) || + + (custom.use_ambiguities [PINYIN_AmbInIng] && + ((lhs == PINYIN_In && rhs == PINYIN_Ing) || + (lhs == PINYIN_Ing && rhs == PINYIN_In))))) + return 0; + else if (custom.use_incomplete && (lhs == PINYIN_ZeroFinal || rhs == PINYIN_ZeroFinal)) + return 0; + else if (lhs < rhs) return -1; + return 1; +} + +int pinyin_compare_tone (const PinyinCustomSettings &custom, + PinyinTone lhs, + PinyinTone rhs) +{ + if(lhs == rhs || !lhs || !rhs) + return 0; + else if (lhs < rhs) return -1; + return 1; +} + +}; diff --git a/src/storage/pinyin_base.h b/src/storage/pinyin_base.h new file mode 100644 index 0000000..374cc53 --- /dev/null +++ b/src/storage/pinyin_base.h @@ -0,0 +1,728 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2002,2003,2006 James Su + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/** @file pinyin_base.h + * @brief the definitions of pinyin related classes and structs. + */ + +#ifndef PINYIN_BASE_H +#define PINYIN_BASE_H + +#include <glib.h> + +namespace novel{ + +// Predefinition of some classes and structs +struct PinyinKey; + +class PinyinValidator; +class PinyinParser; + +struct PinyinKeyPos{ + int m_pos; + size_t m_len; + PinyinKeyPos(){ + m_pos = 0; + m_len = 0; + } + void set_pos(int pos){ + m_pos = pos; + } + void set_length(size_t len){ + m_len = len; + } + int get_pos(){ + return m_pos; + } + int get_end_pos(){ + return m_pos + m_len; + } + size_t get_length(){ + return m_len; + } +}; + +typedef GArray* PinyinKeyVector; /* Array of PinyinKey */ +typedef GArray* PinyinKeyPosVector; /* Array of PinyinKeyPos */ + + +struct PinyinCustomSettings; + +/** + * @brief enums of pinyin initial element. + * + * A pinyin key can be divided into three tokens: + * Initial -- such as B P M F D T N L etc. + * Final -- such as A O E I U V etc. + * Tone -- can be 1, 2, 3, 4 and 5. + */ +enum PinyinInitial +{ + PINYIN_ZeroInitial = 0, /**< zero initial. indicates invaild initial */ + PINYIN_Bo = 1, + PINYIN_Ci = 2, + PINYIN_Chi = 3, + PINYIN_De = 4, + PINYIN_Fo = 5, + PINYIN_He = 6, + PINYIN_Ge = 7, + PINYIN_Ji = 8, + PINYIN_Ke = 9, + PINYIN_Mo =10, + PINYIN_Ne =11, + PINYIN_Le =12, + PINYIN_Ri =13, + PINYIN_Po =14, + PINYIN_Qi =15, + PINYIN_Si =16, + PINYIN_Shi =17, + PINYIN_Te =18, + PINYIN_Wu =19, + PINYIN_Xi =20, + PINYIN_Yi =21, + PINYIN_Zi =22, + PINYIN_Zhi =23, + PINYIN_LastInitial = PINYIN_Zhi, /**< the last initial */ + PINYIN_Number_Of_Initials = PINYIN_LastInitial + 1 +}; + +/** + * @brief enums of pinyin final element. + */ +enum PinyinFinal +{ + PINYIN_ZeroFinal = 0, /**< zero final. indicates invalid final */ + PINYIN_A = 1, + PINYIN_Ai = 2, + PINYIN_An = 3, + PINYIN_Ang = 4, + PINYIN_Ao = 5, + PINYIN_E = 6, + PINYIN_Ea = 7, + PINYIN_Ei = 8, + PINYIN_En = 9, + PINYIN_Eng =10, + PINYIN_Er =11, + PINYIN_I =12, + PINYIN_Ia =13, + PINYIN_Ian =14, + PINYIN_Iang =15, + PINYIN_Iao =16, + PINYIN_Ie =17, + PINYIN_In =18, + PINYIN_Ing =19, + PINYIN_Iong =20, + PINYIN_Iu =21, + PINYIN_Ng =22, + PINYIN_O =23, + PINYIN_Ong =24, + PINYIN_Ou =25, + PINYIN_U =26, + PINYIN_Ua =27, + PINYIN_Uai =28, + PINYIN_Uan =29, + PINYIN_Uang =30, + PINYIN_Ue =31, + PINYIN_Ueng =32, + PINYIN_Ui =33, + PINYIN_Un =34, + PINYIN_Uo =35, + PINYIN_V =36, + PINYIN_Van =37, + PINYIN_Ve =38, + PINYIN_Vn =39, + PINYIN_LastFinal = PINYIN_Vn, /**< the last final */ + PINYIN_Number_Of_Finals = PINYIN_LastFinal + 1 +}; + +/** + * @brief enums of pinyin tone element. + */ +enum PinyinTone +{ + PINYIN_ZeroTone = 0, /**< zero tone. this will be matched with all other tones. */ + PINYIN_First = 1, + PINYIN_Second = 2, + PINYIN_Third = 3, + PINYIN_Fourth = 4, + PINYIN_Fifth = 5, + PINYIN_LastTone = PINYIN_Fifth, /**< the last tone */ + PINYIN_Number_Of_Tones = PINYIN_LastTone + 1 +}; + +/** + * @brief enums of Shuang Pin Schemes. + */ +enum PinyinShuangPinScheme +{ + SHUANG_PIN_STONE = 0, + SHUANG_PIN_ZRM = 1, + SHUANG_PIN_MS = 2, + SHUANG_PIN_ZIGUANG = 3, + SHUANG_PIN_ABC = 4, + SHUANG_PIN_LIUSHI = 5, + SHUANG_PIN_CUSTOMIZED = 6, + SHUANG_PIN_DEFAULT = SHUANG_PIN_ZRM +}; + +/** + * @brief enums of ZhuYin Schemes. + */ +enum PinyinZhuYinScheme +{ + ZHUYIN_ZHUYIN = 0, + ZHUYIN_STANDARD = 1, + ZHUYIN_HSU = 2, + ZHUYIN_IBM = 3, + ZHUYIN_GIN_YIEH = 4, + ZHUYIN_ET = 5, + ZHUYIN_ET26 = 6, + ZHUYIN_DEFAULT = ZHUYIN_STANDARD +}; + +/** + * @brief enums of pinyin ambiguities. + * + * Some pinyin element maybe confused by somebody, + * We allow these ambiguities. + */ +enum PinyinAmbiguity +{ + PINYIN_AmbAny= 0, + PINYIN_AmbZhiZi, + PINYIN_AmbChiCi, + PINYIN_AmbShiSi, + PINYIN_AmbNeLe, + PINYIN_AmbLeRi, + PINYIN_AmbFoHe, + PINYIN_AmbAnAng, + PINYIN_AmbEnEng, + PINYIN_AmbInIng, + PINYIN_AmbLast = PINYIN_AmbInIng +}; + +/** + * @brief Structure to hold pinyin custom settings. + * + * user can custom the behavor of libpinyin by these settings. + */ +struct PinyinCustomSettings +{ + bool use_incomplete; + /**< allow incomplete pinyin key which only has inital. */ + + bool use_ambiguities [PINYIN_AmbLast + 1]; + /**< allow ambiguous pinyin elements or not. */ + + PinyinCustomSettings (); + + void set_use_incomplete (bool use) { use_incomplete = use; } + void set_use_ambiguities (PinyinAmbiguity amb, bool use) + { + if (amb == PINYIN_AmbAny) + for (size_t i=0; i<=PINYIN_AmbLast; ++i) use_ambiguities [i] = use; + else { + use_ambiguities [0] = false; + use_ambiguities [static_cast<size_t>(amb)] = use; + for (size_t i=1; i<=PINYIN_AmbLast; ++i) + if (use_ambiguities [i]) { + use_ambiguities [0] = true; + break; + } + } + } + + bool operator == (const PinyinCustomSettings &rhs) const + { + if (use_incomplete != rhs.use_incomplete) + return false; + + for (size_t i=0; i <= PINYIN_AmbLast; ++i) + if (use_ambiguities [i] != rhs.use_ambiguities [i]) + return false; + + return true; + } + + bool operator != (const PinyinCustomSettings &rhs) const + { + return !(*this == rhs); + } + + guint32 to_value () const + { + guint32 val = 0; + + if (use_incomplete) val |= 1; + + for (size_t i=0; i <= PINYIN_AmbLast; ++i) + if (use_ambiguities [i]) + val |= (1 << (i+1)); + + return val; + } + + void from_value (guint32 val) + { + use_incomplete = ((val & 1) != 0); + + for (size_t i=0; i <= PINYIN_AmbLast; ++i) + use_ambiguities [i] = ((val & (1 << (i+1))) != 0); + } +}; + +/** + * @brief Pinyin key class. + * + * A pinyin key is a composed element of an initial, a final and a tone, + * which represents one or several Chinese ideographs + * + * The position and length information for the portion of string, from which + * the PinyinKey is parsed, are also stored in this structure. + */ +struct PinyinKey +{ + friend class PinyinBitmapIndexLevel; + friend inline int pinyin_exact_compare(const PinyinKey key_lhs[], + const PinyinKey key_rhs[], + int word_length); + friend inline int pinyin_compare_with_ambiguities + (const PinyinCustomSettings &custom, + const PinyinKey* key_lhs, + const PinyinKey* key_rhs, + int word_length); + friend inline void compute_lower_value(const PinyinCustomSettings &custom, + PinyinKey in_keys[], + PinyinKey out_keys[], + int word_length); + friend inline void compute_upper_value(const PinyinCustomSettings &custom, + PinyinKey in_keys[], + PinyinKey out_keys[], + int word_length); + +private: + guint16 m_initial : 5; /**< pinyin initial */ + guint16 m_final : 6; /**< pinyin final */ + guint16 m_tone : 3; /**< pinyin tone */ +public: + /** + * @brief Minimal numerical value of a PinyinKey + * @sa get_value(); + */ + static const guint16 min_value; + + /** + * @brief Maximal numerical value of a PinyinKey + * @sa get_value(); + */ + static const guint16 max_value; + +public: + /** + * Constructor. + * + * The default constructor of class PinyinKey. + */ + PinyinKey (PinyinInitial initial = PINYIN_ZeroInitial, + PinyinFinal final = PINYIN_ZeroFinal, + PinyinTone tone = PINYIN_ZeroTone) + : m_initial (initial), m_final (final), m_tone (tone) + { + } + + /** + * Constructor. + * + * Construct a PinyinKey object from a key string, with + * specified validator. + * + * @sa PinyinValidator + */ + PinyinKey (const PinyinValidator &validator, const char *str, int len = -1) + { + set (validator, str, len); + } + + PinyinKey (guint16 value) + { + set (value); + } + /** + * Clear the PinyinKey object. + */ + + void clear () + { + m_initial = PINYIN_ZeroInitial; + m_final = PINYIN_ZeroFinal; + m_tone = PINYIN_ZeroTone; + } + + /** + * Read PinyinKey value from a key string. + * + * @param validator a PinyinValidator object to validate the key. + * @param key a Latin string including one or more pinyin keys. + * @return the number of characters used by this pinyin key. + */ + int set (const PinyinValidator &validator, const char *str, int len = -1); + + /** + * Set PinyinKey's value to initial, final and tone. + */ + void set (PinyinInitial initial = PINYIN_ZeroInitial, + PinyinFinal final = PINYIN_ZeroFinal, + PinyinTone tone = PINYIN_ZeroTone) + { + m_initial = initial; + m_final = final; + m_tone = tone; + } + + /** + * @brief Set this PinyinKey from its numerical value. + */ + void set (guint16 value) + { + m_tone = value % PINYIN_Number_Of_Tones; + value /= PINYIN_Number_Of_Tones; + m_final = value % PINYIN_Number_Of_Finals; + m_initial = value / PINYIN_Number_Of_Finals; + } + + /** + * @brief Get numerical value of this PinyinKey + */ + guint16 get_value () const + { + return (m_initial * PINYIN_Number_Of_Finals + m_final) * PINYIN_Number_Of_Tones + m_tone; + } + + /** + * Set PinyinKey's initial value to initial. + */ + void set_initial (PinyinInitial initial = PINYIN_ZeroInitial) + { + m_initial = initial; + } + + /** + * Set PinyinKey's final value to final. + */ + void set_final (PinyinFinal final = PINYIN_ZeroFinal) + { + m_final = final; + } + + /** + * Set PinyinKey's tone value to tone. + */ + void set_tone (PinyinTone tone = PINYIN_ZeroTone) + { + m_tone = tone; + } + + /** + * Get initial value of this key. + */ + PinyinInitial get_initial () const + { + return static_cast<PinyinInitial>(m_initial); + } + + /** + * Get final value of this key. + */ + PinyinFinal get_final () const + { + return static_cast<PinyinFinal>(m_final); + } + + /** + * Get tone value of this key. + */ + PinyinTone get_tone () const + { + return static_cast<PinyinTone>(m_tone); + } + + /** + * Get Latin name of this key's initial. + */ + const char* get_initial_string () const; + + /** + * Get Chinese ZhuYin name of this key's initial, in UTF-8 encoding. + */ + const char* get_initial_zhuyin_string () const; + + /** + * Get Latin name of this key's final. + */ + const char* get_final_string () const; + + /** + * Get Chinese ZhuYin name of this key's final, in UTF-8 encoding. + */ + const char* get_final_zhuyin_string () const; + + /** + * Get Latin name of this key's tone. + */ + const char* get_tone_string () const; + + /** + * Get Chinese ZhuYin name of this key's tone, in UTF-8 encoding. + */ + const char* get_tone_zhuyin_string () const; + + /** + * Get Latin name of this key. + */ + const char * get_key_string () const; + + /** + * Get Chinese ZhuYin name of this key, in UTF-8 encoding. + */ + const char * get_key_zhuyin_string () const; + + /** + * Check if this key is empty. + */ + bool is_empty () const + { + return m_initial == PINYIN_ZeroInitial && m_final == PINYIN_ZeroFinal && m_tone == PINYIN_ZeroTone; + } + + /** + * Check if this key has both initial, final and tone. + */ + bool is_complete () const + { + return m_initial != PINYIN_ZeroInitial && m_final != PINYIN_ZeroFinal && m_tone != PINYIN_ZeroTone; + } + + bool operator == (PinyinKey rhs) const + { + return m_initial == rhs.m_initial && m_final == rhs.m_final && m_tone == rhs.m_tone; + } + + bool operator != (PinyinKey rhs) const + { + return m_initial != rhs.m_initial || m_final != rhs.m_final || m_tone != rhs.m_tone; + } + + bool operator < (PinyinKey rhs) const + { + if (m_initial < rhs.m_initial) return true; + if (m_initial > rhs.m_initial) return false; + if (m_final < rhs.m_final) return true; + if (m_final > rhs.m_final) return false; + return m_tone < rhs.m_tone; + } + + bool operator > (PinyinKey rhs) const + { + if (m_initial > rhs.m_initial) return true; + if (m_initial < rhs.m_initial) return false; + if (m_final > rhs.m_final) return true; + if (m_final < rhs.m_final) return false; + return m_tone > rhs.m_tone; + } +}; + +/** + * NULL Validator of PinyinKey object. + * + * This class is for validating a PinyinKey object. + */ +class PinyinValidator +{ +public: + /** + * Overloaded operator () function to validate a pinyin key. + * + * @param key The key to be validated. + * @return true if the key is valid. + */ + virtual bool operator () (PinyinKey key) const = 0; +}; + +class PinyinLargeTable; +/** + * Validator of PinyinKey object. + * + * This class is for validating a PinyinKey object. + */ +class BitmapPinyinValidator:public PinyinValidator +{ + char m_bitmap [(PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 7) / 8]; + +public: + BitmapPinyinValidator (const PinyinLargeTable *table = 0); + + /** + * initialize the validator with specified custom settings + * and PinyinLargeTable. + */ + void initialize (const PinyinLargeTable *table = 0); + + /** + * Overloaded operator () function to validate a pinyin key. + * + * @param key The key to be validated. + * @return true if the key is valid. + */ + virtual bool operator () (PinyinKey key) const; +}; + +/** + * NULL Validator of PinyinKey object. + * + * This class is for validating a PinyinKey object. + */ +class NullPinyinValidator:public PinyinValidator +{ +public: + /** + * Overloaded operator () function to validate a pinyin key. + * + * @param key The key to be validated. + * @return true if the key is valid. + */ + virtual bool operator () (PinyinKey key) const{ + return true; + } +}; + +/** + * @brief Class to translate string into PinyinKey. + */ +class PinyinParser +{ +public: + virtual ~PinyinParser (); + + /** + * @brief Translate only one PinyinKey from a string. + * + * @param validator PinyinValidator object to valid result. + * @param key Stores result PinyinKey. + * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string, + * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme, + * it's an UTF-8 string which contains ZhuYin chars. + * @param len The length of str, in number of chars rather than bytes. + * + * @return the number of chars were actually used. + */ + virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const = 0; + + /** + * @brief Handy wrapper function of parse_one_key(), which accept a String object instead of char *. + */ + int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char * &str) const + { + return parse_one_key (validator, key, str, g_utf8_strlen (str, -1)); + } + + /** + * @brief Translate the source string into a set of PinyinKeys. + * + * @param validator PinyinValidator object to valid result. + * @param keys Stores result PinyinKeys. + * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string, + * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme, + * it's an UTF-8 string which contains ZhuYin chars. + * @param len The length of str, in number of chars rather than bytes. + * + * @return the number of chars were actually used. + */ + virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys,PinyinKeyPosVector & poses, const char *str, int len = -1) const = 0; + +public: + static void normalize (PinyinKey &key); +}; + +/** + * The default Pinyin Parser which parses full pinyin string into PinyinKeys. + */ +class PinyinDefaultParser : public PinyinParser +{ +public: + virtual ~PinyinDefaultParser (); + + virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const; + virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const; + +public: + using PinyinParser::parse_one_key; + using PinyinParser::parse; +}; + +/* The valid input chars of ShuangPin is a-z and ';' + */ +class PinyinShuangPinParser : public PinyinParser +{ + PinyinInitial m_initial_map [27]; + PinyinFinal m_final_map [27][2]; + +public: + /** + * Constructor + * + * @param scheme the predefined ShuangPin scheme to be used. + */ + PinyinShuangPinParser (PinyinShuangPinScheme scheme = SHUANG_PIN_DEFAULT); + PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]); + + virtual ~PinyinShuangPinParser (); + + virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const; + virtual int parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const; + +public: + void set_scheme (PinyinShuangPinScheme scheme); + void set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]); + + void get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2]); + +public: + using PinyinParser::parse_one_key; + using PinyinParser::parse; +}; + +int pinyin_compare_initial (const PinyinCustomSettings &custom, + PinyinInitial lhs, + PinyinInitial rhs); + +int pinyin_compare_final (const PinyinCustomSettings &custom, + PinyinFinal lhs, + PinyinFinal rhs); + +int pinyin_compare_tone (const PinyinCustomSettings &custom, + PinyinTone lhs, + PinyinTone rhs); +}; + +using namespace novel; + +#endif diff --git a/src/storage/pinyin_large_table.cpp b/src/storage/pinyin_large_table.cpp new file mode 100644 index 0000000..794cca5 --- /dev/null +++ b/src/storage/pinyin_large_table.cpp @@ -0,0 +1,690 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <assert.h> +#include <string.h> +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" + + +PinyinBitmapIndexLevel::PinyinBitmapIndexLevel(PinyinCustomSettings * custom) + :m_custom(custom){ + memset(m_pinyin_length_indexes, 0 , sizeof(m_pinyin_length_indexes)); +} + +void PinyinBitmapIndexLevel::reset(){ + for ( int k = PINYIN_ZeroInitial; k < PINYIN_Number_Of_Initials; k++) + for ( int m = PINYIN_ZeroFinal; m < PINYIN_Number_Of_Finals; m++) + for ( int n = PINYIN_ZeroTone; n < PINYIN_Number_Of_Tones; n++){ + PinyinLengthIndexLevel * length_array = + m_pinyin_length_indexes[k][m][n]; + if ( length_array ) + delete length_array; + } +} + +int PinyinBitmapIndexLevel::search( int phrase_length, /* in */ PinyinKey keys[], + /* out */ PhraseIndexRanges ranges) const{ + return initial_level_search(phrase_length, keys, ranges); +} + +int PinyinBitmapIndexLevel::initial_level_search(int phrase_length, + /* in */PinyinKey keys[], + /* out */ PhraseIndexRanges ranges) const{ + +#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \ + { \ + result |= final_level_search((PinyinInitial)first_key.m_initial,\ + phrase_length, keys, ranges); \ + if ( custom.use_ambiguities [AMBIGUITY] ){ \ + result |= final_level_search(ANOTHER, \ + phrase_length, keys, ranges); \ + } \ + return result; \ + } + + //deal with the ambiguities + + int result = 0; + PinyinKey& first_key = keys[0]; + PinyinCustomSettings & custom= *m_custom; + + switch(first_key.m_initial){ + + MATCH(PINYIN_AmbZhiZi, PINYIN_Zi, PINYIN_Zhi); + MATCH(PINYIN_AmbZhiZi, PINYIN_Zhi, PINYIN_Zi); + MATCH(PINYIN_AmbChiCi, PINYIN_Ci, PINYIN_Chi); + MATCH(PINYIN_AmbChiCi, PINYIN_Chi, PINYIN_Ci); + MATCH(PINYIN_AmbShiSi, PINYIN_Si, PINYIN_Shi); + MATCH(PINYIN_AmbShiSi, PINYIN_Shi, PINYIN_Si); + MATCH(PINYIN_AmbLeRi, PINYIN_Ri, PINYIN_Le); + MATCH(PINYIN_AmbNeLe, PINYIN_Ne, PINYIN_Le); + MATCH(PINYIN_AmbFoHe, PINYIN_Fo, PINYIN_He); + MATCH(PINYIN_AmbFoHe, PINYIN_He, PINYIN_Fo); + + case PINYIN_Le: + { + result |= final_level_search((PinyinInitial)first_key.m_initial, + phrase_length, keys, ranges); + if ( custom.use_ambiguities [PINYIN_AmbLeRi] ) + result |= final_level_search(PINYIN_Ri, phrase_length, + keys, ranges); + if ( custom.use_ambiguities [PINYIN_AmbNeLe] ) + result |= final_level_search(PINYIN_Ne, phrase_length, + keys, ranges); + return result; + } + default: + { + return final_level_search((PinyinInitial)first_key.m_initial, + phrase_length, + keys, ranges); + } + } +#undef MATCH +} + +int PinyinBitmapIndexLevel::final_level_search(PinyinInitial initial, + int phrase_length, + /* in */PinyinKey keys[], + /* out */ PhraseIndexRanges ranges) const{ +#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \ + { \ + result = tone_level_search(initial,(PinyinFinal) first_key.m_final,\ + phrase_length, keys, ranges); \ + if ( custom.use_ambiguities [AMBIGUITY] ){ \ + result |= tone_level_search(initial, ANOTHER, \ + phrase_length, keys, ranges); \ + } \ + return result; \ + } + + int result = 0; + PinyinKey& first_key = keys[0]; + PinyinCustomSettings & custom= *m_custom; + + switch(first_key.m_final){ + case PINYIN_ZeroFinal: + { + if (!custom.use_incomplete ) + return result; + for ( int i = PINYIN_A; i < PINYIN_Number_Of_Finals; ++i){ + result |= tone_level_search(initial,(PinyinFinal)i , + phrase_length, keys, ranges); + } + return result; + } + + MATCH(PINYIN_AmbAnAng, PINYIN_An, PINYIN_Ang); + MATCH(PINYIN_AmbAnAng, PINYIN_Ang, PINYIN_An); + MATCH(PINYIN_AmbEnEng, PINYIN_En, PINYIN_Eng); + MATCH(PINYIN_AmbEnEng, PINYIN_Eng, PINYIN_En); + MATCH(PINYIN_AmbInIng, PINYIN_In, PINYIN_Ing); + MATCH(PINYIN_AmbInIng, PINYIN_Ing, PINYIN_In); + + default: + { + return tone_level_search(initial,(PinyinFinal)first_key.m_final, + phrase_length, keys, ranges); + } + } +#undef MATCH +} + +int PinyinBitmapIndexLevel::tone_level_search(PinyinInitial initial, + PinyinFinal final, + int phrase_length, + /* in */PinyinKey keys[], + /* out */ PhraseIndexRanges ranges) const{ + int result = 0; + PinyinKey& first_key = keys[0]; + PinyinCustomSettings & custom= *m_custom; + + switch ( first_key.m_tone ){ + case PINYIN_ZeroTone: + { + //deal with ZeroTone in pinyin table files. + for ( int i = PINYIN_ZeroTone; i < PINYIN_Number_Of_Tones; ++i){ + PinyinLengthIndexLevel * phrases = + m_pinyin_length_indexes[initial][final][(PinyinTone)i]; + if ( phrases ) + result |= phrases->search(phrase_length - 1, &custom, + keys + 1, ranges); + } + return result; + } + default: + { + PinyinLengthIndexLevel * phrases = + m_pinyin_length_indexes[initial][final] + [PINYIN_ZeroTone]; + if ( phrases ) + result = phrases->search(phrase_length - 1, &custom, + keys + 1, ranges); + phrases = m_pinyin_length_indexes[initial][final] + [(PinyinTone) first_key.m_tone]; + if ( phrases ) + result |= phrases->search(phrase_length - 1, &custom, + keys + 1, ranges); + return result; + } + } + return result; +} + +PinyinLengthIndexLevel::PinyinLengthIndexLevel(){ + m_pinyin_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *)); +} + +PinyinLengthIndexLevel::~PinyinLengthIndexLevel(){ +#define CASE(x) case x: \ + { \ + PinyinArrayIndexLevel<x> * array = g_array_index \ + (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> *, x); \ + if (array) \ + delete array; \ + break; \ + } + for ( int i = 0 ; i < m_pinyin_array_indexes->len; ++i){ + switch (i){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } + } + g_array_free(m_pinyin_array_indexes, TRUE); +#undef CASE +} + +int PinyinLengthIndexLevel::search( int phrase_length, + /* in */ PinyinCustomSettings * custom, + /* in */ PinyinKey keys[], + /* out */ PhraseIndexRanges ranges){ + int result = SEARCH_NONE; + if(m_pinyin_array_indexes->len < phrase_length + 1) + return result; + if (m_pinyin_array_indexes->len > phrase_length + 1) + result |= SEARCH_CONTINUED; + +#define CASE(len) case len: \ + { \ + PinyinArrayIndexLevel<len> * array = g_array_index \ + (m_pinyin_array_indexes, PinyinArrayIndexLevel<len> *, len); \ + if ( !array ) \ + return result; \ + result |= array->search(custom, keys, ranges); \ + return result; \ + } + + switch ( phrase_length ){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } +#undef CASE +} + +template<size_t phrase_length> +int PinyinArrayIndexLevel<phrase_length>::search(/* in */ PinyinCustomSettings * custom, /* in */ PinyinKey keys[], /* out */ PhraseIndexRanges ranges){ + PhraseExactLessThan<phrase_length> m_lessthan; + PinyinIndexItem<phrase_length> * chunk_begin, * chunk_end; + chunk_begin = (PinyinIndexItem<phrase_length> *)m_chunk.begin(); + chunk_end = (PinyinIndexItem<phrase_length> *)m_chunk.end(); + //do the search + PinyinKey left_keys[phrase_length], right_keys[phrase_length]; + compute_lower_value(*custom, keys, left_keys, phrase_length); + compute_upper_value(*custom, keys, right_keys, phrase_length); + PinyinIndexItem<phrase_length> left(left_keys, -1), right(right_keys, -1); + + PinyinIndexItem<phrase_length> * begin = std_lite::lower_bound(chunk_begin, chunk_end, left, m_lessthan); + PinyinIndexItem<phrase_length> * end = std_lite::upper_bound(chunk_begin, chunk_end, right, m_lessthan); + + return convert(custom, keys, begin, end, ranges); +} + +template<size_t phrase_length> +int PinyinArrayIndexLevel<phrase_length>::convert(PinyinCustomSettings * custom, PinyinKey keys[], PinyinIndexItem<phrase_length> * begin, PinyinIndexItem<phrase_length> * end, PhraseIndexRanges ranges){ + PinyinIndexItem<phrase_length> * iter; + PhraseIndexRange cursor; + GArray * head, *cursor_head = NULL; + int result = SEARCH_NONE; + cursor.m_range_begin = -1; cursor.m_range_end = -1; + for ( iter = begin; iter != end; ++iter){ + if ( ! 0 == + pinyin_compare_with_ambiguities + (*custom, keys, iter->m_keys, phrase_length)) + continue; + phrase_token_t token = iter->m_token; + head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)]; + if ( NULL == head ) + continue; + + result |= SEARCH_OK; + + if ( cursor.m_range_begin == -1 ){ + cursor.m_range_begin = token; + cursor.m_range_end = token + 1; + cursor_head = head; + }else if (cursor.m_range_end == token && + PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_end) == + PHRASE_INDEX_LIBRARY_INDEX(token) ){ + cursor.m_range_end++; + }else { + g_array_append_val(cursor_head, cursor); + cursor.m_range_begin = token; cursor.m_range_end = token + 1; + cursor_head = head; + } + } + if ( cursor.m_range_begin == -1 ) + return result; + + g_array_append_val(cursor_head, cursor); + return result; +} + +int PinyinBitmapIndexLevel::add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){ + PinyinKey firstkey = keys[0]; + PinyinLengthIndexLevel * &length_array = + m_pinyin_length_indexes[firstkey.m_initial][firstkey.m_final][firstkey.m_tone]; + if ( ! length_array ){ + length_array = new PinyinLengthIndexLevel(); + } + return length_array->add_index(phrase_length - 1, keys + 1, token); +} + +int PinyinBitmapIndexLevel::remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){ + PinyinKey firstkey = keys[0]; + PinyinLengthIndexLevel * &length_array = + m_pinyin_length_indexes[firstkey.m_initial][firstkey.m_final][firstkey.m_tone]; + if ( length_array ) + return length_array->add_index(phrase_length - 1, keys + 1, token); + return REMOVE_ITEM_DONOT_EXISTS; +} + +int PinyinLengthIndexLevel::add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){ + assert(phrase_length + 1 < MAX_PHRASE_LENGTH); + if ( m_pinyin_array_indexes -> len <= phrase_length ) + g_array_set_size(m_pinyin_array_indexes, phrase_length + 1); +#define CASE(x) case x: \ + { \ + PinyinArrayIndexLevel<x> * &array = g_array_index \ + (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> *, x); \ + if ( !array ) \ + array = new PinyinArrayIndexLevel<x>; \ + return array->add_index(keys, token); \ + } + switch(phrase_length){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } +#undef CASE +} + +int PinyinLengthIndexLevel::remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){ + assert(phrase_length + 1 < MAX_PHRASE_LENGTH); + if ( m_pinyin_array_indexes -> len <= phrase_length ) + return false; +#define CASE(x) case x: \ + { \ + PinyinArrayIndexLevel<x> * &array = g_array_index \ + (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> *, x); \ + if ( !array ) \ + return false; \ + return array->remove_index(keys, token); \ + } + switch(phrase_length){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } +#undef CASE +} + +template<size_t phrase_length> +int PinyinArrayIndexLevel<phrase_length>::add_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){ + PhraseExactLessThan<phrase_length> m_lessthan; + PinyinIndexItem<phrase_length> * buf_begin, * buf_end; + + PinyinIndexItem<phrase_length> new_elem(keys, token); + buf_begin = (PinyinIndexItem<phrase_length> *) m_chunk.begin(); + buf_end = (PinyinIndexItem<phrase_length> *) m_chunk.end(); + + std_lite::pair<PinyinIndexItem<phrase_length> *, PinyinIndexItem<phrase_length> *> range; + range = std_lite::equal_range(buf_begin, buf_end, new_elem, m_lessthan); + + PinyinIndexItem<phrase_length> * cur_elem; + for ( cur_elem = range.first; + cur_elem != range.second; ++cur_elem){ + if ( cur_elem->m_token == token ) + return INSERT_ITEM_EXISTS; + if ( cur_elem->m_token > token ) + break; + } + + int offset = (cur_elem - buf_begin) * + sizeof(PinyinIndexItem<phrase_length>); + m_chunk.insert_content(offset, &new_elem, + sizeof ( PinyinIndexItem<phrase_length> )); + return INSERT_OK; +} + +template<size_t phrase_length> +int PinyinArrayIndexLevel<phrase_length>::remove_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){ + PhraseExactLessThan<phrase_length> m_lessthan; + PinyinIndexItem<phrase_length> * buf_begin, * buf_end; + + PinyinIndexItem<phrase_length> new_elem(keys, token); + buf_begin = (PinyinIndexItem<phrase_length> *) m_chunk.begin(); + buf_end = (PinyinIndexItem<phrase_length> *) m_chunk.end(); + + std_lite::pair<PinyinIndexItem<phrase_length> *, PinyinIndexItem<phrase_length> *> range; + range = std_lite::equal_range(buf_begin, buf_end, new_elem, m_lessthan); + + PinyinIndexItem<phrase_length> * cur_elem; + for ( cur_elem = range.first; + cur_elem != range.second; ++cur_elem){ + if ( cur_elem->m_token == token ) + break; + } + if (cur_elem->m_token != token ) + return REMOVE_ITEM_DONOT_EXISTS; + + int offset = (cur_elem - buf_begin) * + sizeof(PinyinIndexItem<phrase_length>); + m_chunk.remove_content(offset, sizeof (PinyinIndexItem<phrase_length>)); + return REMOVE_OK; +} + +bool PinyinLargeTable::load_text(FILE * infile){ + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + while ( !feof(infile)){ + fscanf(infile, "%s", pinyin); + fscanf(infile, "%s", phrase); + fscanf(infile, "%ld", &token); + fscanf(infile, "%ld", &freq); + + PinyinDefaultParser parser; + NullPinyinValidator validator; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + parser.parse(validator, keys, poses, pinyin); + + add_index( keys->len, (PinyinKey *)keys->data, token); + + g_array_free(keys, true); + g_array_free(poses, true); + } + return true; +} + +bool PinyinBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, + table_offset_t end){ + reset(); + char * buf_begin = (char *) chunk->begin(); + table_offset_t phrase_begin, phrase_end; + table_offset_t * index = (table_offset_t *) (buf_begin + offset); + phrase_end = *index; + for ( int m = 0; m < PINYIN_Number_Of_Initials; ++m ) + for ( int n = 0; n < PINYIN_Number_Of_Finals; ++n) + for ( int k = 0; k < PINYIN_Number_Of_Tones; ++k){ + phrase_begin = phrase_end; + index++; + phrase_end = *index; + if ( phrase_begin == phrase_end ) //null pointer + continue; + PinyinLengthIndexLevel * phrases = new PinyinLengthIndexLevel; + m_pinyin_length_indexes[m][n][k] = phrases; + phrases->load(chunk, phrase_begin, phrase_end - 1); + assert( phrase_end <= end ); + assert( *(buf_begin + phrase_end - 1) == c_separate); + } + offset += (PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 1) * sizeof ( table_offset_t); + assert( c_separate == *(buf_begin + offset)); + return true; +} + +bool PinyinBitmapIndexLevel::store(MemoryChunk * new_chunk, + table_offset_t offset, + table_offset_t & end){ + table_offset_t phrase_end; + table_offset_t index = offset; + offset += (PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 1) * sizeof ( table_offset_t); + //add '#' + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + for ( int m = 0; m < PINYIN_Number_Of_Initials; ++m ) + for ( int n = 0; n < PINYIN_Number_Of_Finals; ++n) + for ( int k = 0; k < PINYIN_Number_Of_Tones; ++k){ + PinyinLengthIndexLevel * phrases = m_pinyin_length_indexes[m][n][k]; + if ( !phrases ){ //null pointer + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + continue; + } + phrases->store(new_chunk, offset, phrase_end); //has a end '#' + offset = phrase_end; + //add '#' + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + } + end = offset; + return true; +} + +bool PinyinLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){ + char * buf_begin = (char *) chunk->begin(); + guint32 nindex = *((guint32 *)(buf_begin + offset)); + table_offset_t * index = (table_offset_t *) + (buf_begin + offset + sizeof(guint32)); + + table_offset_t phrase_begin, phrase_end = *index; + m_pinyin_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *)); + for ( size_t i = 1; i <= nindex; i++){ + phrase_begin = phrase_end; + index++; + phrase_end = *index; + if ( phrase_begin == phrase_end ){ + void * null = NULL; + g_array_append_val(m_pinyin_array_indexes , null); + continue; + } + +#define CASE(x) case x - 1: \ + { \ + PinyinArrayIndexLevel<x> * phrase = new PinyinArrayIndexLevel<x>; \ + phrase->load(chunk, phrase_begin, phrase_end - 1); \ + assert( *(buf_begin + phrase_end - 1) == c_separate); \ + assert( phrase_end <= end ); \ + g_array_append_val(m_pinyin_array_indexes, phrase); \ + break; \ + } + switch ( i ){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } +#undef CASE + } + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + assert ( c_separate == * (buf_begin + offset) ); + return true; +} + +bool PinyinLengthIndexLevel::store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end){ + guint32 nindex = m_pinyin_array_indexes->len; + new_chunk->set_content(offset, &nindex, sizeof(guint32)); + table_offset_t index = offset + sizeof(guint32); + + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + table_offset_t phrase_end; + for ( size_t i = 0 ; i < m_pinyin_array_indexes->len; ++i){ +#define CASE(x) case x: { \ + PinyinArrayIndexLevel<x> * phrase = g_array_index \ + (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> * , i); \ + if ( !phrase ){ \ + new_chunk->set_content \ + (index, &offset, sizeof(table_offset_t)); \ + index += sizeof(table_offset_t); \ + continue; \ + } \ + phrase->store(new_chunk, offset, phrase_end); \ + offset = phrase_end; \ + /*add '#'*/ \ + new_chunk->set_content(offset, &c_separate, sizeof(char)); \ + offset += sizeof(char); \ + new_chunk->set_content(index, &offset, sizeof(table_offset_t));\ + index += sizeof(table_offset_t); \ + break; \ + } + switch ( i ){ + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: + assert(false); + } +#undef CASE + } + end = offset; + return true; +} + +template<size_t phrase_length> +bool PinyinArrayIndexLevel<phrase_length>:: +load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){ + char * buf_begin = (char *) chunk->begin(); + m_chunk.set_chunk(buf_begin + offset, end - offset, NULL); + return true; +} + +template<size_t phrase_length> +bool PinyinArrayIndexLevel<phrase_length>:: +store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end){ + new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size()); + end = offset + m_chunk.size(); + return true; +} diff --git a/src/storage/pinyin_large_table.h b/src/storage/pinyin_large_table.h new file mode 100755 index 0000000..71b3640 --- /dev/null +++ b/src/storage/pinyin_large_table.h @@ -0,0 +1,178 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef PINYIN_LARGE_TABLE_H +#define PINYIN_LARGE_TABLE_H + +#include <stdio.h> +#include "novel_types.h" +#include "memory_chunk.h" + +namespace novel{ + +/* Because this is not large, + * Store this in user home directory. + */ + +class PinyinLengthIndexLevel; + +class PinyinBitmapIndexLevel{ + friend class PinyinLargeTable; + PinyinCustomSettings * m_custom; +protected: + PinyinLengthIndexLevel * m_pinyin_length_indexes[PINYIN_Number_Of_Initials] + [PINYIN_Number_Of_Finals] + [PINYIN_Number_Of_Tones]; + //search function + int initial_level_search(int word_length, /* in */PinyinKey keys[], + /* out */ PhraseIndexRanges ranges) const; + int final_level_search(PinyinInitial initial, int word_length, /* in */PinyinKey keys[], /* out */ PhraseIndexRanges ranges) const; + int tone_level_search(PinyinInitial initial, PinyinFinal final, int word_length, /* in */PinyinKey keys[], /* out */ PhraseIndexRanges ranges) const; + void reset(); +public: + PinyinBitmapIndexLevel(PinyinCustomSettings * custom); + ~PinyinBitmapIndexLevel(){ + reset(); + } + + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t &end); + + /*bool load_text(FILE * file);*/ + /*bool save_text(FILE * file);*/ + + /*search/add_index method */ + int search( int phrase_length, /* in */ PinyinKey keys[], + /* out */ PhraseIndexRanges ranges) const; + int add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token); + int remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token); +}; + +class PinyinLengthIndexLevel{ +protected: + GArray* m_pinyin_array_indexes; +public: + PinyinLengthIndexLevel(); + ~PinyinLengthIndexLevel(); + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end); + + /*search/add_index method */ + int search( int phrase_length, /* in */ PinyinCustomSettings * custom, + /* in */ PinyinKey keys[], + /* out */ PhraseIndexRanges ranges); + int add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token); + int remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token); +}; + +template<size_t phrase_length> +class PinyinArrayIndexLevel{ +protected: + MemoryChunk m_chunk; + int convert(PinyinCustomSettings * custom, + PinyinKey keys[], + PinyinIndexItem<phrase_length> * begin, + PinyinIndexItem<phrase_length> * end, + PhraseIndexRanges ranges); +public: + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end); + + /*search/add_index method */ + int search(/* in */ PinyinCustomSettings * custom, + /* in */ PinyinKey keys[], + /* out */ PhraseIndexRanges ranges); + int add_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token); + int remove_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token); +}; + + +/* TODO: add file version check */ +class PinyinLargeTable{ +protected: + PinyinBitmapIndexLevel m_bitmap_table; + MemoryChunk * m_chunk; + + void reset(){ + if ( m_chunk ){ + delete m_chunk; + m_chunk = NULL; + } + } + +public: + PinyinLargeTable(PinyinCustomSettings * custom): + m_bitmap_table(custom){ + m_chunk = NULL; + } + + ~PinyinLargeTable(){ + reset(); + } + + /*load/save method*/ + bool load(MemoryChunk * chunk){ + reset(); + m_chunk = chunk; + return m_bitmap_table.load(chunk, 0 , chunk->size()); + } + + bool store(MemoryChunk * new_chunk){ + table_offset_t end; + return m_bitmap_table.store(new_chunk, 0, end); + } + + bool load_text(FILE * file); +/* + bool save_text(FILE * file){ + return m_bitmap_table.save_text(file); + } +*/ + + /*search/add_index method */ + int search( int phrase_length, /* in */ PinyinKey keys[], + /* out */ PhraseIndexRanges ranges){ + return m_bitmap_table.search(phrase_length, keys, ranges); + } + + int add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){ + return m_bitmap_table.add_index(phrase_length, keys, token); + } + int remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){ + return m_bitmap_table.remove_index(phrase_length, keys, token); + } + + bool has_key(PinyinKey key) const { + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(ranges)); + ranges[1] = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange)); + int result = m_bitmap_table.search(1, &key, ranges); + g_array_free(ranges[1], TRUE); + ranges[1] = NULL; + return result & SEARCH_OK; + } +}; + +}; + +using namespace novel; +#endif diff --git a/src/storage/pinyin_phrase.h b/src/storage/pinyin_phrase.h new file mode 100644 index 0000000..07ee0de --- /dev/null +++ b/src/storage/pinyin_phrase.h @@ -0,0 +1,298 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef PINYIN_PHRASE_H +#define PINYIN_PHRASE_H + +#include <string.h> +#include "stl_lite.h" + +namespace novel{ + +static inline int pinyin_utility_sign(int value){ + if(value > 0) + return 1; + else if (value < 0) + return -1; + else return 0; +} + +inline int pinyin_exact_compare(const PinyinKey key_lhs[], + const PinyinKey key_rhs[], + int phrase_length){ + int i; + int result; + for ( i = 0 ; i < phrase_length ; i++){ + result = key_lhs[i].m_initial - key_rhs[i].m_initial; + if ( result != 0 ) + return pinyin_utility_sign(result); + } + for( i = 0 ; i < phrase_length ; i++){ + result = key_lhs[i].m_final - key_rhs[i].m_final; + if ( result != 0 ) + return pinyin_utility_sign(result); + } + for( i = 0 ; i < phrase_length ; i++){ + result = key_lhs[i].m_tone - key_rhs[i].m_tone; + if ( result != 0 ) + return pinyin_utility_sign(result); + } + return 0; +} + + +inline int pinyin_compare_with_ambiguities(const PinyinCustomSettings &custom, + const PinyinKey* key_lhs, + const PinyinKey* key_rhs, + int phrase_length){ + int i; + int result; + for ( i = 0 ; i < phrase_length ; i++){ + result = pinyin_compare_initial + (custom, + (PinyinInitial)key_lhs[i].m_initial, + (PinyinInitial)key_rhs[i].m_initial); + if ( result != 0 ) + return result; + } + for( i = 0 ; i < phrase_length ; i++){ + result = pinyin_compare_final + (custom, + (PinyinFinal)key_lhs[i].m_final, + (PinyinFinal)key_rhs[i].m_final); + if ( result != 0 ) + return result; + } + for( i = 0 ; i < phrase_length ; i++){ + result = pinyin_compare_tone + (custom, + (PinyinTone)key_lhs[i].m_tone, + (PinyinTone)key_rhs[i].m_tone); + if ( result != 0 ) + return result; + } + return 0; +} + +//compute pinyin lower bound +//maybe replace by table lookup +inline void compute_lower_value(const PinyinCustomSettings &custom, + PinyinKey in_keys[], + PinyinKey out_keys[], + int phrase_length){ + PinyinKey aKey = in_keys[0]; + + for ( int i = 0; i < phrase_length; i++){ + int k; int sel; + aKey = in_keys[i]; + //deal with initial + sel = aKey.m_initial; + for( k = aKey.m_initial - 1; k >= PINYIN_ZeroInitial; k--){ + if ( 0 != pinyin_compare_initial(custom, + (PinyinInitial)k, + (PinyinInitial)aKey.m_initial) ) + break; + else + sel = k; + } + aKey.m_initial = (PinyinInitial)sel; + //deal with final + sel = aKey.m_final; + for( k = aKey.m_final - 1; k >= PINYIN_ZeroFinal; k--){ + if ( 0 != pinyin_compare_final(custom, + (PinyinFinal)k, + (PinyinFinal)aKey.m_final) ) + break; + else + sel = k; + } + aKey.m_final = (PinyinFinal)sel; + //deal with tone + sel = aKey.m_tone; + for( k = aKey.m_tone - 1; k >= PINYIN_ZeroTone; k--){ + if ( 0 != pinyin_compare_tone(custom, + (PinyinTone)k, + (PinyinTone)aKey.m_tone) ) + break; + else + sel = k; + } + aKey.m_tone = (PinyinTone)sel; + //save the result + out_keys[i] = aKey; + } +} + +//compute pinyin upper bound +//maybe replace by table lookup +inline void compute_upper_value(const PinyinCustomSettings &custom, + PinyinKey in_keys[], + PinyinKey out_keys[], + int phrase_length){ + PinyinKey aKey = in_keys[0]; + + for ( int i = 0; i < phrase_length; i++){ + int k; int sel; + aKey = in_keys[i]; + //deal with initial + sel = aKey.m_initial; + for( k = aKey.m_initial + 1; k <= PINYIN_LastInitial; k++){ + if ( 0 != pinyin_compare_initial(custom, (PinyinInitial)k, (PinyinInitial)aKey.m_initial) ) + break; + else + sel = k; + } + aKey.m_initial = (PinyinInitial)sel; + //deal with final + sel = aKey.m_final; + for( k = aKey.m_final + 1; k <= PINYIN_LastFinal; k++){ + if ( 0 != pinyin_compare_final(custom, (PinyinFinal)k, (PinyinFinal)aKey.m_final) ) + break; + else + sel = k; + } + aKey.m_final = (PinyinFinal)sel; + //deal with tone + sel = aKey.m_tone; + for( k = aKey.m_tone + 1; k <= PINYIN_LastTone; k++){ + if ( 0 != pinyin_compare_tone(custom, (PinyinTone)k, (PinyinTone)aKey.m_tone) ) + break; + else + sel = k; + } + aKey.m_tone = (PinyinTone)sel; + //save the result + out_keys[i] = aKey; + } +} + +template<int phrase_length> +struct PinyinIndexItem{ + phrase_token_t m_token; + PinyinKey m_keys[phrase_length]; +public: + PinyinIndexItem<phrase_length>(PinyinKey * keys, phrase_token_t token){ + memmove(m_keys, keys, sizeof(PinyinKey) * phrase_length); + m_token = token; + } +}; + +/* +//just need less than mode +//this method mainly used in pinyin lookup +template<int phrase_length> +class PhraseCompareWithAmbiguities + : public std_lite::binary_function <const PinyinIndexItem <phrase_length>, + const PinyinIndexItem <phrase_length>, int> +{ + const PinyinCustomSettings & m_custom; +public: + PhraseCompareWithAmbiguities<phrase_length> + (const PinyinCustomSettings & custom):m_custom(custom){} + + int operator () (const PinyinIndexItem<phrase_length> &lhs, + const PinyinIndexItem<phrase_length> &rhs) const{ + PinyinKey * key_lhs = (PinyinKey *) lhs.m_keys; + PinyinKey * key_rhs = (PinyinKey *) rhs.m_keys; + return pinyin_compare_with_ambiguities(m_custom, + key_lhs, key_rhs, phrase_length); + } +}; +*/ + +//for find the element in the phrase array +template<int phrase_length> +class PhraseExactCompare + : public std_lite::binary_function <const PinyinIndexItem<phrase_length> + ,const PinyinIndexItem<phrase_length>, int> +{ +public: + int operator () (const PinyinIndexItem<phrase_length> &lhs, + const PinyinIndexItem<phrase_length> &rhs) const{ + PinyinKey * key_lhs = (PinyinKey *) lhs.m_keys; + PinyinKey * key_rhs = (PinyinKey *) rhs.m_keys; + + return pinyin_exact_compare(key_lhs, key_rhs, phrase_length); + } +}; + +/* +//for find the element in the phrase array +template<int phrase_length> +class PhraseExactCompareWithToken + : public std_lite::binary_function <const PinyinIndexItem<phrase_length> + ,const PinyinIndexItem<phrase_length>, int> +{ +public: + int operator () (const PinyinIndexItem<phrase_length> &lhs, + const PinyinIndexItem<phrase_length> &rhs) const{ + PinyinKey * key_lhs = (PinyinKey *) lhs.m_keys; + PinyinKey * key_rhs = (PinyinKey *) rhs.m_keys; + + phrase_token_t token_lhs = lhs.m_token; + phrase_token_t token_rhs = rhs.m_token; + + int result = pinyin_exact_compare(key_lhs, key_rhs, phrase_length); + if ( !result ) + return result; + return pinyin_utility_sign(token_lhs - token_rhs); + } +}; +*/ + +template<int phrase_length> +class PhraseExactLessThan + : public std_lite::binary_function <const PinyinIndexItem<phrase_length> + ,const PinyinIndexItem<phrase_length>, + bool> +{ + private: + PhraseExactCompare<phrase_length> m_compare; + public: + bool operator () (const PinyinIndexItem<phrase_length> &lhs, + const PinyinIndexItem<phrase_length> &rhs) const{ + return -1 == m_compare(lhs, rhs); + } +}; + +/* +template<int phrase_length> +class PhraseExactLessThanWithToken + : public std_lite::binary_function <const PinyinIndexItem<phrase_length> + ,const PinyinIndexItem<phrase_length>, + bool> +{ + private: + PhraseExactCompareWithToken<phrase_length> m_compare; + public: + bool operator () (const PinyinIndexItem<phrase_length> &lhs, + const PinyinIndexItem<phrase_length> &rhs) const{ + return -1 == m_compare(lhs, rhs); + } +}; +*/ + +}; + +using namespace novel; + +#endif diff --git a/src/storage/pinyin_zhuyin_map_data.h b/src/storage/pinyin_zhuyin_map_data.h new file mode 100644 index 0000000..7557c5e --- /dev/null +++ b/src/storage/pinyin_zhuyin_map_data.h @@ -0,0 +1,582 @@ +static const PinyinKey __zhuyin_standard_map [][3] = +{ +/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ' */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* , */{PinyinKey(42) /* eh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* - */{PinyinKey(66) /* er */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* . */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* / */{PinyinKey(60) /* eng */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 0 */{PinyinKey(18) /* an */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 1 */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 2 */{PinyinKey(960) /* d */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 3 */{PinyinKey(3) /* 3 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 4 */{PinyinKey(4) /* 4 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 5 */{PinyinKey(5520) /* zh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 6 */{PinyinKey(2) /* 2 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 7 */{PinyinKey(5) /* 5 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 8 */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 9 */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ; */{PinyinKey(24) /* ang */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* = */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* a */{PinyinKey(2640) /* m */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* b */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* c */{PinyinKey(1680) /* h */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* d */{PinyinKey(2160) /* k */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* e */{PinyinKey(1440) /* g */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* f */{PinyinKey(3360) /* q */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* g */{PinyinKey(4080) /* sh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* h */{PinyinKey(480) /* c */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* i */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* j */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* k */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* l */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* m */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* n */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* o */{PinyinKey(48) /* ei */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* p */{PinyinKey(54) /* en */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* q */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* r */{PinyinKey(1920) /* j */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* s */{PinyinKey(2880) /* n */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* t */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* u */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* v */{PinyinKey(4800) /* x */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* w */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* x */{PinyinKey(2400) /* l */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* y */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* z */{PinyinKey(1200) /* f */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +}; + +static const PinyinKey __zhuyin_hsu_map [][3] = +{ +/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ' */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* , */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* - */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* . */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* / */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 0 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 1 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 2 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 3 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 4 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 5 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 6 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 7 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 8 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 9 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ; */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* = */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* a */{PinyinKey(480) /* c */, PinyinKey(48) /* ei */, PinyinKey(0) /* */}, +/* b */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* c */{PinyinKey(4800) /* x */, PinyinKey(4080) /* sh */, PinyinKey(0) /* */}, +/* d */{PinyinKey(960) /* d */, PinyinKey(2) /* 2 */, PinyinKey(0) /* */}, +/* e */{PinyinKey(72) /* i */, PinyinKey(42) /* eh */, PinyinKey(0) /* */}, +/* f */{PinyinKey(1200) /* f */, PinyinKey(3) /* 3 */, PinyinKey(0) /* */}, +/* g */{PinyinKey(1440) /* g */, PinyinKey(36) /* e */, PinyinKey(0) /* */}, +/* h */{PinyinKey(1680) /* h */, PinyinKey(138) /* o */, PinyinKey(0) /* */}, +/* i */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* j */{PinyinKey(1920) /* j */, PinyinKey(5520) /* zh */, PinyinKey(4) /* 4 */}, +/* k */{PinyinKey(2160) /* k */, PinyinKey(24) /* ang */, PinyinKey(0) /* */}, +/* l */{PinyinKey(2400) /* l */, PinyinKey(60) /* eng */, PinyinKey(66) /* er */}, +/* m */{PinyinKey(2640) /* m */, PinyinKey(18) /* an */, PinyinKey(0) /* */}, +/* n */{PinyinKey(2880) /* n */, PinyinKey(54) /* en */, PinyinKey(0) /* */}, +/* o */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* p */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* r */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* s */{PinyinKey(3840) /* s */, PinyinKey(5) /* 5 */, PinyinKey(0) /* */}, +/* t */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* u */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* v */{PinyinKey(3360) /* q */, PinyinKey(720) /* ch */, PinyinKey(0) /* */}, +/* w */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* x */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* y */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* z */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +}; + +static const PinyinKey __zhuyin_ibm_map [][3] = +{ +/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ' */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* , */{PinyinKey(3) /* 3 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* - */{PinyinKey(1680) /* h */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* . */{PinyinKey(4) /* 4 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* / */{PinyinKey(5) /* 5 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 0 */{PinyinKey(2160) /* k */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 1 */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 2 */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 3 */{PinyinKey(2640) /* m */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 4 */{PinyinKey(1200) /* f */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 5 */{PinyinKey(960) /* d */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 6 */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 7 */{PinyinKey(2880) /* n */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 8 */{PinyinKey(2400) /* l */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 9 */{PinyinKey(1440) /* g */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ; */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* = */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* a */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* b */{PinyinKey(60) /* eng */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* c */{PinyinKey(54) /* en */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* d */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* e */{PinyinKey(4800) /* x */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* f */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* g */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* h */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* i */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* j */{PinyinKey(42) /* eh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* k */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* l */{PinyinKey(48) /* ei */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* m */{PinyinKey(2) /* 2 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* n */{PinyinKey(66) /* er */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* o */{PinyinKey(480) /* c */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* p */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* q */{PinyinKey(1920) /* j */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* r */{PinyinKey(5520) /* zh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* s */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* t */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* u */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* v */{PinyinKey(24) /* ang */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* w */{PinyinKey(3360) /* q */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* x */{PinyinKey(18) /* an */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* y */{PinyinKey(4080) /* sh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* z */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +}; + +static const PinyinKey __zhuyin_gin_yieh_map [][3] = +{ +/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ' */{PinyinKey(60) /* eng */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* , */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* - */{PinyinKey(54) /* en */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* . */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* / */{PinyinKey(18) /* an */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 0 */{PinyinKey(48) /* ei */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 1 */{PinyinKey(5) /* 5 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 2 */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 3 */{PinyinKey(960) /* d */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 4 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 5 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 6 */{PinyinKey(5520) /* zh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 7 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 8 */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 9 */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ; */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* = */{PinyinKey(66) /* er */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* [ */{PinyinKey(24) /* ang */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* a */{PinyinKey(3) /* 3 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* b */{PinyinKey(4800) /* x */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* c */{PinyinKey(2400) /* l */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* d */{PinyinKey(2880) /* n */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* e */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* f */{PinyinKey(2160) /* k */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* g */{PinyinKey(3360) /* q */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* h */{PinyinKey(4080) /* sh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* i */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* j */{PinyinKey(480) /* c */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* k */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* l */{PinyinKey(42) /* eh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* m */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* n */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* o */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* p */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* q */{PinyinKey(2) /* 2 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* r */{PinyinKey(1440) /* g */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* s */{PinyinKey(2640) /* m */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* t */{PinyinKey(1920) /* j */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* u */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* v */{PinyinKey(1680) /* h */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* w */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* x */{PinyinKey(1200) /* f */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* y */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* z */{PinyinKey(4) /* 4 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +}; + +static const PinyinKey __zhuyin_et_map [][3] = +{ +/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ' */{PinyinKey(480) /* c */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* , */{PinyinKey(5520) /* zh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* - */{PinyinKey(60) /* eng */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* . */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* / */{PinyinKey(4080) /* sh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 0 */{PinyinKey(24) /* ang */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 1 */{PinyinKey(5) /* 5 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 2 */{PinyinKey(2) /* 2 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 3 */{PinyinKey(3) /* 3 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 4 */{PinyinKey(4) /* 4 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 5 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 6 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 7 */{PinyinKey(3360) /* q */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 8 */{PinyinKey(18) /* an */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 9 */{PinyinKey(54) /* en */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ; */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* = */{PinyinKey(66) /* er */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* a */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* b */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* c */{PinyinKey(4800) /* x */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* d */{PinyinKey(960) /* d */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* e */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* f */{PinyinKey(1200) /* f */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* g */{PinyinKey(1920) /* j */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* h */{PinyinKey(1680) /* h */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* i */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* j */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* k */{PinyinKey(2160) /* k */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* l */{PinyinKey(2400) /* l */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* m */{PinyinKey(2640) /* m */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* n */{PinyinKey(2880) /* n */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* o */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* p */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* q */{PinyinKey(48) /* ei */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* r */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* s */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* t */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* u */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* v */{PinyinKey(1440) /* g */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* w */{PinyinKey(42) /* eh */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* x */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* y */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* z */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +}; + +static const PinyinKey __zhuyin_et26_map [][3] = +{ +/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ' */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* , */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* - */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* . */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* / */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 0 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 1 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 2 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 3 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 4 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 5 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 6 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 7 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 8 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* 9 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ; */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* = */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* a */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* b */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* c */{PinyinKey(4800) /* x */, PinyinKey(4080) /* sh */, PinyinKey(0) /* */}, +/* d */{PinyinKey(960) /* d */, PinyinKey(5) /* 5 */, PinyinKey(0) /* */}, +/* e */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* f */{PinyinKey(1200) /* f */, PinyinKey(2) /* 2 */, PinyinKey(0) /* */}, +/* g */{PinyinKey(1920) /* j */, PinyinKey(5520) /* zh */, PinyinKey(0) /* */}, +/* h */{PinyinKey(1680) /* h */, PinyinKey(66) /* er */, PinyinKey(0) /* */}, +/* i */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* j */{PinyinKey(3600) /* r */, PinyinKey(3) /* 3 */, PinyinKey(0) /* */}, +/* k */{PinyinKey(2160) /* k */, PinyinKey(4) /* 4 */, PinyinKey(0) /* */}, +/* l */{PinyinKey(2400) /* l */, PinyinKey(60) /* eng */, PinyinKey(0) /* */}, +/* m */{PinyinKey(2640) /* m */, PinyinKey(18) /* an */, PinyinKey(0) /* */}, +/* n */{PinyinKey(2880) /* n */, PinyinKey(54) /* en */, PinyinKey(0) /* */}, +/* o */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* p */{PinyinKey(3120) /* p */, PinyinKey(150) /* ou */, PinyinKey(0) /* */}, +/* q */{PinyinKey(5280) /* z */, PinyinKey(48) /* ei */, PinyinKey(0) /* */}, +/* r */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* s */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* t */{PinyinKey(4320) /* t */, PinyinKey(24) /* ang */, PinyinKey(0) /* */}, +/* u */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* v */{PinyinKey(1440) /* g */, PinyinKey(3360) /* q */, PinyinKey(0) /* */}, +/* w */{PinyinKey(480) /* c */, PinyinKey(42) /* eh */, PinyinKey(0) /* */}, +/* x */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* y */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* z */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */}, +}; + diff --git a/src/training/Makefile.am b/src/training/Makefile.am new file mode 100644 index 0000000..520e4e1 --- /dev/null +++ b/src/training/Makefile.am @@ -0,0 +1,36 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CPPFLAGS@ + +noinst_PROGRAMS = gen_ngram gen_unigram estimate_interpolation + +gen_ngram_SOURCES = gen_ngram.cpp + +gen_ngram_LDADD = ../storage/libstorage.la @GLIB2_LDFLAGS@ + +gen_unigram_SOURCES = gen_unigram.cpp + +gen_unigram_LDADD = ../storage/libstorage.la @GLIB2_LDFLAGS@ + +estimate_interpolation_SOURCES = estimate_interpolation.cpp + +estimate_interpolation_LDADD = ../storage/libstorage.la @GLIB2_LDFLAGS@ diff --git a/src/training/estimate_interpolation.cpp b/src/training/estimate_interpolation.cpp new file mode 100644 index 0000000..1a547bc --- /dev/null +++ b/src/training/estimate_interpolation.cpp @@ -0,0 +1,151 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2008 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +#include <math.h> +#include <glib.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "phrase_index.h" +#include "ngram.h" + +parameter_t compute_interpolation(SingleGram * deleted_bigram, + FacadePhraseIndex * unigram, + SingleGram * bigram){ + bool success; + parameter_t lambda = 0, next_lambda = 0.6; + parameter_t epsilon = 0.001; + + while ( fabs(lambda - next_lambda) > epsilon){ + lambda = next_lambda; + next_lambda = 0; + guint32 table_num = 0; + parameter_t numerator = 0; + parameter_t part_of_denominator = 0; + + PhraseIndexRange range; + range.m_range_begin = token_min; + range.m_range_end = token_max; + + BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem)); + deleted_bigram->search(&range, array); + + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + //get the phrase token + phrase_token_t token = item->m_token; + guint32 deleted_freq = 0; + assert(deleted_bigram->get_freq(token, deleted_freq)); + { + guint32 freq = 0; + parameter_t elem_poss = 0; + if ( bigram && bigram->get_freq(token, freq)){ + guint32 total_freq; + assert(bigram->get_total_freq(total_freq)); + assert(0 != total_freq); + elem_poss = freq / (parameter_t) total_freq; + } + numerator = lambda * elem_poss; + } + + { + guint32 freq = 0; + parameter_t elem_poss = 0; + PhraseItem item; + if (unigram->get_phrase_item(token, item)){ + guint32 freq = item.get_unigram_frequency(); + guint32 total_freq = unigram->get_phrase_index_total_freq(); + elem_poss = freq / (parameter_t)total_freq; + } + part_of_denominator = ( 1 - lambda) * elem_poss; + } + + if ( 0 == (numerator + part_of_denominator)) + continue; + + next_lambda += deleted_freq * (numerator / (numerator + part_of_denominator)); + } + assert(deleted_bigram->get_total_freq(table_num)); + next_lambda /= table_num; + + g_array_free(array, TRUE); + } + lambda = next_lambda; + return lambda; +} + +int main(int argc, char * argv[]){ + FacadePhraseIndex phrase_index; + + //gb_char binary file + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + //gbk_char binary file + chunk = new MemoryChunk; + chunk->load("../../data/gbk_char.bin"); + phrase_index.load(2, chunk); + + Bigram bigram; + bigram.attach("../../data/bigram.db", NULL); + + Bigram deleted_bigram; + deleted_bigram.attach("../../data/deleted_bigram.db", NULL); + + GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + deleted_bigram.get_all_items(system_items, user_items); + assert(0 == user_items->len); + g_array_free(user_items, TRUE); + + parameter_t lambda_sum = 0; + int lambda_count = 0; + + for ( int i = 0; i < system_items->len; ++i ){ + phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i); + SingleGram * system = NULL, * user = NULL; + bigram.load(*token, system, user); + assert(NULL == user); + SingleGram * deleted_system = NULL, * deleted_user = NULL; + deleted_bigram.load(*token, deleted_system, deleted_user); + assert(NULL == deleted_user); + + parameter_t lambda = compute_interpolation(deleted_system, &phrase_index, system); + + printf("lambda:%f\n", lambda); + + lambda_sum += lambda; + lambda_count ++; + + if (system) delete system; + delete deleted_system; + } + + printf("average lambda:%f\n", (lambda_sum/lambda_count)); + g_array_free(system_items, TRUE); +} + diff --git a/src/training/gen_ngram.cpp b/src/training/gen_ngram.cpp new file mode 100644 index 0000000..4dfea78 --- /dev/null +++ b/src/training/gen_ngram.cpp @@ -0,0 +1,179 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +#include <glib.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "phrase_index.h" +#include "ngram.h" + + +static GHashTable * g_phrases; + +//read gb_char.table and gbk_char.table +bool init_phrases(FILE * infile){ + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + while (!feof(infile)){ + fscanf(infile, "%s", pinyin); + fscanf(infile, "%s", phrase); + fscanf(infile, "%d", &token); + fscanf(infile, "%ld", &freq); + if ( feof(infile) ) + break; + g_hash_table_insert(g_phrases, g_strdup(phrase), + GUINT_TO_POINTER(token)); + } + return true; +} + +void print_help(){ + printf("gen_ngram [--skip-pi-gram-training] [--skip-unigram-training]\n"); + printf(" [--bigram-file <FILENAME>]\n"); + exit(1); +} + +int main(int argc, char * argv[]){ + int i = 1; + bool train_pi_gram = true; + bool train_unigram = true; + const char * bigram_filename = "../../data/bigram.db"; + + setlocale(LC_ALL,""); + while ( i < argc ){ + if ( strcmp("--help", argv[i] ) == 0){ + print_help(); + }else if ( strcmp("--skip-pi-gram-training", argv[i] ) == 0) { + train_pi_gram = false; + }else if ( strcmp("--skip-unigram-training", argv[i] ) == 0) { + train_unigram = false; + }else if ( strcmp("--bigram-file", argv[i] ) == 0){ + if ( ++i >= argc ) + print_help(); + bigram_filename = argv[i]; + } + ++i; + } + + g_phrases = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL); + //init phrase lookup + FILE * gb_file = fopen("../../data/gb_char.table", "r"); + if ( gb_file == NULL ){ + fprintf(stderr, "can't open gb_char.table!\n"); + exit(1); + } + init_phrases(gb_file); + fclose(gb_file); + FILE * gbk_file = fopen("../../data/gbk_char.table", "r"); + if ( gbk_file == NULL ){ + fprintf(stderr, "can't open gbk_char.table!\n"); + exit(1); + } + init_phrases(gbk_file); + fclose(gbk_file); + + FacadePhraseIndex phrase_index; + + //gb_char binary file + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + //gbk_char binary file + chunk = new MemoryChunk; + chunk->load("../../data/gbk_char.bin"); + phrase_index.load(2, chunk); + + Bigram bigram; + bigram.attach(NULL, bigram_filename); + + + char* linebuf = (char *)malloc ( 1024 * sizeof (char) ); + size_t size = 1024; + phrase_token_t last_token, cur_token = last_token = 0; + while( getline(&linebuf, &size, stdin) ){ + if ( feof(stdin) ) + break; + linebuf[strlen(linebuf)-1] = '\0'; + + phrase_token_t token; + gpointer orig_key, value; + gboolean result = g_hash_table_lookup_extended + (g_phrases, linebuf, &orig_key, &value); + if (result){ + token = GPOINTER_TO_UINT(value); + }else{ + token = 0; + } + + last_token = cur_token; + cur_token = token; + if ( cur_token ){ + //training uni-gram + if ( train_unigram ) + phrase_index.add_unigram_frequency(cur_token, 1); + } + if ( cur_token ){ + SingleGram * system = NULL, * user = NULL; + if ( 0 == last_token ){ + if (train_pi_gram) + bigram.load(sentence_start, system, user); + } else + bigram.load(last_token, system, user); + assert(NULL == system); + if ( NULL == user ){ + user = new SingleGram; + } + guint32 freq, total_freq; + //increase freq + user->get_freq(cur_token, freq); + user->set_freq(cur_token, freq + 1); + //increase total freq + user->get_total_freq(total_freq); + user->set_total_freq(total_freq + 1); + if ( 0 == last_token ){ + if ( train_pi_gram ) + bigram.store(sentence_start, user); + }else + bigram.store(last_token, user); + delete user; + } + } + + MemoryChunk * new_chunk = new MemoryChunk; + phrase_index.store(1, new_chunk); + new_chunk->save("../../data/gb_char.bin"); + phrase_index.load(1, new_chunk); + + new_chunk = new MemoryChunk; + phrase_index.store(2, new_chunk); + new_chunk->save("../../data/gbk_char.bin"); + phrase_index.load(2, new_chunk); + + return 0; +} diff --git a/src/training/gen_unigram.cpp b/src/training/gen_unigram.cpp new file mode 100644 index 0000000..7e76693 --- /dev/null +++ b/src/training/gen_unigram.cpp @@ -0,0 +1,65 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <stdio.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "phrase_index.h" + +//increase all unigram frequency by one. + +int main(int argc, char * argv[]){ + + FacadePhraseIndex phrase_index; + + //gb_char binary file + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + //gbk_char binary file + chunk = new MemoryChunk; + chunk->load("../../data/gbk_char.bin"); + phrase_index.load(2, chunk); + + for ( size_t i = 16777217; i <= 16870566; ++i){ + phrase_index.add_unigram_frequency(i, 1); + } + +#if 0 + for ( size_t i = 33554433; i <= 33570193 ; ++i){ + phrase_index.add_unigram_frequency(i, 1); + } +#endif + + MemoryChunk * new_chunk = new MemoryChunk; + phrase_index.store(1, new_chunk); + new_chunk->save("../../data/gb_char.bin"); + phrase_index.load(1, new_chunk); + + new_chunk = new MemoryChunk; + phrase_index.store(2, new_chunk); + new_chunk->save("../../data/gbk_char.bin"); + phrase_index.load(2, new_chunk); + + return 0; +} diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 0000000..f36e5f9 --- /dev/null +++ b/tests/Makefile.am @@ -0,0 +1,25 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = include storage lookup + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I $(ac_aux_dir) diff --git a/tests/include/Makefile.am b/tests/include/Makefile.am new file mode 100644 index 0000000..53bc089 --- /dev/null +++ b/tests/include/Makefile.am @@ -0,0 +1,25 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +INCLUDES = -I$(top_srcdir)/src/include + +noinst_PROGRAMS = test_memory_chunk + +test_memory_chunk_SOURCES = test_memory_chunk.cpp + +test_memory_chunk_LDADD = ../../src/storage/libstorage.la @GLIB2_LIBS@ + diff --git a/tests/include/test_memory_chunk.cpp b/tests/include/test_memory_chunk.cpp new file mode 100755 index 0000000..6282d93 --- /dev/null +++ b/tests/include/test_memory_chunk.cpp @@ -0,0 +1,90 @@ +#include <stdio.h> +#include <iostream> +#include "memory_chunk.h" +// Test Memory Chunk Functionality + +int main(int argc, char * argv[]){ + MemoryChunk* chunk; + chunk = new MemoryChunk(); + int i = 12; + chunk->set_content(0, &i, sizeof(int)); + + int * p = (int *)chunk->begin(); + assert(chunk->size() == sizeof(int)); + std::cout<<*p<<std::endl; + std::cout<<chunk->capacity()<<std::endl; + p = & i; + chunk->set_chunk(p, sizeof(int), NULL); + short t = 5; + chunk->set_content(sizeof(int), &t, sizeof(short)); + assert( sizeof(int) + sizeof(short) == chunk->size()); + std::cout<<chunk->capacity()<<std::endl; + + p = (int *)chunk->begin(); + short * p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); + std::cout<<*p<<'\t'<<*p2<<std::endl; + + chunk->set_content(sizeof(int) + sizeof(short), &t, sizeof(short)); + + assert( sizeof(int) + (sizeof(short) << 1) == chunk->size()); + std::cout<<chunk->capacity()<<std::endl; + p = (int *)chunk->begin(); + p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); + std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<std::endl; + + chunk->set_size(sizeof(int) + sizeof(short) *3); + p = (int *)chunk->begin(); + p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); + + chunk->set_content(0, &i, sizeof(int)); + + *(p2+2) = 3; + std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<'\t'<<*(p2+2)<<std::endl; + + int m = 10; + chunk->set_chunk(&m, sizeof(int), NULL); + int n = 12; + chunk->insert_content(sizeof(int), &n, sizeof(int)); + n = 11; + chunk->insert_content(sizeof(int), &n, sizeof(int)); + + int * p3 = (int *)chunk->begin(); + std::cout<<*p3<<'\t'<<*(p3+1)<<'\t'<<*(p3+2)<<std::endl; + + chunk->remove_content(sizeof(int), sizeof(int)); + std::cout<<*p3<<'\t'<<*(p3+1)<<std::endl; + + int tmp; + assert(chunk->get_content(sizeof(int), &tmp, sizeof(int))); + std::cout<<tmp<<std::endl; + + + delete chunk; + + const char * filename = "/tmp/version"; + const char * version = "0.2.0"; + + chunk = new MemoryChunk; + bool retval = chunk->load(filename); + if ( !retval ){ + std::cerr<<"can't find chunk"<<std::endl; + }else{ + if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){ + std::cout<<"match"<<std::endl; + } + + } + + chunk->set_content(0, version, strlen(version) + 1); + chunk->save(filename); + + retval = chunk->load(filename); + if ( !retval ){ + std::cerr<<"can't find chunk"<<std::endl; + } + if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){ + std::cout<<"match"<<std::endl; + } + + return 0; +} diff --git a/tests/lookup/Makefile.am b/tests/lookup/Makefile.am new file mode 100644 index 0000000..ca863ce --- /dev/null +++ b/tests/lookup/Makefile.am @@ -0,0 +1,27 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + -I$(top_srcdir)/src/lookup \ + @GLIB2_CPPFLAGS@ + +noinst_PROGRAMS = test_simple_lookup + +test_simple_lookup_SOURCES = test_simple_lookup.cpp + +test_simple_lookup_LDADD = ../../src/storage/libstorage.la ../../src/lookup/liblookup.la @GLIB2_LDFLAGS@ diff --git a/tests/lookup/test_simple_lookup.cpp b/tests/lookup/test_simple_lookup.cpp new file mode 100644 index 0000000..04f4dce --- /dev/null +++ b/tests/lookup/test_simple_lookup.cpp @@ -0,0 +1,108 @@ +#include <string.h> +#include <stdio.h> +#include <sys/time.h> +#include <glib.h> +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" +#include "phrase_index.h" +#include "ngram.h" +#include "lookup.h" + +size_t bench_times = 1000; + +guint32 record_time () +{ + timeval tv; + gettimeofday (&tv, NULL); + return (guint32) tv.tv_sec * 1000000 + tv.tv_usec; +} + +void print_time (guint32 old_time, guint32 times) +{ + timeval tv; + gettimeofday (&tv, NULL); + + guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time; + + printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted ); +} + + +int main( int argc, char * argv[]){ + + PinyinCustomSettings custom; + PinyinLargeTable largetable(&custom); + + BitmapPinyinValidator validator; + validator.initialize(&largetable); + + MemoryChunk * new_chunk = new MemoryChunk; + new_chunk->load("../../data/pinyin_index.bin"); + largetable.load(new_chunk); + + FacadePhraseIndex phrase_index; + new_chunk = new MemoryChunk; + new_chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, new_chunk); + new_chunk = new MemoryChunk; + new_chunk->load("../../data/gbk_char.bin"); + phrase_index.load(2, new_chunk); + + Bigram bigram; + bigram.attach("../../data/bigram.db", "/tmp/bigram.db"); + + PinyinLookup pinyin_lookup(&custom, &largetable, &phrase_index, &bigram); + + char* linebuf = (char *)malloc ( 1024 * sizeof (char) ); + size_t size = 1024; + while( getline(&linebuf, &size, stdin) ){ + linebuf[strlen(linebuf)-1] = '\0'; + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + PinyinDefaultParser parser; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + + validator.initialize(&largetable); + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + parser.parse(validator, keys, poses,linebuf); + + if ( 0 == keys->len ) + continue; + CandidateConstraints constraints = g_array_new(FALSE, FALSE, sizeof(lookup_constraint_t)); + + g_array_set_size(constraints, keys->len); + for ( size_t i = 0; i < constraints->len; ++i){ + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + guint32 start_time = record_time(); + size_t times = 100; + for ( size_t i = 0; i < times; ++i) + pinyin_lookup.get_best_match(keys, constraints, results); + print_time(start_time, times); + for ( size_t i = 0; i < results->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if ( NULL == *token) + continue; + printf("pos:%d,token:%d\t", i, *token); + } + printf("\n"); + char * sentence = NULL; + pinyin_lookup.convert_to_utf8(results, sentence); + printf("%s\n", sentence); + + g_array_free(keys, true); + g_array_free(poses, true); + g_free(sentence); + } + free(linebuf); +} diff --git a/tests/storage/Makefile.am b/tests/storage/Makefile.am new file mode 100644 index 0000000..e38c690 --- /dev/null +++ b/tests/storage/Makefile.am @@ -0,0 +1,41 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CPPFLAGS@ + +noinst_PROGRAMS = test_parser \ + test_pinyin_index \ + test_phrase_index \ + test_ngram + +test_parser_SOURCES = test_parser.cpp + +test_parser_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ + +test_pinyin_index_SOURCES = test_pinyin_index.cpp + +test_pinyin_index_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ + +test_phrase_index_SOURCES = test_phrase_index.cpp + +test_phrase_index_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ + +test_ngram_SOURCES = test_ngram.cpp + +test_ngram_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp new file mode 100644 index 0000000..7bdb141 --- /dev/null +++ b/tests/storage/test_ngram.cpp @@ -0,0 +1,126 @@ +#include <stdio.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "ngram.h" + + +int main(int argc, char * argv[]){ + SingleGram single_gram; + + const guint32 total_freq = 16; + assert(single_gram.set_total_freq(total_freq)); + + + phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3}; + guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; + + for(int i = 0; i < 6 ;++i){ + single_gram.set_freq(tokens[i], freqs[i]); + } + + guint32 freq; + single_gram.get_freq(3, freq); + assert(freq == 32); + + printf("--------------------------------------------------------\n"); + PhraseIndexRange range; + BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem)); + range.m_range_begin = 0; range.m_range_end = 8; + single_gram.search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + + + assert(single_gram.get_total_freq(freq)); + assert(freq == total_freq); + + + Bigram bigram; + assert(bigram.attach(NULL, "/tmp/system.db")); + bigram.store(1, &single_gram); + single_gram.set_freq(5, 8); + single_gram.set_total_freq(32); + + bigram.store(2, &single_gram); + + printf("--------------------------------------------------------\n"); + SingleGram * system, * user; + bigram.load(1, system, user); + assert(NULL == system); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + user->search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete user; + + printf("--------------------------------------------------------\n"); + bigram.load(2, system, user); + assert(NULL == system); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + user->search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete user; + + bigram.attach("/tmp/system.db", NULL); + printf("--------------------------------------------------------\n"); + bigram.load(1, system, user); + assert(NULL == user); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + system->search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete system; + + printf("--------------------------------------------------------\n"); + bigram.load(2, system, user); + assert(NULL == user); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + system->search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + delete system; + + printf("--------------------------------------------------------\n"); + single_gram.prune(); + g_array_set_size(array, 0); + range.m_range_begin = 0; range.m_range_end = 8; + single_gram.search(&range,array); + for ( int i = 0; i < array->len; ++i){ + BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); + printf("item:%d:%f\n", item->m_token, item->m_freq); + } + assert(single_gram.get_total_freq(freq)); + printf("total_freq:%d\n", freq); + + g_array_free(array, TRUE); + + GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(system_items, user_items); + + printf("----------------------system----------------------------\n"); + for ( int i = 0; i < system_items->len; ++i){ + phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i); + printf("item:%d\n", *token); + } + printf("-----------------------user-----------------------------\n"); + for ( int i = 0; i < user_items->len; ++i){ + phrase_token_t * token = &g_array_index(user_items, phrase_token_t, i); + printf("item:%d\n", *token); + } +} diff --git a/tests/storage/test_parser.cpp b/tests/storage/test_parser.cpp new file mode 100644 index 0000000..ba5bfb8 --- /dev/null +++ b/tests/storage/test_parser.cpp @@ -0,0 +1,165 @@ +/* + * libpinyin + * + * Copyright (c) 2006 James Su <suzhe@tsinghua.org.cn> + * + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA + * + * $Id$ + * + */ + +#include <string.h> +#include <iostream> +#include "pinyin_base.h" + +typedef std::string String; + + +static const char *help_msg = + "Too few argument!\n" + "Usage:\n" + " test-parser [options]\n\n" + " -i Use incomplete pinyin.\n" + " -f table Use specified pinyin table file.\n" + " -p parser Use specified parser instead of Default.\n" + " parser could be:\n" + " sp-stone\n" + " sp-zrm\n" + " sp-ms\n" + " sp-ziguang\n" + " sp-abc\n" + " sp-liushi\n" + " zy-zhuyin\n" + " zy-standard\n" + " zy-hsu\n" + " zy-ibm\n" + " zy-gin-yieh\n" + " zy-et\n" + " zy-et26\n"; + +int main (int argc, char * argv []) +{ + NullPinyinValidator validator; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + PinyinCustomSettings custom; + PinyinParser *parser = 0; + //PinyinTable table; + const char *tablefile = "../data/pinyin-table.txt"; + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + + int i = 0; + while (i<argc) { + if (++i >= argc) break; + + if (String ("-h") == argv [i] || String ("--help") == argv [i]) { + std::cout << help_msg; + return 0; + } + + if (String ("-i") == argv [i]) { + custom.set_use_incomplete (true); + continue; + } + + if (String ("-p") == argv [i]) { + if (++i >= argc) { + std::cerr << "No argument for option " << argv [i-1] << "\n"; + return -1; + } + if (!strcmp (argv[i], "sp") || !strcmp (argv[i], "sp-default")) + parser = new PinyinShuangPinParser (); + else if (!strcmp (argv[i], "sp-stone")) + parser = new PinyinShuangPinParser (SHUANG_PIN_STONE); + else if (!strcmp (argv[i], "sp-zrm")) + parser = new PinyinShuangPinParser (SHUANG_PIN_ZRM); + else if (!strcmp (argv[i], "sp-ms")) + parser = new PinyinShuangPinParser (SHUANG_PIN_MS); + else if (!strcmp (argv[i], "sp-ziguang")) + parser = new PinyinShuangPinParser (SHUANG_PIN_ZIGUANG); + else if (!strcmp (argv[i], "sp-abc")) + parser = new PinyinShuangPinParser (SHUANG_PIN_ABC); + else if (!strcmp (argv[i], "sp-liushi")) + parser = new PinyinShuangPinParser (SHUANG_PIN_LIUSHI); + continue; + } + + if (String ("-f") == argv [i]) { + if (++i >= argc) { + std::cerr << "No argument for option " << argv [i-1] << "\n"; + return -1; + } + tablefile = argv [i]; + continue; + } + + std::cerr << "Invalid option: " << argv [i] << "\n"; + return -1; + }; + + if (!parser) parser = new PinyinDefaultParser (); + +/* + if (!table.load (tablefile)) { + std::cerr << "Failed to load tablefile: " << tablefile << "\n"; + return -1; + } +*/ + //table.update_custom_settings (custom); + + + char buf[1024]; + + while (1) { + std::cout << "Input:" << std::flush; + std::cin.getline (buf, 1023, '\n'); + + if (strncmp (buf, "quit", 4) == 0) break; + + int len = parser->parse (validator, keys, poses,(const char *) buf); + + std::cout << "Parsed " << len << " chars, " << keys->len << " keys:\n"; + + for (size_t i=0; i < keys->len; ++i){ + PinyinKey * key = &g_array_index(keys, PinyinKey, i); + std::cout << key->get_key_string () << " "; + } + + std::cout << std::endl; + + for ( size_t i=0; i < poses->len; ++i){ + PinyinKeyPos * pos = &g_array_index(poses, PinyinKeyPos, i); + std::cout << pos->get_pos() << " " << pos->get_length()<<" "; + } + + std::cout << std::endl; + + for (size_t i=0; i < keys->len; ++i){ + PinyinKey * key = &g_array_index(keys, PinyinKey, i); + std::cout << key->get_key_zhuyin_string () << " "; + } + + std::cout << std::endl; + } +} + +/* +vi:ts=4:nowrap:ai:expandtab +*/ diff --git a/tests/storage/test_phrase_index.cpp b/tests/storage/test_phrase_index.cpp new file mode 100644 index 0000000..d858ae2 --- /dev/null +++ b/tests/storage/test_phrase_index.cpp @@ -0,0 +1,141 @@ +#include <stdio.h> +#include <sys/time.h> +#include <glib.h> +#include "memory_chunk.h" +#include "pinyin_base.h" +#include "phrase_index.h" + +size_t bench_times = 100000; + +guint32 record_time () +{ + timeval tv; + gettimeofday (&tv, NULL); + return (guint32) tv.tv_sec * 1000000 + tv.tv_usec; +} + +void print_time (guint32 old_time, guint32 times) +{ + timeval tv; + gettimeofday (&tv, NULL); + + guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time; + + printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted ); +} + + +int main(int argc, char * argv[]){ + PhraseItem phrase_item; + utf16_t string1 = 2; + PinyinKey key1 = PinyinKey((PinyinInitial)3,(PinyinFinal)3,(PinyinTone)3); + PinyinKey key2 = PinyinKey((PinyinInitial)4,(PinyinFinal)4,(PinyinTone)4); + + + phrase_item.set_phrase_string(1, &string1); + phrase_item.append_pronunciation(&key1, 100); + phrase_item.append_pronunciation(&key2, 300); + + assert(phrase_item.get_phrase_length() == 1); + + PinyinKey key3; + guint32 freq; + phrase_item.get_nth_pronunciation(0, &key3, freq); + assert(key3 == key1); + assert(freq == 100); + phrase_item.get_nth_pronunciation(1, &key3, freq); + assert(key3 == key2); + assert(freq == 300); + + PinyinCustomSettings custom; + gfloat poss = phrase_item.get_pinyin_possibility(custom, &key1); + printf("pinyin possiblitiy:%f\n", poss); + + assert(phrase_item.get_unigram_frequency() == 0); + + utf16_t string2; + phrase_item.get_phrase_string(&string2); + assert(string1 == string2); + + FacadePhraseIndex phrase_index; + assert(phrase_index.add_phrase_item(1, &phrase_item)); + + MemoryChunk* chunk = new MemoryChunk; + assert(phrase_index.store(0, chunk)); + assert(phrase_index.load(0, chunk)); + + PhraseItem item2; + guint32 time = record_time(); + for ( int i = 0; i < bench_times; ++i){ + phrase_index.get_phrase_item(1, item2); + assert(item2.get_unigram_frequency() == 0); + assert(item2.get_n_pronunciation() == 2); + assert(item2.get_phrase_length() == 1); + assert(item2.get_pinyin_possibility(custom, &key2) == 0.75); + } + print_time(time, bench_times); + + { + PhraseItem item3; + phrase_index.get_phrase_item(1, item3); + item3.increase_pinyin_possibility(custom, &key1, 200); + assert(item3.get_pinyin_possibility(custom, &key1) == 0.5) ; + } + + { + PhraseItem item5; + phrase_index.get_phrase_item(1, item5); + gfloat poss = item5.get_pinyin_possibility(custom, &key1); + printf("pinyin poss:%f\n", poss); + assert(poss == 0.5); + } + + FacadePhraseIndex phrase_index_load; + + FILE* infile = fopen("../../data/gb_char.table", "r"); + if ( NULL == infile ){ + printf("open gb_char.table failed!\n"); + exit(1); + } + + phrase_index_load.load_text(1, infile); + fclose(infile); + + infile = fopen("../../data/gbk_char.table", "r"); + if ( NULL == infile ){ + printf("open gbk_char.table failed!\n"); + exit(1); + } + + phrase_index_load.load_text(2, infile); + fclose(infile); + + MemoryChunk* store1 = new MemoryChunk; + phrase_index_load.store(1, store1); + phrase_index_load.load(1, store1); + + MemoryChunk* store2 = new MemoryChunk; + phrase_index_load.store(2, store2); + phrase_index_load.load(2, store2); + + phrase_index_load.get_phrase_item(16870555, item2); + assert( item2.get_phrase_length() == 14); + assert( item2.get_n_pronunciation() == 1); + + gunichar2 buf[1024]; + item2.get_phrase_string(buf); + char * string = g_utf16_to_utf8( buf, 14, NULL, NULL, NULL); + printf("%s\n", string); + g_free(string); + + guint32 delta = 3; + phrase_index_load.add_unigram_frequency(16870555, delta); + phrase_index_load.get_phrase_item(16870555, item2); + assert( item2.get_unigram_frequency() == 3); + + phrase_index_load.get_phrase_item(16777222, item2); + assert(item2.get_phrase_length() == 1); + assert(item2.get_n_pronunciation() == 5); + + return 0; +} diff --git a/tests/storage/test_pinyin_index.cpp b/tests/storage/test_pinyin_index.cpp new file mode 100644 index 0000000..e79eb3b --- /dev/null +++ b/tests/storage/test_pinyin_index.cpp @@ -0,0 +1,148 @@ +#include <string.h> +#include <stdio.h> +#include <sys/time.h> +#include <glib.h> +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" +#include "phrase_index.h" + +size_t bench_times = 1000; + +guint32 record_time () +{ + timeval tv; + gettimeofday (&tv, NULL); + return (guint32) tv.tv_sec * 1000000 + tv.tv_usec; +} + +void print_time (guint32 old_time, guint32 times) +{ + timeval tv; + gettimeofday (&tv, NULL); + + guint32 wasted = (guint32) tv.tv_sec * 1000000 + tv.tv_usec - old_time; + + printf("Spent %d us for %d operations, %f us/op, %f times/s.\n\n" , wasted , times , ((double) wasted)/times , times * 1000000.0/wasted ); +} + + +int main( int argc, char * argv[]){ + + PinyinCustomSettings custom; + PinyinLargeTable largetable(&custom); + + FILE * gbfile = fopen("../../data/gb_char.table", "r"); + if ( gbfile == NULL) { + printf("open gb_char.table failed!"); + return 1; + } + FILE * gbkfile = fopen("../../data/gbk_char.table","r"); + if ( gbkfile == NULL) { + printf("open gb_char.table failed!"); + return 1; + } + + largetable.load_text(gbfile); + fclose(gbfile); + largetable.load_text(gbkfile); + fclose(gbkfile); + + FacadePhraseIndex phrase_index; + + FILE* infile = fopen("../../data/gb_char.table", "r"); + if ( NULL == infile ){ + printf("open gb_char.table failed!\n"); + exit(1); + } + + phrase_index.load_text(1, infile); + fclose(infile); + + infile = fopen("../../data/gbk_char.table", "r"); + if ( NULL == infile ){ + printf("open gbk_char.table failed!\n"); + exit(1); + } + + phrase_index.load_text(2, infile); + fclose(infile); + + MemoryChunk* new_chunk = new MemoryChunk; + largetable.store(new_chunk); + largetable.load(new_chunk); + + char* linebuf = (char *)malloc ( 1024 * sizeof (char) ); + size_t size = 1024; + while( getline(&linebuf, &size, stdin) ){ + linebuf[strlen(linebuf)-1] = '\0'; + if ( strcmp ( linebuf, "quit" ) == 0) + break; + + PinyinDefaultParser parser; + NullPinyinValidator validator; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + parser.parse(validator, keys, poses, linebuf); + + guint32 start = record_time(); + + PhraseIndexRanges ranges; + for( int i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ + ranges[i] = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange)); + } + for ( int i = 0 ; i < bench_times; ++i){ + largetable.search(keys->len, (PinyinKey *)keys->data, ranges); + } + + for( int i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ + GArray * range = ranges[i]; + g_array_set_size( range, 0); + } + print_time(start, bench_times); + + largetable.search(keys->len, (PinyinKey *)keys->data, ranges); + for( int i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ + GArray * range = ranges[i]; + if ( range ){ + for (int k = 0; k < range->len; ++k){ + PhraseIndexRange* onerange = &g_array_index(range, PhraseIndexRange, k); + printf("start:%ld\tend:%ld\n", onerange->m_range_begin, onerange->m_range_end); + PhraseItem item; + for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){ + phrase_index.get_phrase_item( token, item); + gunichar2 bufstr[1024]; + item.get_phrase_string(bufstr); + char * string = g_utf16_to_utf8 + ( bufstr, item.get_phrase_length(), + NULL, NULL, NULL); + printf("%s\t", string); + g_free(string); + PinyinKey pinyin_buffer[1024]; + size_t npron = item.get_n_pronunciation(); + guint32 freq; + for ( size_t n = 0; n < npron; ++n){ + item.get_nth_pronunciation(n, pinyin_buffer, freq); + for ( size_t o = 0; o < item.get_phrase_length(); ++o){ + printf("%s'", pinyin_buffer[o].get_key_string()); + } + printf("\b \t %d", freq); + } + printf("\n"); + } + } + if ( range->len) + printf("range items number:%d\n", range->len); + } + g_array_set_size( range, 0); + } + + g_array_free(keys, TRUE); + g_array_free(poses, TRUE); + } + free(linebuf); +} diff --git a/utils/Makefile.am b/utils/Makefile.am new file mode 100644 index 0000000..1f0d85d --- /dev/null +++ b/utils/Makefile.am @@ -0,0 +1,25 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +AUTOMAKE_OPTIONS = gnu +SUBDIRS = storage + +MAINTAINERCLEANFILES = Makefile.in + +CLEANFILES = *.bak + +ACLOCAL = aclocal -I $(ac_aux_dir) diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am new file mode 100644 index 0000000..9328174 --- /dev/null +++ b/utils/storage/Makefile.am @@ -0,0 +1,30 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CPPFLAGS@ + +noinst_PROGRAMS = gen_pinyin_table gen_binary_files + +gen_pinyin_table_SOURCES = gen_pinyin_table.cpp + +gen_pinyin_table_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ + +gen_binary_files_SOURCES = gen_binary_files.cpp + +gen_binary_files_LDADD = ../../src/storage/libstorage.la @GLIB2_LDFLAGS@ diff --git a/utils/storage/gen_binary_files.cpp b/utils/storage/gen_binary_files.cpp new file mode 100644 index 0000000..7386106 --- /dev/null +++ b/utils/storage/gen_binary_files.cpp @@ -0,0 +1,68 @@ +#include <stdio.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" +#include "phrase_index.h" + +int main(int argc, char * argv[]){ + /* generate pinyin index*/ + PinyinCustomSettings custom; + PinyinLargeTable largetable(&custom); + + FILE * gbfile = fopen("../../data/gb_char.table", "r"); + if ( gbfile == NULL) { + printf("open gb_char.table failed!"); + return 1; + } + FILE * gbkfile = fopen("../../data/gbk_char.table","r"); + if ( gbkfile == NULL) { + printf("open gb_char.table failed!"); + return 1; + } + + largetable.load_text(gbfile); + fclose(gbfile); + largetable.load_text(gbkfile); + fclose(gbkfile); + + MemoryChunk * new_chunk = new MemoryChunk; + largetable.store(new_chunk); + new_chunk->save("../../data/pinyin_index.bin"); + largetable.load(new_chunk); + + + /* generate phrase index*/ + FacadePhraseIndex phrase_index; + + FILE* infile = fopen("../../data/gb_char.table", "r"); + if ( NULL == infile ){ + printf("open gb_char.table failed!\n"); + exit(1); + } + + phrase_index.load_text(1, infile); + fclose(infile); + + infile = fopen("../../data/gbk_char.table", "r"); + if ( NULL == infile ){ + printf("open gbk_char.table failed!\n"); + exit(1); + } + + phrase_index.load_text(2, infile); + fclose(infile); + + new_chunk = new MemoryChunk; + phrase_index.store(1, new_chunk); + new_chunk->save("../../data/gb_char.bin"); + phrase_index.load(1, new_chunk); + + new_chunk = new MemoryChunk; + phrase_index.store(2, new_chunk); + new_chunk->save("../../data/gbk_char.bin"); + phrase_index.load(2, new_chunk); + + return 0; +} diff --git a/utils/storage/gen_pinyin_table.cpp b/utils/storage/gen_pinyin_table.cpp new file mode 100644 index 0000000..38e6a27 --- /dev/null +++ b/utils/storage/gen_pinyin_table.cpp @@ -0,0 +1,248 @@ +#include <stdio.h> +#include <locale.h> +#include <glib.h> +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" + + +GTree * g_pinyin_tree; +GArray * g_item_array[MAX_PHRASE_LENGTH + 1]; + +struct phrase_item{ + size_t length; + gunichar * uniphrase; +}; + +struct pinyin_and_freq_item{ + GArray * pinyin; + guint32 freq; +}; + +struct item{ + phrase_item * phrase; + GArray * pinyin_and_freq_array; /* Array of pinyin_and_freq_item. */ +}; + +void feed_file(const char * filename); + +void feed_line(const char * phrase, const char * pinyin, const guint32 freq); + +void store_in_item_array(); + +void sort_item_array(); + +void gen_phrase_file(const char * outfilename, int phrase_index); + +void print_help(){ + printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> " + "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"); + printf("<OUTPUTFILE> the result output file\n"); + printf("<FILEi> input pinyin files\n"); + printf("<PHRASE_INDEX> phrase index identifier\n"); + exit(1); +} + +gint phrase_item_compare(gconstpointer a, gconstpointer b){ + phrase_item * itema = (phrase_item *) a; + phrase_item * itemb = (phrase_item *) b; + if ( itema->length != itemb->length ) + return itema->length - itemb->length; + else + return memcmp(itema->uniphrase, itemb->uniphrase, + sizeof(gunichar) * itema->length); +} + +int main(int argc, char * argv[]){ + char outfilename[1024]="temp.out"; + int phrase_index = 0; + int i = 1; + + g_pinyin_tree = g_tree_new(phrase_item_compare); + + setlocale(LC_ALL,""); + while ( i < argc ){ + if ( strcmp("--help", argv[i] ) == 0) { + print_help(); + }else if ( strcmp("-t", argv[i] ) == 0){ + if ( ++i >= argc ) + print_help(); + phrase_index = atoi(argv[i]); + }else if ( strcmp("-o", argv[i] ) == 0 ){ + if ( ++i >= argc ) + print_help(); + strcpy( outfilename, argv[i]); + } else { + feed_file(argv[i]); + } + ++i; + } + + printf("nnodes: %d\n", g_tree_nnodes(g_pinyin_tree)); + + store_in_item_array(); + sort_item_array(); + gen_phrase_file(outfilename, phrase_index); + + return 0; +} + + +void feed_file ( const char * filename){ + char phrase[1024], pinyin[1024]; + guint32 n_freq; + FILE * infile = fopen(filename, "r"); + if ( NULL == infile ){ + fprintf(stderr, "Can't open file %s.\n", filename); + exit(1); + } + while ( !feof(infile)){ + fscanf(infile, "%s", phrase); + fscanf(infile, "%s", pinyin); + fscanf(infile, "%u", &n_freq); + if (feof(infile)) + break; + feed_line(phrase, pinyin, n_freq); + } + fclose(infile); +} + +void feed_line (const char * phrase, const char * pinyin, const guint32 freq){ + phrase_item * new_phrase_ptr = (phrase_item *) + malloc( sizeof(phrase_item)); + new_phrase_ptr->length = g_utf8_strlen(phrase, -1); + /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp + * where is the code which I don't want to touch. :-) + */ + if (new_phrase_ptr->length >= MAX_PHRASE_LENGTH ) { + printf("too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); + free(new_phrase_ptr); + return; + } + new_phrase_ptr->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); + + PinyinDefaultParser parser; + NullPinyinValidator validator; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + parser.parse(validator, keys, poses, pinyin); + + GArray * array = (GArray *)g_tree_lookup(g_pinyin_tree, new_phrase_ptr); + + pinyin_and_freq_item value_item; + value_item.pinyin = keys; + value_item.freq = freq; + + if(new_phrase_ptr->length != value_item.pinyin->len){ + printf("error:phrase:%s\tpinyin:%s\n", phrase, pinyin); + return; + } + + if ( array == NULL){ + array = g_array_new(FALSE, TRUE, sizeof(pinyin_and_freq_item)); + g_array_append_val(array, value_item); + g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); + return; + } + bool found = false; + for ( int i = 0; i < array->len ; ++i){ + pinyin_and_freq_item * old_value_item = &g_array_index(array, pinyin_and_freq_item, i); + int result = pinyin_exact_compare((PinyinKey *)value_item.pinyin->data, + (PinyinKey *)old_value_item->pinyin->data , value_item.pinyin->len); + if ( result == 0 ){ + printf("Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", + phrase, pinyin, freq); + old_value_item->freq += freq; + found = true; + } + } + + g_array_free(poses, TRUE); + + if ( !found ){ + g_array_append_val(array, value_item); + g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); + }else + g_array_free(keys, TRUE); + + free(new_phrase_ptr); + //g_array_free(keys, TRUE); +} + +gboolean store_one_item (gpointer key, gpointer value, gpointer data){ + item oneitem; + oneitem.phrase = (phrase_item *)key; + oneitem.pinyin_and_freq_array = (GArray *)value; + int length = oneitem.phrase->length; + g_array_append_val(g_item_array[length], oneitem); + return FALSE; +} + +void store_in_item_array(){ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_item_array[i] = g_array_new(FALSE, TRUE, sizeof(item)); + } + g_tree_foreach(g_pinyin_tree, store_one_item, NULL); +} + +gint phrase_array_compare ( gconstpointer a, gconstpointer b, gpointer user_data){ + int phrase_length = *((int *) user_data); + GArray * arraya = + g_array_index(((item *)a)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin; + GArray * arrayb = + g_array_index(((item *)b)->pinyin_and_freq_array, pinyin_and_freq_item, 0).pinyin; + return pinyin_exact_compare((PinyinKey *)arraya->data, (PinyinKey*)arrayb->data, phrase_length); +} + +void sort_item_array(){ + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i); + } +} + +void gen_phrase_file(const char * outfilename, int phrase_index){ + FILE * outfile = fopen(outfilename, "w"); + if (NULL == outfile ) { + fprintf(stderr, "Can't write file %s.\n", outfilename); + exit(1); + } + phrase_token_t token = 1; + char pinyin_buffer[4096]; + //phrase length + for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){ + GArray * item_array = g_item_array[i]; + //item array + for( int m = 0; m < item_array->len; ++m){ + item* oneitem = & g_array_index(item_array, item, m); + phrase_item * phrase = oneitem->phrase; + GArray * pinyin_and_freqs = oneitem->pinyin_and_freq_array; + const char * phrase_buffer = g_ucs4_to_utf8(phrase->uniphrase, + phrase->length, + NULL, NULL, NULL); + //each pinyin + for( int n = 0 ; n < pinyin_and_freqs->len; ++n){ + pinyin_and_freq_item * pinyin_and_freq = &g_array_index(pinyin_and_freqs, pinyin_and_freq_item, n); + GArray * pinyin = pinyin_and_freq->pinyin; + PinyinKey * key = &g_array_index(pinyin, PinyinKey, 0); + strcpy(pinyin_buffer,key->get_key_string()); + for (size_t k = 1; k < pinyin->len; ++k){ + strcat(pinyin_buffer, "'"); + PinyinKey * key = &g_array_index(pinyin, PinyinKey, k); + strcat(pinyin_buffer, key->get_key_string ()); + } + guint32 freq = pinyin_and_freq -> freq; + if ( freq < 3 ) + freq = 3; + fprintf( outfile, "%s\t%s\t%d\t%d\n", + pinyin_buffer, phrase_buffer, + PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), + freq); + } + token++; + } + } + fclose(outfile); +} |