summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2010-08-03 10:42:47 +0800
committerPeng Wu <alexepico@gmail.com>2010-08-03 10:42:47 +0800
commitf41d1fdf83408e042ab07925710a8913bad0c27c (patch)
tree1757833ac4cdd0830834d2f9ef92be07c0bc1a5b /src
parent34acf9be9033e0dc0a5905999133482c20b6cbf3 (diff)
downloadlibpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.gz
libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.xz
libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.zip
import from pinyin.
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am25
-rw-r--r--src/include/Makefile.am22
-rwxr-xr-xsrc/include/memory_chunk.h264
-rwxr-xr-xsrc/include/novel_types.h117
-rw-r--r--src/include/stl_lite.h285
-rw-r--r--src/lookup/Makefile.am30
-rw-r--r--src/lookup/lookup.h144
-rw-r--r--src/lookup/pinyin_lookup.cpp587
-rw-r--r--src/lookup/winner_tree.cpp141
-rw-r--r--src/lookup/winner_tree.h148
-rw-r--r--src/segment/Makefile.am28
-rw-r--r--src/segment/mmseg.cpp212
-rw-r--r--src/storage/Makefile.am35
-rw-r--r--src/storage/ngram.cpp283
-rw-r--r--src/storage/ngram.h119
-rw-r--r--src/storage/phrase_index.cpp340
-rwxr-xr-xsrc/storage/phrase_index.h250
-rw-r--r--src/storage/pinyin_base.cpp1425
-rw-r--r--src/storage/pinyin_base.h728
-rw-r--r--src/storage/pinyin_large_table.cpp690
-rwxr-xr-xsrc/storage/pinyin_large_table.h178
-rw-r--r--src/storage/pinyin_phrase.h298
-rw-r--r--src/storage/pinyin_zhuyin_map_data.h582
-rw-r--r--src/training/Makefile.am36
-rw-r--r--src/training/estimate_interpolation.cpp151
-rw-r--r--src/training/gen_ngram.cpp179
-rw-r--r--src/training/gen_unigram.cpp65
27 files changed, 7362 insertions, 0 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644
index 0000000..59e009f
--- /dev/null
+++ b/src/Makefile.am
@@ -0,0 +1,25 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+AUTOMAKE_OPTIONS = gnu
+SUBDIRS = include storage segment training lookup
+
+MAINTAINERCLEANFILES = Makefile.in
+
+CLEANFILES = *.bak
+
+ACLOCAL = aclocal -I $(ac_aux_dir)
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
new file mode 100644
index 0000000..bb605ee
--- /dev/null
+++ b/src/include/Makefile.am
@@ -0,0 +1,22 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+noinst_HEADERS = memory_chunk.h \
+ novel_types.h \
+ stl_lite.h
diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h
new file mode 100755
index 0000000..3571256
--- /dev/null
+++ b/src/include/memory_chunk.h
@@ -0,0 +1,264 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef MEMORY_CHUNK_H
+#define MEMORY_CHUNK_H
+
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <stdlib.h>
+#include "stl_lite.h"
+
+/* for unmanaged mode
+ * m_free_func == free , when memory is allocated by malloc
+ * m_free_func == NULL,
+ * when memory is in small protion of allocated area
+ * m_free_func == other,
+ * malloc then free.
+ */
+
+class MemoryChunk{
+ typedef void (* free_func_t)(void *);
+private:
+ char * m_data_begin;
+ char * m_data_end; //one data pass the end.
+ char * m_allocated; //one data pass the end.
+ free_func_t m_free_func;
+
+private:
+ void reset(){
+ if ( m_free_func )
+ (*m_free_func)(m_data_begin);
+ m_data_begin = NULL;
+ m_data_end = NULL;
+ m_allocated = NULL;
+ m_free_func = NULL;
+ }
+
+ void ensure_has_space(size_t new_size){
+ int delta_size = m_data_begin + new_size - m_data_end;
+ if ( delta_size <= 0 ) return;
+ ensure_has_more_space ( delta_size );
+ }
+
+ /* enlarge function */
+ void ensure_has_more_space(size_t extra_size){
+ if ( 0 == extra_size ) return;
+ size_t newsize;
+ size_t cursize = size();
+ if ( m_free_func != free ) {
+ /* copy on resize */
+ newsize = cursize + extra_size;
+ /* do the copy */
+ char * tmp = (char *) malloc(newsize);
+ assert(tmp);
+ memset(tmp, 0, newsize);
+ memmove(tmp, m_data_begin, cursize);
+ /* free the origin memory */
+ if ( m_free_func){
+ (*m_free_func)(m_data_begin);
+ }
+
+ /* change varibles */
+ m_data_begin = tmp;
+ m_data_end = m_data_begin + cursize;
+ m_allocated = m_data_begin + newsize;
+ m_free_func = free;
+ return;
+ }
+ /* the memory area is managed by this memory chunk */
+ if ( extra_size <= (size_t) (m_allocated - m_data_end))
+ return;
+ newsize = std_lite::max( capacity()<<1, cursize + extra_size);
+ m_data_begin = (char *) realloc(m_data_begin, newsize);
+ assert(m_data_begin);
+ memset(m_data_begin + cursize, 0, newsize - cursize);
+ m_data_end = m_data_begin + cursize;
+ m_allocated = m_data_begin + newsize;
+ return;
+ }
+
+public:
+ /* constructors */
+ MemoryChunk(){
+ m_data_begin = NULL;
+ m_data_end = NULL;
+ m_allocated = NULL;
+ m_free_func = NULL;
+ }
+
+ /* destructors */
+ ~MemoryChunk(){
+ reset();
+ }
+
+ /* read access method */
+ void* begin() const{
+ return m_data_begin;
+ }
+
+ void* end() const{
+ return m_data_end;
+ }
+
+ size_t size(){
+ return m_data_end - m_data_begin;
+ }
+
+ void set_size(size_t newsize){
+ ensure_has_space(newsize);
+ m_data_end = m_data_begin + newsize;
+ }
+
+ size_t capacity(){
+ return m_allocated - m_data_begin;
+ }
+
+ /*
+ * Transfer management of a memory chunk allocated by other part system
+ * to the memory chunk.
+ */
+ void set_chunk(void* begin, size_t length, free_func_t free_func){
+ if ( m_free_func )
+ m_free_func( m_data_begin );
+
+ m_data_begin = (char *) begin;
+ m_data_end = (char *) m_data_begin + length;
+ m_allocated = (char *) m_data_begin + length;
+ m_free_func = free_func;
+ }
+
+ /* subchunk
+ * use set_buffer internally.
+ * new chunk need to be deleted.
+ */
+ MemoryChunk * get_sub_chunk(size_t offset, size_t length){
+ MemoryChunk * retval = new MemoryChunk();
+ char * begin_pos = m_data_begin + offset;
+ retval->set_chunk(begin_pos, length, NULL);
+ return retval;
+ }
+ /* write function
+ * Data are written directly to the memory area.
+ */
+ bool set_content(size_t offset, const void * data, size_t len){
+ size_t cursize = std_lite::max(size(), offset + len);
+ ensure_has_space(offset + len);
+ memmove(m_data_begin + offset, data, len);
+ m_data_end = m_data_begin + cursize;
+ return true;
+ }
+ /* insert function
+ * Data are written to the memory area,
+ * the original content are moved towards the rear.
+ * parameter offset start from zero.
+ */
+ bool insert_content(size_t offset, const void * data, size_t length){
+ ensure_has_more_space(length);
+ size_t move_size = size() - offset;
+ memmove(m_data_begin + offset + length, m_data_begin + offset, move_size);
+ memmove(m_data_begin + offset, data, length);
+ m_data_end += length;
+ return true;
+ }
+ /* remove function
+ * Data are removed directly,
+ * the following content are moved towards the front.
+ */
+ bool remove_content(size_t offset, size_t length){
+ size_t move_size = size() - offset - length;
+ memmove(m_data_begin + offset, m_data_begin + offset + length, move_size);
+ m_data_end -= length;
+ return true;
+ }
+
+ /* get_content function
+ * Get the binary data
+ */
+ bool get_content(size_t offset, void * buffer, size_t length){
+ if ( size() < offset + length )
+ return false;
+ memcpy( buffer, m_data_begin + offset, length);
+ return true;
+ }
+
+ /* compact memory, reduce the size */
+ void compact_memory(){
+ if ( m_free_func != free )
+ return;
+ size_t newsize = size();
+ m_data_begin = (char *) realloc(m_data_begin, newsize);
+ m_allocated = m_data_begin + newsize;
+ }
+
+ /* file storage functions */
+ bool load(const char * filename){
+ /* free old data */
+ reset();
+
+ struct stat stat_buf;
+
+ int retval = stat(filename, &stat_buf);
+
+ if ( retval )
+ return false;
+
+ FILE* file = fopen(filename, "r");
+ if ( !file )
+ return false;
+ int data_len = stat_buf.st_size;
+ void* data = malloc(data_len);
+ if ( !data ){
+ fclose(file);
+ return false;
+ }
+
+ data_len = fread(data, 1, data_len, file);
+ set_chunk(data, data_len, free);
+ //Fixes memory chunk end.
+ if ( stat_buf.st_size > data_len )
+ m_allocated = (char *) m_data_begin + stat_buf.st_size;
+ fclose(file);
+ return true;
+ }
+
+ bool save(const char * filename){
+ FILE* file = fopen(filename, "w");
+ if ( !file )
+ return false;
+
+ size_t data_len = fwrite(begin(), 1, size(), file);
+ if ( data_len != size()){
+ fclose(file);
+ return false;
+ }
+
+ fsync(fileno(file));
+ fclose(file);
+ return true;
+ }
+};
+
+#endif
diff --git a/src/include/novel_types.h b/src/include/novel_types.h
new file mode 100755
index 0000000..a992e8e
--- /dev/null
+++ b/src/include/novel_types.h
@@ -0,0 +1,117 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef NOVEL_TYPES_H
+#define NOVEL_TYPES_H
+
+#include <limits.h>
+#include <glib.h>
+
+typedef guint32 phrase_token_t;
+typedef gunichar2 utf16_t;
+
+/*
+ * Phrase Index Library Definition
+ * Reserve 4-bits for future usage.
+ */
+
+#define PHRASE_MASK 0x00FFFFFF
+#define PHRASE_INDEX_LIBRARY_MASK 0x0F000000
+#define PHRASE_INDEX_LIBRARY_COUNT (1<<4)
+#define PHRASE_INDEX_LIBRARY_INDEX(token) ((token&PHRASE_INDEX_LIBRARY_MASK)>>24)
+#define PHRASE_INDEX_MAKE_TOKEN(phrase_index, token) \
+ ( ( (phrase_index<<24) & PHRASE_INDEX_LIBRARY_MASK)|(token & PHRASE_MASK))
+
+
+/*
+ * PhraseIndexRanges definitions
+ */
+
+struct PhraseIndexRange{
+ phrase_token_t m_range_begin;
+ phrase_token_t m_range_end; /* pass the last item like stl */
+};
+
+/*Array of PhraseIndexRange*/
+typedef GArray * PhraseIndexRanges[PHRASE_INDEX_LIBRARY_COUNT];
+
+/*
+ * PinYin Table Definition
+ */
+class MemoryChunk;
+
+
+/* For both PinYin Table and Phrase Table */
+enum SearchResult{
+ SEARCH_NONE = 0x00, /* found nothing */
+ SEARCH_OK = 0x01 , /* found items */
+ SEARCH_CONTINUED = 0x02 /* has longer word in the storage to search */
+};
+
+enum AddIndexResult{
+ INSERT_OK = 0 , /* insert ok */
+ INSERT_ITEM_EXISTS /* item already exists */
+};
+
+enum RemoveIndexResult{
+ REMOVE_OK = 0, /* remove ok */
+ REMOVE_ITEM_DONOT_EXISTS /* item don't exists */
+};
+/*
+ * n-gram Definition
+ * no B parameter(there are duplicated items in uni-gram and bi-gram)
+ * used in system n-gram and user n-gram.
+ * using delta technique.
+ */
+
+struct BigramPhraseItem{
+ phrase_token_t m_token;
+ gfloat m_freq; /* P(W2|W1) */
+};
+
+typedef GArray * BigramPhraseArray; /* Array of HighLevelPhraseItem */
+
+/*
+ * n-gram Definition
+ * n-gram library
+ */
+
+enum AttachOption{
+ ATTACH_NEW_FILE = 1,
+ ATTACH_READ = 2,
+ ATTACH_READ_WRITE = 3
+};
+
+#define MAX_PHRASE_LENGTH 16
+
+const phrase_token_t sentence_start = 1;
+const phrase_token_t token_min = 0;
+const phrase_token_t token_max = UINT_MAX;
+
+const char c_separate = '#';
+typedef guint32 table_offset_t;
+
+typedef double parameter_t;
+
+#define LAMBDA_PARAMETER 0.588792
+
+#endif
diff --git a/src/include/stl_lite.h b/src/include/stl_lite.h
new file mode 100644
index 0000000..0612782
--- /dev/null
+++ b/src/include/stl_lite.h
@@ -0,0 +1,285 @@
+#ifndef STL_LITE_H
+#define STL_LITE_H
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace std_lite{
+
+ /**
+ * @brief This does what you think it does.
+ * @param a A thing of arbitrary type.
+ * @param b Another thing of arbitrary type.
+ * @return The lesser of the parameters.
+ *
+ * This is the simple classic generic implementation. It will work on
+ * temporary expressions, since they are only evaluated once, unlike a
+ * preprocessor macro.
+ */
+ template<typename _Tp>
+ inline const _Tp&
+ min(const _Tp& __a, const _Tp& __b)
+ {
+ //return __b < __a ? __b : __a;
+ if (__b < __a)
+ return __b;
+ return __a;
+ }
+
+
+ /**
+ * @brief This does what you think it does.
+ * @param a A thing of arbitrary type.
+ * @param b Another thing of arbitrary type.
+ * @return The greater of the parameters.
+ *
+ * This is the simple classic generic implementation. It will work on
+ * temporary expressions, since they are only evaluated once, unlike a
+ * preprocessor macro.
+ */
+ template<typename _Tp>
+ inline const _Tp&
+ max(const _Tp& __a, const _Tp& __b)
+ {
+ //return __a < __b ? __b : __a;
+ if (__a < __b)
+ return __b;
+ return __a;
+ }
+
+ /**
+ * This is one of the @link s20_3_1_base functor base classes@endlink.
+ */
+ template <class _Arg1, class _Arg2, class _Result>
+ struct binary_function
+ {
+ typedef _Arg1 first_argument_type; ///< the type of the first argument
+ /// (no surprises here)
+
+ typedef _Arg2 second_argument_type; ///< the type of the second argument
+ typedef _Result result_type; ///< type of the return type
+ };
+ /** @} */
+
+ /// pair holds two objects of arbitrary type.
+ template<class _T1, class _T2>
+ struct pair
+ {
+ typedef _T1 first_type; ///< @c first_type is the first bound type
+ typedef _T2 second_type; ///< @c second_type is the second bound type
+
+ _T1 first; ///< @c first is a copy of the first object
+ _T2 second; ///< @c second is a copy of the second object
+
+ // _GLIBCXX_RESOLVE_LIB_DEFECTS
+ // 265. std::pair::pair() effects overly restrictive
+ /** The default constructor creates @c first and @c second using their
+ * respective default constructors. */
+ pair()
+ : first(), second() { }
+
+ /** Two objects may be passed to a @c pair constructor to be copied. */
+ pair(const _T1& __a, const _T2& __b)
+ : first(__a), second(__b) { }
+
+ /** There is also a templated copy ctor for the @c pair class itself. */
+ template<class _U1, class _U2>
+ pair(const pair<_U1, _U2>& __p)
+ : first(__p.first), second(__p.second) { }
+ };
+
+ /// Two pairs of the same type are equal iff their members are equal.
+ template<class _T1, class _T2>
+ inline bool
+ operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+ { return __x.first == __y.first && __x.second == __y.second; }
+
+ /// <http://gcc.gnu.org/onlinedocs/libstdc++/20_util/howto.html#pairlt>
+ template<class _T1, class _T2>
+ inline bool
+ operator<(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+ { return __x.first < __y.first
+ || (!(__y.first < __x.first) && __x.second < __y.second); }
+
+ /// Uses @c operator== to find the result.
+ template<class _T1, class _T2>
+ inline bool
+ operator!=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+ { return !(__x == __y); }
+
+ /// Uses @c operator< to find the result.
+ template<class _T1, class _T2>
+ inline bool
+ operator>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+ { return __y < __x; }
+
+ /// Uses @c operator< to find the result.
+ template<class _T1, class _T2>
+ inline bool
+ operator<=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+ { return !(__y < __x); }
+
+ /// Uses @c operator< to find the result.
+ template<class _T1, class _T2>
+ inline bool
+ operator>=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+ { return !(__x < __y); }
+
+ /**
+ * @brief A convenience wrapper for creating a pair from two objects.
+ * @param x The first object.
+ * @param y The second object.
+ * @return A newly-constructed pair<> object of the appropriate type.
+ *
+ * The standard requires that the objects be passed by reference-to-const,
+ * but LWG issue #181 says they should be passed by const value. We follow
+ * the LWG by default.
+ */
+ // _GLIBCXX_RESOLVE_LIB_DEFECTS
+ // 181. make_pair() unintended behavior
+ template<class _T1, class _T2>
+ inline pair<_T1, _T2>
+ make_pair(_T1 __x, _T2 __y)
+ { return pair<_T1, _T2>(__x, __y); }
+
+ /**
+ * @brief Finds the first position in which @a val could be inserted
+ * without changing the ordering.
+ * @param first An iterator.
+ * @param last Another iterator.
+ * @param val The search term.
+ * @param comp A functor to use for comparisons.
+ * @return An iterator pointing to the first element "not less than" @a val,
+ * or end() if every element is less than @a val.
+ * @ingroup binarysearch
+ *
+ * The comparison function should have the same effects on ordering as
+ * the function used for the initial sort.
+ */
+ template<typename _ForwardIterator, typename _Tp, typename _Compare>
+ _ForwardIterator
+ lower_bound(_ForwardIterator __first, _ForwardIterator __last,
+ const _Tp& __val, _Compare __comp)
+ {
+ typedef size_t _DistanceType;
+
+ _DistanceType __len = __last - __first;
+ _DistanceType __half;
+ _ForwardIterator __middle;
+
+ while (__len > 0)
+ {
+ __half = __len >> 1;
+ __middle = __first;
+ __middle += __half;
+ if (__comp(*__middle, __val))
+ {
+ __first = __middle;
+ ++__first;
+ __len = __len - __half - 1;
+ }
+ else
+ __len = __half;
+ }
+ return __first;
+ }
+
+ /**
+ * @brief Finds the last position in which @a val could be inserted
+ * without changing the ordering.
+ * @param first An iterator.
+ * @param last Another iterator.
+ * @param val The search term.
+ * @param comp A functor to use for comparisons.
+ * @return An iterator pointing to the first element greater than @a val,
+ * or end() if no elements are greater than @a val.
+ * @ingroup binarysearch
+ *
+ * The comparison function should have the same effects on ordering as
+ * the function used for the initial sort.
+ */
+ template<typename _ForwardIterator, typename _Tp, typename _Compare>
+ _ForwardIterator
+ upper_bound(_ForwardIterator __first, _ForwardIterator __last,
+ const _Tp& __val, _Compare __comp)
+ {
+ typedef size_t _DistanceType;
+ _DistanceType __len = __last - __first;
+ _DistanceType __half;
+ _ForwardIterator __middle;
+
+ while (__len > 0)
+ {
+ __half = __len >> 1;
+ __middle = __first;
+ __middle += __half;
+ if (__comp(__val, *__middle))
+ __len = __half;
+ else
+ {
+ __first = __middle;
+ ++__first;
+ __len = __len - __half - 1;
+ }
+ }
+ return __first;
+ }
+
+ /**
+ * @brief Finds the largest subrange in which @a val could be inserted
+ * at any place in it without changing the ordering.
+ * @param first An iterator.
+ * @param last Another iterator.
+ * @param val The search term.
+ * @param comp A functor to use for comparisons.
+ * @return An pair of iterators defining the subrange.
+ * @ingroup binarysearch
+ *
+ * This is equivalent to
+ * @code
+ * std::make_pair(lower_bound(first, last, val, comp),
+ * upper_bound(first, last, val, comp))
+ * @endcode
+ * but does not actually call those functions.
+ */
+ template<typename _ForwardIterator, typename _Tp, typename _Compare>
+ pair<_ForwardIterator, _ForwardIterator>
+ equal_range(_ForwardIterator __first, _ForwardIterator __last,
+ const _Tp& __val,
+ _Compare __comp)
+ {
+
+ typedef size_t _DistanceType;
+
+ _DistanceType __len = __last - __first;
+ _DistanceType __half;
+ _ForwardIterator __middle, __left, __right;
+
+ while (__len > 0)
+ {
+ __half = __len >> 1;
+ __middle = __first;
+ __middle += __half;
+ if (__comp(*__middle, __val))
+ {
+ __first = __middle;
+ ++__first;
+ __len = __len - __half - 1;
+ }
+ else if (__comp(__val, *__middle))
+ __len = __half;
+ else
+ {
+ __left = lower_bound(__first, __middle, __val, __comp);
+ __first += __len;
+ __right = upper_bound(++__middle, __first, __val, __comp);
+ return pair<_ForwardIterator, _ForwardIterator>(__left, __right);
+ }
+ }
+ return pair<_ForwardIterator, _ForwardIterator>(__first, __first);
+ }
+
+
+}
+#endif
diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am
new file mode 100644
index 0000000..2b7d21f
--- /dev/null
+++ b/src/lookup/Makefile.am
@@ -0,0 +1,30 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ @GLIB2_CPPFLAGS@
+
+noinst_HEADERS = lookup.h winner_tree.h
+
+noinst_PROGRAMS =
+
+noinst_LTLIBRARIES = liblookup.la
+
+liblookup_la_SOURCES = pinyin_lookup.cpp winner_tree.cpp
diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h
new file mode 100644
index 0000000..676c6ea
--- /dev/null
+++ b/src/lookup/lookup.h
@@ -0,0 +1,144 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef LOOKUP_H
+#define LOOKUP_H
+
+#include <float.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+
+class WinnerTree;
+
+/** @file lookup.h
+ * @brief the definitions of lookup related classes and structs.
+ * Currently only contains pinyin lookup.
+ */
+
+typedef phrase_token_t lookup_key_t;
+
+struct lookup_value_t{
+ phrase_token_t m_handles[2];
+ gfloat m_poss;
+ gint32 m_last_step;
+ lookup_value_t(gfloat poss = FLT_MAX){
+ m_handles[0] = NULL; m_handles[1] = NULL;
+ m_poss = poss;
+ m_last_step = -1;
+ }
+};
+
+enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH };
+
+struct lookup_constraint_t{
+ constraint_type m_type;
+ union{
+ phrase_token_t m_token;
+ guint32 m_constraint_step; /* index of m_token */
+ };
+};
+
+typedef GArray * CandidateConstraints; /* Array of lookup_constraint_t */
+typedef GArray * MatchResults; /* Array of phrase_token_t */
+
+namespace novel{
+class PinyinLargeTable;
+class FacadePhraseIndex;
+class Bigram;
+};
+
+typedef GHashTable * LookupStepIndex;
+/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */
+typedef GArray * LookupStepContent; /* array of lookup_value_t */
+
+
+class IBranchIterator{
+public:
+ virtual ~IBranchIterator(){}
+ virtual bool has_next() = 0;
+ virtual lookup_value_t next() = 0;
+ virtual lookup_value_t max() = 0;
+};
+
+class PinyinLookup{
+private:
+ static const gfloat bigram_lambda = LAMBDA_PARAMETER;
+ static const gfloat unigram_lambda = 1 - LAMBDA_PARAMETER;
+
+ PhraseItem m_cache_phrase_item;
+protected:
+ //saved varibles
+ CandidateConstraints m_constraints;
+ PinyinKeyVector m_keys;
+
+ novel::PinyinLargeTable * m_pinyin_table;
+ novel::FacadePhraseIndex * m_phrase_index;
+ novel::PinyinCustomSettings * m_custom;
+ novel::Bigram * m_bigram;
+
+ //internal step data structure
+ GPtrArray * m_steps_index;
+ /* Array of LookupStepIndex */
+ GPtrArray * m_steps_content;
+ /* Array of LookupStepContent */
+
+ GArray * m_table_cache;
+ /* Array of PhraseIndexRanges */
+
+ WinnerTree * m_winner_tree;
+
+ size_t prepare_table_cache(int nstep, int total_pinyin);
+
+ bool search_unigram(IBranchIterator * iter, int nstep, int npinyin);
+ bool search_bigram(IBranchIterator * iter, int nstep, int npinyin);
+
+ bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token);
+ bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss);
+
+ bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step);
+
+ bool final_step(MatchResults & results);
+public:
+ PinyinLookup( PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram);
+
+ ~PinyinLookup();
+
+ bool get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+
+ bool train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+
+ bool convert_to_utf8(MatchResults results, /* out */ char * & result_string);
+
+ bool add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token);
+
+ bool clear_constraint(CandidateConstraints constraints, size_t index);
+
+ bool validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys);
+
+ /* init pinyin table lookup array */
+ bool prepare_pinyin_lookup(PhraseIndexRanges ranges);
+ /* destroy pinyin table lookup array */
+ bool destroy_pinyin_lookup(PhraseIndexRanges ranges);
+};
+
+#endif
diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp
new file mode 100644
index 0000000..c335453
--- /dev/null
+++ b/src/lookup/pinyin_lookup.cpp
@@ -0,0 +1,587 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <math.h>
+#include <assert.h>
+#include <iostream>
+#include "stl_lite.h"
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+#include "phrase_index.h"
+#include "ngram.h"
+#include "lookup.h"
+#include "winner_tree.h"
+
+const gfloat PinyinLookup::bigram_lambda;
+const gfloat PinyinLookup::unigram_lambda;
+
+PinyinLookup::PinyinLookup(PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram){
+ m_custom = custom;
+ m_pinyin_table = pinyin_table;
+ m_phrase_index = phrase_index;
+ m_bigram = bigram;
+ m_winner_tree = new WinnerTree;
+ m_steps_index = g_ptr_array_new();
+ m_steps_content = g_ptr_array_new();
+ m_table_cache = g_array_new(FALSE, TRUE, sizeof(PhraseIndexRanges));
+ g_array_set_size(m_table_cache, 1);
+}
+
+PinyinLookup::~PinyinLookup(){
+ if ( m_winner_tree )
+ delete m_winner_tree;
+ m_winner_tree = NULL;
+ //free resources
+ for ( size_t i = 0; i < m_table_cache->len; ++i){
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
+ destroy_pinyin_lookup(*ranges);
+ }
+ //g_array_set_size(m_table_cache, 1);
+ g_array_free(m_table_cache, TRUE);
+
+ //free m_steps_index
+ for ( size_t i = 0; i < m_steps_index->len; ++i){
+ GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i);
+ g_hash_table_destroy(table);
+ g_ptr_array_index(m_steps_index, i) = NULL;
+ }
+ g_ptr_array_free(m_steps_index, TRUE);
+
+ //free m_steps_content
+ for ( size_t i = 0; i < m_steps_content->len; ++i){
+ GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i);
+ g_array_free(array, TRUE);
+ g_ptr_array_index(m_steps_content, i) = NULL;
+ }
+ g_ptr_array_free(m_steps_content, TRUE);
+
+}
+
+bool PinyinLookup::prepare_pinyin_lookup(PhraseIndexRanges ranges){
+ //memset(ranges, 0, sizeof(ranges));
+ for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
+ GArray * & array = ranges[i];
+ assert(NULL == array);
+ if (m_phrase_index->m_sub_phrase_indices[i]){
+ array = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange));
+ }
+ }
+ return true;
+}
+
+bool PinyinLookup::destroy_pinyin_lookup(PhraseIndexRanges ranges){
+ for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
+ GArray * & array = ranges[i];
+ if ( array )
+ g_array_free(array, TRUE);
+ array = NULL;
+ }
+ return true;
+}
+
+size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){
+ //free resources
+ for ( size_t i = 0; i < m_table_cache->len; ++i){
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
+ destroy_pinyin_lookup(*ranges);
+ }
+ //g_array_set_size(m_table_cache, 1);
+ PinyinKey * pinyin_keys = (PinyinKey *)m_keys->data;
+ pinyin_keys += nstep;
+ //init resources
+ g_array_set_size(m_table_cache, MAX_PHRASE_LENGTH + 1);
+ size_t len;
+ for ( len = 1; len <= total_pinyin && len <= MAX_PHRASE_LENGTH; ++len){
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, len);
+ prepare_pinyin_lookup(*ranges);
+ int result = m_pinyin_table->search(len, pinyin_keys, *ranges);
+ if (!( result & SEARCH_CONTINUED)){
+ ++len;
+ break;
+ }
+ }
+ g_array_set_size(m_table_cache, std_lite::min(len, (size_t) MAX_PHRASE_LENGTH + 1));
+ return m_table_cache->len - 1;
+}
+
+bool PinyinLookup::get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){
+ //g_array_set_size(results, 0);
+
+ m_constraints = constraints;
+ m_keys = keys;
+ int nstep = keys->len + 1;
+
+ //free m_steps_index
+ for ( size_t i = 0; i < m_steps_index->len; ++i){
+ GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i);
+ g_hash_table_destroy(table);
+ g_ptr_array_index(m_steps_index, i) = NULL;
+ }
+
+ //free m_steps_content
+ for ( size_t i = 0; i < m_steps_content->len; ++i){
+ GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i);
+ g_array_free(array, TRUE);
+ g_ptr_array_index(m_steps_content, i) = NULL;
+ }
+
+ //add null start step
+ g_ptr_array_set_size(m_steps_index, nstep);
+ g_ptr_array_set_size(m_steps_content, nstep);
+
+ for ( size_t i = 0 ; i < nstep; ++i ){
+ //initialize m_steps_index
+ g_ptr_array_index(m_steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal);
+ //initialize m_steps_content
+ g_ptr_array_index(m_steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t));
+ }
+
+ lookup_key_t initial_key = sentence_start;
+ lookup_value_t initial_value(log(1));
+ initial_value.m_handles[1] = sentence_start;
+ GArray * initial_step_content = (GArray *) g_ptr_array_index(m_steps_content, 0);
+ initial_step_content = g_array_append_val(initial_step_content, initial_value);
+ GHashTable * initial_step_index = (GHashTable *) g_ptr_array_index(m_steps_index, 0);
+ g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), GUINT_TO_POINTER(initial_step_content->len - 1));
+
+#if 0
+ LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, 0);
+ IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step);
+ size_t npinyin = prepare_table_cache(0, keys->len);
+ search_unigram(iter, 0, npinyin);
+ delete iter;
+#endif
+
+ for ( size_t i = 0 ; i < nstep - 1 ; ++i ){
+ LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, i);
+ IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step);
+ size_t npinyin = prepare_table_cache(i, keys->len - i);
+ search_bigram(iter, i, npinyin),
+ search_unigram(iter, i, npinyin);
+ delete iter;
+ }
+ return final_step(results);
+}
+
+bool PinyinLookup::search_unigram(IBranchIterator * iter, int nstep, int npinyin){
+ lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep);
+ if ( CONSTRAINT_NOSEARCH == constraint->m_type )
+ return false;
+ GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return false;
+ lookup_value_t max_step = iter->max();
+ if ( CONSTRAINT_ONESTEP == constraint->m_type){
+ return unigram_gen_next_step(nstep, &max_step, constraint->m_token);
+ }
+ if ( NO_CONSTRAINT == constraint->m_type ){
+ bool found = false;
+ for ( size_t i = 1; i < m_table_cache->len && i <= MAX_PHRASE_LENGTH; ++i){
+ lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1);
+ if ( constraint->m_type != NO_CONSTRAINT )
+ continue;
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache,PhraseIndexRanges, i);
+ for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = (*ranges)[m];
+ if ( !array ) continue;
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
+ for ( phrase_token_t token = range->m_range_begin;
+ token != range->m_range_end; ++token){
+ found = unigram_gen_next_step(nstep, &max_step, token)|| found;
+ }
+ }
+ }
+ }
+ return found;
+ }
+ return false;
+}
+
+
+bool PinyinLookup::search_bigram(IBranchIterator * iter,
+ int nstep, int npinyin){
+ lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep);
+ if ( CONSTRAINT_NOSEARCH == constraint->m_type )
+ return false;
+ GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep);
+
+ bool found = false;
+ BigramPhraseArray bigram_phrase_items = g_array_new(FALSE, FALSE,
+ sizeof(BigramPhraseItem));
+ while ( iter->has_next() ){
+ lookup_value_t cur_step = iter->next();
+ //printf("token:%d\t%d\n", cur_step.m_handles[0], cur_step.m_handles[1]);
+ phrase_token_t index_token = cur_step.m_handles[1];
+ SingleGram * system, * user;
+ m_bigram->load(index_token, system, user);
+ if ( system && user ){
+ guint32 total_freq;
+ assert(user->get_total_freq(total_freq));
+ assert(system->set_total_freq(total_freq));
+ }
+ if ( CONSTRAINT_ONESTEP == constraint->m_type ){
+ phrase_token_t token = constraint->m_token;
+ if ( system ){
+ guint32 freq;
+ if( system->get_freq(token, freq) ){
+ guint32 total_freq;
+ system->get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found;
+ }
+ }
+ if ( user ){
+ guint32 freq;
+ if( user->get_freq(token, freq)){
+ guint32 total_freq;
+ user->get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found;
+ }
+ }
+ }
+
+ if ( NO_CONSTRAINT == constraint->m_type ){
+ for ( size_t i = 1; i < m_table_cache->len
+ && i <= MAX_PHRASE_LENGTH;++i ){
+ lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1);
+ if ( constraint->m_type != NO_CONSTRAINT )
+ continue;
+
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
+ for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = (*ranges)[m];
+ if ( !array ) continue;
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
+ if (system){
+ g_array_set_size(bigram_phrase_items, 0);
+ system->search(range, bigram_phrase_items);
+ for( size_t k = 0; k < bigram_phrase_items->len;
+ ++k){
+ BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
+ found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found;
+ }
+ }
+ if (user){
+ g_array_set_size(bigram_phrase_items, 0);
+ user->search(range, bigram_phrase_items);
+ for( size_t k = 0; k < bigram_phrase_items->len;
+ ++k){
+ BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
+ found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found;
+ }
+ }
+ }
+ }
+ }
+ }
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+ g_array_free(bigram_phrase_items, TRUE);
+ return found;
+}
+
+
+bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token){
+ PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep;
+ if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gfloat elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( elem_poss < FLT_EPSILON )
+ return false;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys);
+ if (pinyin_poss < FLT_EPSILON )
+ return false;
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss){
+ PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep;
+ if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gfloat unigram_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( bigram_poss < FLT_EPSILON && unigram_poss < FLT_EPSILON )
+ return false;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys);
+ if ( pinyin_poss < FLT_EPSILON )
+ return false;
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss +
+ log(( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) *pinyin_poss);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup::save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step){
+ lookup_key_t next_key = next_step->m_handles[1];
+ GHashTable * next_lookup_index = (GHashTable *) g_ptr_array_index(m_steps_index, next_step_pos);
+ GArray * next_lookup_content = (GArray *) g_ptr_array_index(m_steps_content, next_step_pos);
+
+ gpointer key, value;
+ gboolean lookup_result = g_hash_table_lookup_extended(next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
+ size_t step_index = GPOINTER_TO_UINT(value);
+ if ( !lookup_result ){
+ g_array_append_val(next_lookup_content, *next_step);
+ g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1));
+ return true;
+ }else{
+ lookup_value_t * orig_next_value = &g_array_index(next_lookup_content, lookup_value_t,step_index);
+ if ( orig_next_value->m_poss < next_step->m_poss) {
+ orig_next_value->m_handles[0] = next_step->m_handles[0];
+ assert(orig_next_value->m_handles[1] == next_step->m_handles[1]);
+ orig_next_value->m_poss = next_step->m_poss;
+ orig_next_value->m_last_step = next_step->m_last_step;
+ return true;
+ }
+ return false;
+ }
+}
+
+bool PinyinLookup::final_step(MatchResults & results){
+ //reset results
+ g_array_set_size(results, m_steps_content->len);
+ for ( size_t i = 0 ; i < m_steps_content->len ; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ *token = NULL;
+ }
+ //find max element
+ size_t last_step_pos = m_steps_content->len - 1;
+
+ GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos);
+ if ( last_step_array->len == 0 )
+ return false;
+ lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0);
+ for ( size_t i = 1; i < last_step_array->len; ++i){
+ lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ //backtracing
+ while( true ){
+ int cur_step_pos = max_value->m_last_step;
+ if ( -1 == cur_step_pos )
+ break;
+
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, cur_step_pos);
+ *token = max_value->m_handles[1];
+
+ phrase_token_t last_token = max_value->m_handles[0];
+
+
+ GHashTable * lookup_step_index = (GHashTable *)g_ptr_array_index(m_steps_index, cur_step_pos);
+ gpointer key, value;
+ gboolean result = g_hash_table_lookup_extended(lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
+ if (!result)
+ return false;
+ GArray * lookup_step_content = (GArray *)g_ptr_array_index(m_steps_content, cur_step_pos);
+
+ max_value = &g_array_index(lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
+ }
+
+ //no need to reverse the result
+
+ return true;
+}
+
+bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){
+ bool train_next = false;
+ PinyinKey * pinyin_keys = (PinyinKey *)keys->data;
+ //TODO: verify the new training method.
+ phrase_token_t last_token = sentence_start;
+ // constraints->len + 1 == results->len
+ guint32 train_factor = 23;
+ for ( size_t i = 0; i < constraints->len; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ if ( *token == NULL )
+ continue;
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+ if (train_next || CONSTRAINT_ONESTEP == constraint->m_type ){
+ if (CONSTRAINT_ONESTEP == constraint->m_type){
+ assert(*token == constraint->m_token);
+ train_next = true;
+ }else{
+ train_next = false;
+ }
+ //add pi-gram frequency
+ //std::cout<<"i:"<<i<<"last_token:"<<last_token<<"\ttoken:"<<*token<<std::endl;
+ m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+ m_cache_phrase_item.increase_pinyin_possibility(*m_custom, pinyin_keys + i, train_factor);
+ m_phrase_index->add_unigram_frequency(*token, train_factor);
+ if ( last_token ){
+ SingleGram * system, *user;
+ m_bigram->load(last_token, system, user);
+ guint32 total_freq;
+ if ( !user ){
+ total_freq = 0;
+ if ( system )
+ assert(system->get_total_freq(total_freq));
+ user = new SingleGram;
+ user->set_total_freq(total_freq);
+ }
+ guint32 freq = 0;
+ if ( !user->get_freq(*token, freq)){
+ if (system) system->get_freq(*token, freq);
+ user->set_freq(*token, freq);
+ }
+ assert(user->get_total_freq(total_freq));
+ //protect against total_freq overflow.
+ if ( train_factor > 0 && total_freq > total_freq + train_factor)
+ goto next;
+ assert(user->set_total_freq(total_freq + train_factor));
+ assert(user->get_freq(*token, freq));
+ //if total_freq is not overflow, then freq won't overflow.
+ assert(user->set_freq(*token, freq + train_factor));
+ assert(m_bigram->store(last_token, user));
+ next:
+ if (system) delete system;
+ if (user) delete user;
+ }
+ }
+ last_token = *token;
+ }
+ return true;
+}
+
+bool PinyinLookup::convert_to_utf8(MatchResults results, /* out */ char * & result_string){
+ result_string = g_strdup("");
+ for ( size_t i = 0; i < results->len; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ if ( NULL == *token )
+ continue;
+ m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+ utf16_t buffer[MAX_PHRASE_LENGTH];
+ m_cache_phrase_item.get_phrase_string(buffer);
+ guint8 length = m_cache_phrase_item.get_phrase_length();
+ gchar * phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
+ char * tmp = result_string;
+ result_string = g_strconcat(result_string, phrase, NULL);
+ g_free(tmp); g_free(phrase);
+ }
+ return true;
+}
+
+bool PinyinLookup::add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token){
+ if ( !m_phrase_index->get_phrase_item(token, m_cache_phrase_item) )
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ if ( index + phrase_length > constraints->len )
+ return false;
+
+ for ( size_t i = index; i < index + phrase_length ; ++i ){
+ clear_constraint(constraints, i);
+ }
+
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index);
+ constraint->m_type = CONSTRAINT_ONESTEP;
+ constraint->m_token = token;
+
+ for (size_t i = 1; i < phrase_length; ++i){
+ constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
+ constraint->m_type = CONSTRAINT_NOSEARCH;
+ constraint->m_constraint_step = index;
+ }
+ return true;
+}
+
+bool PinyinLookup::clear_constraint(CandidateConstraints constraints, size_t index){
+ if ( index < 0 || index >= constraints->len )
+ return false;
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index);
+ if (constraint->m_type == NO_CONSTRAINT)
+ return false;
+ if (constraint->m_type == CONSTRAINT_NOSEARCH){
+ index = constraint->m_constraint_step;
+ constraint = &g_array_index(constraints, lookup_constraint_t, index);
+ }
+
+ assert(constraint->m_type == CONSTRAINT_ONESTEP);
+
+ phrase_token_t token = constraint->m_token;
+ if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ for ( size_t i = 0; i < phrase_length; ++i){
+ if ( index + i >= constraints->len )
+ continue;
+ constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+ return true;
+}
+
+bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys){
+ //resize constraints array
+ size_t constraints_length = constraints->len;
+ if ( m_parsed_keys->len > constraints_length ){
+ g_array_set_size(constraints, m_parsed_keys->len);
+ //initialize new element
+ for( size_t i = constraints_length; i < m_parsed_keys->len; ++i){
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+ }else if (m_parsed_keys->len < constraints_length ){
+ g_array_set_size(constraints, m_parsed_keys->len);
+ }
+
+ PinyinKey * pinyin_keys = (PinyinKey *)m_parsed_keys->data;
+
+ for ( size_t i = 0; i < constraints->len; ++i){
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+ if ( constraint->m_type == CONSTRAINT_ONESTEP ){
+ phrase_token_t token = constraint->m_token;
+ m_phrase_index->get_phrase_item(token, m_cache_phrase_item);
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ //clear too long constraint
+ if ( i + phrase_length > constraints->len ){
+ clear_constraint(constraints, i);
+ continue;
+ }
+ //clear invalidated pinyin
+ gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyin_keys + i);
+ if ( pinyin_poss < FLT_EPSILON ){
+ clear_constraint(constraints, i);
+ }
+ }
+ }
+ return true;
+}
diff --git a/src/lookup/winner_tree.cpp b/src/lookup/winner_tree.cpp
new file mode 100644
index 0000000..248a749
--- /dev/null
+++ b/src/lookup/winner_tree.cpp
@@ -0,0 +1,141 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <float.h>
+#include <limits.h>
+#include <stdio.h>
+#include "memory_chunk.h"
+#include "phrase_index.h"
+#include "lookup.h"
+#include "winner_tree.h"
+
+WinnerTreeBranchIterator::WinnerTreeBranchIterator(WinnerTree & tree)
+ :m_tree(tree), m_counter(0){
+ m_max_value = m_tree.m_items[m_tree.get_winner()];
+ m_counter = 0;
+}
+
+bool WinnerTreeBranchIterator::has_next(){
+ if ( m_counter >= m_tree.m_tree_size)
+ return false;
+ return m_counter < nbranch;
+}
+
+lookup_value_t WinnerTreeBranchIterator::next(){
+ int winner = m_tree.get_winner();
+ lookup_value_t tmp = m_tree.m_items[winner];
+ m_tree.m_items[winner].m_poss =
+ - FLT_MAX;
+ m_tree.replay(winner);
+ ++m_counter;
+ return tmp;
+}
+
+void WinnerTree::play(int p, int lc, int rc){
+ m_tree[p] = winner(lc, rc);
+ //continue competition
+ while( p > 1 && p % 2) {
+ m_tree[p/2] = winner( m_tree[p - 1], m_tree[p]);
+ p/=2;
+ }
+}
+
+
+bool WinnerTree::initialize(LookupStepContent cur_step){
+ size_t size = cur_step->len;
+ if ( size > m_max_tree_size ){
+ init(size);
+ }
+ assert(size > nbranch);
+ m_tree_size = size;
+
+ //initialize array tree
+ int nindex = 1;
+
+ for( size_t i = 0; i < cur_step->len ; ++i){
+ lookup_value_t * cur_value = &g_array_index(cur_step, lookup_value_t, i);
+ m_items[nindex++] = *cur_value;
+ }
+
+ //compute s = 2 ^ log(n -1)
+ int i, s;
+ for( s = 1; 2 * s <= m_tree_size - 1; s += s);
+
+ m_low_ext = 2 * (m_tree_size - s);
+ m_offset = 2 * s - 1;
+
+ //compute outside nodes
+ for( i = 2; i <= m_low_ext; i += 2)
+ play((m_offset + i)/2, i - 1, i);
+ //compute other nodes
+ if ( m_tree_size % 2){
+ play( m_tree_size / 2, m_tree[m_tree_size - 1], m_low_ext +1);
+ i = m_low_ext + 3;
+ }else i = m_low_ext + 2;
+
+ //compute others
+ for( ; i <= m_tree_size; i += 2)
+ play( (i - m_low_ext + m_tree_size - 1) / 2, i - 1, i);
+ return true;
+}
+
+void WinnerTree::replay(int i){
+ assert( 1 <= i && i <= m_tree_size);
+
+ int p; //compete node
+ int lc; //p's left child
+ int rc; //p's right child
+
+ //first compete
+ if ( i <= m_low_ext){
+ p = (m_offset + i) / 2;
+ lc = 2 * p - m_offset;
+ rc = lc + 1;
+ }else{
+ p = (i - m_low_ext + m_tree_size -1) / 2;
+ if ( 2 * p == m_tree_size - 1 ){
+ lc = m_tree[2*p];
+ rc = i;
+ }else{
+ lc = 2 * p - m_tree_size + 1 + m_low_ext;
+ rc = lc + 1;
+ }
+ }
+
+ m_tree[p] = winner(lc, rc);
+
+ //added by wupeng
+ if ( ( p | 0x01 ) == m_tree_size ){
+ p /= 2;
+ m_tree[p] = winner( m_tree[2 * p], m_low_ext + 1 );
+ }
+
+ //compute others
+ p /= 2;
+ for( ; p >= 1 ; p /= 2)
+ m_tree[p] = winner( m_tree[2 * p], m_tree[2 * p + 1]);
+}
+
+int WinnerTree::winner(int lc, int rc){
+ return m_items[lc].m_poss > m_items[rc].m_poss ?
+ lc : rc;
+}
diff --git a/src/lookup/winner_tree.h b/src/lookup/winner_tree.h
new file mode 100644
index 0000000..262f196
--- /dev/null
+++ b/src/lookup/winner_tree.h
@@ -0,0 +1,148 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef LOOKUP_WINNER_TREE_H
+#define LOOKUP_WINNER_TREE_H
+
+#include <assert.h>
+#include "lookup.h"
+
+const int nbranch = 32;
+
+class DirectBranchIterator: public IBranchIterator{//for nitem <= nbranch
+ LookupStepContent m_step_content;
+ size_t m_iter_pos;
+public:
+ //Constructor
+ DirectBranchIterator(LookupStepContent step_content)
+ :m_step_content(step_content)
+ { m_iter_pos = 0; }
+
+ //Destructor
+ virtual ~DirectBranchIterator(){}
+
+ //Member Function
+ bool has_next(){
+ return m_iter_pos != m_step_content->len;
+ }
+
+ lookup_value_t next(){
+ lookup_value_t * tmp = &g_array_index(m_step_content,
+ lookup_value_t, m_iter_pos);
+ ++m_iter_pos;
+ return *tmp;
+ }
+
+ lookup_value_t max(){
+ lookup_value_t * max_value = &g_array_index(m_step_content, lookup_value_t, 0);
+ for ( size_t i = 1 ; i < m_step_content->len; ++i){
+ lookup_value_t * cur_value = &g_array_index(m_step_content, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+ return *max_value;
+ }
+};
+
+class WinnerTree;
+
+class WinnerTreeBranchIterator: public IBranchIterator{//for nitem <= nbranch
+ WinnerTree& m_tree;
+ int m_counter;
+ lookup_value_t m_max_value;
+public:
+ //Constructor
+ WinnerTreeBranchIterator(WinnerTree & tree);
+
+ //Destructor
+ virtual ~WinnerTreeBranchIterator(){}
+
+ //Member Function
+ bool has_next();
+
+ lookup_value_t next();
+
+ lookup_value_t max(){
+ return m_max_value;
+ }
+
+};
+
+class WinnerTree{
+ friend class WinnerTreeBranchIterator;
+private:
+ size_t m_max_tree_size; // maxsize
+ int m_tree_size; // n
+ int m_low_ext;
+ int m_offset;
+ int * m_tree;
+ MemoryChunk m_buffer;
+ MemoryChunk m_tree_buffer;
+ lookup_value_t * m_items;
+
+ int winner(int lc, int rc);
+
+ void play(int p, int lc, int rc);
+
+ void init(int tree_size){
+ m_max_tree_size = tree_size;
+ //data buffer
+ m_buffer.set_size( sizeof(lookup_value_t) * (tree_size + 1) );
+ m_items = (lookup_value_t *) m_buffer.begin();
+
+ //tree item buffer
+ m_tree_buffer.set_size( sizeof(int) * m_max_tree_size);
+ m_tree = (int * ) m_tree_buffer.begin();
+ m_tree_size = 0;
+ }
+
+public:
+
+ //Constructor
+ WinnerTree(int tree_size = 10){
+ init(tree_size);
+ }
+
+ //Destructor
+ ~WinnerTree() { }
+
+ //need delete this
+ IBranchIterator* get_iterator(LookupStepContent step){
+ if ( step->len <= nbranch )
+ return new DirectBranchIterator(step);
+ //TODO:another situation > nbranch
+ assert(initialize(step));
+ return new WinnerTreeBranchIterator(*this);
+ }
+
+protected:
+
+ int get_winner() const {
+ return (m_tree_size)? m_tree[1] : 0;
+ }
+
+ //Member Function
+ bool initialize(LookupStepContent cur_step);
+ void replay(int i);
+};
+
+#endif
diff --git a/src/segment/Makefile.am b/src/segment/Makefile.am
new file mode 100644
index 0000000..0e58ddf
--- /dev/null
+++ b/src/segment/Makefile.am
@@ -0,0 +1,28 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ @GLIB2_CPPFLAGS@
+
+noinst_PROGRAMS = mmseg
+
+mmseg_SOURCES = mmseg.cpp
+
+mmseg_LDADD = @GLIB2_LDFLAGS@
diff --git a/src/segment/mmseg.cpp b/src/segment/mmseg.cpp
new file mode 100644
index 0000000..6a3d7f7
--- /dev/null
+++ b/src/segment/mmseg.cpp
@@ -0,0 +1,212 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <limits.h>
+#include <locale.h>
+#include <glib.h>
+#include "novel_types.h"
+
+static GHashTable * g_phrases;
+
+struct SegmentStep{
+ phrase_token_t m_handle;
+ char * m_phrase;
+ //use formula W = No. of words. Zero handle means one word.
+ size_t m_nword;
+ //backtracing information, -1 one step backward.
+ gint8 m_backward_nstep;
+};
+
+//read gb_char.table and gbk_char.table
+bool init_phrases(FILE * infile){
+ char pinyin[256];
+ char phrase[256];
+ phrase_token_t token;
+ size_t freq;
+ while (!feof(infile)){
+ fscanf(infile, "%s", pinyin);
+ fscanf(infile, "%s", phrase);
+ fscanf(infile, "%d", &token);
+ fscanf(infile, "%ld", &freq);
+ if ( feof(infile) )
+ break;
+ g_hash_table_insert(g_phrases, g_strdup(phrase),
+ GUINT_TO_POINTER(token));
+ }
+ return true;
+}
+
+bool segment(GHashTable * phrases, // Lookup Phrases
+ const char * phrase,
+ GArray * strings /* Array of const char * */){
+ GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+ GArray * offsets = g_array_new(TRUE, TRUE, sizeof(size_t));
+ //construct dynamic programming.
+ size_t phrase_length = g_utf8_strlen(phrase, -1);
+ const char * p = phrase;
+ size_t offset = p - phrase;
+ g_array_append_val(offsets, offset);
+ g_array_set_size(steps, phrase_length + 1);
+ for ( size_t i = 0 ; i < phrase_length; ++i){
+ p = g_utf8_next_char(p);
+ offset = p - phrase;
+ g_array_append_val(offsets, offset);
+ }
+ assert( *p == '\0' );
+
+ //initialize segment steps values.
+ for ( size_t i = 0; i < phrase_length + 1; ++i){
+ SegmentStep* step = &g_array_index(steps, SegmentStep, i);
+ step->m_nword = UINT_MAX;
+ }
+
+ for ( size_t i = 0 ; i < phrase_length + 1; ++i){
+ size_t* offset_begin = &g_array_index(offsets, size_t, i);
+ const char * phrase_begin = phrase + *offset_begin;
+ SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
+ size_t nword = step_begin->m_nword;
+ for ( size_t k = i + 1; k < phrase_length + 1; ++k){
+ size_t* offset_end = &g_array_index(offsets, size_t, k);
+ size_t len = *offset_end - *offset_begin;
+ char * cur_phrase = g_strndup(phrase_begin, len);
+ phrase_token_t token;
+ gpointer orig_key, value;
+ gboolean result = g_hash_table_lookup_extended
+ (phrases, cur_phrase, &orig_key, &value);
+ if ( result ){
+ token = GPOINTER_TO_UINT(value);
+ }else{
+ token = 0;
+ if ( 1 != k - i ){ //skip non-phrase
+ g_free(cur_phrase);
+ continue;
+ }
+ }
+ ++nword;
+ SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
+ if ( nword < step_end->m_nword){
+ if ( step_end->m_phrase ){
+ g_free(step_end->m_phrase);
+ step_end->m_phrase = NULL;
+ }
+ step_end->m_handle = token;
+ step_end->m_phrase = cur_phrase;
+ step_end->m_nword = nword;
+ step_end->m_backward_nstep = k - i;
+ }else{
+ g_free(cur_phrase);
+ }
+ }
+ }
+ //backtracing to get the result.
+ size_t cur_step = phrase_length;
+ g_array_set_size(strings, 0);
+ while ( cur_step ){
+ SegmentStep* step_end = &g_array_index(steps, SegmentStep, cur_step);
+ char * str_dup = g_strdup(step_end->m_phrase);
+ g_array_append_val(strings, str_dup);
+ cur_step = cur_step - step_end->m_backward_nstep;
+ }
+
+ for ( size_t i = 0; i < strings->len / 2; ++i){
+ char ** phrase_head = &g_array_index(strings, char * , i);
+ char ** phrase_tail = &g_array_index(strings, char * , strings->len -1 - i);
+ char * phrase_tmp;
+ phrase_tmp = * phrase_head;
+ * phrase_head = * phrase_tail;
+ * phrase_tail = phrase_tmp;
+ }
+
+ //free strndup memory
+ for ( size_t i = 0; i < steps->len; ++i){
+ SegmentStep* step = &g_array_index(steps, SegmentStep, i);
+ if ( step->m_phrase ){
+ g_free(step->m_phrase);
+ step->m_phrase = NULL;
+ }
+ }
+
+ g_array_free(offsets, TRUE);
+ g_array_free(steps, TRUE);
+ return true;
+}
+
+void print_help(){
+ printf("Usage: mmseg [--generate-extra-enter]\n");
+ exit(1);
+}
+
+int main(int argc, char * argv[]){
+ int i = 1;
+ bool gen_extra_enter = false;
+
+ setlocale(LC_ALL,"");
+ while ( i < argc ){
+ if ( strcmp("--help", argv[i] ) == 0) {
+ print_help();
+ }else if ( strcmp("--generate-extra-enter", argv[i]) == 0) {
+ gen_extra_enter = true;
+ }
+ ++i;
+ }
+
+ g_phrases = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
+ //init phrase lookup
+ FILE * gb_file = fopen("../../data/gb_char.table", "r");
+ if ( gb_file == NULL ){
+ fprintf(stderr, "can't open gb_char.table!\n");
+ exit(1);
+ }
+ init_phrases(gb_file);
+ fclose(gb_file);
+ FILE * gbk_file = fopen("../../data/gbk_char.table", "r");
+ if ( gbk_file == NULL ){
+ fprintf(stderr, "can't open gbk_char.table!\n");
+ exit(1);
+ }
+ init_phrases(gbk_file);
+ fclose(gbk_file);
+
+ char* linebuf = (char *)malloc ( 1024 * sizeof (char) );
+ size_t size = 1024;
+ while( getline(&linebuf, &size, stdin) ){
+ if ( feof(stdin) )
+ break;
+ linebuf[strlen(linebuf)-1] = '\0';
+
+ GArray * phrases = g_array_new(TRUE, TRUE, sizeof( char *));
+ segment(g_phrases, linebuf, phrases);
+ for ( size_t i = 0; i < phrases->len; ++i){
+ char * phrase = g_array_index(phrases, char *, i);
+ printf("%s\n", phrase);
+ g_free(phrase);
+ }
+ if ( gen_extra_enter )
+ printf("\n");
+ g_array_free(phrases, TRUE);
+ }
+ free(linebuf);
+}
diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am
new file mode 100644
index 0000000..adf2b5c
--- /dev/null
+++ b/src/storage/Makefile.am
@@ -0,0 +1,35 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+INCLUDES = -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ @GLIB2_CPPFLAGS@
+
+noinst_HEADERS = pinyin_large_table.h \
+ pinyin_base.h \
+ pinyin_phrase.h \
+ phrase_index.h \
+ pinyin_zhuyin_map_data.h \
+ ngram.h
+
+noinst_LTLIBRARIES = libstorage.la
+
+libstorage_la_SOURCES = pinyin_base.cpp \
+ pinyin_large_table.cpp \
+ phrase_index.cpp \
+ ngram.cpp
+
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
new file mode 100644
index 0000000..7fdc58f
--- /dev/null
+++ b/src/storage/ngram.cpp
@@ -0,0 +1,283 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "ngram.h"
+
+struct SingleGramItem{
+ phrase_token_t m_token;
+ guint32 m_freq;
+};
+
+SingleGram::SingleGram(){
+ m_chunk.set_size(sizeof(guint32));
+ memset(m_chunk.begin(), 0, sizeof(guint32));
+}
+
+SingleGram::SingleGram(void * buffer, size_t length){
+ m_chunk.set_chunk(buffer, length, NULL);
+}
+
+bool SingleGram::set_total_freq(guint32 m_total){
+ char * buf_begin = (char *)m_chunk.begin();
+ *((guint32 *)buf_begin) = m_total;
+ return true;
+}
+
+bool SingleGram::get_total_freq(guint32 & m_total){
+ char * buf_begin = (char *)m_chunk.begin();
+ m_total = *((guint32 *)buf_begin);
+ return true;
+}
+
+bool SingleGram::prune(){
+#if 1
+ SingleGramItem * begin = (SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+
+ size_t nitem = 0;
+ for ( SingleGramItem * cur = begin; cur != end; ++cur){
+ cur->m_freq--;
+ nitem++;
+ if ( cur->m_freq == 0 ){
+ size_t offset = sizeof(guint32) + (cur - begin)
+ * sizeof(SingleGramItem) ;
+ m_chunk.remove_content(offset, sizeof(SingleGramItem));
+ }
+ }
+ guint32 total_freq;
+ assert(get_total_freq(total_freq));
+ assert(set_total_freq(total_freq - nitem));
+#endif
+ return true;
+}
+
+bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
+ return lhs.m_token < rhs.m_token;
+}
+
+bool SingleGram::search(/* in */ PhraseIndexRange * range,
+ /* out */ BigramPhraseArray array){
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = range->m_range_begin;
+ const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ guint32 total_freq;
+ BigramPhraseItem bigram_item;
+ assert(get_total_freq(total_freq));
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token >= range->m_range_end )
+ break;
+ bigram_item.m_token = cur_item->m_token;
+ bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
+ g_array_append_val(array, bigram_item);
+ }
+ return true;
+}
+
+bool SingleGram::get_freq(/* in */ phrase_token_t token,
+ /* out */ guint32 & freq){
+ freq = 0;
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token )
+ return false;
+ if ( cur_item->m_token == token ){
+ freq = cur_item -> m_freq;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool SingleGram::set_freq(/* in */ phrase_token_t token,
+ guint32 freq){
+ SingleGramItem * begin = (SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ SingleGramItem insert_item;
+ insert_item.m_token = token;
+ insert_item.m_freq = freq;
+ for ( ;cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token ){
+ size_t offset = sizeof(guint32) +
+ sizeof(SingleGramItem) * (cur_item - begin);
+ m_chunk.insert_content(offset, &insert_item,
+ sizeof(SingleGramItem));
+ return true;
+ }
+ if ( cur_item->m_token == token ){
+ cur_item -> m_freq = freq;
+ return true;
+ }
+ }
+ m_chunk.insert_content(m_chunk.size(), &insert_item,
+ sizeof(SingleGramItem));
+ return true;
+}
+
+
+bool Bigram::attach(const char * systemfile, const char * userfile){
+ reset();
+ if ( systemfile ){
+ int ret = db_create(&m_system, NULL, 0);
+ if ( ret != 0 )
+ assert(false);
+
+ m_system->open(m_system, NULL, systemfile, NULL,
+ DB_HASH, DB_RDONLY, 0664);
+ if ( ret != 0)
+ return false;
+ }
+
+ if ( userfile ){
+ int ret = db_create(&m_user, NULL, 0);
+ if ( ret != 0 )
+ assert(false);
+
+ m_user->open(m_user, NULL, userfile, NULL, DB_HASH, DB_CREATE, 0664);
+ if ( ret != 0)
+ return false;
+ }
+ return true;
+}
+
+bool Bigram::load(phrase_token_t index, SingleGram * & system_gram, SingleGram * & user_gram){
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ system_gram = NULL; user_gram = NULL;
+ if ( m_system ){
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ int ret = m_system->get(m_system, NULL, &db_key, &db_data, 0);
+ if ( ret == 0 )
+ system_gram = new SingleGram(db_data.data, db_data.size);
+ }
+ if ( m_user ){
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ int ret = m_user->get(m_user, NULL, &db_key, &db_data, 0);
+ if ( ret == 0 )
+ user_gram = new SingleGram(db_data.data, db_data.size);
+ }
+ return true;
+}
+
+bool Bigram::store(phrase_token_t index, SingleGram * user_gram){
+ if ( !m_user )
+ return false;
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = user_gram->m_chunk.begin();
+ db_data.size = user_gram->m_chunk.size();
+
+ int ret = m_user->put(m_user, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+}
+
+bool Bigram::get_all_items(GArray * system, GArray * user){
+ bool retval = false;
+ g_array_set_size(system, 0);
+ g_array_set_size(user, 0);
+ if ( m_system ){
+ DBC * cursorp;
+ DBT key, data;
+ int ret;
+ /* Get a cursor */
+ m_system->cursor(m_system, NULL, &cursorp, 0);
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ assert(key.size == sizeof(phrase_token_t));
+ phrase_token_t * token = (phrase_token_t *)key.data;
+ g_array_append_val(system, *token);
+ }
+
+ if (ret != DB_NOTFOUND) {
+ fprintf(stderr, "system db error, exit!");
+ exit(1);
+ }
+
+ /* Cursors must be closed */
+ if (cursorp != NULL)
+ cursorp->c_close(cursorp);
+
+ retval = true;
+ }
+ if ( m_user ){
+ DBC * cursorp;
+ DBT key, data;
+ int ret;
+ /* Get a cursor */
+ m_user->cursor(m_user, NULL, &cursorp, 0);
+
+ /* Initialize out DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ assert(key.size == sizeof(phrase_token_t));
+ phrase_token_t * token = (phrase_token_t *) key.data;
+ g_array_append_val(user, *token);
+ }
+
+ if (ret != DB_NOTFOUND){
+ fprintf(stderr, "user db error, exit!");
+ exit(1);
+ }
+
+ /* Cursor must be closed */
+ if ( cursorp != NULL)
+ cursorp->c_close(cursorp);
+
+ retval = true;
+ }
+ return retval;
+}
diff --git a/src/storage/ngram.h b/src/storage/ngram.h
new file mode 100644
index 0000000..39a9ecc
--- /dev/null
+++ b/src/storage/ngram.h
@@ -0,0 +1,119 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef NGRAM_H
+#define NGRAM_H
+
+#include <db.h>
+
+namespace novel{
+
+class Bigram;
+
+/* Note:
+ * When transfer from system ngram to user ngram,
+ * if user ngram doesn't exist,
+ * copy total freq from system ngram to user ngram,
+ * so the total freq exists.
+ * if item freq don't exist, copy item freq from system to user ngram,
+ * so the item freq exists.
+ * if user ngram already exists(always true), increases the total freq,
+ * if item ngram already exists(always true), increases the freq.
+ */
+
+class SingleGram{
+ friend class Bigram;
+private:
+ MemoryChunk m_chunk;
+ SingleGram(void * buffer, size_t length);
+public:
+ /* Null Constructor */
+ SingleGram();
+ /* search method */
+ /* the array result contains many items */
+ bool search(/* in */ PhraseIndexRange * range,
+ /* out */ BigramPhraseArray array);
+
+ bool get_freq(/* in */ phrase_token_t token,
+ /* out */ guint32 & freq);
+
+ /* set_freq method
+ */
+ bool set_freq(/* in */ phrase_token_t token,
+ guint32 freq);
+
+ /* set_total_freq method
+ * used in user bigram table
+ */
+ bool set_total_freq(guint32 m_total);
+
+ /* get_total_freq method
+ * used in user bigram table
+ */
+ bool get_total_freq(guint32 & m_total);
+
+ /* prune one method
+ * only used in training
+ */
+ bool prune();
+};
+
+class Bigram{
+private:
+ DB * m_system;
+ DB * m_user;
+public:
+ Bigram(){
+ m_system = NULL; m_user = NULL;
+ }
+
+ ~Bigram(){
+ reset();
+ }
+
+ void reset(){
+ if ( m_system ){
+ m_system->close(m_system, 0);
+ m_system = NULL;
+ }
+ if ( m_user ){
+ m_user->close(m_user, 0);
+ m_user = NULL;
+ }
+ }
+
+ /* attach system and user bi-gram */
+ /* when with training systemdb is NULL, only user_gram */
+ bool attach(const char * systemfile, const char * userfile);
+
+ bool load(phrase_token_t index, SingleGram * & system_gram, SingleGram * & user_gram);
+ bool store(phrase_token_t index, SingleGram * user_gram);
+ /* array of phrase_token_t items, for parameter estimation. */
+ bool get_all_items(GArray * system, GArray * user);
+};
+
+};
+
+using namespace novel;
+
+
+#endif
diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp
new file mode 100644
index 0000000..7dbecb3
--- /dev/null
+++ b/src/storage/phrase_index.cpp
@@ -0,0 +1,340 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "phrase_index.h"
+
+bool PhraseItem::set_n_pronunciation(guint8 n_prouns){
+ m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8));
+ return true;
+}
+
+bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){
+ guint8 phrase_length = get_phrase_length();
+ table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32));
+ bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey));
+ if ( !retval )
+ return retval;
+ return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32));
+}
+
+void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){
+ guint8 phrase_length = get_phrase_length();
+ set_n_pronunciation(get_n_pronunciation() + 1);
+ m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey));
+ m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32));
+}
+
+void PhraseItem::remove_nth_pronunciation(size_t index){
+ guint8 phrase_length = get_phrase_length();
+ set_n_pronunciation(get_n_pronunciation() - 1);
+ size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32));
+ m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32));
+}
+
+bool PhraseItem::get_phrase_string(utf16_t * phrase){
+ guint8 phrase_length = get_phrase_length();
+ return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
+}
+
+bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){
+ m_chunk.set_content(0, &phrase_length, sizeof(guint8));
+ m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t));
+ return true;
+}
+
+void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom,
+ PinyinKey * pinyin_keys,
+ gint32 delta){
+ guint8 phrase_length = get_phrase_length();
+ guint8 npron = get_n_pronunciation();
+ size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
+ char * buf_begin = (char *) m_chunk.begin();
+ guint32 total_freq = 0;
+ for ( int i = 0 ; i < npron ; ++i){
+ char * pinyin_begin = buf_begin + offset +
+ i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
+ guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
+ total_freq += *freq;
+ if ( 0 == pinyin_compare_with_ambiguities(custom,
+ (PinyinKey *)pinyin_begin,
+ pinyin_keys,
+ phrase_length)){
+ //protect against total_freq overflow.
+ if ( delta > 0 && total_freq > total_freq + delta )
+ return;
+ *freq += delta;
+ total_freq += delta;
+ }
+ }
+}
+
+
+guint32 SubPhraseIndex::get_phrase_index_total_freq(){
+ return m_total_freq;
+}
+
+bool SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){
+ table_offset_t offset;
+ guint32 freq;
+ bool result = m_phrase_index.get_content
+ ((token & PHRASE_MASK)
+ * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+
+ if ( !result)
+ return result;
+
+ if ( 0 == offset )
+ return false;
+
+ result = m_phrase_content.get_content
+ (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
+ //protect total_freq overflow
+ if ( delta > 0 && m_total_freq > m_total_freq + delta )
+ return false;
+ freq += delta;
+ m_total_freq += delta;
+ return m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32));
+}
+
+bool SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){
+ table_offset_t offset;
+ guint8 phrase_length;
+ guint8 n_prons;
+
+ bool result = m_phrase_index.get_content
+ ((token & PHRASE_MASK)
+ * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+
+ if ( !result )
+ return result;
+
+ if ( 0 == offset )
+ return false;
+
+ result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
+ if ( !result )
+ return result;
+
+ result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
+ if ( !result )
+ return result;
+
+ size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
+ item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL);
+ return true;
+}
+
+bool SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){
+ table_offset_t offset = m_phrase_content.size();
+ if ( 0 == offset )
+ offset = 8;
+ m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size());
+ m_phrase_index.set_content((token & PHRASE_MASK)
+ * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+ m_total_freq += item->get_unigram_frequency();
+ return true;
+}
+
+bool SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){
+ table_offset_t offset;
+ guint8 phrase_length;
+ guint8 n_prons;
+
+ bool result = m_phrase_index.get_content
+ ((token & PHRASE_MASK)
+ * sizeof(table_offset_t), &offset, sizeof(table_offset_t));
+
+ if ( !result )
+ return result;
+
+ if ( 0 == offset )
+ return false;
+
+ result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8));
+ if ( !result )
+ return result;
+
+ result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8));
+ if ( !result )
+ return result;
+
+ size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) );
+ item = new PhraseItem;
+ //implictly copy data from m_chunk_content.
+ item->m_chunk.set_content(0, (char *) m_phrase_content.begin() + offset, length);
+
+ const table_offset_t zero_const = 0;
+ m_phrase_index.set_content((token & PHRASE_MASK)
+ * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t));
+ m_total_freq -= item->get_unigram_frequency();
+ return true;
+}
+
+bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases ){
+ sub_phrases = new SubPhraseIndex;
+ }
+
+ bool retval = sub_phrases->load(chunk, 0, chunk->size());
+ if ( !retval )
+ return retval;
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+ return retval;
+}
+
+bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){
+ table_offset_t end;
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases )
+ return false;
+
+ sub_phrases->store(new_chunk, 0, end);
+ return true;
+}
+
+bool FacadePhraseIndex::unload(guint8 phrase_index){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases )
+ return false;
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
+ delete sub_phrases;
+ sub_phrases = NULL;
+ return true;
+}
+
+bool SubPhraseIndex::load(MemoryChunk * chunk,
+ table_offset_t offset, table_offset_t end){
+ //save the memory chunk
+ if ( m_chunk ){
+ delete m_chunk;
+ m_chunk = NULL;
+ }
+ m_chunk = chunk;
+
+ char * buf_begin = (char *)chunk->begin();
+ chunk->get_content(offset, &m_total_freq, sizeof(guint32));
+ offset += sizeof(guint32);
+ table_offset_t index_one, index_two, index_three;
+ chunk->get_content(offset, &index_one, sizeof(table_offset_t));
+ offset += sizeof(table_offset_t);
+ chunk->get_content(offset, &index_two, sizeof(table_offset_t));
+ offset += sizeof(table_offset_t);
+ chunk->get_content(offset, &index_three, sizeof(table_offset_t));
+ offset += sizeof(table_offset_t);
+ g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE);
+ g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE);
+ g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE);
+ m_phrase_index.set_chunk(buf_begin + index_one,
+ index_two - 1 - index_one, NULL);
+ m_phrase_content.set_chunk(buf_begin + index_two,
+ index_three - 1 - index_two, NULL);
+ g_return_val_if_fail( index_three <= end, FALSE);
+ return true;
+}
+
+bool SubPhraseIndex::store(MemoryChunk * new_chunk,
+ table_offset_t offset, table_offset_t& end){
+ new_chunk->set_content(offset, &m_total_freq, sizeof(guint32));
+ table_offset_t index = offset + sizeof(guint32);
+
+ offset = index + sizeof(table_offset_t) * 3 ;
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size());
+ offset += m_phrase_index.size();
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+
+ new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size());
+ offset += m_phrase_content.size();
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ return true;
+}
+
+bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
+ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
+ if ( !sub_phrases ){
+ sub_phrases = new SubPhraseIndex;
+ }
+
+ char pinyin[256];
+ char phrase[256];
+ phrase_token_t token;
+ size_t freq;
+ PhraseItem * item_ptr = new PhraseItem;
+ phrase_token_t cur_token = 0;
+ while ( !feof(infile)){
+ fscanf(infile, "%s", pinyin);
+ fscanf(infile, "%s", phrase);
+ fscanf(infile, "%ld", &token);
+ fscanf(infile, "%ld", &freq);
+ if ( feof(infile) )
+ break;
+
+ glong written;
+ utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL,
+ &written, NULL);
+
+ if ( 0 == cur_token ){
+ cur_token = token;
+ item_ptr->set_phrase_string(written, phrase_utf16);
+ }
+
+ if ( cur_token != token ){
+ add_phrase_item( cur_token, item_ptr);
+ delete item_ptr;
+ item_ptr = new PhraseItem;
+ cur_token = token;
+ item_ptr->set_phrase_string(written, phrase_utf16);
+ }
+
+ PinyinDefaultParser parser;
+ NullPinyinValidator validator;
+ PinyinKeyVector keys;
+ PinyinKeyPosVector poses;
+
+ keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
+ poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
+ parser.parse(validator, keys, poses, pinyin);
+
+ assert ( item_ptr->get_phrase_length() == keys->len );
+ item_ptr->append_pronunciation((PinyinKey *)keys->data, freq);
+
+ g_array_free(keys, TRUE);
+ g_array_free(poses, TRUE);
+ g_free(phrase_utf16);
+ }
+
+ add_phrase_item( cur_token, item_ptr);
+ delete item_ptr;
+ m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
+ return true;
+}
diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h
new file mode 100755
index 0000000..e635453
--- /dev/null
+++ b/src/storage/phrase_index.h
@@ -0,0 +1,250 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef PHRASE_INDEX_H
+#define PHRASE_INDEX_H
+
+#include <stdio.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "memory_chunk.h"
+
+class PinyinLookup;
+
+namespace novel{
+
+/* Because this is not large,
+ * Store this in user home directory.
+ */
+
+const int phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
+
+class PhraseItem{
+ friend class SubPhraseIndex;
+private:
+ MemoryChunk m_chunk;
+ bool set_n_pronunciation(guint8 n_prouns);
+public:
+ /* Null Constructor */
+ PhraseItem(){
+ m_chunk.set_size(phrase_item_header);
+ memset(m_chunk.begin(), 0, m_chunk.size());
+ }
+
+ PhraseItem(MemoryChunk chunk){
+ m_chunk = chunk;
+ assert ( m_chunk.size() >= phrase_item_header);
+ }
+
+ /* functions */
+ guint8 get_phrase_length(){
+ char * buf_begin = (char *)m_chunk.begin();
+ return (*(guint8 *)buf_begin);
+ }
+
+ guint8 get_n_pronunciation(){
+ char * buf_begin = ( char *) m_chunk.begin();
+ return (*(guint8 *)(buf_begin + sizeof(guint8)));
+ }
+
+ guint32 get_unigram_frequency(){
+ char * buf_begin = (char *)m_chunk.begin();
+ return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
+ }
+
+ gfloat get_pinyin_possibility(PinyinCustomSettings & custom,
+ PinyinKey * pinyin_keys){
+ guint8 phrase_length = get_phrase_length();
+ guint8 npron = get_n_pronunciation();
+ size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t );
+ char * buf_begin = (char *)m_chunk.begin();
+ guint32 matched = 0, total_freq =0;
+ for ( int i = 0 ; i < npron ; ++i){
+ char * pinyin_begin = buf_begin + offset +
+ i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) );
+ guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey));
+ total_freq += *freq;
+ if ( 0 == pinyin_compare_with_ambiguities(custom,
+ (PinyinKey *)pinyin_begin,
+ pinyin_keys,
+ phrase_length)){
+ matched += *freq;
+ }
+ }
+ // use preprocessor to avoid zero freq, in gen_pinyin_table.
+ /*
+ if ( 0 == total_freq )
+ return 0.1;
+ */
+ gfloat retval = matched / (gfloat) total_freq;
+ /*
+ if ( 0 == retval )
+ return 0.03;
+ */
+ return retval;
+ }
+
+ void increase_pinyin_possibility(PinyinCustomSettings & custom,
+ PinyinKey * pinyin_keys,
+ gint32 delta);
+
+ bool get_phrase_string(utf16_t * phrase);
+ bool set_phrase_string(guint8 phrase_length, utf16_t * phrase);
+ bool get_nth_pronunciation(size_t index,
+ /* out */ PinyinKey * pinyin,
+ /* out */ guint32 & freq);
+ /* Normally don't change the first pronunciation,
+ * which decides the token number.
+ */
+ void append_pronunciation(PinyinKey * pinyin, guint32 freq);
+ void remove_nth_pronunciation(size_t index);
+};
+
+/*
+ * In Sub Phrase Index, token == (token & PHRASE_MASK).
+ */
+
+class SubPhraseIndex{
+private:
+ guint32 m_total_freq;
+ MemoryChunk m_phrase_index;
+ MemoryChunk m_phrase_content;
+ MemoryChunk * m_chunk;
+public:
+ SubPhraseIndex():m_total_freq(0){
+ m_chunk = NULL;
+ }
+
+ ~SubPhraseIndex(){
+ reset();
+ }
+
+ void reset(){
+ if ( m_chunk ){
+ delete m_chunk;
+ m_chunk = NULL;
+ }
+ }
+
+ bool load(MemoryChunk * chunk,
+ table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk,
+ table_offset_t offset, table_offset_t & end);
+
+ /* Zero-gram */
+ guint32 get_phrase_index_total_freq();
+ bool add_unigram_frequency(phrase_token_t token, guint32 delta);
+ /* get_phrase_item function can't modify the phrase item,
+ * but can increment the freq of the special pronunciation.
+ */
+ bool get_phrase_item(phrase_token_t token, PhraseItem & item);
+ bool add_phrase_item(phrase_token_t token, PhraseItem * item);
+ /* remove_phrase_item will substract item->get_unigram_frequency()
+ * from m_total_freq
+ */
+ bool remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);
+};
+
+class FacadePhraseIndex{
+ friend class ::PinyinLookup;
+private:
+ guint32 m_total_freq;
+ SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
+public:
+ FacadePhraseIndex(){
+ m_total_freq = 0;
+ memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
+ }
+
+ ~FacadePhraseIndex(){
+ for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
+ if ( m_sub_phrase_indices[i] ){
+ delete m_sub_phrase_indices[i];
+ m_sub_phrase_indices[i] = NULL;
+ }
+ }
+ }
+
+ /* load/store single sub phrase index, according to the config files. */
+ bool load_text(guint8 phrase_index, FILE * infile);
+ bool load(guint8 phrase_index, MemoryChunk * chunk);
+ bool store(guint8 phrase_index, MemoryChunk * new_chunk);
+ bool unload(guint8 phrase_index);
+
+ /* Zero-gram */
+ guint32 get_phrase_index_total_freq(){
+ return m_total_freq;
+ }
+
+ bool add_unigram_frequency(phrase_token_t token, guint32 delta){
+ guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
+ if ( !sub_phrase )
+ return false;
+ m_total_freq += delta;
+ return sub_phrase->add_unigram_frequency(token, delta);
+ }
+
+ /* get_phrase_item function can't modify the phrase item */
+ bool get_phrase_item(phrase_token_t token, PhraseItem & item){
+ guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
+ if ( !sub_phrase )
+ return false;
+ return sub_phrase->get_phrase_item(token, item);
+ }
+
+ bool add_phrase_item(phrase_token_t token, PhraseItem * item){
+ guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+ SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
+ if ( !sub_phrase ){
+ sub_phrase = new SubPhraseIndex;
+ }
+ m_total_freq += item->get_unigram_frequency();
+ return sub_phrase->add_phrase_item(token, item);
+ }
+
+ bool remove_phrase_item(phrase_token_t token, PhraseItem * & item){
+ guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
+ SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
+ if ( !sub_phrase ){
+ return false;
+ }
+ bool result = sub_phrase->remove_phrase_item(token, item);
+ if ( !result )
+ return result;
+ m_total_freq -= item->get_unigram_frequency();
+ return result;
+ }
+};
+
+};
+
+using namespace novel;
+
+
+
+
+
+#endif
diff --git a/src/storage/pinyin_base.cpp b/src/storage/pinyin_base.cpp
new file mode 100644
index 0000000..cffee3c
--- /dev/null
+++ b/src/storage/pinyin_base.cpp
@@ -0,0 +1,1425 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2002,2003,2006 James Su
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "stl_lite.h"
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+
+// Internal data definition
+
+/**
+ * struct of pinyin token.
+ *
+ * this struct store the informations of a pinyin token
+ * (an initial or final)
+ */
+struct PinyinToken
+{
+ const char *latin; /**< Latin name of the token. */
+ const char *zhuyin; /**< Zhuyin name in UTF-8. */
+ int latin_len; /**< length of Latin name. */
+ int zhuyin_len; /**< length of Chinese name. */
+};
+
+/**
+ * struct to index PinyinToken list.
+ */
+struct PinyinTokenIndex
+{
+ int start;
+ int num;
+};
+
+static const PinyinToken __pinyin_initials[] =
+{
+ {"", "", 0, 0},
+ {"b", "ㄅ", 1, 1},
+ {"c", "ㄘ", 1, 1},
+ {"ch","ㄔ", 2, 1},
+ {"d", "ㄉ", 1, 1},
+ {"f", "ㄈ", 1, 1},
+ {"h", "ㄏ", 1, 1},
+ {"g", "ㄍ", 1, 1},
+ {"j", "ㄐ", 1, 1},
+ {"k", "ㄎ", 1, 1},
+ {"m", "ㄇ", 1, 1},
+ {"n", "ㄋ", 1, 1},
+ {"l", "ㄌ", 1, 1},
+ {"r", "ㄖ", 1, 1},
+ {"p", "ㄆ", 1, 1},
+ {"q", "ㄑ", 1, 1},
+ {"s", "ㄙ", 1, 1},
+ {"sh","ㄕ", 2, 1},
+ {"t", "ㄊ", 1, 1},
+ {"w", "ㄨ", 1, 1}, //Should be omitted in some case.
+ {"x", "ㄒ", 1, 1},
+ {"y", "ㄧ", 1, 1}, //Should be omitted in some case.
+ {"z", "ㄗ", 1, 1},
+ {"zh","ㄓ", 2, 1}
+};
+
+static const PinyinToken __pinyin_finals[] =
+{
+ {"", "", 0, 0},
+ {"a", "ㄚ", 1, 1},
+ {"ai", "ㄞ", 2, 1},
+ {"an", "ㄢ", 2, 1},
+ {"ang", "ㄤ", 3, 1},
+ {"ao", "ㄠ", 2, 1},
+ {"e", "ㄜ", 1, 1},
+ {"ea", "ㄝ", 2, 1},
+ {"ei", "ㄟ", 2, 1},
+ {"en", "ㄣ", 2, 1},
+ {"eng", "ㄥ", 3, 1},
+ {"er", "ㄦ", 2, 1},
+ {"i", "ㄧ", 1, 1},
+ {"ia", "ㄧㄚ", 2, 2},
+ {"ian", "ㄧㄢ", 3, 2},
+ {"iang","ㄧㄤ", 4, 2},
+ {"iao", "ㄧㄠ", 3, 2},
+ {"ie", "ㄧㄝ", 2, 2},
+ {"in", "ㄧㄣ", 2, 2},
+ {"ing", "ㄧㄥ", 3, 2},
+ {"iong","ㄩㄥ", 4, 2},
+ {"iu", "ㄧㄡ", 2, 2},
+ {"ng", "ㄣ", 2, 1},
+ {"o", "ㄛ", 1, 1},
+ {"ong", "ㄨㄥ", 3, 2},
+ {"ou", "ㄡ", 2, 1},
+ {"u", "ㄨ", 1, 1},
+ {"ua", "ㄨㄚ", 2, 2},
+ {"uai", "ㄨㄞ", 3, 2},
+ {"uan", "ㄨㄢ", 3, 2},
+ {"uang","ㄨㄤ", 4, 2},
+ {"ue", "ㄩㄝ", 2, 2},
+ {"ueng","ㄨㄥ", 4, 2},
+ {"ui", "ㄨㄟ", 2, 2},
+ {"un", "ㄨㄣ", 2, 2},
+ {"uo", "ㄨㄛ", 2, 2},
+ {"v", "ㄩ", 1, 1},
+ {"van", "ㄩㄢ", 3, 2},
+ {"ve", "ㄩㄝ", 2, 2},
+ {"vn", "ㄩㄣ", 2, 2}
+};
+
+static const PinyinToken __pinyin_tones [] =
+{
+ {"", "", 0, 0},
+ {"1", "ˉ", 1, 1},
+ {"2", "ˊ", 1, 1},
+ {"3", "ˇ", 1, 1},
+ {"4", "ˋ", 1, 1},
+ {"5", "˙", 1, 1}
+};
+
+static const PinyinTokenIndex __pinyin_initials_index[] =
+{
+ //a b c d e f g h i j k l m
+ {-1,0},{1,1}, {2,2}, {4,1}, {-1,0},{5,1}, {7,1}, {6,1}, {-1,0},{8,1}, {9,1}, {12,1},{10,1},
+ //n o p q r s t u v w x y z
+ {11,1},{-1,0},{14,1},{15,1},{13,1},{16,2},{18,1},{-1,0},{-1,0},{19,1},{20,1},{21,1},{22,2}
+};
+
+static const PinyinTokenIndex __pinyin_finals_index[] =
+{
+ //a b c d e f g h i j k l m
+ {1,5}, {-1,0},{-1,0},{-1,0},{6,6},{-1,0},{-1,0},{-1,0},{12,10},{-1,0},{-1,0},{-1,0},{-1,0},
+ //n o p q r s t u v w x y z
+ {22,1},{23,3},{-1,0},{-1,0},{-1,0},{-1,0},{-1,0},{26,10},{36,4},{-1,0},{-1,0},{-1,0},{-1,0}
+};
+
+
+
+static const PinyinInitial __shuang_pin_stone_initial_map [] =
+{
+ PINYIN_ZeroInitial, // A
+ PINYIN_Bo, // B
+ PINYIN_Ci, // C
+ PINYIN_De, // D
+ PINYIN_ZeroInitial, // E
+ PINYIN_Fo, // F
+ PINYIN_Ge, // G
+ PINYIN_He, // H
+ PINYIN_Shi, // I
+ PINYIN_Ji, // J
+ PINYIN_Ke, // K
+ PINYIN_Le, // L
+ PINYIN_Mo, // M
+ PINYIN_Ne, // N
+ PINYIN_ZeroInitial, // O
+ PINYIN_Po, // P
+ PINYIN_Qi, // Q
+ PINYIN_Ri, // R
+ PINYIN_Si, // S
+ PINYIN_Te, // T
+ PINYIN_Chi, // U
+ PINYIN_Zhi, // V
+ PINYIN_Wu, // W
+ PINYIN_Xi, // X
+ PINYIN_Yi, // Y
+ PINYIN_Zi, // Z
+ PINYIN_ZeroInitial, // ;
+};
+
+static const PinyinFinal __shuang_pin_stone_final_map [][2] =
+{
+ { PINYIN_A, PINYIN_ZeroFinal }, // A
+ { PINYIN_Ia, PINYIN_Ua }, // B
+ { PINYIN_Uan, PINYIN_ZeroFinal }, // C
+ { PINYIN_Ao, PINYIN_ZeroFinal }, // D
+ { PINYIN_E, PINYIN_ZeroFinal }, // E
+ { PINYIN_An, PINYIN_ZeroFinal }, // F
+ { PINYIN_Ang, PINYIN_ZeroFinal }, // G
+ { PINYIN_Uang,PINYIN_Iang }, // H
+ { PINYIN_I, PINYIN_ZeroFinal }, // I
+ { PINYIN_Ian, PINYIN_ZeroFinal }, // J
+ { PINYIN_Iao, PINYIN_ZeroFinal }, // K
+ { PINYIN_In, PINYIN_ZeroFinal }, // L
+ { PINYIN_Ie, PINYIN_ZeroFinal }, // M
+ { PINYIN_Iu, PINYIN_ZeroFinal }, // N
+ { PINYIN_Uo, PINYIN_O }, // O
+ { PINYIN_Ou, PINYIN_ZeroFinal }, // P
+ { PINYIN_Ing, PINYIN_Er }, // Q
+ { PINYIN_En, PINYIN_ZeroFinal }, // R
+ { PINYIN_Ai, PINYIN_ZeroFinal }, // S
+ { PINYIN_Ng, PINYIN_Eng }, // T
+ { PINYIN_U, PINYIN_ZeroFinal }, // U
+ { PINYIN_V, PINYIN_Ui }, // V
+ { PINYIN_Ei, PINYIN_ZeroFinal }, // W
+ { PINYIN_Uai, PINYIN_Ue }, // X
+ { PINYIN_Ong, PINYIN_Iong }, // Y
+ { PINYIN_Un, PINYIN_ZeroFinal }, // Z
+ { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ;
+};
+
+
+static const PinyinInitial __shuang_pin_zrm_initial_map [] =
+{
+ PINYIN_ZeroInitial, // A
+ PINYIN_Bo, // B
+ PINYIN_Ci, // C
+ PINYIN_De, // D
+ PINYIN_ZeroInitial, // E
+ PINYIN_Fo, // F
+ PINYIN_Ge, // G
+ PINYIN_He, // H
+ PINYIN_Chi, // I
+ PINYIN_Ji, // J
+ PINYIN_Ke, // K
+ PINYIN_Le, // L
+ PINYIN_Mo, // M
+ PINYIN_Ne, // N
+ PINYIN_ZeroInitial, // O
+ PINYIN_Po, // P
+ PINYIN_Qi, // Q
+ PINYIN_Ri, // R
+ PINYIN_Si, // S
+ PINYIN_Te, // T
+ PINYIN_Shi, // U
+ PINYIN_Zhi, // V
+ PINYIN_Wu, // W
+ PINYIN_Xi, // X
+ PINYIN_Yi, // Y
+ PINYIN_Zi, // Z
+ PINYIN_ZeroInitial, // ;
+};
+
+static const PinyinFinal __shuang_pin_zrm_final_map [][2] =
+{
+ { PINYIN_A, PINYIN_ZeroFinal }, // A
+ { PINYIN_Ou, PINYIN_ZeroFinal }, // B
+ { PINYIN_Iao, PINYIN_ZeroFinal }, // C
+ { PINYIN_Uang,PINYIN_Iang }, // D
+ { PINYIN_E, PINYIN_ZeroFinal }, // E
+ { PINYIN_En, PINYIN_ZeroFinal }, // F
+ { PINYIN_Ng, PINYIN_Eng }, // G
+ { PINYIN_Ang, PINYIN_ZeroFinal }, // H
+ { PINYIN_I, PINYIN_ZeroFinal }, // I
+ { PINYIN_An, PINYIN_ZeroFinal }, // J
+ { PINYIN_Ao, PINYIN_ZeroFinal }, // K
+ { PINYIN_Ai, PINYIN_ZeroFinal }, // L
+ { PINYIN_Ian, PINYIN_ZeroFinal }, // M
+ { PINYIN_In, PINYIN_ZeroFinal }, // N
+ { PINYIN_Uo, PINYIN_O }, // O
+ { PINYIN_Un, PINYIN_ZeroFinal }, // P
+ { PINYIN_Iu, PINYIN_ZeroFinal }, // Q
+ { PINYIN_Uan, PINYIN_Er }, // R
+ { PINYIN_Ong, PINYIN_Iong }, // S
+ { PINYIN_Ue, PINYIN_ZeroFinal }, // T
+ { PINYIN_U, PINYIN_ZeroFinal }, // U
+ { PINYIN_V, PINYIN_Ui }, // V
+ { PINYIN_Ia, PINYIN_Ua }, // W
+ { PINYIN_Ie, PINYIN_ZeroFinal }, // X
+ { PINYIN_Ing, PINYIN_Uai }, // Y
+ { PINYIN_Ei, PINYIN_ZeroFinal }, // Z
+ { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ;
+};
+
+
+static const PinyinInitial __shuang_pin_ms_initial_map [] =
+{
+ PINYIN_ZeroInitial, // A
+ PINYIN_Bo, // B
+ PINYIN_Ci, // C
+ PINYIN_De, // D
+ PINYIN_ZeroInitial, // E
+ PINYIN_Fo, // F
+ PINYIN_Ge, // G
+ PINYIN_He, // H
+ PINYIN_Chi, // I
+ PINYIN_Ji, // J
+ PINYIN_Ke, // K
+ PINYIN_Le, // L
+ PINYIN_Mo, // M
+ PINYIN_Ne, // N
+ PINYIN_ZeroInitial, // O
+ PINYIN_Po, // P
+ PINYIN_Qi, // Q
+ PINYIN_Ri, // R
+ PINYIN_Si, // S
+ PINYIN_Te, // T
+ PINYIN_Shi, // U
+ PINYIN_Zhi, // V
+ PINYIN_Wu, // W
+ PINYIN_Xi, // X
+ PINYIN_Yi, // Y
+ PINYIN_Zi, // Z
+ PINYIN_ZeroInitial, // ;
+};
+
+static const PinyinFinal __shuang_pin_ms_final_map [][2] =
+{
+ { PINYIN_A, PINYIN_ZeroFinal }, // A
+ { PINYIN_Ou, PINYIN_ZeroFinal }, // B
+ { PINYIN_Iao, PINYIN_ZeroFinal }, // C
+ { PINYIN_Uang,PINYIN_Iang }, // D
+ { PINYIN_E, PINYIN_ZeroFinal }, // E
+ { PINYIN_En, PINYIN_ZeroFinal }, // F
+ { PINYIN_Ng, PINYIN_Eng }, // G
+ { PINYIN_Ang, PINYIN_ZeroFinal }, // H
+ { PINYIN_I, PINYIN_ZeroFinal }, // I
+ { PINYIN_An, PINYIN_ZeroFinal }, // J
+ { PINYIN_Ao, PINYIN_ZeroFinal }, // K
+ { PINYIN_Ai, PINYIN_ZeroFinal }, // L
+ { PINYIN_Ian, PINYIN_ZeroFinal }, // M
+ { PINYIN_In, PINYIN_ZeroFinal }, // N
+ { PINYIN_Uo, PINYIN_O }, // O
+ { PINYIN_Un, PINYIN_ZeroFinal }, // P
+ { PINYIN_Iu, PINYIN_ZeroFinal }, // Q
+ { PINYIN_Uan, PINYIN_Er }, // R
+ { PINYIN_Ong, PINYIN_Iong }, // S
+ { PINYIN_Ue, PINYIN_ZeroFinal }, // T
+ { PINYIN_U, PINYIN_ZeroFinal }, // U
+ { PINYIN_V, PINYIN_Ui }, // V
+ { PINYIN_Ia, PINYIN_Ua }, // W
+ { PINYIN_Ie, PINYIN_ZeroFinal }, // X
+ { PINYIN_Uai, PINYIN_V }, // Y
+ { PINYIN_Ei, PINYIN_ZeroFinal }, // Z
+ { PINYIN_Ing, PINYIN_ZeroFinal }, // ;
+};
+
+
+static const PinyinInitial __shuang_pin_ziguang_initial_map [] =
+{
+ PINYIN_Chi, // A
+ PINYIN_Bo, // B
+ PINYIN_Ci, // C
+ PINYIN_De, // D
+ PINYIN_ZeroInitial, // E
+ PINYIN_Fo, // F
+ PINYIN_Ge, // G
+ PINYIN_He, // H
+ PINYIN_Shi, // I
+ PINYIN_Ji, // J
+ PINYIN_Ke, // K
+ PINYIN_Le, // L
+ PINYIN_Mo, // M
+ PINYIN_Ne, // N
+ PINYIN_ZeroInitial, // O
+ PINYIN_Po, // P
+ PINYIN_Qi, // Q
+ PINYIN_Ri, // R
+ PINYIN_Si, // S
+ PINYIN_Te, // T
+ PINYIN_Zhi, // U
+ PINYIN_ZeroInitial, // V
+ PINYIN_Wu, // W
+ PINYIN_Xi, // X
+ PINYIN_Yi, // Y
+ PINYIN_Zi, // Z
+ PINYIN_ZeroInitial, // ;
+};
+
+static const PinyinFinal __shuang_pin_ziguang_final_map [][2] =
+{
+ { PINYIN_A, PINYIN_ZeroFinal }, // A
+ { PINYIN_Iao, PINYIN_ZeroFinal }, // B
+ { PINYIN_Ing, PINYIN_ZeroFinal }, // C
+ { PINYIN_Ie, PINYIN_ZeroFinal }, // D
+ { PINYIN_E, PINYIN_ZeroFinal }, // E
+ { PINYIN_Ian, PINYIN_ZeroFinal }, // F
+ { PINYIN_Uang,PINYIN_Iang }, // G
+ { PINYIN_Ong, PINYIN_Iong }, // H
+ { PINYIN_I, PINYIN_ZeroFinal }, // I
+ { PINYIN_Iu, PINYIN_Er }, // J
+ { PINYIN_Ei, PINYIN_ZeroFinal }, // K
+ { PINYIN_Uan, PINYIN_ZeroFinal }, // L
+ { PINYIN_Un, PINYIN_ZeroFinal }, // M
+ { PINYIN_Ui, PINYIN_Ue }, // N
+ { PINYIN_Uo, PINYIN_O }, // O
+ { PINYIN_Ai, PINYIN_ZeroFinal }, // P
+ { PINYIN_Ao, PINYIN_ZeroFinal }, // Q
+ { PINYIN_An, PINYIN_ZeroFinal }, // R
+ { PINYIN_Ang, PINYIN_ZeroFinal }, // S
+ { PINYIN_Ng, PINYIN_Eng }, // T
+ { PINYIN_U, PINYIN_ZeroFinal }, // U
+ { PINYIN_V, PINYIN_ZeroFinal }, // V
+ { PINYIN_En, PINYIN_ZeroFinal }, // W
+ { PINYIN_Ia, PINYIN_Ua }, // X
+ { PINYIN_In, PINYIN_Uai }, // Y
+ { PINYIN_Ou, PINYIN_ZeroFinal }, // Z
+ { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ;
+};
+
+
+static const PinyinInitial __shuang_pin_abc_initial_map [] =
+{
+ PINYIN_Zhi, // A
+ PINYIN_Bo, // B
+ PINYIN_Ci, // C
+ PINYIN_De, // D
+ PINYIN_Chi, // E
+ PINYIN_Fo, // F
+ PINYIN_Ge, // G
+ PINYIN_He, // H
+ PINYIN_ZeroInitial, // I
+ PINYIN_Ji, // J
+ PINYIN_Ke, // K
+ PINYIN_Le, // L
+ PINYIN_Mo, // M
+ PINYIN_Ne, // N
+ PINYIN_ZeroInitial, // O
+ PINYIN_Po, // P
+ PINYIN_Qi, // Q
+ PINYIN_Ri, // R
+ PINYIN_Si, // S
+ PINYIN_Te, // T
+ PINYIN_ZeroInitial, // U
+ PINYIN_Shi, // V
+ PINYIN_Wu, // W
+ PINYIN_Xi, // X
+ PINYIN_Yi, // Y
+ PINYIN_Zi, // Z
+ PINYIN_ZeroInitial, // ;
+};
+
+static const PinyinFinal __shuang_pin_abc_final_map [][2] =
+{
+ { PINYIN_A, PINYIN_ZeroFinal }, // A
+ { PINYIN_Ou, PINYIN_ZeroFinal }, // B
+ { PINYIN_In, PINYIN_Uai }, // C
+ { PINYIN_Ia, PINYIN_Ua }, // D
+ { PINYIN_E, PINYIN_ZeroFinal }, // E
+ { PINYIN_En, PINYIN_ZeroFinal }, // F
+ { PINYIN_Ng, PINYIN_Eng }, // G
+ { PINYIN_Ang, PINYIN_ZeroFinal }, // H
+ { PINYIN_I, PINYIN_ZeroFinal }, // I
+ { PINYIN_An, PINYIN_ZeroFinal }, // J
+ { PINYIN_Ao, PINYIN_ZeroFinal }, // K
+ { PINYIN_Ai, PINYIN_ZeroFinal }, // L
+ { PINYIN_Ui, PINYIN_Ue }, // M
+ { PINYIN_Un, PINYIN_ZeroFinal }, // N
+ { PINYIN_Uo, PINYIN_O }, // O
+ { PINYIN_Uan, PINYIN_ZeroFinal }, // P
+ { PINYIN_Ei, PINYIN_ZeroFinal }, // Q
+ { PINYIN_Iu, PINYIN_Er }, // R
+ { PINYIN_Ong, PINYIN_Iong }, // S
+ { PINYIN_Uang,PINYIN_Iang }, // T
+ { PINYIN_U, PINYIN_ZeroFinal }, // U
+ { PINYIN_V, PINYIN_ZeroFinal }, // V
+ { PINYIN_Ian, PINYIN_ZeroFinal }, // W
+ { PINYIN_Ie, PINYIN_ZeroFinal }, // X
+ { PINYIN_Ing, PINYIN_ZeroFinal }, // Y
+ { PINYIN_Iao, PINYIN_ZeroFinal }, // Z
+ { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ;
+};
+
+
+static const PinyinInitial __shuang_pin_liushi_initial_map [] =
+{
+ PINYIN_ZeroInitial, // A
+ PINYIN_Bo, // B
+ PINYIN_Ci, // C
+ PINYIN_De, // D
+ PINYIN_ZeroInitial, // E
+ PINYIN_Fo, // F
+ PINYIN_Ge, // G
+ PINYIN_He, // H
+ PINYIN_Chi, // I
+ PINYIN_Ji, // J
+ PINYIN_Ke, // K
+ PINYIN_Le, // L
+ PINYIN_Mo, // M
+ PINYIN_Ne, // N
+ PINYIN_ZeroInitial, // O
+ PINYIN_Po, // P
+ PINYIN_Qi, // Q
+ PINYIN_Ri, // R
+ PINYIN_Si, // S
+ PINYIN_Te, // T
+ PINYIN_Shi, // U
+ PINYIN_Zhi, // V
+ PINYIN_Wu, // W
+ PINYIN_Xi, // X
+ PINYIN_Yi, // Y
+ PINYIN_Zi, // Z
+ PINYIN_ZeroInitial, // ;
+};
+
+static const PinyinFinal __shuang_pin_liushi_final_map [][2] =
+{
+ { PINYIN_A, PINYIN_ZeroFinal }, // A
+ { PINYIN_Ao, PINYIN_ZeroFinal }, // B
+ { PINYIN_Ang, PINYIN_ZeroFinal }, // C
+ { PINYIN_Uan, PINYIN_ZeroFinal }, // D
+ { PINYIN_E, PINYIN_ZeroFinal }, // E
+ { PINYIN_An, PINYIN_ZeroFinal }, // F
+ { PINYIN_Ong, PINYIN_Iong }, // G
+ { PINYIN_Ui, PINYIN_Ue }, // H
+ { PINYIN_I, PINYIN_ZeroFinal }, // I
+ { PINYIN_Ia, PINYIN_Ua }, // J
+ { PINYIN_Un, PINYIN_ZeroFinal }, // K
+ { PINYIN_Iu, PINYIN_ZeroFinal }, // L
+ { PINYIN_In, PINYIN_ZeroFinal }, // M
+ { PINYIN_Uang,PINYIN_Iang }, // N
+ { PINYIN_Uo, PINYIN_O }, // O
+ { PINYIN_Ng, PINYIN_Eng }, // P
+ { PINYIN_Ing, PINYIN_ZeroFinal }, // Q
+ { PINYIN_Ou, PINYIN_Er }, // R
+ { PINYIN_Ai, PINYIN_ZeroFinal }, // S
+ { PINYIN_Ian, PINYIN_ZeroFinal }, // T
+ { PINYIN_U, PINYIN_ZeroFinal }, // U
+ { PINYIN_V, PINYIN_En }, // V
+ { PINYIN_Ei, PINYIN_ZeroFinal }, // W
+ { PINYIN_Ie, PINYIN_ZeroFinal }, // X
+ { PINYIN_Uai, PINYIN_ZeroFinal }, // Y
+ { PINYIN_Iao, PINYIN_ZeroFinal }, // Z
+ { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ;
+};
+
+static const size_t __zhuyin_zhuyin_map_start_char = 0x3105;
+static const size_t __zhuyin_zhuyin_map_tone_start_idx = 37;
+static const PinyinKey __zhuyin_zhuyin_map [][3] =
+{
+ {PinyinKey(PINYIN_Bo),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Po),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Mo),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Fo),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_De),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Te),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Ne),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Le),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Ge),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Ke),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_He),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Ji),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Qi),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Xi),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Zhi),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Chi),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Shi),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Ri),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Zi),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Ci),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_Si),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_A),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_O),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_E),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ea),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ai),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ei),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ao),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ou),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_An),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_En),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ang),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_Eng),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_Er),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_I),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_U),PinyinKey(),PinyinKey()},
+ {PinyinKey(PINYIN_ZeroInitial,PINYIN_V),PinyinKey(),PinyinKey()},
+};
+
+static const size_t __zhuyin_map_start_char = 0x20;
+#include "pinyin_zhuyin_map_data.h"
+
+static const PinyinKey (*__zhuyin_maps []) [3] = {
+ __zhuyin_zhuyin_map,
+ __zhuyin_standard_map,
+ __zhuyin_hsu_map,
+ __zhuyin_ibm_map,
+ __zhuyin_gin_yieh_map,
+ __zhuyin_et_map,
+ __zhuyin_et26_map,
+ 0
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinCustomSettings
+
+PinyinCustomSettings::PinyinCustomSettings ()
+ : use_incomplete (true)
+{
+ for (size_t i=0; i<=PINYIN_AmbLast; ++i)
+ use_ambiguities [i] = false;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinKey
+
+const guint16 PinyinKey::min_value = 0;
+const guint16 PinyinKey::max_value = PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones - 1;
+
+const char*
+PinyinKey::get_initial_string () const
+{
+ return __pinyin_initials [m_initial].latin;
+}
+
+const char*
+PinyinKey::get_initial_zhuyin_string () const
+{
+ if ((m_initial == PINYIN_Wu && m_final == PINYIN_U) ||
+ (m_initial == PINYIN_Yi &&
+ (m_final == PINYIN_I || m_final == PINYIN_In || m_final == PINYIN_Ing || m_final == PINYIN_Ong ||
+ m_final == PINYIN_U || m_final == PINYIN_Ue || m_final == PINYIN_Uan || m_final == PINYIN_Un)))
+ return "";
+
+ return __pinyin_initials [m_initial].zhuyin;
+}
+
+const char*
+PinyinKey::get_final_string () const
+{
+ return __pinyin_finals [m_final].latin;
+}
+
+const char*
+PinyinKey::get_final_zhuyin_string () const
+{
+ if (m_initial == PINYIN_Yi && m_final == PINYIN_Ong) {
+ return __pinyin_finals [PINYIN_Iong].zhuyin;
+ } else if (m_initial == PINYIN_Yi || m_initial == PINYIN_Ji || m_initial == PINYIN_Qi || m_initial == PINYIN_Xi) {
+ switch (m_final) {
+ case PINYIN_U:
+ return __pinyin_finals [PINYIN_V].zhuyin;
+ case PINYIN_Ue:
+ return __pinyin_finals [PINYIN_Ve].zhuyin;
+ case PINYIN_Uan:
+ return __pinyin_finals [PINYIN_Van].zhuyin;
+ case PINYIN_Un:
+ return __pinyin_finals [PINYIN_Vn].zhuyin;
+ }
+ if (m_initial == PINYIN_Yi && m_final == PINYIN_E)
+ return __pinyin_finals [PINYIN_Ea].zhuyin;
+ } else if ((m_initial == PINYIN_Ne || m_initial == PINYIN_Le) && m_final == PINYIN_Ue) {
+ return __pinyin_finals [PINYIN_Ve].zhuyin;
+ } else if ((m_initial == PINYIN_Zhi || m_initial == PINYIN_Chi || m_initial == PINYIN_Shi ||
+ m_initial == PINYIN_Zi || m_initial == PINYIN_Ci || m_initial == PINYIN_Si ||
+ m_initial == PINYIN_Ri) && m_final == PINYIN_I) {
+ return "";
+ }
+
+ return __pinyin_finals [m_final].zhuyin;
+}
+
+const char*
+PinyinKey::get_tone_string () const
+{
+ return __pinyin_tones [m_tone].latin;
+}
+
+const char*
+PinyinKey::get_tone_zhuyin_string () const
+{
+ return __pinyin_tones [m_tone].zhuyin;
+}
+
+const char *
+PinyinKey::get_key_string () const
+{
+ char key [16];
+ g_snprintf (key, 15, "%s%s%s", get_initial_string(), get_final_string(), get_tone_string ());
+
+ return g_strdup(key);
+}
+
+const char *
+PinyinKey::get_key_zhuyin_string () const
+{
+ char key [32];
+ g_snprintf (key, 31, "%s%s%s", get_initial_zhuyin_string(), get_final_zhuyin_string(), get_tone_zhuyin_string ());
+
+ return g_strdup (key);
+}
+
+int
+PinyinKey::set (const PinyinValidator &validator, const char *str, int len)
+{
+ if (!str || ! (*str))
+ return 0;
+
+ PinyinDefaultParser parser;
+
+ return parser.parse_one_key (validator, *this, str, len);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinValidator
+BitmapPinyinValidator::BitmapPinyinValidator (const PinyinLargeTable *table)
+{
+ initialize (table);
+}
+
+void
+BitmapPinyinValidator::initialize (const PinyinLargeTable *table)
+{
+ memset (m_bitmap, 0, sizeof (m_bitmap));
+
+ if (!table) return;
+
+ for (guint16 val=0; val<=PinyinKey::max_value; ++val)
+ if (!table->has_key (PinyinKey (val)))
+ m_bitmap [val >> 3] |= (1 << (val % 8));
+}
+
+bool
+BitmapPinyinValidator::operator () (PinyinKey key) const
+{
+ if (key.is_empty ()) return false;
+
+ guint16 val = key.get_value ();
+
+ return (m_bitmap [ val >> 3 ] & (1 << (val % 8))) == 0;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinParser
+PinyinParser::~PinyinParser ()
+{
+}
+
+struct PinyinReplaceRulePair
+{
+ PinyinInitial initial;
+ PinyinFinal final;
+ PinyinInitial new_initial;
+ PinyinFinal new_final;
+};
+
+class PinyinReplaceRulePairLessThan
+{
+public:
+ bool operator () (const PinyinReplaceRulePair &lhs, const PinyinReplaceRulePair &rhs) const {
+ if (lhs.initial < rhs.initial) return true;
+ if (lhs.initial > rhs.initial) return false;
+ return lhs.final < rhs.final;
+ }
+};
+
+void
+PinyinParser::normalize (PinyinKey &key)
+{
+ static const PinyinReplaceRulePair rules [] =
+ {
+#if 0
+ {PINYIN_ZeroInitial, PINYIN_I, PINYIN_Yi, PINYIN_I},
+ {PINYIN_ZeroInitial, PINYIN_Ia, PINYIN_Yi, PINYIN_A},
+ {PINYIN_ZeroInitial, PINYIN_Ian, PINYIN_Yi, PINYIN_An},
+ {PINYIN_ZeroInitial, PINYIN_Iang, PINYIN_Yi, PINYIN_Ang},
+ {PINYIN_ZeroInitial, PINYIN_Iao, PINYIN_Yi, PINYIN_Ao},
+ {PINYIN_ZeroInitial, PINYIN_Ie, PINYIN_Yi, PINYIN_E},
+ {PINYIN_ZeroInitial, PINYIN_In, PINYIN_Yi, PINYIN_In},
+ {PINYIN_ZeroInitial, PINYIN_Ing, PINYIN_Yi, PINYIN_Ing},
+ {PINYIN_ZeroInitial, PINYIN_Iong, PINYIN_Yi, PINYIN_Ong},
+ {PINYIN_ZeroInitial, PINYIN_Iu, PINYIN_Yi, PINYIN_Ou},
+ {PINYIN_ZeroInitial, PINYIN_U, PINYIN_Wu, PINYIN_U},
+ {PINYIN_ZeroInitial, PINYIN_Ua, PINYIN_Wu, PINYIN_A},
+ {PINYIN_ZeroInitial, PINYIN_Uai, PINYIN_Wu, PINYIN_Ai},
+ {PINYIN_ZeroInitial, PINYIN_Uan, PINYIN_Wu, PINYIN_An},
+ {PINYIN_ZeroInitial, PINYIN_Uang, PINYIN_Wu, PINYIN_Ang},
+ {PINYIN_ZeroInitial, PINYIN_Ue, PINYIN_Wu, PINYIN_E},
+ {PINYIN_ZeroInitial, PINYIN_Ueng, PINYIN_Wu, PINYIN_Eng},
+ {PINYIN_ZeroInitial, PINYIN_Ui, PINYIN_Wu, PINYIN_Ei},
+ {PINYIN_ZeroInitial, PINYIN_Un, PINYIN_Wu, PINYIN_En},
+ {PINYIN_ZeroInitial, PINYIN_Uo, PINYIN_Wu, PINYIN_O},
+ {PINYIN_ZeroInitial, PINYIN_V, PINYIN_Yi, PINYIN_U},
+ {PINYIN_ZeroInitial, PINYIN_Van, PINYIN_Yi, PINYIN_Uan},
+ {PINYIN_ZeroInitial, PINYIN_Ve, PINYIN_Yi, PINYIN_Ue},
+ {PINYIN_ZeroInitial, PINYIN_Vn, PINYIN_Yi, PINYIN_Un},
+#endif
+ {PINYIN_Ji, PINYIN_V, PINYIN_Ji, PINYIN_U},
+ {PINYIN_Ji, PINYIN_Van, PINYIN_Ji, PINYIN_Uan},
+ {PINYIN_Ji, PINYIN_Ve, PINYIN_Ji, PINYIN_Ue},
+ {PINYIN_Ji, PINYIN_Vn, PINYIN_Ji, PINYIN_Un},
+ {PINYIN_Ne, PINYIN_Ve, PINYIN_Ne, PINYIN_Ue},
+ {PINYIN_Le, PINYIN_Ve, PINYIN_Le, PINYIN_Ue},
+ {PINYIN_Qi, PINYIN_V, PINYIN_Qi, PINYIN_U},
+ {PINYIN_Qi, PINYIN_Van, PINYIN_Qi, PINYIN_Uan},
+ {PINYIN_Qi, PINYIN_Ve, PINYIN_Qi, PINYIN_Ue},
+ {PINYIN_Qi, PINYIN_Vn, PINYIN_Qi, PINYIN_Un},
+ {PINYIN_Xi, PINYIN_V, PINYIN_Xi, PINYIN_U},
+ {PINYIN_Xi, PINYIN_Van, PINYIN_Xi, PINYIN_Uan},
+ {PINYIN_Xi, PINYIN_Ve, PINYIN_Xi, PINYIN_Ue},
+ {PINYIN_Xi, PINYIN_Vn, PINYIN_Xi, PINYIN_Un}
+ };
+ static const PinyinReplaceRulePair *rules_start = rules;
+ static const PinyinReplaceRulePair *rules_end = rules + sizeof(rules)/sizeof(PinyinReplaceRulePair);
+
+ PinyinReplaceRulePair kp;
+
+ kp.initial = key.get_initial ();
+ kp.final = key.get_final ();
+
+ const PinyinReplaceRulePair *p = std_lite::lower_bound (rules_start, rules_end, kp, PinyinReplaceRulePairLessThan ());
+
+ if (p->initial == kp.initial && p->final == kp.final) {
+ key.set_initial (p->new_initial);
+ key.set_final (p->new_final);
+ }
+}
+
+//============== Internal functions used by PinyinDefaultParser ==============
+static int
+__default_parser_parse_initial (PinyinInitial &initial, const char *str, int len)
+{
+ int lastlen = 0;
+
+ initial = PINYIN_ZeroInitial;
+
+ if (str && *str >= 'a' && *str <= 'z') {
+ int start = __pinyin_initials_index [*str - 'a'].start;
+ int end = __pinyin_initials_index [*str - 'a'].num + start;
+
+ if (start > 0) {
+ for (int i = start; i < end; ++i) {
+ if ((len < 0 || len >= __pinyin_initials [i].latin_len) && __pinyin_initials [i].latin_len >= lastlen) {
+ int j;
+ for (j = 1; j < __pinyin_initials [i].latin_len; ++j) {
+ if (str [j] != __pinyin_initials [i].latin [j])
+ break;
+ }
+ if (j == __pinyin_initials [i].latin_len) {
+ initial = static_cast<PinyinInitial>(i);
+ lastlen = __pinyin_initials [i].latin_len;
+ }
+ }
+ }
+ }
+ }
+
+ return lastlen;
+}
+static int
+__default_parser_parse_final (PinyinFinal &final, const char *str, int len)
+{
+ int lastlen = 0;
+
+ final = PINYIN_ZeroFinal;
+
+ if (str && *str >= 'a' && *str <= 'z') {
+ int start = __pinyin_finals_index [*str - 'a'].start;
+ int end = __pinyin_finals_index [*str - 'a'].num + start;
+
+ if (start > 0) {
+ for (int i = start; i < end; ++i) {
+ if ((len < 0 || len >= __pinyin_finals [i].latin_len) && __pinyin_finals [i].latin_len >= lastlen) {
+ int j;
+ for (j = 1; j < __pinyin_finals [i].latin_len; ++j) {
+ if (str [j] != __pinyin_finals [i].latin [j])
+ break;
+ }
+ if (j == __pinyin_finals [i].latin_len) {
+ final = static_cast<PinyinFinal>(i);
+ lastlen = __pinyin_finals [i].latin_len;
+ }
+ }
+ }
+ }
+ }
+
+ return lastlen;
+}
+static int
+__default_parser_parse_tone (PinyinTone &tone, const char *str, int len)
+{
+ tone = PINYIN_ZeroTone;
+
+ if (str && (len >= 1 || len < 0)) {
+ int kt = (*str) - '0';
+ if (kt >= PINYIN_First && kt <= PINYIN_LastTone) {
+ tone = static_cast<PinyinTone>(kt);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+__default_parser_parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1)
+{
+ int initial_len = 0;
+ int final_len = 0;
+ int tone_len = 0;
+
+ const char *ptr;
+
+ PinyinInitial initial;
+ PinyinFinal final;
+ PinyinTone tone;
+
+ key.clear ();
+
+ if (!str || !len) return 0;
+
+ if (len < 0) len = strlen (str);
+
+ while (len > 0) {
+ ptr = str;
+
+ initial = PINYIN_ZeroInitial;
+ final = PINYIN_ZeroFinal;
+ tone = PINYIN_ZeroTone;
+
+ final_len = __default_parser_parse_final (final, ptr, len);
+ ptr += final_len;
+ len -= final_len;
+
+ // An initial is present
+ if (final == PINYIN_ZeroFinal) {
+ initial_len = __default_parser_parse_initial (initial, ptr, len);
+ ptr += initial_len;
+ len -= initial_len;
+ if (len){
+ final_len = __default_parser_parse_final (final, ptr, len);
+ ptr += final_len;
+ len -= final_len;
+ }
+ }
+
+ if (len)
+ tone_len = __default_parser_parse_tone (tone, ptr, len);
+
+ key.set (initial, final, tone);
+
+ PinyinParser::normalize (key);
+
+ // A valid key was found, return.
+ if (validator (key)) break;
+
+ // The key is invalid, reduce the len and find again.
+ len = initial_len + final_len + tone_len - 1;
+
+ initial_len = final_len = tone_len = 0;
+
+ key.clear ();
+ }
+
+ len = initial_len + final_len + tone_len;
+
+ return len;
+}
+
+struct DefaultParserCacheElement
+{
+ PinyinKey key;
+ PinyinKeyPos pos;
+ int num_keys;
+ int parsed_len;
+ int next_start;
+};
+
+typedef GArray* DefaultParserCache; /* Array of DefaultParserCacheElement */
+
+static int
+__default_parser_parse_recursive (const PinyinValidator &validator,
+ DefaultParserCache &cache,
+ int &real_start,
+ int &num_keys,
+ const char *str,
+ int len,
+ int start)
+{
+ if (*str == 0 || len == 0) return 0;
+
+ int used_len = 0;
+
+ real_start = 0;
+ num_keys = 0;
+
+ if (*str == '\'' || *str == ' ') {
+ ++used_len;
+ ++str;
+ ++start;
+ --len;
+ }
+
+ if (!isalpha (*str) || !len)
+ return 0;
+
+ real_start = start;
+
+ // The best keys start from this position have been found, just return the result.
+ DefaultParserCacheElement* element = &g_array_index
+ (cache, DefaultParserCacheElement, start);
+
+
+ if (element->num_keys >=0) {
+ num_keys = element->num_keys;
+ return element->parsed_len;
+ }
+
+ PinyinKey first_key;
+ PinyinKey best_first_key;
+ PinyinKeyPos pos;
+
+ int first_len = 0;
+ int best_first_len = 0;
+
+ int remained_len = 0;
+ int best_remained_len = 0;
+
+ int remained_keys = 0;
+ int best_remained_keys = 0;
+
+ int remained_start = 0;
+ int best_remained_start = 0;
+
+ first_len = __default_parser_parse_one_key (validator, first_key, str, len);
+
+ if (!first_len) {
+ element = &g_array_index(cache, DefaultParserCacheElement, start);
+
+ element->key = PinyinKey ();
+ element->num_keys = 0;
+ element->parsed_len = 0;
+ element->next_start = start;
+ return 0;
+ }
+
+ best_first_key = first_key;
+ best_first_len = first_len;
+
+ if (len > first_len) {
+ char ch1 = str [first_len -1];
+ char ch2 = str [first_len];
+
+ best_remained_len = __default_parser_parse_recursive (validator,
+ cache,
+ best_remained_start,
+ best_remained_keys,
+ str + first_len,
+ len - first_len,
+ start + first_len);
+
+ // For those keys which the last char is 'g' or 'n' or 'r', try put the end char into the next key.
+ if (first_len > 1 &&
+ (((ch1=='g' || ch1=='n' || ch1=='r') && (ch2=='a' || ch2=='e' || ch2=='i' || ch2=='o' || ch2=='u' || ch2=='v')) ||
+ ((ch1=='a' || ch1=='e' || ch1=='o') && (ch2=='i' || ch2=='n' || ch2=='o' || ch2=='r' || ch2=='u')))) {
+
+ first_len = __default_parser_parse_one_key (validator, first_key, str, first_len - 1);
+
+ if (first_len) {
+ remained_len = __default_parser_parse_recursive (validator,
+ cache,
+ remained_start,
+ remained_keys,
+ str + first_len,
+ len - first_len,
+ start + first_len);
+
+
+ DefaultParserCacheElement* best_remained_element = &g_array_index
+ (cache, DefaultParserCacheElement, best_remained_start);
+
+ // A better seq was found.
+ if (remained_len != 0 && (remained_len + first_len) >= (best_remained_len + best_first_len) &&
+ (remained_keys <= best_remained_keys || best_remained_keys == 0)) {
+#if 0
+ if ((remained_len + first_len) > (best_remained_len + best_first_len) ||
+ remained_keys < best_remained_keys ||
+ best_remained_element->key.get_final () == PINYIN_ZeroFinal ||
+ best_remained_element->key.get_initial () == PINYIN_Wu ||
+ best_remained_element->key.get_initial () == PINYIN_Yi) {
+#endif
+ best_first_len = first_len;
+ best_first_key = first_key;
+ best_remained_len = remained_len;
+ best_remained_keys = remained_keys;
+ best_remained_start = remained_start;
+#if 0
+ }
+#endif
+ }
+ }
+ }
+ }
+
+ num_keys = best_remained_keys + 1;
+
+
+ element = &g_array_index
+ (cache, DefaultParserCacheElement, start);
+
+ pos.set_pos(start);
+ pos.set_length(best_first_len);
+
+ element->key = best_first_key;
+ element->pos = pos;
+ element->num_keys = num_keys;
+ element->parsed_len = used_len + best_first_len + best_remained_len;
+ element->next_start = best_remained_start;
+
+ return element->parsed_len;
+}
+//============================================================================
+
+PinyinDefaultParser::~PinyinDefaultParser ()
+{
+}
+
+int
+PinyinDefaultParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const
+{
+ return __default_parser_parse_one_key (validator, key, str, len);
+}
+
+int
+PinyinDefaultParser::parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len) const
+{
+ g_array_set_size(keys, 0);
+ g_array_set_size(poses, 0);
+
+ if (!str || !len) return 0;
+
+ if (len < 0) len = strlen (str);
+
+ DefaultParserCacheElement elm;
+
+ elm.num_keys = -1L;
+ elm.parsed_len = 0;
+ elm.next_start = 0;
+
+ DefaultParserCache cache = g_array_new (FALSE, TRUE, sizeof (DefaultParserCacheElement));
+ g_array_set_size(cache, len);
+ for ( size_t index = 0 ; index < len ; index++){
+ DefaultParserCacheElement * element =
+ &g_array_index(cache,DefaultParserCacheElement, index);
+ *element = elm;
+ }
+ int start = 0;
+ int num_keys = 0;
+
+ len = __default_parser_parse_recursive (validator, cache, start, num_keys, str, len, 0);
+
+ for (size_t i=0; i<(size_t)num_keys; ++i) {
+ DefaultParserCacheElement* element = &g_array_index
+ (cache, DefaultParserCacheElement, start);
+ g_array_append_val(keys, element->key);
+ g_array_append_val(poses, element->pos);
+ start = element->next_start;
+ }
+
+ return len;
+}
+
+PinyinShuangPinParser::PinyinShuangPinParser (PinyinShuangPinScheme scheme)
+{
+ set_scheme (scheme);
+}
+
+PinyinShuangPinParser::PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2])
+{
+ set_scheme (initial_map, final_map);
+}
+
+PinyinShuangPinParser::~PinyinShuangPinParser ()
+{
+}
+
+int
+PinyinShuangPinParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const
+{
+ key.clear ();
+
+ if (!str || !len || ! (*str)) return 0;
+
+ if (len < 0) len = strlen (str);
+
+ PinyinInitial initial = PINYIN_ZeroInitial;
+ PinyinFinal final = PINYIN_ZeroFinal;
+ PinyinFinal final_cands [4] = { PINYIN_ZeroFinal, PINYIN_ZeroFinal, PINYIN_ZeroFinal, PINYIN_ZeroFinal };
+
+ PinyinTone tone = PINYIN_ZeroTone;
+
+ int idx [2] = {-1, -1};
+ int used_len = 0;
+
+ size_t i;
+ bool matched = false;
+
+ for (i = 0; i < 2 && i < (size_t) len; ++i) {
+ if (str [i] >= 'a' && str [i] <= 'z') idx [i] = str [i] - 'a';
+ else if (str [i] == ';') idx [i] = 26;
+ }
+
+ // parse initial or final
+ if (idx [0] >= 0) {
+ initial = m_initial_map [idx[0]];
+ final_cands [0] = m_final_map [idx[0]][0];
+ final_cands [1] = m_final_map [idx[0]][1];
+ }
+
+ if (initial == PINYIN_ZeroInitial && final_cands [0] == PINYIN_ZeroFinal)
+ return 0;
+
+ // parse final, if str [0] == 'o' (idx [0] == 14) then just skip to parse final.
+ if (idx [1] >= 0 && (initial != PINYIN_ZeroInitial || idx[0] == 14)) {
+ final_cands [2] = m_final_map [idx [1]][0];
+ final_cands [3] = m_final_map [idx [1]][1];
+
+ for (i = 2; i < 4; ++i) {
+ if (final_cands [i] != PINYIN_ZeroFinal) {
+ key.set (initial, final_cands [i]);
+ PinyinParser::normalize (key);
+
+ if (validator (key)) {
+ final = final_cands [i];
+ matched = true;
+ used_len = 2;
+ str += 2;
+ len -= 2;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!matched) {
+ initial = PINYIN_ZeroInitial;
+ for (i = 0; i < 2; ++i) {
+ key.set (initial, final_cands [i]);
+ PinyinParser::normalize (key);
+
+ if (validator (key)) {
+ final = final_cands [i];
+ matched = true;
+ used_len = 1;
+ ++str;
+ --len;
+ break;
+ }
+ }
+ }
+
+ if (!matched) return 0;
+
+ // parse tone
+ if (len) {
+ int kt = (*str) - '0';
+ if (kt >= PINYIN_First && kt <= PINYIN_LastTone) {
+ tone = static_cast<PinyinTone>(kt);
+
+ key.set (initial, final, tone);
+
+ if (validator (key)) {
+ return used_len + 1;
+ }
+ }
+ }
+
+ return used_len;
+}
+
+int
+PinyinShuangPinParser::parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len) const
+{
+ g_array_set_size(keys, 0);
+ g_array_set_size(poses, 0);
+
+ if (!str || !len || ! (*str)) return 0;
+
+ if (len < 0) len = strlen (str);
+
+ int used_len = 0;
+
+ PinyinKey key;
+ PinyinKeyPos pos;
+
+ while (used_len < len) {
+ if (*str == '\'' || *str == ' ') {
+ ++str;
+ ++used_len;
+ continue;
+ }
+
+ int one_len = parse_one_key (validator, key, str, len);
+
+ if (one_len) {
+ pos.set_pos(used_len);
+ pos.set_length(one_len);
+ g_array_append_val(keys, key);
+ g_array_append_val(poses, pos);
+ } else {
+ break;
+ }
+
+ str += one_len;
+ used_len += one_len;
+ }
+
+ return used_len;
+}
+
+void
+PinyinShuangPinParser::set_scheme (PinyinShuangPinScheme scheme)
+{
+ switch (scheme) {
+ case SHUANG_PIN_STONE:
+ set_scheme (__shuang_pin_stone_initial_map, __shuang_pin_stone_final_map);
+ break;
+ case SHUANG_PIN_ZRM:
+ set_scheme (__shuang_pin_zrm_initial_map, __shuang_pin_zrm_final_map);
+ break;
+ case SHUANG_PIN_MS:
+ set_scheme (__shuang_pin_ms_initial_map, __shuang_pin_ms_final_map);
+ break;
+ case SHUANG_PIN_ZIGUANG:
+ set_scheme (__shuang_pin_ziguang_initial_map, __shuang_pin_ziguang_final_map);
+ break;
+ case SHUANG_PIN_ABC:
+ set_scheme (__shuang_pin_abc_initial_map, __shuang_pin_abc_final_map);
+ break;
+ case SHUANG_PIN_LIUSHI:
+ set_scheme (__shuang_pin_liushi_initial_map, __shuang_pin_liushi_final_map);
+ break;
+ default:
+ set_scheme (__shuang_pin_zrm_initial_map, __shuang_pin_zrm_final_map);
+ return;
+ }
+}
+
+void
+PinyinShuangPinParser::set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2])
+{
+ for (size_t i = 0; i < 27; ++i) {
+ m_initial_map [i] = initial_map [i];
+ m_final_map [i][0] = final_map [i][0];
+ m_final_map [i][1] = final_map [i][1];
+ }
+}
+
+void
+PinyinShuangPinParser::get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2])
+{
+ for (size_t i = 0; i < 27; ++i) {
+ initial_map [i] = m_initial_map [i];
+ final_map [i][0] = m_final_map [i][0];
+ final_map [i][1] = m_final_map [i][1];
+ }
+}
+
+namespace novel{
+
+//////////////////////////////////////////////////////////////////////////////
+// implementation of PinyinKey comparision classe
+int pinyin_compare_initial (const PinyinCustomSettings &custom,
+ PinyinInitial lhs,
+ PinyinInitial rhs)
+{
+ if ((lhs == rhs) ||
+ (custom.use_ambiguities [PINYIN_AmbZhiZi] &&
+ ((lhs == PINYIN_Zhi && rhs == PINYIN_Zi) ||
+ (lhs == PINYIN_Zi && rhs == PINYIN_Zhi))) ||
+
+ (custom.use_ambiguities [PINYIN_AmbChiCi] &&
+ ((lhs == PINYIN_Chi && rhs == PINYIN_Ci) ||
+ (lhs == PINYIN_Ci && rhs == PINYIN_Chi))) ||
+
+ (custom.use_ambiguities [PINYIN_AmbShiSi] &&
+ ((lhs == PINYIN_Shi && rhs == PINYIN_Si) ||
+ (lhs == PINYIN_Si && rhs == PINYIN_Shi))) ||
+
+ (custom.use_ambiguities [PINYIN_AmbLeRi] &&
+ ((lhs == PINYIN_Le && rhs == PINYIN_Ri) ||
+ (lhs == PINYIN_Ri && rhs == PINYIN_Le))) ||
+
+ (custom.use_ambiguities [PINYIN_AmbNeLe] &&
+ ((lhs == PINYIN_Ne && rhs == PINYIN_Le) ||
+ (lhs == PINYIN_Le && rhs == PINYIN_Ne))) ||
+
+ (custom.use_ambiguities [PINYIN_AmbFoHe] &&
+ ((lhs == PINYIN_Fo && rhs == PINYIN_He) ||
+ (lhs == PINYIN_He && rhs == PINYIN_Fo)))
+ )
+ return 0;
+ else if (lhs < rhs) return -1;
+ return 1;
+}
+
+int pinyin_compare_final (const PinyinCustomSettings &custom,
+ PinyinFinal lhs,
+ PinyinFinal rhs)
+{
+ if(((lhs == rhs) ||
+ (custom.use_ambiguities [PINYIN_AmbAnAng] &&
+ ((lhs == PINYIN_An && rhs == PINYIN_Ang) ||
+ (lhs == PINYIN_Ang && rhs == PINYIN_An))) ||
+
+ (custom.use_ambiguities [PINYIN_AmbEnEng] &&
+ ((lhs == PINYIN_En && rhs == PINYIN_Eng) ||
+ (lhs == PINYIN_Eng && rhs == PINYIN_En))) ||
+
+ (custom.use_ambiguities [PINYIN_AmbInIng] &&
+ ((lhs == PINYIN_In && rhs == PINYIN_Ing) ||
+ (lhs == PINYIN_Ing && rhs == PINYIN_In)))))
+ return 0;
+ else if (custom.use_incomplete && (lhs == PINYIN_ZeroFinal || rhs == PINYIN_ZeroFinal))
+ return 0;
+ else if (lhs < rhs) return -1;
+ return 1;
+}
+
+int pinyin_compare_tone (const PinyinCustomSettings &custom,
+ PinyinTone lhs,
+ PinyinTone rhs)
+{
+ if(lhs == rhs || !lhs || !rhs)
+ return 0;
+ else if (lhs < rhs) return -1;
+ return 1;
+}
+
+};
diff --git a/src/storage/pinyin_base.h b/src/storage/pinyin_base.h
new file mode 100644
index 0000000..374cc53
--- /dev/null
+++ b/src/storage/pinyin_base.h
@@ -0,0 +1,728 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2002,2003,2006 James Su
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/** @file pinyin_base.h
+ * @brief the definitions of pinyin related classes and structs.
+ */
+
+#ifndef PINYIN_BASE_H
+#define PINYIN_BASE_H
+
+#include <glib.h>
+
+namespace novel{
+
+// Predefinition of some classes and structs
+struct PinyinKey;
+
+class PinyinValidator;
+class PinyinParser;
+
+struct PinyinKeyPos{
+ int m_pos;
+ size_t m_len;
+ PinyinKeyPos(){
+ m_pos = 0;
+ m_len = 0;
+ }
+ void set_pos(int pos){
+ m_pos = pos;
+ }
+ void set_length(size_t len){
+ m_len = len;
+ }
+ int get_pos(){
+ return m_pos;
+ }
+ int get_end_pos(){
+ return m_pos + m_len;
+ }
+ size_t get_length(){
+ return m_len;
+ }
+};
+
+typedef GArray* PinyinKeyVector; /* Array of PinyinKey */
+typedef GArray* PinyinKeyPosVector; /* Array of PinyinKeyPos */
+
+
+struct PinyinCustomSettings;
+
+/**
+ * @brief enums of pinyin initial element.
+ *
+ * A pinyin key can be divided into three tokens:
+ * Initial -- such as B P M F D T N L etc.
+ * Final -- such as A O E I U V etc.
+ * Tone -- can be 1, 2, 3, 4 and 5.
+ */
+enum PinyinInitial
+{
+ PINYIN_ZeroInitial = 0, /**< zero initial. indicates invaild initial */
+ PINYIN_Bo = 1,
+ PINYIN_Ci = 2,
+ PINYIN_Chi = 3,
+ PINYIN_De = 4,
+ PINYIN_Fo = 5,
+ PINYIN_He = 6,
+ PINYIN_Ge = 7,
+ PINYIN_Ji = 8,
+ PINYIN_Ke = 9,
+ PINYIN_Mo =10,
+ PINYIN_Ne =11,
+ PINYIN_Le =12,
+ PINYIN_Ri =13,
+ PINYIN_Po =14,
+ PINYIN_Qi =15,
+ PINYIN_Si =16,
+ PINYIN_Shi =17,
+ PINYIN_Te =18,
+ PINYIN_Wu =19,
+ PINYIN_Xi =20,
+ PINYIN_Yi =21,
+ PINYIN_Zi =22,
+ PINYIN_Zhi =23,
+ PINYIN_LastInitial = PINYIN_Zhi, /**< the last initial */
+ PINYIN_Number_Of_Initials = PINYIN_LastInitial + 1
+};
+
+/**
+ * @brief enums of pinyin final element.
+ */
+enum PinyinFinal
+{
+ PINYIN_ZeroFinal = 0, /**< zero final. indicates invalid final */
+ PINYIN_A = 1,
+ PINYIN_Ai = 2,
+ PINYIN_An = 3,
+ PINYIN_Ang = 4,
+ PINYIN_Ao = 5,
+ PINYIN_E = 6,
+ PINYIN_Ea = 7,
+ PINYIN_Ei = 8,
+ PINYIN_En = 9,
+ PINYIN_Eng =10,
+ PINYIN_Er =11,
+ PINYIN_I =12,
+ PINYIN_Ia =13,
+ PINYIN_Ian =14,
+ PINYIN_Iang =15,
+ PINYIN_Iao =16,
+ PINYIN_Ie =17,
+ PINYIN_In =18,
+ PINYIN_Ing =19,
+ PINYIN_Iong =20,
+ PINYIN_Iu =21,
+ PINYIN_Ng =22,
+ PINYIN_O =23,
+ PINYIN_Ong =24,
+ PINYIN_Ou =25,
+ PINYIN_U =26,
+ PINYIN_Ua =27,
+ PINYIN_Uai =28,
+ PINYIN_Uan =29,
+ PINYIN_Uang =30,
+ PINYIN_Ue =31,
+ PINYIN_Ueng =32,
+ PINYIN_Ui =33,
+ PINYIN_Un =34,
+ PINYIN_Uo =35,
+ PINYIN_V =36,
+ PINYIN_Van =37,
+ PINYIN_Ve =38,
+ PINYIN_Vn =39,
+ PINYIN_LastFinal = PINYIN_Vn, /**< the last final */
+ PINYIN_Number_Of_Finals = PINYIN_LastFinal + 1
+};
+
+/**
+ * @brief enums of pinyin tone element.
+ */
+enum PinyinTone
+{
+ PINYIN_ZeroTone = 0, /**< zero tone. this will be matched with all other tones. */
+ PINYIN_First = 1,
+ PINYIN_Second = 2,
+ PINYIN_Third = 3,
+ PINYIN_Fourth = 4,
+ PINYIN_Fifth = 5,
+ PINYIN_LastTone = PINYIN_Fifth, /**< the last tone */
+ PINYIN_Number_Of_Tones = PINYIN_LastTone + 1
+};
+
+/**
+ * @brief enums of Shuang Pin Schemes.
+ */
+enum PinyinShuangPinScheme
+{
+ SHUANG_PIN_STONE = 0,
+ SHUANG_PIN_ZRM = 1,
+ SHUANG_PIN_MS = 2,
+ SHUANG_PIN_ZIGUANG = 3,
+ SHUANG_PIN_ABC = 4,
+ SHUANG_PIN_LIUSHI = 5,
+ SHUANG_PIN_CUSTOMIZED = 6,
+ SHUANG_PIN_DEFAULT = SHUANG_PIN_ZRM
+};
+
+/**
+ * @brief enums of ZhuYin Schemes.
+ */
+enum PinyinZhuYinScheme
+{
+ ZHUYIN_ZHUYIN = 0,
+ ZHUYIN_STANDARD = 1,
+ ZHUYIN_HSU = 2,
+ ZHUYIN_IBM = 3,
+ ZHUYIN_GIN_YIEH = 4,
+ ZHUYIN_ET = 5,
+ ZHUYIN_ET26 = 6,
+ ZHUYIN_DEFAULT = ZHUYIN_STANDARD
+};
+
+/**
+ * @brief enums of pinyin ambiguities.
+ *
+ * Some pinyin element maybe confused by somebody,
+ * We allow these ambiguities.
+ */
+enum PinyinAmbiguity
+{
+ PINYIN_AmbAny= 0,
+ PINYIN_AmbZhiZi,
+ PINYIN_AmbChiCi,
+ PINYIN_AmbShiSi,
+ PINYIN_AmbNeLe,
+ PINYIN_AmbLeRi,
+ PINYIN_AmbFoHe,
+ PINYIN_AmbAnAng,
+ PINYIN_AmbEnEng,
+ PINYIN_AmbInIng,
+ PINYIN_AmbLast = PINYIN_AmbInIng
+};
+
+/**
+ * @brief Structure to hold pinyin custom settings.
+ *
+ * user can custom the behavor of libpinyin by these settings.
+ */
+struct PinyinCustomSettings
+{
+ bool use_incomplete;
+ /**< allow incomplete pinyin key which only has inital. */
+
+ bool use_ambiguities [PINYIN_AmbLast + 1];
+ /**< allow ambiguous pinyin elements or not. */
+
+ PinyinCustomSettings ();
+
+ void set_use_incomplete (bool use) { use_incomplete = use; }
+ void set_use_ambiguities (PinyinAmbiguity amb, bool use)
+ {
+ if (amb == PINYIN_AmbAny)
+ for (size_t i=0; i<=PINYIN_AmbLast; ++i) use_ambiguities [i] = use;
+ else {
+ use_ambiguities [0] = false;
+ use_ambiguities [static_cast<size_t>(amb)] = use;
+ for (size_t i=1; i<=PINYIN_AmbLast; ++i)
+ if (use_ambiguities [i]) {
+ use_ambiguities [0] = true;
+ break;
+ }
+ }
+ }
+
+ bool operator == (const PinyinCustomSettings &rhs) const
+ {
+ if (use_incomplete != rhs.use_incomplete)
+ return false;
+
+ for (size_t i=0; i <= PINYIN_AmbLast; ++i)
+ if (use_ambiguities [i] != rhs.use_ambiguities [i])
+ return false;
+
+ return true;
+ }
+
+ bool operator != (const PinyinCustomSettings &rhs) const
+ {
+ return !(*this == rhs);
+ }
+
+ guint32 to_value () const
+ {
+ guint32 val = 0;
+
+ if (use_incomplete) val |= 1;
+
+ for (size_t i=0; i <= PINYIN_AmbLast; ++i)
+ if (use_ambiguities [i])
+ val |= (1 << (i+1));
+
+ return val;
+ }
+
+ void from_value (guint32 val)
+ {
+ use_incomplete = ((val & 1) != 0);
+
+ for (size_t i=0; i <= PINYIN_AmbLast; ++i)
+ use_ambiguities [i] = ((val & (1 << (i+1))) != 0);
+ }
+};
+
+/**
+ * @brief Pinyin key class.
+ *
+ * A pinyin key is a composed element of an initial, a final and a tone,
+ * which represents one or several Chinese ideographs
+ *
+ * The position and length information for the portion of string, from which
+ * the PinyinKey is parsed, are also stored in this structure.
+ */
+struct PinyinKey
+{
+ friend class PinyinBitmapIndexLevel;
+ friend inline int pinyin_exact_compare(const PinyinKey key_lhs[],
+ const PinyinKey key_rhs[],
+ int word_length);
+ friend inline int pinyin_compare_with_ambiguities
+ (const PinyinCustomSettings &custom,
+ const PinyinKey* key_lhs,
+ const PinyinKey* key_rhs,
+ int word_length);
+ friend inline void compute_lower_value(const PinyinCustomSettings &custom,
+ PinyinKey in_keys[],
+ PinyinKey out_keys[],
+ int word_length);
+ friend inline void compute_upper_value(const PinyinCustomSettings &custom,
+ PinyinKey in_keys[],
+ PinyinKey out_keys[],
+ int word_length);
+
+private:
+ guint16 m_initial : 5; /**< pinyin initial */
+ guint16 m_final : 6; /**< pinyin final */
+ guint16 m_tone : 3; /**< pinyin tone */
+public:
+ /**
+ * @brief Minimal numerical value of a PinyinKey
+ * @sa get_value();
+ */
+ static const guint16 min_value;
+
+ /**
+ * @brief Maximal numerical value of a PinyinKey
+ * @sa get_value();
+ */
+ static const guint16 max_value;
+
+public:
+ /**
+ * Constructor.
+ *
+ * The default constructor of class PinyinKey.
+ */
+ PinyinKey (PinyinInitial initial = PINYIN_ZeroInitial,
+ PinyinFinal final = PINYIN_ZeroFinal,
+ PinyinTone tone = PINYIN_ZeroTone)
+ : m_initial (initial), m_final (final), m_tone (tone)
+ {
+ }
+
+ /**
+ * Constructor.
+ *
+ * Construct a PinyinKey object from a key string, with
+ * specified validator.
+ *
+ * @sa PinyinValidator
+ */
+ PinyinKey (const PinyinValidator &validator, const char *str, int len = -1)
+ {
+ set (validator, str, len);
+ }
+
+ PinyinKey (guint16 value)
+ {
+ set (value);
+ }
+ /**
+ * Clear the PinyinKey object.
+ */
+
+ void clear ()
+ {
+ m_initial = PINYIN_ZeroInitial;
+ m_final = PINYIN_ZeroFinal;
+ m_tone = PINYIN_ZeroTone;
+ }
+
+ /**
+ * Read PinyinKey value from a key string.
+ *
+ * @param validator a PinyinValidator object to validate the key.
+ * @param key a Latin string including one or more pinyin keys.
+ * @return the number of characters used by this pinyin key.
+ */
+ int set (const PinyinValidator &validator, const char *str, int len = -1);
+
+ /**
+ * Set PinyinKey's value to initial, final and tone.
+ */
+ void set (PinyinInitial initial = PINYIN_ZeroInitial,
+ PinyinFinal final = PINYIN_ZeroFinal,
+ PinyinTone tone = PINYIN_ZeroTone)
+ {
+ m_initial = initial;
+ m_final = final;
+ m_tone = tone;
+ }
+
+ /**
+ * @brief Set this PinyinKey from its numerical value.
+ */
+ void set (guint16 value)
+ {
+ m_tone = value % PINYIN_Number_Of_Tones;
+ value /= PINYIN_Number_Of_Tones;
+ m_final = value % PINYIN_Number_Of_Finals;
+ m_initial = value / PINYIN_Number_Of_Finals;
+ }
+
+ /**
+ * @brief Get numerical value of this PinyinKey
+ */
+ guint16 get_value () const
+ {
+ return (m_initial * PINYIN_Number_Of_Finals + m_final) * PINYIN_Number_Of_Tones + m_tone;
+ }
+
+ /**
+ * Set PinyinKey's initial value to initial.
+ */
+ void set_initial (PinyinInitial initial = PINYIN_ZeroInitial)
+ {
+ m_initial = initial;
+ }
+
+ /**
+ * Set PinyinKey's final value to final.
+ */
+ void set_final (PinyinFinal final = PINYIN_ZeroFinal)
+ {
+ m_final = final;
+ }
+
+ /**
+ * Set PinyinKey's tone value to tone.
+ */
+ void set_tone (PinyinTone tone = PINYIN_ZeroTone)
+ {
+ m_tone = tone;
+ }
+
+ /**
+ * Get initial value of this key.
+ */
+ PinyinInitial get_initial () const
+ {
+ return static_cast<PinyinInitial>(m_initial);
+ }
+
+ /**
+ * Get final value of this key.
+ */
+ PinyinFinal get_final () const
+ {
+ return static_cast<PinyinFinal>(m_final);
+ }
+
+ /**
+ * Get tone value of this key.
+ */
+ PinyinTone get_tone () const
+ {
+ return static_cast<PinyinTone>(m_tone);
+ }
+
+ /**
+ * Get Latin name of this key's initial.
+ */
+ const char* get_initial_string () const;
+
+ /**
+ * Get Chinese ZhuYin name of this key's initial, in UTF-8 encoding.
+ */
+ const char* get_initial_zhuyin_string () const;
+
+ /**
+ * Get Latin name of this key's final.
+ */
+ const char* get_final_string () const;
+
+ /**
+ * Get Chinese ZhuYin name of this key's final, in UTF-8 encoding.
+ */
+ const char* get_final_zhuyin_string () const;
+
+ /**
+ * Get Latin name of this key's tone.
+ */
+ const char* get_tone_string () const;
+
+ /**
+ * Get Chinese ZhuYin name of this key's tone, in UTF-8 encoding.
+ */
+ const char* get_tone_zhuyin_string () const;
+
+ /**
+ * Get Latin name of this key.
+ */
+ const char * get_key_string () const;
+
+ /**
+ * Get Chinese ZhuYin name of this key, in UTF-8 encoding.
+ */
+ const char * get_key_zhuyin_string () const;
+
+ /**
+ * Check if this key is empty.
+ */
+ bool is_empty () const
+ {
+ return m_initial == PINYIN_ZeroInitial && m_final == PINYIN_ZeroFinal && m_tone == PINYIN_ZeroTone;
+ }
+
+ /**
+ * Check if this key has both initial, final and tone.
+ */
+ bool is_complete () const
+ {
+ return m_initial != PINYIN_ZeroInitial && m_final != PINYIN_ZeroFinal && m_tone != PINYIN_ZeroTone;
+ }
+
+ bool operator == (PinyinKey rhs) const
+ {
+ return m_initial == rhs.m_initial && m_final == rhs.m_final && m_tone == rhs.m_tone;
+ }
+
+ bool operator != (PinyinKey rhs) const
+ {
+ return m_initial != rhs.m_initial || m_final != rhs.m_final || m_tone != rhs.m_tone;
+ }
+
+ bool operator < (PinyinKey rhs) const
+ {
+ if (m_initial < rhs.m_initial) return true;
+ if (m_initial > rhs.m_initial) return false;
+ if (m_final < rhs.m_final) return true;
+ if (m_final > rhs.m_final) return false;
+ return m_tone < rhs.m_tone;
+ }
+
+ bool operator > (PinyinKey rhs) const
+ {
+ if (m_initial > rhs.m_initial) return true;
+ if (m_initial < rhs.m_initial) return false;
+ if (m_final > rhs.m_final) return true;
+ if (m_final < rhs.m_final) return false;
+ return m_tone > rhs.m_tone;
+ }
+};
+
+/**
+ * NULL Validator of PinyinKey object.
+ *
+ * This class is for validating a PinyinKey object.
+ */
+class PinyinValidator
+{
+public:
+ /**
+ * Overloaded operator () function to validate a pinyin key.
+ *
+ * @param key The key to be validated.
+ * @return true if the key is valid.
+ */
+ virtual bool operator () (PinyinKey key) const = 0;
+};
+
+class PinyinLargeTable;
+/**
+ * Validator of PinyinKey object.
+ *
+ * This class is for validating a PinyinKey object.
+ */
+class BitmapPinyinValidator:public PinyinValidator
+{
+ char m_bitmap [(PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 7) / 8];
+
+public:
+ BitmapPinyinValidator (const PinyinLargeTable *table = 0);
+
+ /**
+ * initialize the validator with specified custom settings
+ * and PinyinLargeTable.
+ */
+ void initialize (const PinyinLargeTable *table = 0);
+
+ /**
+ * Overloaded operator () function to validate a pinyin key.
+ *
+ * @param key The key to be validated.
+ * @return true if the key is valid.
+ */
+ virtual bool operator () (PinyinKey key) const;
+};
+
+/**
+ * NULL Validator of PinyinKey object.
+ *
+ * This class is for validating a PinyinKey object.
+ */
+class NullPinyinValidator:public PinyinValidator
+{
+public:
+ /**
+ * Overloaded operator () function to validate a pinyin key.
+ *
+ * @param key The key to be validated.
+ * @return true if the key is valid.
+ */
+ virtual bool operator () (PinyinKey key) const{
+ return true;
+ }
+};
+
+/**
+ * @brief Class to translate string into PinyinKey.
+ */
+class PinyinParser
+{
+public:
+ virtual ~PinyinParser ();
+
+ /**
+ * @brief Translate only one PinyinKey from a string.
+ *
+ * @param validator PinyinValidator object to valid result.
+ * @param key Stores result PinyinKey.
+ * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string,
+ * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme,
+ * it's an UTF-8 string which contains ZhuYin chars.
+ * @param len The length of str, in number of chars rather than bytes.
+ *
+ * @return the number of chars were actually used.
+ */
+ virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const = 0;
+
+ /**
+ * @brief Handy wrapper function of parse_one_key(), which accept a String object instead of char *.
+ */
+ int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char * &str) const
+ {
+ return parse_one_key (validator, key, str, g_utf8_strlen (str, -1));
+ }
+
+ /**
+ * @brief Translate the source string into a set of PinyinKeys.
+ *
+ * @param validator PinyinValidator object to valid result.
+ * @param keys Stores result PinyinKeys.
+ * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string,
+ * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme,
+ * it's an UTF-8 string which contains ZhuYin chars.
+ * @param len The length of str, in number of chars rather than bytes.
+ *
+ * @return the number of chars were actually used.
+ */
+ virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys,PinyinKeyPosVector & poses, const char *str, int len = -1) const = 0;
+
+public:
+ static void normalize (PinyinKey &key);
+};
+
+/**
+ * The default Pinyin Parser which parses full pinyin string into PinyinKeys.
+ */
+class PinyinDefaultParser : public PinyinParser
+{
+public:
+ virtual ~PinyinDefaultParser ();
+
+ virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const;
+ virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const;
+
+public:
+ using PinyinParser::parse_one_key;
+ using PinyinParser::parse;
+};
+
+/* The valid input chars of ShuangPin is a-z and ';'
+ */
+class PinyinShuangPinParser : public PinyinParser
+{
+ PinyinInitial m_initial_map [27];
+ PinyinFinal m_final_map [27][2];
+
+public:
+ /**
+ * Constructor
+ *
+ * @param scheme the predefined ShuangPin scheme to be used.
+ */
+ PinyinShuangPinParser (PinyinShuangPinScheme scheme = SHUANG_PIN_DEFAULT);
+ PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]);
+
+ virtual ~PinyinShuangPinParser ();
+
+ virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const;
+ virtual int parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const;
+
+public:
+ void set_scheme (PinyinShuangPinScheme scheme);
+ void set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]);
+
+ void get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2]);
+
+public:
+ using PinyinParser::parse_one_key;
+ using PinyinParser::parse;
+};
+
+int pinyin_compare_initial (const PinyinCustomSettings &custom,
+ PinyinInitial lhs,
+ PinyinInitial rhs);
+
+int pinyin_compare_final (const PinyinCustomSettings &custom,
+ PinyinFinal lhs,
+ PinyinFinal rhs);
+
+int pinyin_compare_tone (const PinyinCustomSettings &custom,
+ PinyinTone lhs,
+ PinyinTone rhs);
+};
+
+using namespace novel;
+
+#endif
diff --git a/src/storage/pinyin_large_table.cpp b/src/storage/pinyin_large_table.cpp
new file mode 100644
index 0000000..794cca5
--- /dev/null
+++ b/src/storage/pinyin_large_table.cpp
@@ -0,0 +1,690 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <assert.h>
+#include <string.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+
+
+PinyinBitmapIndexLevel::PinyinBitmapIndexLevel(PinyinCustomSettings * custom)
+ :m_custom(custom){
+ memset(m_pinyin_length_indexes, 0 , sizeof(m_pinyin_length_indexes));
+}
+
+void PinyinBitmapIndexLevel::reset(){
+ for ( int k = PINYIN_ZeroInitial; k < PINYIN_Number_Of_Initials; k++)
+ for ( int m = PINYIN_ZeroFinal; m < PINYIN_Number_Of_Finals; m++)
+ for ( int n = PINYIN_ZeroTone; n < PINYIN_Number_Of_Tones; n++){
+ PinyinLengthIndexLevel * length_array =
+ m_pinyin_length_indexes[k][m][n];
+ if ( length_array )
+ delete length_array;
+ }
+}
+
+int PinyinBitmapIndexLevel::search( int phrase_length, /* in */ PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges) const{
+ return initial_level_search(phrase_length, keys, ranges);
+}
+
+int PinyinBitmapIndexLevel::initial_level_search(int phrase_length,
+ /* in */PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges) const{
+
+#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \
+ { \
+ result |= final_level_search((PinyinInitial)first_key.m_initial,\
+ phrase_length, keys, ranges); \
+ if ( custom.use_ambiguities [AMBIGUITY] ){ \
+ result |= final_level_search(ANOTHER, \
+ phrase_length, keys, ranges); \
+ } \
+ return result; \
+ }
+
+ //deal with the ambiguities
+
+ int result = 0;
+ PinyinKey& first_key = keys[0];
+ PinyinCustomSettings & custom= *m_custom;
+
+ switch(first_key.m_initial){
+
+ MATCH(PINYIN_AmbZhiZi, PINYIN_Zi, PINYIN_Zhi);
+ MATCH(PINYIN_AmbZhiZi, PINYIN_Zhi, PINYIN_Zi);
+ MATCH(PINYIN_AmbChiCi, PINYIN_Ci, PINYIN_Chi);
+ MATCH(PINYIN_AmbChiCi, PINYIN_Chi, PINYIN_Ci);
+ MATCH(PINYIN_AmbShiSi, PINYIN_Si, PINYIN_Shi);
+ MATCH(PINYIN_AmbShiSi, PINYIN_Shi, PINYIN_Si);
+ MATCH(PINYIN_AmbLeRi, PINYIN_Ri, PINYIN_Le);
+ MATCH(PINYIN_AmbNeLe, PINYIN_Ne, PINYIN_Le);
+ MATCH(PINYIN_AmbFoHe, PINYIN_Fo, PINYIN_He);
+ MATCH(PINYIN_AmbFoHe, PINYIN_He, PINYIN_Fo);
+
+ case PINYIN_Le:
+ {
+ result |= final_level_search((PinyinInitial)first_key.m_initial,
+ phrase_length, keys, ranges);
+ if ( custom.use_ambiguities [PINYIN_AmbLeRi] )
+ result |= final_level_search(PINYIN_Ri, phrase_length,
+ keys, ranges);
+ if ( custom.use_ambiguities [PINYIN_AmbNeLe] )
+ result |= final_level_search(PINYIN_Ne, phrase_length,
+ keys, ranges);
+ return result;
+ }
+ default:
+ {
+ return final_level_search((PinyinInitial)first_key.m_initial,
+ phrase_length,
+ keys, ranges);
+ }
+ }
+#undef MATCH
+}
+
+int PinyinBitmapIndexLevel::final_level_search(PinyinInitial initial,
+ int phrase_length,
+ /* in */PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges) const{
+#define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \
+ { \
+ result = tone_level_search(initial,(PinyinFinal) first_key.m_final,\
+ phrase_length, keys, ranges); \
+ if ( custom.use_ambiguities [AMBIGUITY] ){ \
+ result |= tone_level_search(initial, ANOTHER, \
+ phrase_length, keys, ranges); \
+ } \
+ return result; \
+ }
+
+ int result = 0;
+ PinyinKey& first_key = keys[0];
+ PinyinCustomSettings & custom= *m_custom;
+
+ switch(first_key.m_final){
+ case PINYIN_ZeroFinal:
+ {
+ if (!custom.use_incomplete )
+ return result;
+ for ( int i = PINYIN_A; i < PINYIN_Number_Of_Finals; ++i){
+ result |= tone_level_search(initial,(PinyinFinal)i ,
+ phrase_length, keys, ranges);
+ }
+ return result;
+ }
+
+ MATCH(PINYIN_AmbAnAng, PINYIN_An, PINYIN_Ang);
+ MATCH(PINYIN_AmbAnAng, PINYIN_Ang, PINYIN_An);
+ MATCH(PINYIN_AmbEnEng, PINYIN_En, PINYIN_Eng);
+ MATCH(PINYIN_AmbEnEng, PINYIN_Eng, PINYIN_En);
+ MATCH(PINYIN_AmbInIng, PINYIN_In, PINYIN_Ing);
+ MATCH(PINYIN_AmbInIng, PINYIN_Ing, PINYIN_In);
+
+ default:
+ {
+ return tone_level_search(initial,(PinyinFinal)first_key.m_final,
+ phrase_length, keys, ranges);
+ }
+ }
+#undef MATCH
+}
+
+int PinyinBitmapIndexLevel::tone_level_search(PinyinInitial initial,
+ PinyinFinal final,
+ int phrase_length,
+ /* in */PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges) const{
+ int result = 0;
+ PinyinKey& first_key = keys[0];
+ PinyinCustomSettings & custom= *m_custom;
+
+ switch ( first_key.m_tone ){
+ case PINYIN_ZeroTone:
+ {
+ //deal with ZeroTone in pinyin table files.
+ for ( int i = PINYIN_ZeroTone; i < PINYIN_Number_Of_Tones; ++i){
+ PinyinLengthIndexLevel * phrases =
+ m_pinyin_length_indexes[initial][final][(PinyinTone)i];
+ if ( phrases )
+ result |= phrases->search(phrase_length - 1, &custom,
+ keys + 1, ranges);
+ }
+ return result;
+ }
+ default:
+ {
+ PinyinLengthIndexLevel * phrases =
+ m_pinyin_length_indexes[initial][final]
+ [PINYIN_ZeroTone];
+ if ( phrases )
+ result = phrases->search(phrase_length - 1, &custom,
+ keys + 1, ranges);
+ phrases = m_pinyin_length_indexes[initial][final]
+ [(PinyinTone) first_key.m_tone];
+ if ( phrases )
+ result |= phrases->search(phrase_length - 1, &custom,
+ keys + 1, ranges);
+ return result;
+ }
+ }
+ return result;
+}
+
+PinyinLengthIndexLevel::PinyinLengthIndexLevel(){
+ m_pinyin_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
+}
+
+PinyinLengthIndexLevel::~PinyinLengthIndexLevel(){
+#define CASE(x) case x: \
+ { \
+ PinyinArrayIndexLevel<x> * array = g_array_index \
+ (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> *, x); \
+ if (array) \
+ delete array; \
+ break; \
+ }
+ for ( int i = 0 ; i < m_pinyin_array_indexes->len; ++i){
+ switch (i){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+ }
+ g_array_free(m_pinyin_array_indexes, TRUE);
+#undef CASE
+}
+
+int PinyinLengthIndexLevel::search( int phrase_length,
+ /* in */ PinyinCustomSettings * custom,
+ /* in */ PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges){
+ int result = SEARCH_NONE;
+ if(m_pinyin_array_indexes->len < phrase_length + 1)
+ return result;
+ if (m_pinyin_array_indexes->len > phrase_length + 1)
+ result |= SEARCH_CONTINUED;
+
+#define CASE(len) case len: \
+ { \
+ PinyinArrayIndexLevel<len> * array = g_array_index \
+ (m_pinyin_array_indexes, PinyinArrayIndexLevel<len> *, len); \
+ if ( !array ) \
+ return result; \
+ result |= array->search(custom, keys, ranges); \
+ return result; \
+ }
+
+ switch ( phrase_length ){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+#undef CASE
+}
+
+template<size_t phrase_length>
+int PinyinArrayIndexLevel<phrase_length>::search(/* in */ PinyinCustomSettings * custom, /* in */ PinyinKey keys[], /* out */ PhraseIndexRanges ranges){
+ PhraseExactLessThan<phrase_length> m_lessthan;
+ PinyinIndexItem<phrase_length> * chunk_begin, * chunk_end;
+ chunk_begin = (PinyinIndexItem<phrase_length> *)m_chunk.begin();
+ chunk_end = (PinyinIndexItem<phrase_length> *)m_chunk.end();
+ //do the search
+ PinyinKey left_keys[phrase_length], right_keys[phrase_length];
+ compute_lower_value(*custom, keys, left_keys, phrase_length);
+ compute_upper_value(*custom, keys, right_keys, phrase_length);
+ PinyinIndexItem<phrase_length> left(left_keys, -1), right(right_keys, -1);
+
+ PinyinIndexItem<phrase_length> * begin = std_lite::lower_bound(chunk_begin, chunk_end, left, m_lessthan);
+ PinyinIndexItem<phrase_length> * end = std_lite::upper_bound(chunk_begin, chunk_end, right, m_lessthan);
+
+ return convert(custom, keys, begin, end, ranges);
+}
+
+template<size_t phrase_length>
+int PinyinArrayIndexLevel<phrase_length>::convert(PinyinCustomSettings * custom, PinyinKey keys[], PinyinIndexItem<phrase_length> * begin, PinyinIndexItem<phrase_length> * end, PhraseIndexRanges ranges){
+ PinyinIndexItem<phrase_length> * iter;
+ PhraseIndexRange cursor;
+ GArray * head, *cursor_head = NULL;
+ int result = SEARCH_NONE;
+ cursor.m_range_begin = -1; cursor.m_range_end = -1;
+ for ( iter = begin; iter != end; ++iter){
+ if ( ! 0 ==
+ pinyin_compare_with_ambiguities
+ (*custom, keys, iter->m_keys, phrase_length))
+ continue;
+ phrase_token_t token = iter->m_token;
+ head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)];
+ if ( NULL == head )
+ continue;
+
+ result |= SEARCH_OK;
+
+ if ( cursor.m_range_begin == -1 ){
+ cursor.m_range_begin = token;
+ cursor.m_range_end = token + 1;
+ cursor_head = head;
+ }else if (cursor.m_range_end == token &&
+ PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_end) ==
+ PHRASE_INDEX_LIBRARY_INDEX(token) ){
+ cursor.m_range_end++;
+ }else {
+ g_array_append_val(cursor_head, cursor);
+ cursor.m_range_begin = token; cursor.m_range_end = token + 1;
+ cursor_head = head;
+ }
+ }
+ if ( cursor.m_range_begin == -1 )
+ return result;
+
+ g_array_append_val(cursor_head, cursor);
+ return result;
+}
+
+int PinyinBitmapIndexLevel::add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+ PinyinKey firstkey = keys[0];
+ PinyinLengthIndexLevel * &length_array =
+ m_pinyin_length_indexes[firstkey.m_initial][firstkey.m_final][firstkey.m_tone];
+ if ( ! length_array ){
+ length_array = new PinyinLengthIndexLevel();
+ }
+ return length_array->add_index(phrase_length - 1, keys + 1, token);
+}
+
+int PinyinBitmapIndexLevel::remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+ PinyinKey firstkey = keys[0];
+ PinyinLengthIndexLevel * &length_array =
+ m_pinyin_length_indexes[firstkey.m_initial][firstkey.m_final][firstkey.m_tone];
+ if ( length_array )
+ return length_array->add_index(phrase_length - 1, keys + 1, token);
+ return REMOVE_ITEM_DONOT_EXISTS;
+}
+
+int PinyinLengthIndexLevel::add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+ assert(phrase_length + 1 < MAX_PHRASE_LENGTH);
+ if ( m_pinyin_array_indexes -> len <= phrase_length )
+ g_array_set_size(m_pinyin_array_indexes, phrase_length + 1);
+#define CASE(x) case x: \
+ { \
+ PinyinArrayIndexLevel<x> * &array = g_array_index \
+ (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> *, x); \
+ if ( !array ) \
+ array = new PinyinArrayIndexLevel<x>; \
+ return array->add_index(keys, token); \
+ }
+ switch(phrase_length){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+#undef CASE
+}
+
+int PinyinLengthIndexLevel::remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+ assert(phrase_length + 1 < MAX_PHRASE_LENGTH);
+ if ( m_pinyin_array_indexes -> len <= phrase_length )
+ return false;
+#define CASE(x) case x: \
+ { \
+ PinyinArrayIndexLevel<x> * &array = g_array_index \
+ (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> *, x); \
+ if ( !array ) \
+ return false; \
+ return array->remove_index(keys, token); \
+ }
+ switch(phrase_length){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+#undef CASE
+}
+
+template<size_t phrase_length>
+int PinyinArrayIndexLevel<phrase_length>::add_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+ PhraseExactLessThan<phrase_length> m_lessthan;
+ PinyinIndexItem<phrase_length> * buf_begin, * buf_end;
+
+ PinyinIndexItem<phrase_length> new_elem(keys, token);
+ buf_begin = (PinyinIndexItem<phrase_length> *) m_chunk.begin();
+ buf_end = (PinyinIndexItem<phrase_length> *) m_chunk.end();
+
+ std_lite::pair<PinyinIndexItem<phrase_length> *, PinyinIndexItem<phrase_length> *> range;
+ range = std_lite::equal_range(buf_begin, buf_end, new_elem, m_lessthan);
+
+ PinyinIndexItem<phrase_length> * cur_elem;
+ for ( cur_elem = range.first;
+ cur_elem != range.second; ++cur_elem){
+ if ( cur_elem->m_token == token )
+ return INSERT_ITEM_EXISTS;
+ if ( cur_elem->m_token > token )
+ break;
+ }
+
+ int offset = (cur_elem - buf_begin) *
+ sizeof(PinyinIndexItem<phrase_length>);
+ m_chunk.insert_content(offset, &new_elem,
+ sizeof ( PinyinIndexItem<phrase_length> ));
+ return INSERT_OK;
+}
+
+template<size_t phrase_length>
+int PinyinArrayIndexLevel<phrase_length>::remove_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+ PhraseExactLessThan<phrase_length> m_lessthan;
+ PinyinIndexItem<phrase_length> * buf_begin, * buf_end;
+
+ PinyinIndexItem<phrase_length> new_elem(keys, token);
+ buf_begin = (PinyinIndexItem<phrase_length> *) m_chunk.begin();
+ buf_end = (PinyinIndexItem<phrase_length> *) m_chunk.end();
+
+ std_lite::pair<PinyinIndexItem<phrase_length> *, PinyinIndexItem<phrase_length> *> range;
+ range = std_lite::equal_range(buf_begin, buf_end, new_elem, m_lessthan);
+
+ PinyinIndexItem<phrase_length> * cur_elem;
+ for ( cur_elem = range.first;
+ cur_elem != range.second; ++cur_elem){
+ if ( cur_elem->m_token == token )
+ break;
+ }
+ if (cur_elem->m_token != token )
+ return REMOVE_ITEM_DONOT_EXISTS;
+
+ int offset = (cur_elem - buf_begin) *
+ sizeof(PinyinIndexItem<phrase_length>);
+ m_chunk.remove_content(offset, sizeof (PinyinIndexItem<phrase_length>));
+ return REMOVE_OK;
+}
+
+bool PinyinLargeTable::load_text(FILE * infile){
+ char pinyin[256];
+ char phrase[256];
+ phrase_token_t token;
+ size_t freq;
+ while ( !feof(infile)){
+ fscanf(infile, "%s", pinyin);
+ fscanf(infile, "%s", phrase);
+ fscanf(infile, "%ld", &token);
+ fscanf(infile, "%ld", &freq);
+
+ PinyinDefaultParser parser;
+ NullPinyinValidator validator;
+ PinyinKeyVector keys;
+ PinyinKeyPosVector poses;
+
+ keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
+ poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
+ parser.parse(validator, keys, poses, pinyin);
+
+ add_index( keys->len, (PinyinKey *)keys->data, token);
+
+ g_array_free(keys, true);
+ g_array_free(poses, true);
+ }
+ return true;
+}
+
+bool PinyinBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
+ table_offset_t end){
+ reset();
+ char * buf_begin = (char *) chunk->begin();
+ table_offset_t phrase_begin, phrase_end;
+ table_offset_t * index = (table_offset_t *) (buf_begin + offset);
+ phrase_end = *index;
+ for ( int m = 0; m < PINYIN_Number_Of_Initials; ++m )
+ for ( int n = 0; n < PINYIN_Number_Of_Finals; ++n)
+ for ( int k = 0; k < PINYIN_Number_Of_Tones; ++k){
+ phrase_begin = phrase_end;
+ index++;
+ phrase_end = *index;
+ if ( phrase_begin == phrase_end ) //null pointer
+ continue;
+ PinyinLengthIndexLevel * phrases = new PinyinLengthIndexLevel;
+ m_pinyin_length_indexes[m][n][k] = phrases;
+ phrases->load(chunk, phrase_begin, phrase_end - 1);
+ assert( phrase_end <= end );
+ assert( *(buf_begin + phrase_end - 1) == c_separate);
+ }
+ offset += (PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 1) * sizeof ( table_offset_t);
+ assert( c_separate == *(buf_begin + offset));
+ return true;
+}
+
+bool PinyinBitmapIndexLevel::store(MemoryChunk * new_chunk,
+ table_offset_t offset,
+ table_offset_t & end){
+ table_offset_t phrase_end;
+ table_offset_t index = offset;
+ offset += (PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 1) * sizeof ( table_offset_t);
+ //add '#'
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ for ( int m = 0; m < PINYIN_Number_Of_Initials; ++m )
+ for ( int n = 0; n < PINYIN_Number_Of_Finals; ++n)
+ for ( int k = 0; k < PINYIN_Number_Of_Tones; ++k){
+ PinyinLengthIndexLevel * phrases = m_pinyin_length_indexes[m][n][k];
+ if ( !phrases ){ //null pointer
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ continue;
+ }
+ phrases->store(new_chunk, offset, phrase_end); //has a end '#'
+ offset = phrase_end;
+ //add '#'
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ }
+ end = offset;
+ return true;
+}
+
+bool PinyinLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
+ char * buf_begin = (char *) chunk->begin();
+ guint32 nindex = *((guint32 *)(buf_begin + offset));
+ table_offset_t * index = (table_offset_t *)
+ (buf_begin + offset + sizeof(guint32));
+
+ table_offset_t phrase_begin, phrase_end = *index;
+ m_pinyin_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
+ for ( size_t i = 1; i <= nindex; i++){
+ phrase_begin = phrase_end;
+ index++;
+ phrase_end = *index;
+ if ( phrase_begin == phrase_end ){
+ void * null = NULL;
+ g_array_append_val(m_pinyin_array_indexes , null);
+ continue;
+ }
+
+#define CASE(x) case x - 1: \
+ { \
+ PinyinArrayIndexLevel<x> * phrase = new PinyinArrayIndexLevel<x>; \
+ phrase->load(chunk, phrase_begin, phrase_end - 1); \
+ assert( *(buf_begin + phrase_end - 1) == c_separate); \
+ assert( phrase_end <= end ); \
+ g_array_append_val(m_pinyin_array_indexes, phrase); \
+ break; \
+ }
+ switch ( i ){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+#undef CASE
+ }
+ offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
+ assert ( c_separate == * (buf_begin + offset) );
+ return true;
+}
+
+bool PinyinLengthIndexLevel::store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end){
+ guint32 nindex = m_pinyin_array_indexes->len;
+ new_chunk->set_content(offset, &nindex, sizeof(guint32));
+ table_offset_t index = offset + sizeof(guint32);
+
+ offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
+ new_chunk->set_content(offset, &c_separate, sizeof(char));
+ offset += sizeof(char);
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));
+ index += sizeof(table_offset_t);
+ table_offset_t phrase_end;
+ for ( size_t i = 0 ; i < m_pinyin_array_indexes->len; ++i){
+#define CASE(x) case x: { \
+ PinyinArrayIndexLevel<x> * phrase = g_array_index \
+ (m_pinyin_array_indexes, PinyinArrayIndexLevel<x> * , i); \
+ if ( !phrase ){ \
+ new_chunk->set_content \
+ (index, &offset, sizeof(table_offset_t)); \
+ index += sizeof(table_offset_t); \
+ continue; \
+ } \
+ phrase->store(new_chunk, offset, phrase_end); \
+ offset = phrase_end; \
+ /*add '#'*/ \
+ new_chunk->set_content(offset, &c_separate, sizeof(char)); \
+ offset += sizeof(char); \
+ new_chunk->set_content(index, &offset, sizeof(table_offset_t));\
+ index += sizeof(table_offset_t); \
+ break; \
+ }
+ switch ( i ){
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ default:
+ assert(false);
+ }
+#undef CASE
+ }
+ end = offset;
+ return true;
+}
+
+template<size_t phrase_length>
+bool PinyinArrayIndexLevel<phrase_length>::
+load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){
+ char * buf_begin = (char *) chunk->begin();
+ m_chunk.set_chunk(buf_begin + offset, end - offset, NULL);
+ return true;
+}
+
+template<size_t phrase_length>
+bool PinyinArrayIndexLevel<phrase_length>::
+store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end){
+ new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
+ end = offset + m_chunk.size();
+ return true;
+}
diff --git a/src/storage/pinyin_large_table.h b/src/storage/pinyin_large_table.h
new file mode 100755
index 0000000..71b3640
--- /dev/null
+++ b/src/storage/pinyin_large_table.h
@@ -0,0 +1,178 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef PINYIN_LARGE_TABLE_H
+#define PINYIN_LARGE_TABLE_H
+
+#include <stdio.h>
+#include "novel_types.h"
+#include "memory_chunk.h"
+
+namespace novel{
+
+/* Because this is not large,
+ * Store this in user home directory.
+ */
+
+class PinyinLengthIndexLevel;
+
+class PinyinBitmapIndexLevel{
+ friend class PinyinLargeTable;
+ PinyinCustomSettings * m_custom;
+protected:
+ PinyinLengthIndexLevel * m_pinyin_length_indexes[PINYIN_Number_Of_Initials]
+ [PINYIN_Number_Of_Finals]
+ [PINYIN_Number_Of_Tones];
+ //search function
+ int initial_level_search(int word_length, /* in */PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges) const;
+ int final_level_search(PinyinInitial initial, int word_length, /* in */PinyinKey keys[], /* out */ PhraseIndexRanges ranges) const;
+ int tone_level_search(PinyinInitial initial, PinyinFinal final, int word_length, /* in */PinyinKey keys[], /* out */ PhraseIndexRanges ranges) const;
+ void reset();
+public:
+ PinyinBitmapIndexLevel(PinyinCustomSettings * custom);
+ ~PinyinBitmapIndexLevel(){
+ reset();
+ }
+
+ bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t &end);
+
+ /*bool load_text(FILE * file);*/
+ /*bool save_text(FILE * file);*/
+
+ /*search/add_index method */
+ int search( int phrase_length, /* in */ PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges) const;
+ int add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token);
+ int remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token);
+};
+
+class PinyinLengthIndexLevel{
+protected:
+ GArray* m_pinyin_array_indexes;
+public:
+ PinyinLengthIndexLevel();
+ ~PinyinLengthIndexLevel();
+ bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end);
+
+ /*search/add_index method */
+ int search( int phrase_length, /* in */ PinyinCustomSettings * custom,
+ /* in */ PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges);
+ int add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token);
+ int remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token);
+};
+
+template<size_t phrase_length>
+class PinyinArrayIndexLevel{
+protected:
+ MemoryChunk m_chunk;
+ int convert(PinyinCustomSettings * custom,
+ PinyinKey keys[],
+ PinyinIndexItem<phrase_length> * begin,
+ PinyinIndexItem<phrase_length> * end,
+ PhraseIndexRanges ranges);
+public:
+ bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
+ bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end);
+
+ /*search/add_index method */
+ int search(/* in */ PinyinCustomSettings * custom,
+ /* in */ PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges);
+ int add_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token);
+ int remove_index(/* in */ PinyinKey keys[] ,/* in */ phrase_token_t token);
+};
+
+
+/* TODO: add file version check */
+class PinyinLargeTable{
+protected:
+ PinyinBitmapIndexLevel m_bitmap_table;
+ MemoryChunk * m_chunk;
+
+ void reset(){
+ if ( m_chunk ){
+ delete m_chunk;
+ m_chunk = NULL;
+ }
+ }
+
+public:
+ PinyinLargeTable(PinyinCustomSettings * custom):
+ m_bitmap_table(custom){
+ m_chunk = NULL;
+ }
+
+ ~PinyinLargeTable(){
+ reset();
+ }
+
+ /*load/save method*/
+ bool load(MemoryChunk * chunk){
+ reset();
+ m_chunk = chunk;
+ return m_bitmap_table.load(chunk, 0 , chunk->size());
+ }
+
+ bool store(MemoryChunk * new_chunk){
+ table_offset_t end;
+ return m_bitmap_table.store(new_chunk, 0, end);
+ }
+
+ bool load_text(FILE * file);
+/*
+ bool save_text(FILE * file){
+ return m_bitmap_table.save_text(file);
+ }
+*/
+
+ /*search/add_index method */
+ int search( int phrase_length, /* in */ PinyinKey keys[],
+ /* out */ PhraseIndexRanges ranges){
+ return m_bitmap_table.search(phrase_length, keys, ranges);
+ }
+
+ int add_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+ return m_bitmap_table.add_index(phrase_length, keys, token);
+ }
+ int remove_index( int phrase_length, /* in */ PinyinKey keys[] ,/* in */ phrase_token_t token){
+ return m_bitmap_table.remove_index(phrase_length, keys, token);
+ }
+
+ bool has_key(PinyinKey key) const {
+ PhraseIndexRanges ranges;
+ memset(ranges, 0, sizeof(ranges));
+ ranges[1] = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange));
+ int result = m_bitmap_table.search(1, &key, ranges);
+ g_array_free(ranges[1], TRUE);
+ ranges[1] = NULL;
+ return result & SEARCH_OK;
+ }
+};
+
+};
+
+using namespace novel;
+#endif
diff --git a/src/storage/pinyin_phrase.h b/src/storage/pinyin_phrase.h
new file mode 100644
index 0000000..07ee0de
--- /dev/null
+++ b/src/storage/pinyin_phrase.h
@@ -0,0 +1,298 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef PINYIN_PHRASE_H
+#define PINYIN_PHRASE_H
+
+#include <string.h>
+#include "stl_lite.h"
+
+namespace novel{
+
+static inline int pinyin_utility_sign(int value){
+ if(value > 0)
+ return 1;
+ else if (value < 0)
+ return -1;
+ else return 0;
+}
+
+inline int pinyin_exact_compare(const PinyinKey key_lhs[],
+ const PinyinKey key_rhs[],
+ int phrase_length){
+ int i;
+ int result;
+ for ( i = 0 ; i < phrase_length ; i++){
+ result = key_lhs[i].m_initial - key_rhs[i].m_initial;
+ if ( result != 0 )
+ return pinyin_utility_sign(result);
+ }
+ for( i = 0 ; i < phrase_length ; i++){
+ result = key_lhs[i].m_final - key_rhs[i].m_final;
+ if ( result != 0 )
+ return pinyin_utility_sign(result);
+ }
+ for( i = 0 ; i < phrase_length ; i++){
+ result = key_lhs[i].m_tone - key_rhs[i].m_tone;
+ if ( result != 0 )
+ return pinyin_utility_sign(result);
+ }
+ return 0;
+}
+
+
+inline int pinyin_compare_with_ambiguities(const PinyinCustomSettings &custom,
+ const PinyinKey* key_lhs,
+ const PinyinKey* key_rhs,
+ int phrase_length){
+ int i;
+ int result;
+ for ( i = 0 ; i < phrase_length ; i++){
+ result = pinyin_compare_initial
+ (custom,
+ (PinyinInitial)key_lhs[i].m_initial,
+ (PinyinInitial)key_rhs[i].m_initial);
+ if ( result != 0 )
+ return result;
+ }
+ for( i = 0 ; i < phrase_length ; i++){
+ result = pinyin_compare_final
+ (custom,
+ (PinyinFinal)key_lhs[i].m_final,
+ (PinyinFinal)key_rhs[i].m_final);
+ if ( result != 0 )
+ return result;
+ }
+ for( i = 0 ; i < phrase_length ; i++){
+ result = pinyin_compare_tone
+ (custom,
+ (PinyinTone)key_lhs[i].m_tone,
+ (PinyinTone)key_rhs[i].m_tone);
+ if ( result != 0 )
+ return result;
+ }
+ return 0;
+}
+
+//compute pinyin lower bound
+//maybe replace by table lookup
+inline void compute_lower_value(const PinyinCustomSettings &custom,
+ PinyinKey in_keys[],
+ PinyinKey out_keys[],
+ int phrase_length){
+ PinyinKey aKey = in_keys[0];
+
+ for ( int i = 0; i < phrase_length; i++){
+ int k; int sel;
+ aKey = in_keys[i];
+ //deal with initial
+ sel = aKey.m_initial;
+ for( k = aKey.m_initial - 1; k >= PINYIN_ZeroInitial; k--){
+ if ( 0 != pinyin_compare_initial(custom,
+ (PinyinInitial)k,
+ (PinyinInitial)aKey.m_initial) )
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_initial = (PinyinInitial)sel;
+ //deal with final
+ sel = aKey.m_final;
+ for( k = aKey.m_final - 1; k >= PINYIN_ZeroFinal; k--){
+ if ( 0 != pinyin_compare_final(custom,
+ (PinyinFinal)k,
+ (PinyinFinal)aKey.m_final) )
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_final = (PinyinFinal)sel;
+ //deal with tone
+ sel = aKey.m_tone;
+ for( k = aKey.m_tone - 1; k >= PINYIN_ZeroTone; k--){
+ if ( 0 != pinyin_compare_tone(custom,
+ (PinyinTone)k,
+ (PinyinTone)aKey.m_tone) )
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_tone = (PinyinTone)sel;
+ //save the result
+ out_keys[i] = aKey;
+ }
+}
+
+//compute pinyin upper bound
+//maybe replace by table lookup
+inline void compute_upper_value(const PinyinCustomSettings &custom,
+ PinyinKey in_keys[],
+ PinyinKey out_keys[],
+ int phrase_length){
+ PinyinKey aKey = in_keys[0];
+
+ for ( int i = 0; i < phrase_length; i++){
+ int k; int sel;
+ aKey = in_keys[i];
+ //deal with initial
+ sel = aKey.m_initial;
+ for( k = aKey.m_initial + 1; k <= PINYIN_LastInitial; k++){
+ if ( 0 != pinyin_compare_initial(custom, (PinyinInitial)k, (PinyinInitial)aKey.m_initial) )
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_initial = (PinyinInitial)sel;
+ //deal with final
+ sel = aKey.m_final;
+ for( k = aKey.m_final + 1; k <= PINYIN_LastFinal; k++){
+ if ( 0 != pinyin_compare_final(custom, (PinyinFinal)k, (PinyinFinal)aKey.m_final) )
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_final = (PinyinFinal)sel;
+ //deal with tone
+ sel = aKey.m_tone;
+ for( k = aKey.m_tone + 1; k <= PINYIN_LastTone; k++){
+ if ( 0 != pinyin_compare_tone(custom, (PinyinTone)k, (PinyinTone)aKey.m_tone) )
+ break;
+ else
+ sel = k;
+ }
+ aKey.m_tone = (PinyinTone)sel;
+ //save the result
+ out_keys[i] = aKey;
+ }
+}
+
+template<int phrase_length>
+struct PinyinIndexItem{
+ phrase_token_t m_token;
+ PinyinKey m_keys[phrase_length];
+public:
+ PinyinIndexItem<phrase_length>(PinyinKey * keys, phrase_token_t token){
+ memmove(m_keys, keys, sizeof(PinyinKey) * phrase_length);
+ m_token = token;
+ }
+};
+
+/*
+//just need less than mode
+//this method mainly used in pinyin lookup
+template<int phrase_length>
+class PhraseCompareWithAmbiguities
+ : public std_lite::binary_function <const PinyinIndexItem <phrase_length>,
+ const PinyinIndexItem <phrase_length>, int>
+{
+ const PinyinCustomSettings & m_custom;
+public:
+ PhraseCompareWithAmbiguities<phrase_length>
+ (const PinyinCustomSettings & custom):m_custom(custom){}
+
+ int operator () (const PinyinIndexItem<phrase_length> &lhs,
+ const PinyinIndexItem<phrase_length> &rhs) const{
+ PinyinKey * key_lhs = (PinyinKey *) lhs.m_keys;
+ PinyinKey * key_rhs = (PinyinKey *) rhs.m_keys;
+ return pinyin_compare_with_ambiguities(m_custom,
+ key_lhs, key_rhs, phrase_length);
+ }
+};
+*/
+
+//for find the element in the phrase array
+template<int phrase_length>
+class PhraseExactCompare
+ : public std_lite::binary_function <const PinyinIndexItem<phrase_length>
+ ,const PinyinIndexItem<phrase_length>, int>
+{
+public:
+ int operator () (const PinyinIndexItem<phrase_length> &lhs,
+ const PinyinIndexItem<phrase_length> &rhs) const{
+ PinyinKey * key_lhs = (PinyinKey *) lhs.m_keys;
+ PinyinKey * key_rhs = (PinyinKey *) rhs.m_keys;
+
+ return pinyin_exact_compare(key_lhs, key_rhs, phrase_length);
+ }
+};
+
+/*
+//for find the element in the phrase array
+template<int phrase_length>
+class PhraseExactCompareWithToken
+ : public std_lite::binary_function <const PinyinIndexItem<phrase_length>
+ ,const PinyinIndexItem<phrase_length>, int>
+{
+public:
+ int operator () (const PinyinIndexItem<phrase_length> &lhs,
+ const PinyinIndexItem<phrase_length> &rhs) const{
+ PinyinKey * key_lhs = (PinyinKey *) lhs.m_keys;
+ PinyinKey * key_rhs = (PinyinKey *) rhs.m_keys;
+
+ phrase_token_t token_lhs = lhs.m_token;
+ phrase_token_t token_rhs = rhs.m_token;
+
+ int result = pinyin_exact_compare(key_lhs, key_rhs, phrase_length);
+ if ( !result )
+ return result;
+ return pinyin_utility_sign(token_lhs - token_rhs);
+ }
+};
+*/
+
+template<int phrase_length>
+class PhraseExactLessThan
+ : public std_lite::binary_function <const PinyinIndexItem<phrase_length>
+ ,const PinyinIndexItem<phrase_length>,
+ bool>
+{
+ private:
+ PhraseExactCompare<phrase_length> m_compare;
+ public:
+ bool operator () (const PinyinIndexItem<phrase_length> &lhs,
+ const PinyinIndexItem<phrase_length> &rhs) const{
+ return -1 == m_compare(lhs, rhs);
+ }
+};
+
+/*
+template<int phrase_length>
+class PhraseExactLessThanWithToken
+ : public std_lite::binary_function <const PinyinIndexItem<phrase_length>
+ ,const PinyinIndexItem<phrase_length>,
+ bool>
+{
+ private:
+ PhraseExactCompareWithToken<phrase_length> m_compare;
+ public:
+ bool operator () (const PinyinIndexItem<phrase_length> &lhs,
+ const PinyinIndexItem<phrase_length> &rhs) const{
+ return -1 == m_compare(lhs, rhs);
+ }
+};
+*/
+
+};
+
+using namespace novel;
+
+#endif
diff --git a/src/storage/pinyin_zhuyin_map_data.h b/src/storage/pinyin_zhuyin_map_data.h
new file mode 100644
index 0000000..7557c5e
--- /dev/null
+++ b/src/storage/pinyin_zhuyin_map_data.h
@@ -0,0 +1,582 @@
+static const PinyinKey __zhuyin_standard_map [][3] =
+{
+/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ' */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* , */{PinyinKey(42) /* eh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* - */{PinyinKey(66) /* er */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* . */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* / */{PinyinKey(60) /* eng */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 0 */{PinyinKey(18) /* an */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 1 */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 2 */{PinyinKey(960) /* d */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 3 */{PinyinKey(3) /* 3 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 4 */{PinyinKey(4) /* 4 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 5 */{PinyinKey(5520) /* zh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 6 */{PinyinKey(2) /* 2 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 7 */{PinyinKey(5) /* 5 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 8 */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 9 */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ; */{PinyinKey(24) /* ang */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* = */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* a */{PinyinKey(2640) /* m */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* b */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* c */{PinyinKey(1680) /* h */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* d */{PinyinKey(2160) /* k */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* e */{PinyinKey(1440) /* g */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* f */{PinyinKey(3360) /* q */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* g */{PinyinKey(4080) /* sh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* h */{PinyinKey(480) /* c */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* i */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* j */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* k */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* l */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* m */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* n */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* o */{PinyinKey(48) /* ei */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* p */{PinyinKey(54) /* en */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* q */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* r */{PinyinKey(1920) /* j */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* s */{PinyinKey(2880) /* n */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* t */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* u */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* v */{PinyinKey(4800) /* x */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* w */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* x */{PinyinKey(2400) /* l */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* y */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* z */{PinyinKey(1200) /* f */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+};
+
+static const PinyinKey __zhuyin_hsu_map [][3] =
+{
+/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ' */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* , */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* - */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* . */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* / */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 0 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 1 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 2 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 3 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 4 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 5 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 6 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 7 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 8 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 9 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ; */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* = */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* a */{PinyinKey(480) /* c */, PinyinKey(48) /* ei */, PinyinKey(0) /* */},
+/* b */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* c */{PinyinKey(4800) /* x */, PinyinKey(4080) /* sh */, PinyinKey(0) /* */},
+/* d */{PinyinKey(960) /* d */, PinyinKey(2) /* 2 */, PinyinKey(0) /* */},
+/* e */{PinyinKey(72) /* i */, PinyinKey(42) /* eh */, PinyinKey(0) /* */},
+/* f */{PinyinKey(1200) /* f */, PinyinKey(3) /* 3 */, PinyinKey(0) /* */},
+/* g */{PinyinKey(1440) /* g */, PinyinKey(36) /* e */, PinyinKey(0) /* */},
+/* h */{PinyinKey(1680) /* h */, PinyinKey(138) /* o */, PinyinKey(0) /* */},
+/* i */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* j */{PinyinKey(1920) /* j */, PinyinKey(5520) /* zh */, PinyinKey(4) /* 4 */},
+/* k */{PinyinKey(2160) /* k */, PinyinKey(24) /* ang */, PinyinKey(0) /* */},
+/* l */{PinyinKey(2400) /* l */, PinyinKey(60) /* eng */, PinyinKey(66) /* er */},
+/* m */{PinyinKey(2640) /* m */, PinyinKey(18) /* an */, PinyinKey(0) /* */},
+/* n */{PinyinKey(2880) /* n */, PinyinKey(54) /* en */, PinyinKey(0) /* */},
+/* o */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* p */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* r */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* s */{PinyinKey(3840) /* s */, PinyinKey(5) /* 5 */, PinyinKey(0) /* */},
+/* t */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* u */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* v */{PinyinKey(3360) /* q */, PinyinKey(720) /* ch */, PinyinKey(0) /* */},
+/* w */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* x */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* y */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* z */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+};
+
+static const PinyinKey __zhuyin_ibm_map [][3] =
+{
+/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ' */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* , */{PinyinKey(3) /* 3 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* - */{PinyinKey(1680) /* h */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* . */{PinyinKey(4) /* 4 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* / */{PinyinKey(5) /* 5 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 0 */{PinyinKey(2160) /* k */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 1 */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 2 */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 3 */{PinyinKey(2640) /* m */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 4 */{PinyinKey(1200) /* f */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 5 */{PinyinKey(960) /* d */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 6 */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 7 */{PinyinKey(2880) /* n */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 8 */{PinyinKey(2400) /* l */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 9 */{PinyinKey(1440) /* g */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ; */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* = */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* a */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* b */{PinyinKey(60) /* eng */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* c */{PinyinKey(54) /* en */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* d */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* e */{PinyinKey(4800) /* x */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* f */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* g */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* h */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* i */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* j */{PinyinKey(42) /* eh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* k */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* l */{PinyinKey(48) /* ei */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* m */{PinyinKey(2) /* 2 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* n */{PinyinKey(66) /* er */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* o */{PinyinKey(480) /* c */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* p */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* q */{PinyinKey(1920) /* j */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* r */{PinyinKey(5520) /* zh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* s */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* t */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* u */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* v */{PinyinKey(24) /* ang */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* w */{PinyinKey(3360) /* q */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* x */{PinyinKey(18) /* an */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* y */{PinyinKey(4080) /* sh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* z */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+};
+
+static const PinyinKey __zhuyin_gin_yieh_map [][3] =
+{
+/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ' */{PinyinKey(60) /* eng */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* , */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* - */{PinyinKey(54) /* en */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* . */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* / */{PinyinKey(18) /* an */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 0 */{PinyinKey(48) /* ei */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 1 */{PinyinKey(5) /* 5 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 2 */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 3 */{PinyinKey(960) /* d */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 4 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 5 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 6 */{PinyinKey(5520) /* zh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 7 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 8 */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 9 */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ; */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* = */{PinyinKey(66) /* er */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* [ */{PinyinKey(24) /* ang */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* a */{PinyinKey(3) /* 3 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* b */{PinyinKey(4800) /* x */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* c */{PinyinKey(2400) /* l */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* d */{PinyinKey(2880) /* n */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* e */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* f */{PinyinKey(2160) /* k */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* g */{PinyinKey(3360) /* q */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* h */{PinyinKey(4080) /* sh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* i */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* j */{PinyinKey(480) /* c */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* k */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* l */{PinyinKey(42) /* eh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* m */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* n */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* o */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* p */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* q */{PinyinKey(2) /* 2 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* r */{PinyinKey(1440) /* g */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* s */{PinyinKey(2640) /* m */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* t */{PinyinKey(1920) /* j */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* u */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* v */{PinyinKey(1680) /* h */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* w */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* x */{PinyinKey(1200) /* f */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* y */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* z */{PinyinKey(4) /* 4 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+};
+
+static const PinyinKey __zhuyin_et_map [][3] =
+{
+/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ' */{PinyinKey(480) /* c */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* , */{PinyinKey(5520) /* zh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* - */{PinyinKey(60) /* eng */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* . */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* / */{PinyinKey(4080) /* sh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 0 */{PinyinKey(24) /* ang */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 1 */{PinyinKey(5) /* 5 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 2 */{PinyinKey(2) /* 2 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 3 */{PinyinKey(3) /* 3 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 4 */{PinyinKey(4) /* 4 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 5 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 6 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 7 */{PinyinKey(3360) /* q */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 8 */{PinyinKey(18) /* an */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 9 */{PinyinKey(54) /* en */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ; */{PinyinKey(5280) /* z */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* = */{PinyinKey(66) /* er */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* a */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* b */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* c */{PinyinKey(4800) /* x */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* d */{PinyinKey(960) /* d */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* e */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* f */{PinyinKey(1200) /* f */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* g */{PinyinKey(1920) /* j */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* h */{PinyinKey(1680) /* h */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* i */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* j */{PinyinKey(3600) /* r */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* k */{PinyinKey(2160) /* k */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* l */{PinyinKey(2400) /* l */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* m */{PinyinKey(2640) /* m */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* n */{PinyinKey(2880) /* n */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* o */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* p */{PinyinKey(3120) /* p */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* q */{PinyinKey(48) /* ei */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* r */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* s */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* t */{PinyinKey(4320) /* t */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* u */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* v */{PinyinKey(1440) /* g */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* w */{PinyinKey(42) /* eh */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* x */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* y */{PinyinKey(150) /* ou */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* z */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+};
+
+static const PinyinKey __zhuyin_et26_map [][3] =
+{
+/* */{PinyinKey(1) /* 1 */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ! */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* " */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* # */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* $ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* % */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* & */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ' */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ( */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ) */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* * */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* + */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* , */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* - */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* . */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* / */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 0 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 1 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 2 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 3 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 4 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 5 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 6 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 7 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 8 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* 9 */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* : */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ; */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* < */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* = */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* > */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ? */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* @ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* A */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* B */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* C */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* D */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* E */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* F */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* G */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* H */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* I */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* J */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* K */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* L */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* M */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* N */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* O */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* P */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Q */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* R */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* S */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* T */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* U */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* V */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* W */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* X */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Y */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* Z */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* [ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* \ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ] */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ^ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* _ */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* ` */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* a */{PinyinKey(6) /* a */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* b */{PinyinKey(240) /* b */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* c */{PinyinKey(4800) /* x */, PinyinKey(4080) /* sh */, PinyinKey(0) /* */},
+/* d */{PinyinKey(960) /* d */, PinyinKey(5) /* 5 */, PinyinKey(0) /* */},
+/* e */{PinyinKey(72) /* i */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* f */{PinyinKey(1200) /* f */, PinyinKey(2) /* 2 */, PinyinKey(0) /* */},
+/* g */{PinyinKey(1920) /* j */, PinyinKey(5520) /* zh */, PinyinKey(0) /* */},
+/* h */{PinyinKey(1680) /* h */, PinyinKey(66) /* er */, PinyinKey(0) /* */},
+/* i */{PinyinKey(12) /* ai */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* j */{PinyinKey(3600) /* r */, PinyinKey(3) /* 3 */, PinyinKey(0) /* */},
+/* k */{PinyinKey(2160) /* k */, PinyinKey(4) /* 4 */, PinyinKey(0) /* */},
+/* l */{PinyinKey(2400) /* l */, PinyinKey(60) /* eng */, PinyinKey(0) /* */},
+/* m */{PinyinKey(2640) /* m */, PinyinKey(18) /* an */, PinyinKey(0) /* */},
+/* n */{PinyinKey(2880) /* n */, PinyinKey(54) /* en */, PinyinKey(0) /* */},
+/* o */{PinyinKey(138) /* o */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* p */{PinyinKey(3120) /* p */, PinyinKey(150) /* ou */, PinyinKey(0) /* */},
+/* q */{PinyinKey(5280) /* z */, PinyinKey(48) /* ei */, PinyinKey(0) /* */},
+/* r */{PinyinKey(36) /* e */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* s */{PinyinKey(3840) /* s */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* t */{PinyinKey(4320) /* t */, PinyinKey(24) /* ang */, PinyinKey(0) /* */},
+/* u */{PinyinKey(216) /* v */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* v */{PinyinKey(1440) /* g */, PinyinKey(3360) /* q */, PinyinKey(0) /* */},
+/* w */{PinyinKey(480) /* c */, PinyinKey(42) /* eh */, PinyinKey(0) /* */},
+/* x */{PinyinKey(156) /* u */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* y */{PinyinKey(720) /* ch */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* z */{PinyinKey(30) /* ao */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* { */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+/* | */{PinyinKey(0) /* */, PinyinKey(0) /* */, PinyinKey(0) /* */},
+};
+
diff --git a/src/training/Makefile.am b/src/training/Makefile.am
new file mode 100644
index 0000000..520e4e1
--- /dev/null
+++ b/src/training/Makefile.am
@@ -0,0 +1,36 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ @GLIB2_CPPFLAGS@
+
+noinst_PROGRAMS = gen_ngram gen_unigram estimate_interpolation
+
+gen_ngram_SOURCES = gen_ngram.cpp
+
+gen_ngram_LDADD = ../storage/libstorage.la @GLIB2_LDFLAGS@
+
+gen_unigram_SOURCES = gen_unigram.cpp
+
+gen_unigram_LDADD = ../storage/libstorage.la @GLIB2_LDFLAGS@
+
+estimate_interpolation_SOURCES = estimate_interpolation.cpp
+
+estimate_interpolation_LDADD = ../storage/libstorage.la @GLIB2_LDFLAGS@
diff --git a/src/training/estimate_interpolation.cpp b/src/training/estimate_interpolation.cpp
new file mode 100644
index 0000000..1a547bc
--- /dev/null
+++ b/src/training/estimate_interpolation.cpp
@@ -0,0 +1,151 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2008 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <math.h>
+#include <glib.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "ngram.h"
+
+parameter_t compute_interpolation(SingleGram * deleted_bigram,
+ FacadePhraseIndex * unigram,
+ SingleGram * bigram){
+ bool success;
+ parameter_t lambda = 0, next_lambda = 0.6;
+ parameter_t epsilon = 0.001;
+
+ while ( fabs(lambda - next_lambda) > epsilon){
+ lambda = next_lambda;
+ next_lambda = 0;
+ guint32 table_num = 0;
+ parameter_t numerator = 0;
+ parameter_t part_of_denominator = 0;
+
+ PhraseIndexRange range;
+ range.m_range_begin = token_min;
+ range.m_range_end = token_max;
+
+ BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem));
+ deleted_bigram->search(&range, array);
+
+ for ( int i = 0; i < array->len; ++i){
+ BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
+ //get the phrase token
+ phrase_token_t token = item->m_token;
+ guint32 deleted_freq = 0;
+ assert(deleted_bigram->get_freq(token, deleted_freq));
+ {
+ guint32 freq = 0;
+ parameter_t elem_poss = 0;
+ if ( bigram && bigram->get_freq(token, freq)){
+ guint32 total_freq;
+ assert(bigram->get_total_freq(total_freq));
+ assert(0 != total_freq);
+ elem_poss = freq / (parameter_t) total_freq;
+ }
+ numerator = lambda * elem_poss;
+ }
+
+ {
+ guint32 freq = 0;
+ parameter_t elem_poss = 0;
+ PhraseItem item;
+ if (unigram->get_phrase_item(token, item)){
+ guint32 freq = item.get_unigram_frequency();
+ guint32 total_freq = unigram->get_phrase_index_total_freq();
+ elem_poss = freq / (parameter_t)total_freq;
+ }
+ part_of_denominator = ( 1 - lambda) * elem_poss;
+ }
+
+ if ( 0 == (numerator + part_of_denominator))
+ continue;
+
+ next_lambda += deleted_freq * (numerator / (numerator + part_of_denominator));
+ }
+ assert(deleted_bigram->get_total_freq(table_num));
+ next_lambda /= table_num;
+
+ g_array_free(array, TRUE);
+ }
+ lambda = next_lambda;
+ return lambda;
+}
+
+int main(int argc, char * argv[]){
+ FacadePhraseIndex phrase_index;
+
+ //gb_char binary file
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+
+ //gbk_char binary file
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gbk_char.bin");
+ phrase_index.load(2, chunk);
+
+ Bigram bigram;
+ bigram.attach("../../data/bigram.db", NULL);
+
+ Bigram deleted_bigram;
+ deleted_bigram.attach("../../data/deleted_bigram.db", NULL);
+
+ GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ deleted_bigram.get_all_items(system_items, user_items);
+ assert(0 == user_items->len);
+ g_array_free(user_items, TRUE);
+
+ parameter_t lambda_sum = 0;
+ int lambda_count = 0;
+
+ for ( int i = 0; i < system_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i);
+ SingleGram * system = NULL, * user = NULL;
+ bigram.load(*token, system, user);
+ assert(NULL == user);
+ SingleGram * deleted_system = NULL, * deleted_user = NULL;
+ deleted_bigram.load(*token, deleted_system, deleted_user);
+ assert(NULL == deleted_user);
+
+ parameter_t lambda = compute_interpolation(deleted_system, &phrase_index, system);
+
+ printf("lambda:%f\n", lambda);
+
+ lambda_sum += lambda;
+ lambda_count ++;
+
+ if (system) delete system;
+ delete deleted_system;
+ }
+
+ printf("average lambda:%f\n", (lambda_sum/lambda_count));
+ g_array_free(system_items, TRUE);
+}
+
diff --git a/src/training/gen_ngram.cpp b/src/training/gen_ngram.cpp
new file mode 100644
index 0000000..4dfea78
--- /dev/null
+++ b/src/training/gen_ngram.cpp
@@ -0,0 +1,179 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "ngram.h"
+
+
+static GHashTable * g_phrases;
+
+//read gb_char.table and gbk_char.table
+bool init_phrases(FILE * infile){
+ char pinyin[256];
+ char phrase[256];
+ phrase_token_t token;
+ size_t freq;
+ while (!feof(infile)){
+ fscanf(infile, "%s", pinyin);
+ fscanf(infile, "%s", phrase);
+ fscanf(infile, "%d", &token);
+ fscanf(infile, "%ld", &freq);
+ if ( feof(infile) )
+ break;
+ g_hash_table_insert(g_phrases, g_strdup(phrase),
+ GUINT_TO_POINTER(token));
+ }
+ return true;
+}
+
+void print_help(){
+ printf("gen_ngram [--skip-pi-gram-training] [--skip-unigram-training]\n");
+ printf(" [--bigram-file <FILENAME>]\n");
+ exit(1);
+}
+
+int main(int argc, char * argv[]){
+ int i = 1;
+ bool train_pi_gram = true;
+ bool train_unigram = true;
+ const char * bigram_filename = "../../data/bigram.db";
+
+ setlocale(LC_ALL,"");
+ while ( i < argc ){
+ if ( strcmp("--help", argv[i] ) == 0){
+ print_help();
+ }else if ( strcmp("--skip-pi-gram-training", argv[i] ) == 0) {
+ train_pi_gram = false;
+ }else if ( strcmp("--skip-unigram-training", argv[i] ) == 0) {
+ train_unigram = false;
+ }else if ( strcmp("--bigram-file", argv[i] ) == 0){
+ if ( ++i >= argc )
+ print_help();
+ bigram_filename = argv[i];
+ }
+ ++i;
+ }
+
+ g_phrases = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
+ //init phrase lookup
+ FILE * gb_file = fopen("../../data/gb_char.table", "r");
+ if ( gb_file == NULL ){
+ fprintf(stderr, "can't open gb_char.table!\n");
+ exit(1);
+ }
+ init_phrases(gb_file);
+ fclose(gb_file);
+ FILE * gbk_file = fopen("../../data/gbk_char.table", "r");
+ if ( gbk_file == NULL ){
+ fprintf(stderr, "can't open gbk_char.table!\n");
+ exit(1);
+ }
+ init_phrases(gbk_file);
+ fclose(gbk_file);
+
+ FacadePhraseIndex phrase_index;
+
+ //gb_char binary file
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+
+ //gbk_char binary file
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gbk_char.bin");
+ phrase_index.load(2, chunk);
+
+ Bigram bigram;
+ bigram.attach(NULL, bigram_filename);
+
+
+ char* linebuf = (char *)malloc ( 1024 * sizeof (char) );
+ size_t size = 1024;
+ phrase_token_t last_token, cur_token = last_token = 0;
+ while( getline(&linebuf, &size, stdin) ){
+ if ( feof(stdin) )
+ break;
+ linebuf[strlen(linebuf)-1] = '\0';
+
+ phrase_token_t token;
+ gpointer orig_key, value;
+ gboolean result = g_hash_table_lookup_extended
+ (g_phrases, linebuf, &orig_key, &value);
+ if (result){
+ token = GPOINTER_TO_UINT(value);
+ }else{
+ token = 0;
+ }
+
+ last_token = cur_token;
+ cur_token = token;
+ if ( cur_token ){
+ //training uni-gram
+ if ( train_unigram )
+ phrase_index.add_unigram_frequency(cur_token, 1);
+ }
+ if ( cur_token ){
+ SingleGram * system = NULL, * user = NULL;
+ if ( 0 == last_token ){
+ if (train_pi_gram)
+ bigram.load(sentence_start, system, user);
+ } else
+ bigram.load(last_token, system, user);
+ assert(NULL == system);
+ if ( NULL == user ){
+ user = new SingleGram;
+ }
+ guint32 freq, total_freq;
+ //increase freq
+ user->get_freq(cur_token, freq);
+ user->set_freq(cur_token, freq + 1);
+ //increase total freq
+ user->get_total_freq(total_freq);
+ user->set_total_freq(total_freq + 1);
+ if ( 0 == last_token ){
+ if ( train_pi_gram )
+ bigram.store(sentence_start, user);
+ }else
+ bigram.store(last_token, user);
+ delete user;
+ }
+ }
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ phrase_index.store(1, new_chunk);
+ new_chunk->save("../../data/gb_char.bin");
+ phrase_index.load(1, new_chunk);
+
+ new_chunk = new MemoryChunk;
+ phrase_index.store(2, new_chunk);
+ new_chunk->save("../../data/gbk_char.bin");
+ phrase_index.load(2, new_chunk);
+
+ return 0;
+}
diff --git a/src/training/gen_unigram.cpp b/src/training/gen_unigram.cpp
new file mode 100644
index 0000000..7e76693
--- /dev/null
+++ b/src/training/gen_unigram.cpp
@@ -0,0 +1,65 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "phrase_index.h"
+
+//increase all unigram frequency by one.
+
+int main(int argc, char * argv[]){
+
+ FacadePhraseIndex phrase_index;
+
+ //gb_char binary file
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+
+ //gbk_char binary file
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gbk_char.bin");
+ phrase_index.load(2, chunk);
+
+ for ( size_t i = 16777217; i <= 16870566; ++i){
+ phrase_index.add_unigram_frequency(i, 1);
+ }
+
+#if 0
+ for ( size_t i = 33554433; i <= 33570193 ; ++i){
+ phrase_index.add_unigram_frequency(i, 1);
+ }
+#endif
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ phrase_index.store(1, new_chunk);
+ new_chunk->save("../../data/gb_char.bin");
+ phrase_index.load(1, new_chunk);
+
+ new_chunk = new MemoryChunk;
+ phrase_index.store(2, new_chunk);
+ new_chunk->save("../../data/gbk_char.bin");
+ phrase_index.load(2, new_chunk);
+
+ return 0;
+}