diff options
author | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
commit | f41d1fdf83408e042ab07925710a8913bad0c27c (patch) | |
tree | 1757833ac4cdd0830834d2f9ef92be07c0bc1a5b /src/storage/pinyin_base.h | |
parent | 34acf9be9033e0dc0a5905999133482c20b6cbf3 (diff) | |
download | libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.gz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.xz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.zip |
import from pinyin.
Diffstat (limited to 'src/storage/pinyin_base.h')
-rw-r--r-- | src/storage/pinyin_base.h | 728 |
1 files changed, 728 insertions, 0 deletions
diff --git a/src/storage/pinyin_base.h b/src/storage/pinyin_base.h new file mode 100644 index 0000000..374cc53 --- /dev/null +++ b/src/storage/pinyin_base.h @@ -0,0 +1,728 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2002,2003,2006 James Su + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/** @file pinyin_base.h + * @brief the definitions of pinyin related classes and structs. + */ + +#ifndef PINYIN_BASE_H +#define PINYIN_BASE_H + +#include <glib.h> + +namespace novel{ + +// Predefinition of some classes and structs +struct PinyinKey; + +class PinyinValidator; +class PinyinParser; + +struct PinyinKeyPos{ + int m_pos; + size_t m_len; + PinyinKeyPos(){ + m_pos = 0; + m_len = 0; + } + void set_pos(int pos){ + m_pos = pos; + } + void set_length(size_t len){ + m_len = len; + } + int get_pos(){ + return m_pos; + } + int get_end_pos(){ + return m_pos + m_len; + } + size_t get_length(){ + return m_len; + } +}; + +typedef GArray* PinyinKeyVector; /* Array of PinyinKey */ +typedef GArray* PinyinKeyPosVector; /* Array of PinyinKeyPos */ + + +struct PinyinCustomSettings; + +/** + * @brief enums of pinyin initial element. + * + * A pinyin key can be divided into three tokens: + * Initial -- such as B P M F D T N L etc. + * Final -- such as A O E I U V etc. + * Tone -- can be 1, 2, 3, 4 and 5. + */ +enum PinyinInitial +{ + PINYIN_ZeroInitial = 0, /**< zero initial. indicates invaild initial */ + PINYIN_Bo = 1, + PINYIN_Ci = 2, + PINYIN_Chi = 3, + PINYIN_De = 4, + PINYIN_Fo = 5, + PINYIN_He = 6, + PINYIN_Ge = 7, + PINYIN_Ji = 8, + PINYIN_Ke = 9, + PINYIN_Mo =10, + PINYIN_Ne =11, + PINYIN_Le =12, + PINYIN_Ri =13, + PINYIN_Po =14, + PINYIN_Qi =15, + PINYIN_Si =16, + PINYIN_Shi =17, + PINYIN_Te =18, + PINYIN_Wu =19, + PINYIN_Xi =20, + PINYIN_Yi =21, + PINYIN_Zi =22, + PINYIN_Zhi =23, + PINYIN_LastInitial = PINYIN_Zhi, /**< the last initial */ + PINYIN_Number_Of_Initials = PINYIN_LastInitial + 1 +}; + +/** + * @brief enums of pinyin final element. + */ +enum PinyinFinal +{ + PINYIN_ZeroFinal = 0, /**< zero final. indicates invalid final */ + PINYIN_A = 1, + PINYIN_Ai = 2, + PINYIN_An = 3, + PINYIN_Ang = 4, + PINYIN_Ao = 5, + PINYIN_E = 6, + PINYIN_Ea = 7, + PINYIN_Ei = 8, + PINYIN_En = 9, + PINYIN_Eng =10, + PINYIN_Er =11, + PINYIN_I =12, + PINYIN_Ia =13, + PINYIN_Ian =14, + PINYIN_Iang =15, + PINYIN_Iao =16, + PINYIN_Ie =17, + PINYIN_In =18, + PINYIN_Ing =19, + PINYIN_Iong =20, + PINYIN_Iu =21, + PINYIN_Ng =22, + PINYIN_O =23, + PINYIN_Ong =24, + PINYIN_Ou =25, + PINYIN_U =26, + PINYIN_Ua =27, + PINYIN_Uai =28, + PINYIN_Uan =29, + PINYIN_Uang =30, + PINYIN_Ue =31, + PINYIN_Ueng =32, + PINYIN_Ui =33, + PINYIN_Un =34, + PINYIN_Uo =35, + PINYIN_V =36, + PINYIN_Van =37, + PINYIN_Ve =38, + PINYIN_Vn =39, + PINYIN_LastFinal = PINYIN_Vn, /**< the last final */ + PINYIN_Number_Of_Finals = PINYIN_LastFinal + 1 +}; + +/** + * @brief enums of pinyin tone element. + */ +enum PinyinTone +{ + PINYIN_ZeroTone = 0, /**< zero tone. this will be matched with all other tones. */ + PINYIN_First = 1, + PINYIN_Second = 2, + PINYIN_Third = 3, + PINYIN_Fourth = 4, + PINYIN_Fifth = 5, + PINYIN_LastTone = PINYIN_Fifth, /**< the last tone */ + PINYIN_Number_Of_Tones = PINYIN_LastTone + 1 +}; + +/** + * @brief enums of Shuang Pin Schemes. + */ +enum PinyinShuangPinScheme +{ + SHUANG_PIN_STONE = 0, + SHUANG_PIN_ZRM = 1, + SHUANG_PIN_MS = 2, + SHUANG_PIN_ZIGUANG = 3, + SHUANG_PIN_ABC = 4, + SHUANG_PIN_LIUSHI = 5, + SHUANG_PIN_CUSTOMIZED = 6, + SHUANG_PIN_DEFAULT = SHUANG_PIN_ZRM +}; + +/** + * @brief enums of ZhuYin Schemes. + */ +enum PinyinZhuYinScheme +{ + ZHUYIN_ZHUYIN = 0, + ZHUYIN_STANDARD = 1, + ZHUYIN_HSU = 2, + ZHUYIN_IBM = 3, + ZHUYIN_GIN_YIEH = 4, + ZHUYIN_ET = 5, + ZHUYIN_ET26 = 6, + ZHUYIN_DEFAULT = ZHUYIN_STANDARD +}; + +/** + * @brief enums of pinyin ambiguities. + * + * Some pinyin element maybe confused by somebody, + * We allow these ambiguities. + */ +enum PinyinAmbiguity +{ + PINYIN_AmbAny= 0, + PINYIN_AmbZhiZi, + PINYIN_AmbChiCi, + PINYIN_AmbShiSi, + PINYIN_AmbNeLe, + PINYIN_AmbLeRi, + PINYIN_AmbFoHe, + PINYIN_AmbAnAng, + PINYIN_AmbEnEng, + PINYIN_AmbInIng, + PINYIN_AmbLast = PINYIN_AmbInIng +}; + +/** + * @brief Structure to hold pinyin custom settings. + * + * user can custom the behavor of libpinyin by these settings. + */ +struct PinyinCustomSettings +{ + bool use_incomplete; + /**< allow incomplete pinyin key which only has inital. */ + + bool use_ambiguities [PINYIN_AmbLast + 1]; + /**< allow ambiguous pinyin elements or not. */ + + PinyinCustomSettings (); + + void set_use_incomplete (bool use) { use_incomplete = use; } + void set_use_ambiguities (PinyinAmbiguity amb, bool use) + { + if (amb == PINYIN_AmbAny) + for (size_t i=0; i<=PINYIN_AmbLast; ++i) use_ambiguities [i] = use; + else { + use_ambiguities [0] = false; + use_ambiguities [static_cast<size_t>(amb)] = use; + for (size_t i=1; i<=PINYIN_AmbLast; ++i) + if (use_ambiguities [i]) { + use_ambiguities [0] = true; + break; + } + } + } + + bool operator == (const PinyinCustomSettings &rhs) const + { + if (use_incomplete != rhs.use_incomplete) + return false; + + for (size_t i=0; i <= PINYIN_AmbLast; ++i) + if (use_ambiguities [i] != rhs.use_ambiguities [i]) + return false; + + return true; + } + + bool operator != (const PinyinCustomSettings &rhs) const + { + return !(*this == rhs); + } + + guint32 to_value () const + { + guint32 val = 0; + + if (use_incomplete) val |= 1; + + for (size_t i=0; i <= PINYIN_AmbLast; ++i) + if (use_ambiguities [i]) + val |= (1 << (i+1)); + + return val; + } + + void from_value (guint32 val) + { + use_incomplete = ((val & 1) != 0); + + for (size_t i=0; i <= PINYIN_AmbLast; ++i) + use_ambiguities [i] = ((val & (1 << (i+1))) != 0); + } +}; + +/** + * @brief Pinyin key class. + * + * A pinyin key is a composed element of an initial, a final and a tone, + * which represents one or several Chinese ideographs + * + * The position and length information for the portion of string, from which + * the PinyinKey is parsed, are also stored in this structure. + */ +struct PinyinKey +{ + friend class PinyinBitmapIndexLevel; + friend inline int pinyin_exact_compare(const PinyinKey key_lhs[], + const PinyinKey key_rhs[], + int word_length); + friend inline int pinyin_compare_with_ambiguities + (const PinyinCustomSettings &custom, + const PinyinKey* key_lhs, + const PinyinKey* key_rhs, + int word_length); + friend inline void compute_lower_value(const PinyinCustomSettings &custom, + PinyinKey in_keys[], + PinyinKey out_keys[], + int word_length); + friend inline void compute_upper_value(const PinyinCustomSettings &custom, + PinyinKey in_keys[], + PinyinKey out_keys[], + int word_length); + +private: + guint16 m_initial : 5; /**< pinyin initial */ + guint16 m_final : 6; /**< pinyin final */ + guint16 m_tone : 3; /**< pinyin tone */ +public: + /** + * @brief Minimal numerical value of a PinyinKey + * @sa get_value(); + */ + static const guint16 min_value; + + /** + * @brief Maximal numerical value of a PinyinKey + * @sa get_value(); + */ + static const guint16 max_value; + +public: + /** + * Constructor. + * + * The default constructor of class PinyinKey. + */ + PinyinKey (PinyinInitial initial = PINYIN_ZeroInitial, + PinyinFinal final = PINYIN_ZeroFinal, + PinyinTone tone = PINYIN_ZeroTone) + : m_initial (initial), m_final (final), m_tone (tone) + { + } + + /** + * Constructor. + * + * Construct a PinyinKey object from a key string, with + * specified validator. + * + * @sa PinyinValidator + */ + PinyinKey (const PinyinValidator &validator, const char *str, int len = -1) + { + set (validator, str, len); + } + + PinyinKey (guint16 value) + { + set (value); + } + /** + * Clear the PinyinKey object. + */ + + void clear () + { + m_initial = PINYIN_ZeroInitial; + m_final = PINYIN_ZeroFinal; + m_tone = PINYIN_ZeroTone; + } + + /** + * Read PinyinKey value from a key string. + * + * @param validator a PinyinValidator object to validate the key. + * @param key a Latin string including one or more pinyin keys. + * @return the number of characters used by this pinyin key. + */ + int set (const PinyinValidator &validator, const char *str, int len = -1); + + /** + * Set PinyinKey's value to initial, final and tone. + */ + void set (PinyinInitial initial = PINYIN_ZeroInitial, + PinyinFinal final = PINYIN_ZeroFinal, + PinyinTone tone = PINYIN_ZeroTone) + { + m_initial = initial; + m_final = final; + m_tone = tone; + } + + /** + * @brief Set this PinyinKey from its numerical value. + */ + void set (guint16 value) + { + m_tone = value % PINYIN_Number_Of_Tones; + value /= PINYIN_Number_Of_Tones; + m_final = value % PINYIN_Number_Of_Finals; + m_initial = value / PINYIN_Number_Of_Finals; + } + + /** + * @brief Get numerical value of this PinyinKey + */ + guint16 get_value () const + { + return (m_initial * PINYIN_Number_Of_Finals + m_final) * PINYIN_Number_Of_Tones + m_tone; + } + + /** + * Set PinyinKey's initial value to initial. + */ + void set_initial (PinyinInitial initial = PINYIN_ZeroInitial) + { + m_initial = initial; + } + + /** + * Set PinyinKey's final value to final. + */ + void set_final (PinyinFinal final = PINYIN_ZeroFinal) + { + m_final = final; + } + + /** + * Set PinyinKey's tone value to tone. + */ + void set_tone (PinyinTone tone = PINYIN_ZeroTone) + { + m_tone = tone; + } + + /** + * Get initial value of this key. + */ + PinyinInitial get_initial () const + { + return static_cast<PinyinInitial>(m_initial); + } + + /** + * Get final value of this key. + */ + PinyinFinal get_final () const + { + return static_cast<PinyinFinal>(m_final); + } + + /** + * Get tone value of this key. + */ + PinyinTone get_tone () const + { + return static_cast<PinyinTone>(m_tone); + } + + /** + * Get Latin name of this key's initial. + */ + const char* get_initial_string () const; + + /** + * Get Chinese ZhuYin name of this key's initial, in UTF-8 encoding. + */ + const char* get_initial_zhuyin_string () const; + + /** + * Get Latin name of this key's final. + */ + const char* get_final_string () const; + + /** + * Get Chinese ZhuYin name of this key's final, in UTF-8 encoding. + */ + const char* get_final_zhuyin_string () const; + + /** + * Get Latin name of this key's tone. + */ + const char* get_tone_string () const; + + /** + * Get Chinese ZhuYin name of this key's tone, in UTF-8 encoding. + */ + const char* get_tone_zhuyin_string () const; + + /** + * Get Latin name of this key. + */ + const char * get_key_string () const; + + /** + * Get Chinese ZhuYin name of this key, in UTF-8 encoding. + */ + const char * get_key_zhuyin_string () const; + + /** + * Check if this key is empty. + */ + bool is_empty () const + { + return m_initial == PINYIN_ZeroInitial && m_final == PINYIN_ZeroFinal && m_tone == PINYIN_ZeroTone; + } + + /** + * Check if this key has both initial, final and tone. + */ + bool is_complete () const + { + return m_initial != PINYIN_ZeroInitial && m_final != PINYIN_ZeroFinal && m_tone != PINYIN_ZeroTone; + } + + bool operator == (PinyinKey rhs) const + { + return m_initial == rhs.m_initial && m_final == rhs.m_final && m_tone == rhs.m_tone; + } + + bool operator != (PinyinKey rhs) const + { + return m_initial != rhs.m_initial || m_final != rhs.m_final || m_tone != rhs.m_tone; + } + + bool operator < (PinyinKey rhs) const + { + if (m_initial < rhs.m_initial) return true; + if (m_initial > rhs.m_initial) return false; + if (m_final < rhs.m_final) return true; + if (m_final > rhs.m_final) return false; + return m_tone < rhs.m_tone; + } + + bool operator > (PinyinKey rhs) const + { + if (m_initial > rhs.m_initial) return true; + if (m_initial < rhs.m_initial) return false; + if (m_final > rhs.m_final) return true; + if (m_final < rhs.m_final) return false; + return m_tone > rhs.m_tone; + } +}; + +/** + * NULL Validator of PinyinKey object. + * + * This class is for validating a PinyinKey object. + */ +class PinyinValidator +{ +public: + /** + * Overloaded operator () function to validate a pinyin key. + * + * @param key The key to be validated. + * @return true if the key is valid. + */ + virtual bool operator () (PinyinKey key) const = 0; +}; + +class PinyinLargeTable; +/** + * Validator of PinyinKey object. + * + * This class is for validating a PinyinKey object. + */ +class BitmapPinyinValidator:public PinyinValidator +{ + char m_bitmap [(PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 7) / 8]; + +public: + BitmapPinyinValidator (const PinyinLargeTable *table = 0); + + /** + * initialize the validator with specified custom settings + * and PinyinLargeTable. + */ + void initialize (const PinyinLargeTable *table = 0); + + /** + * Overloaded operator () function to validate a pinyin key. + * + * @param key The key to be validated. + * @return true if the key is valid. + */ + virtual bool operator () (PinyinKey key) const; +}; + +/** + * NULL Validator of PinyinKey object. + * + * This class is for validating a PinyinKey object. + */ +class NullPinyinValidator:public PinyinValidator +{ +public: + /** + * Overloaded operator () function to validate a pinyin key. + * + * @param key The key to be validated. + * @return true if the key is valid. + */ + virtual bool operator () (PinyinKey key) const{ + return true; + } +}; + +/** + * @brief Class to translate string into PinyinKey. + */ +class PinyinParser +{ +public: + virtual ~PinyinParser (); + + /** + * @brief Translate only one PinyinKey from a string. + * + * @param validator PinyinValidator object to valid result. + * @param key Stores result PinyinKey. + * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string, + * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme, + * it's an UTF-8 string which contains ZhuYin chars. + * @param len The length of str, in number of chars rather than bytes. + * + * @return the number of chars were actually used. + */ + virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const = 0; + + /** + * @brief Handy wrapper function of parse_one_key(), which accept a String object instead of char *. + */ + int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char * &str) const + { + return parse_one_key (validator, key, str, g_utf8_strlen (str, -1)); + } + + /** + * @brief Translate the source string into a set of PinyinKeys. + * + * @param validator PinyinValidator object to valid result. + * @param keys Stores result PinyinKeys. + * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string, + * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme, + * it's an UTF-8 string which contains ZhuYin chars. + * @param len The length of str, in number of chars rather than bytes. + * + * @return the number of chars were actually used. + */ + virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys,PinyinKeyPosVector & poses, const char *str, int len = -1) const = 0; + +public: + static void normalize (PinyinKey &key); +}; + +/** + * The default Pinyin Parser which parses full pinyin string into PinyinKeys. + */ +class PinyinDefaultParser : public PinyinParser +{ +public: + virtual ~PinyinDefaultParser (); + + virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const; + virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const; + +public: + using PinyinParser::parse_one_key; + using PinyinParser::parse; +}; + +/* The valid input chars of ShuangPin is a-z and ';' + */ +class PinyinShuangPinParser : public PinyinParser +{ + PinyinInitial m_initial_map [27]; + PinyinFinal m_final_map [27][2]; + +public: + /** + * Constructor + * + * @param scheme the predefined ShuangPin scheme to be used. + */ + PinyinShuangPinParser (PinyinShuangPinScheme scheme = SHUANG_PIN_DEFAULT); + PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]); + + virtual ~PinyinShuangPinParser (); + + virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const; + virtual int parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const; + +public: + void set_scheme (PinyinShuangPinScheme scheme); + void set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]); + + void get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2]); + +public: + using PinyinParser::parse_one_key; + using PinyinParser::parse; +}; + +int pinyin_compare_initial (const PinyinCustomSettings &custom, + PinyinInitial lhs, + PinyinInitial rhs); + +int pinyin_compare_final (const PinyinCustomSettings &custom, + PinyinFinal lhs, + PinyinFinal rhs); + +int pinyin_compare_tone (const PinyinCustomSettings &custom, + PinyinTone lhs, + PinyinTone rhs); +}; + +using namespace novel; + +#endif |