/* * libpinyin * Library to deal with pinyin. * * Copyright (C) 2002,2003,2006 James Su * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /** @file pinyin_base.h * @brief the definitions of pinyin related classes and structs. */ #ifndef PINYIN_BASE_H #define PINYIN_BASE_H #include #include namespace pinyin{ // Predefinition of some classes and structs struct PinyinKey; class PinyinValidator; class PinyinParser; struct PinyinKeyPos{ int m_pos; size_t m_len; PinyinKeyPos(){ m_pos = 0; m_len = 0; } void set_pos(int pos){ m_pos = pos; } void set_length(size_t len){ m_len = len; } int get_pos(){ return m_pos; } int get_end_pos(){ return m_pos + m_len; } size_t get_length(){ return m_len; } }; typedef GArray* PinyinKeyVector; /* Array of PinyinKey */ typedef GArray* PinyinKeyPosVector; /* Array of PinyinKeyPos */ struct PinyinCustomSettings; /** * @brief enums of pinyin initial element. * * A pinyin key can be divided into three tokens: * Initial -- such as B P M F D T N L etc. * Final -- such as A O E I U V etc. * Tone -- can be 1, 2, 3, 4 and 5. */ enum PinyinInitial { PINYIN_ZeroInitial = 0, /**< zero initial. indicates invaild initial */ PINYIN_Bo = 1, PINYIN_Ci = 2, PINYIN_Chi = 3, PINYIN_De = 4, PINYIN_Fo = 5, PINYIN_He = 6, PINYIN_Ge = 7, PINYIN_Ke = 8, PINYIN_Ji = 9, PINYIN_Mo =10, PINYIN_Ne =11, PINYIN_Le =12, PINYIN_Ri =13, PINYIN_Po =14, PINYIN_Qi =15, PINYIN_Si =16, PINYIN_Shi =17, PINYIN_Te =18, PINYIN_Wu =19, PINYIN_Xi =20, PINYIN_Yi =21, PINYIN_Zi =22, PINYIN_Zhi =23, PINYIN_LastInitial = PINYIN_Zhi, /**< the last initial */ PINYIN_Number_Of_Initials = PINYIN_LastInitial + 1 }; /** * @brief enums of pinyin final element. */ enum PinyinFinal { PINYIN_ZeroFinal = 0, /**< zero final. indicates invalid final */ PINYIN_A = 1, PINYIN_Ai = 2, PINYIN_An = 3, PINYIN_Ang = 4, PINYIN_Ao = 5, PINYIN_E = 6, PINYIN_Ea = 7, PINYIN_Ei = 8, PINYIN_En = 9, PINYIN_Eng =10, PINYIN_Er =11, PINYIN_I =12, PINYIN_Ia =13, PINYIN_Ian =14, PINYIN_Iang =15, PINYIN_Iao =16, PINYIN_Ie =17, PINYIN_In =18, PINYIN_Ing =19, PINYIN_Iong =20, PINYIN_Iu =21, PINYIN_Ng =22, PINYIN_O =23, PINYIN_Ong =24, PINYIN_Ou =25, PINYIN_U =26, PINYIN_Ua =27, PINYIN_Uai =28, PINYIN_Uan =29, PINYIN_Uang =30, PINYIN_Ue =31, PINYIN_Ueng =32, PINYIN_Ui =33, PINYIN_Un =34, PINYIN_Uo =35, PINYIN_V =36, PINYIN_Van =37, PINYIN_Ve =38, PINYIN_Vn =39, PINYIN_LastFinal = PINYIN_Vn, /**< the last final */ PINYIN_Number_Of_Finals = PINYIN_LastFinal + 1 }; /** * @brief enums of pinyin tone element. */ enum PinyinTone { PINYIN_ZeroTone = 0, /**< zero tone. this will be matched with all other tones. */ PINYIN_First = 1, PINYIN_Second = 2, PINYIN_Third = 3, PINYIN_Fourth = 4, PINYIN_Fifth = 5, PINYIN_LastTone = PINYIN_Fifth, /**< the last tone */ PINYIN_Number_Of_Tones = PINYIN_LastTone + 1 }; /** * @brief enums of Shuang Pin Schemes. */ enum PinyinShuangPinScheme { SHUANG_PIN_STONE = 0, SHUANG_PIN_ZRM = 1, SHUANG_PIN_MS = 2, SHUANG_PIN_ZIGUANG = 3, SHUANG_PIN_ABC = 4, SHUANG_PIN_LIUSHI = 5, SHUANG_PIN_CUSTOMIZED = 6, SHUANG_PIN_DEFAULT = SHUANG_PIN_ZRM }; /** * @brief enums of ZhuYin Schemes. */ enum PinyinZhuYinScheme { ZHUYIN_ZHUYIN = 0, ZHUYIN_STANDARD = 1, ZHUYIN_HSU = 2, ZHUYIN_IBM = 3, ZHUYIN_GIN_YIEH = 4, ZHUYIN_ET = 5, ZHUYIN_ET26 = 6, ZHUYIN_DEFAULT = ZHUYIN_STANDARD }; /** * @brief enums of pinyin ambiguities. * * Some pinyin element maybe confused by somebody, * We allow these ambiguities. */ enum PinyinAmbiguity { PINYIN_AmbAny= 0, PINYIN_AmbZhiZi, PINYIN_AmbChiCi, PINYIN_AmbShiSi, PINYIN_AmbNeLe, PINYIN_AmbLeRi, PINYIN_AmbFoHe, PINYIN_AmbGeKe, PINYIN_AmbAnAng, PINYIN_AmbEnEng, PINYIN_AmbInIng, PINYIN_AmbLast = PINYIN_AmbInIng }; /** * @brief Structure to hold pinyin custom settings. * * user can custom the behavor of libpinyin by these settings. */ struct PinyinCustomSettings { bool use_incomplete; /**< allow incomplete pinyin key which only has inital. */ bool use_ambiguities [PINYIN_AmbLast + 1]; /**< allow ambiguous pinyin elements or not. */ PinyinCustomSettings (); void set_use_incomplete (bool use) { use_incomplete = use; } void set_use_ambiguities (PinyinAmbiguity amb, bool use) { if (amb == PINYIN_AmbAny) for (size_t i=0; i<=PINYIN_AmbLast; ++i) use_ambiguities [i] = use; else { use_ambiguities [0] = false; use_ambiguities [static_cast(amb)] = use; for (size_t i=1; i<=PINYIN_AmbLast; ++i) if (use_ambiguities [i]) { use_ambiguities [0] = true; break; } } } bool operator == (const PinyinCustomSettings &rhs) const { if (use_incomplete != rhs.use_incomplete) return false; for (size_t i=0; i <= PINYIN_AmbLast; ++i) if (use_ambiguities [i] != rhs.use_ambiguities [i]) return false; return true; } bool operator != (const PinyinCustomSettings &rhs) const { return !(*this == rhs); } guint32 to_value () const { guint32 val = 0; if (use_incomplete) val |= 1; for (size_t i=0; i <= PINYIN_AmbLast; ++i) if (use_ambiguities [i]) val |= (1 << (i+1)); return val; } void from_value (guint32 val) { use_incomplete = ((val & 1) != 0); for (size_t i=0; i <= PINYIN_AmbLast; ++i) use_ambiguities [i] = ((val & (1 << (i+1))) != 0); } }; /** * @brief Pinyin key class. * * A pinyin key is a composed element of an initial, a final and a tone, * which represents one or several Chinese ideographs * * The position and length information for the portion of string, from which * the PinyinKey is parsed, are also stored in this structure. */ struct PinyinKey { friend class PinyinBitmapIndexLevel; friend inline int pinyin_exact_compare(const PinyinKey key_lhs[], const PinyinKey key_rhs[], int word_length); friend inline int pinyin_compare_with_ambiguities (const PinyinCustomSettings &custom, const PinyinKey* key_lhs, const PinyinKey* key_rhs, int word_length); friend inline void compute_lower_value(const PinyinCustomSettings &custom, PinyinKey in_keys[], PinyinKey out_keys[], int word_length); friend inline void compute_upper_value(const PinyinCustomSettings &custom, PinyinKey in_keys[], PinyinKey out_keys[], int word_length); private: guint16 m_initial : 5; /**< pinyin initial */ guint16 m_final : 6; /**< pinyin final */ guint16 m_tone : 3; /**< pinyin tone */ public: /** * @brief Minimal numerical value of a PinyinKey * @sa get_value(); */ static const guint16 min_value; /** * @brief Maximal numerical value of a PinyinKey * @sa get_value(); */ static const guint16 max_value; public: /** * Constructor. * * The default constructor of class PinyinKey. */ PinyinKey (PinyinInitial initial = PINYIN_ZeroInitial, PinyinFinal final = PINYIN_ZeroFinal, PinyinTone tone = PINYIN_ZeroTone) : m_initial (initial), m_final (final), m_tone (tone) { } /** * Constructor. * * Construct a PinyinKey object from a key string, with * specified validator. * * @sa PinyinValidator */ PinyinKey (const PinyinValidator &validator, const char *str, int len = -1) { set (validator, str, len); } PinyinKey (guint16 value) { set (value); } /** * Clear the PinyinKey object. */ void clear () { m_initial = PINYIN_ZeroInitial; m_final = PINYIN_ZeroFinal; m_tone = PINYIN_ZeroTone; } /** * Read PinyinKey value from a key string. * * @param validator a PinyinValidator object to validate the key. * @param key a Latin string including one or more pinyin keys. * @return the number of characters used by this pinyin key. */ int set (const PinyinValidator &validator, const char *str, int len = -1); /** * Set PinyinKey's value to initial, final and tone. */ void set (PinyinInitial initial = PINYIN_ZeroInitial, PinyinFinal final = PINYIN_ZeroFinal, PinyinTone tone = PINYIN_ZeroTone) { m_initial = initial; m_final = final; m_tone = tone; } /** * @brief Set this PinyinKey from its numerical value. */ void set (guint16 value) { m_tone = value % PINYIN_Number_Of_Tones; value /= PINYIN_Number_Of_Tones; m_final = value % PINYIN_Number_Of_Finals; m_initial = value / PINYIN_Number_Of_Finals; } /** * @brief Get numerical value of this PinyinKey */ guint16 get_value () const { return (m_initial * PINYIN_Number_Of_Finals + m_final) * PINYIN_Number_Of_Tones + m_tone; } /** * Set PinyinKey's initial value to initial. */ void set_initial (PinyinInitial initial = PINYIN_ZeroInitial) { m_initial = initial; } /** * Set PinyinKey's final value to final. */ void set_final (PinyinFinal final = PINYIN_ZeroFinal) { m_final = final; } /** * Set PinyinKey's tone value to tone. */ void set_tone (PinyinTone tone = PINYIN_ZeroTone) { m_tone = tone; } /** * Get initial value of this key. */ PinyinInitial get_initial () const { return static_cast(m_initial); } /** * Get final value of this key. */ PinyinFinal get_final () const { return static_cast(m_final); } /** * Get tone value of this key. */ PinyinTone get_tone () const { return static_cast(m_tone); } /** * Get Latin name of this key's initial. */ const char* get_initial_string () const; /** * Get Chinese ZhuYin name of this key's initial, in UTF-8 encoding. */ const char* get_initial_zhuyin_string () const; /** * Get Latin name of this key's final. */ const char* get_final_string () const; /** * Get Chinese ZhuYin name of this key's final, in UTF-8 encoding. */ const char* get_final_zhuyin_string () const; /** * Get Latin name of this key's tone. */ const char* get_tone_string () const; /** * Get Chinese ZhuYin name of this key's tone, in UTF-8 encoding. */ const char* get_tone_zhuyin_string () const; /** * Get Latin name of this key. */ const char * get_key_string () const; /** * Get Chinese ZhuYin name of this key, in UTF-8 encoding. */ const char * get_key_zhuyin_string () const; /** * Check if this key is empty. */ bool is_empty () const { return m_initial == PINYIN_ZeroInitial && m_final == PINYIN_ZeroFinal && m_tone == PINYIN_ZeroTone; } /** * Check if this key has both initial, final and tone. */ bool is_complete () const { return m_initial != PINYIN_ZeroInitial && m_final != PINYIN_ZeroFinal && m_tone != PINYIN_ZeroTone; } bool operator == (PinyinKey rhs) const { return m_initial == rhs.m_initial && m_final == rhs.m_final && m_tone == rhs.m_tone; } bool operator != (PinyinKey rhs) const { return m_initial != rhs.m_initial || m_final != rhs.m_final || m_tone != rhs.m_tone; } bool operator < (PinyinKey rhs) const { if (m_initial < rhs.m_initial) return true; if (m_initial > rhs.m_initial) return false; if (m_final < rhs.m_final) return true; if (m_final > rhs.m_final) return false; return m_tone < rhs.m_tone; } bool operator > (PinyinKey rhs) const { if (m_initial > rhs.m_initial) return true; if (m_initial < rhs.m_initial) return false; if (m_final > rhs.m_final) return true; if (m_final < rhs.m_final) return false; return m_tone > rhs.m_tone; } }; /** * NULL Validator of PinyinKey object. * * This class is for validating a PinyinKey object. */ class PinyinValidator { public: /** * Overloaded operator () function to validate a pinyin key. * * @param key The key to be validated. * @return true if the key is valid. */ virtual bool operator () (PinyinKey key) const = 0; }; class PinyinLargeTable; /** * Validator of PinyinKey object. * * This class is for validating a PinyinKey object. */ class BitmapPinyinValidator:public PinyinValidator { char m_bitmap [(PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 7) / 8]; public: BitmapPinyinValidator (const PinyinLargeTable *table = 0); /** * initialize the validator with specified custom settings * and PinyinLargeTable. */ void initialize (const PinyinLargeTable *table = 0); /** * Overloaded operator () function to validate a pinyin key. * * @param key The key to be validated. * @return true if the key is valid. */ virtual bool operator () (PinyinKey key) const; }; /** * NULL Validator of PinyinKey object. * * This class is for validating a PinyinKey object. */ class NullPinyinValidator:public PinyinValidator { public: /** * Overloaded operator () function to validate a pinyin key. * * @param key The key to be validated. * @return true if the key is valid. */ virtual bool operator () (PinyinKey key) const{ return true; } }; /** * @brief Class to translate string into PinyinKey. */ class PinyinParser { public: virtual ~PinyinParser (); /** * @brief Translate only one PinyinKey from a string. * * @param validator PinyinValidator object to valid result. * @param key Stores result PinyinKey. * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string, * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme, * it's an UTF-8 string which contains ZhuYin chars. * @param len The length of str, in number of chars rather than bytes. * * @return the number of chars were actually used. */ virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const = 0; /** * @brief Handy wrapper function of parse_one_key(), which accept a String object instead of char *. */ int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char * &str) const { return parse_one_key (validator, key, str, strlen (str)); } /** * @brief Translate the source string into a set of PinyinKeys. * * @param validator PinyinValidator object to valid result. * @param keys Stores result PinyinKeys. * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string, * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme, * it's an UTF-8 string which contains ZhuYin chars. * @param len The length of str, in number of chars rather than bytes. * * @return the number of chars were actually used. */ virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys,PinyinKeyPosVector & poses, const char *str, int len = -1) const = 0; public: static void normalize (PinyinKey &key); }; /** * The default Pinyin Parser which parses full pinyin string into PinyinKeys. */ class PinyinDefaultParser : public PinyinParser { public: virtual ~PinyinDefaultParser (); virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const; virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const; public: using PinyinParser::parse_one_key; using PinyinParser::parse; }; /* The valid input chars of ShuangPin is a-z and ';' */ class PinyinShuangPinParser : public PinyinParser { PinyinInitial m_initial_map [27]; PinyinFinal m_final_map [27][2]; public: /** * Constructor * * @param scheme the predefined ShuangPin scheme to be used. */ PinyinShuangPinParser (PinyinShuangPinScheme scheme = SHUANG_PIN_DEFAULT); PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]); virtual ~PinyinShuangPinParser (); virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const; virtual int parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const; public: void set_scheme (PinyinShuangPinScheme scheme); void set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]); void get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2]); public: using PinyinParser::parse_one_key; using PinyinParser::parse; }; int pinyin_compare_initial (const PinyinCustomSettings &custom, PinyinInitial lhs, PinyinInitial rhs); int pinyin_compare_final (const PinyinCustomSettings &custom, PinyinFinal lhs, PinyinFinal rhs); int pinyin_compare_tone (const PinyinCustomSettings &custom, PinyinTone lhs, PinyinTone rhs); }; #endif