summaryrefslogtreecommitdiffstats
path: root/src/storage/pinyin_base.h
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2010-08-03 10:42:47 +0800
committerPeng Wu <alexepico@gmail.com>2010-08-03 10:42:47 +0800
commitf41d1fdf83408e042ab07925710a8913bad0c27c (patch)
tree1757833ac4cdd0830834d2f9ef92be07c0bc1a5b /src/storage/pinyin_base.h
parent34acf9be9033e0dc0a5905999133482c20b6cbf3 (diff)
downloadlibpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.gz
libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.xz
libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.zip
import from pinyin.
Diffstat (limited to 'src/storage/pinyin_base.h')
-rw-r--r--src/storage/pinyin_base.h728
1 files changed, 728 insertions, 0 deletions
diff --git a/src/storage/pinyin_base.h b/src/storage/pinyin_base.h
new file mode 100644
index 0000000..374cc53
--- /dev/null
+++ b/src/storage/pinyin_base.h
@@ -0,0 +1,728 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2002,2003,2006 James Su
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/** @file pinyin_base.h
+ * @brief the definitions of pinyin related classes and structs.
+ */
+
+#ifndef PINYIN_BASE_H
+#define PINYIN_BASE_H
+
+#include <glib.h>
+
+namespace novel{
+
+// Predefinition of some classes and structs
+struct PinyinKey;
+
+class PinyinValidator;
+class PinyinParser;
+
+struct PinyinKeyPos{
+ int m_pos;
+ size_t m_len;
+ PinyinKeyPos(){
+ m_pos = 0;
+ m_len = 0;
+ }
+ void set_pos(int pos){
+ m_pos = pos;
+ }
+ void set_length(size_t len){
+ m_len = len;
+ }
+ int get_pos(){
+ return m_pos;
+ }
+ int get_end_pos(){
+ return m_pos + m_len;
+ }
+ size_t get_length(){
+ return m_len;
+ }
+};
+
+typedef GArray* PinyinKeyVector; /* Array of PinyinKey */
+typedef GArray* PinyinKeyPosVector; /* Array of PinyinKeyPos */
+
+
+struct PinyinCustomSettings;
+
+/**
+ * @brief enums of pinyin initial element.
+ *
+ * A pinyin key can be divided into three tokens:
+ * Initial -- such as B P M F D T N L etc.
+ * Final -- such as A O E I U V etc.
+ * Tone -- can be 1, 2, 3, 4 and 5.
+ */
+enum PinyinInitial
+{
+ PINYIN_ZeroInitial = 0, /**< zero initial. indicates invaild initial */
+ PINYIN_Bo = 1,
+ PINYIN_Ci = 2,
+ PINYIN_Chi = 3,
+ PINYIN_De = 4,
+ PINYIN_Fo = 5,
+ PINYIN_He = 6,
+ PINYIN_Ge = 7,
+ PINYIN_Ji = 8,
+ PINYIN_Ke = 9,
+ PINYIN_Mo =10,
+ PINYIN_Ne =11,
+ PINYIN_Le =12,
+ PINYIN_Ri =13,
+ PINYIN_Po =14,
+ PINYIN_Qi =15,
+ PINYIN_Si =16,
+ PINYIN_Shi =17,
+ PINYIN_Te =18,
+ PINYIN_Wu =19,
+ PINYIN_Xi =20,
+ PINYIN_Yi =21,
+ PINYIN_Zi =22,
+ PINYIN_Zhi =23,
+ PINYIN_LastInitial = PINYIN_Zhi, /**< the last initial */
+ PINYIN_Number_Of_Initials = PINYIN_LastInitial + 1
+};
+
+/**
+ * @brief enums of pinyin final element.
+ */
+enum PinyinFinal
+{
+ PINYIN_ZeroFinal = 0, /**< zero final. indicates invalid final */
+ PINYIN_A = 1,
+ PINYIN_Ai = 2,
+ PINYIN_An = 3,
+ PINYIN_Ang = 4,
+ PINYIN_Ao = 5,
+ PINYIN_E = 6,
+ PINYIN_Ea = 7,
+ PINYIN_Ei = 8,
+ PINYIN_En = 9,
+ PINYIN_Eng =10,
+ PINYIN_Er =11,
+ PINYIN_I =12,
+ PINYIN_Ia =13,
+ PINYIN_Ian =14,
+ PINYIN_Iang =15,
+ PINYIN_Iao =16,
+ PINYIN_Ie =17,
+ PINYIN_In =18,
+ PINYIN_Ing =19,
+ PINYIN_Iong =20,
+ PINYIN_Iu =21,
+ PINYIN_Ng =22,
+ PINYIN_O =23,
+ PINYIN_Ong =24,
+ PINYIN_Ou =25,
+ PINYIN_U =26,
+ PINYIN_Ua =27,
+ PINYIN_Uai =28,
+ PINYIN_Uan =29,
+ PINYIN_Uang =30,
+ PINYIN_Ue =31,
+ PINYIN_Ueng =32,
+ PINYIN_Ui =33,
+ PINYIN_Un =34,
+ PINYIN_Uo =35,
+ PINYIN_V =36,
+ PINYIN_Van =37,
+ PINYIN_Ve =38,
+ PINYIN_Vn =39,
+ PINYIN_LastFinal = PINYIN_Vn, /**< the last final */
+ PINYIN_Number_Of_Finals = PINYIN_LastFinal + 1
+};
+
+/**
+ * @brief enums of pinyin tone element.
+ */
+enum PinyinTone
+{
+ PINYIN_ZeroTone = 0, /**< zero tone. this will be matched with all other tones. */
+ PINYIN_First = 1,
+ PINYIN_Second = 2,
+ PINYIN_Third = 3,
+ PINYIN_Fourth = 4,
+ PINYIN_Fifth = 5,
+ PINYIN_LastTone = PINYIN_Fifth, /**< the last tone */
+ PINYIN_Number_Of_Tones = PINYIN_LastTone + 1
+};
+
+/**
+ * @brief enums of Shuang Pin Schemes.
+ */
+enum PinyinShuangPinScheme
+{
+ SHUANG_PIN_STONE = 0,
+ SHUANG_PIN_ZRM = 1,
+ SHUANG_PIN_MS = 2,
+ SHUANG_PIN_ZIGUANG = 3,
+ SHUANG_PIN_ABC = 4,
+ SHUANG_PIN_LIUSHI = 5,
+ SHUANG_PIN_CUSTOMIZED = 6,
+ SHUANG_PIN_DEFAULT = SHUANG_PIN_ZRM
+};
+
+/**
+ * @brief enums of ZhuYin Schemes.
+ */
+enum PinyinZhuYinScheme
+{
+ ZHUYIN_ZHUYIN = 0,
+ ZHUYIN_STANDARD = 1,
+ ZHUYIN_HSU = 2,
+ ZHUYIN_IBM = 3,
+ ZHUYIN_GIN_YIEH = 4,
+ ZHUYIN_ET = 5,
+ ZHUYIN_ET26 = 6,
+ ZHUYIN_DEFAULT = ZHUYIN_STANDARD
+};
+
+/**
+ * @brief enums of pinyin ambiguities.
+ *
+ * Some pinyin element maybe confused by somebody,
+ * We allow these ambiguities.
+ */
+enum PinyinAmbiguity
+{
+ PINYIN_AmbAny= 0,
+ PINYIN_AmbZhiZi,
+ PINYIN_AmbChiCi,
+ PINYIN_AmbShiSi,
+ PINYIN_AmbNeLe,
+ PINYIN_AmbLeRi,
+ PINYIN_AmbFoHe,
+ PINYIN_AmbAnAng,
+ PINYIN_AmbEnEng,
+ PINYIN_AmbInIng,
+ PINYIN_AmbLast = PINYIN_AmbInIng
+};
+
+/**
+ * @brief Structure to hold pinyin custom settings.
+ *
+ * user can custom the behavor of libpinyin by these settings.
+ */
+struct PinyinCustomSettings
+{
+ bool use_incomplete;
+ /**< allow incomplete pinyin key which only has inital. */
+
+ bool use_ambiguities [PINYIN_AmbLast + 1];
+ /**< allow ambiguous pinyin elements or not. */
+
+ PinyinCustomSettings ();
+
+ void set_use_incomplete (bool use) { use_incomplete = use; }
+ void set_use_ambiguities (PinyinAmbiguity amb, bool use)
+ {
+ if (amb == PINYIN_AmbAny)
+ for (size_t i=0; i<=PINYIN_AmbLast; ++i) use_ambiguities [i] = use;
+ else {
+ use_ambiguities [0] = false;
+ use_ambiguities [static_cast<size_t>(amb)] = use;
+ for (size_t i=1; i<=PINYIN_AmbLast; ++i)
+ if (use_ambiguities [i]) {
+ use_ambiguities [0] = true;
+ break;
+ }
+ }
+ }
+
+ bool operator == (const PinyinCustomSettings &rhs) const
+ {
+ if (use_incomplete != rhs.use_incomplete)
+ return false;
+
+ for (size_t i=0; i <= PINYIN_AmbLast; ++i)
+ if (use_ambiguities [i] != rhs.use_ambiguities [i])
+ return false;
+
+ return true;
+ }
+
+ bool operator != (const PinyinCustomSettings &rhs) const
+ {
+ return !(*this == rhs);
+ }
+
+ guint32 to_value () const
+ {
+ guint32 val = 0;
+
+ if (use_incomplete) val |= 1;
+
+ for (size_t i=0; i <= PINYIN_AmbLast; ++i)
+ if (use_ambiguities [i])
+ val |= (1 << (i+1));
+
+ return val;
+ }
+
+ void from_value (guint32 val)
+ {
+ use_incomplete = ((val & 1) != 0);
+
+ for (size_t i=0; i <= PINYIN_AmbLast; ++i)
+ use_ambiguities [i] = ((val & (1 << (i+1))) != 0);
+ }
+};
+
+/**
+ * @brief Pinyin key class.
+ *
+ * A pinyin key is a composed element of an initial, a final and a tone,
+ * which represents one or several Chinese ideographs
+ *
+ * The position and length information for the portion of string, from which
+ * the PinyinKey is parsed, are also stored in this structure.
+ */
+struct PinyinKey
+{
+ friend class PinyinBitmapIndexLevel;
+ friend inline int pinyin_exact_compare(const PinyinKey key_lhs[],
+ const PinyinKey key_rhs[],
+ int word_length);
+ friend inline int pinyin_compare_with_ambiguities
+ (const PinyinCustomSettings &custom,
+ const PinyinKey* key_lhs,
+ const PinyinKey* key_rhs,
+ int word_length);
+ friend inline void compute_lower_value(const PinyinCustomSettings &custom,
+ PinyinKey in_keys[],
+ PinyinKey out_keys[],
+ int word_length);
+ friend inline void compute_upper_value(const PinyinCustomSettings &custom,
+ PinyinKey in_keys[],
+ PinyinKey out_keys[],
+ int word_length);
+
+private:
+ guint16 m_initial : 5; /**< pinyin initial */
+ guint16 m_final : 6; /**< pinyin final */
+ guint16 m_tone : 3; /**< pinyin tone */
+public:
+ /**
+ * @brief Minimal numerical value of a PinyinKey
+ * @sa get_value();
+ */
+ static const guint16 min_value;
+
+ /**
+ * @brief Maximal numerical value of a PinyinKey
+ * @sa get_value();
+ */
+ static const guint16 max_value;
+
+public:
+ /**
+ * Constructor.
+ *
+ * The default constructor of class PinyinKey.
+ */
+ PinyinKey (PinyinInitial initial = PINYIN_ZeroInitial,
+ PinyinFinal final = PINYIN_ZeroFinal,
+ PinyinTone tone = PINYIN_ZeroTone)
+ : m_initial (initial), m_final (final), m_tone (tone)
+ {
+ }
+
+ /**
+ * Constructor.
+ *
+ * Construct a PinyinKey object from a key string, with
+ * specified validator.
+ *
+ * @sa PinyinValidator
+ */
+ PinyinKey (const PinyinValidator &validator, const char *str, int len = -1)
+ {
+ set (validator, str, len);
+ }
+
+ PinyinKey (guint16 value)
+ {
+ set (value);
+ }
+ /**
+ * Clear the PinyinKey object.
+ */
+
+ void clear ()
+ {
+ m_initial = PINYIN_ZeroInitial;
+ m_final = PINYIN_ZeroFinal;
+ m_tone = PINYIN_ZeroTone;
+ }
+
+ /**
+ * Read PinyinKey value from a key string.
+ *
+ * @param validator a PinyinValidator object to validate the key.
+ * @param key a Latin string including one or more pinyin keys.
+ * @return the number of characters used by this pinyin key.
+ */
+ int set (const PinyinValidator &validator, const char *str, int len = -1);
+
+ /**
+ * Set PinyinKey's value to initial, final and tone.
+ */
+ void set (PinyinInitial initial = PINYIN_ZeroInitial,
+ PinyinFinal final = PINYIN_ZeroFinal,
+ PinyinTone tone = PINYIN_ZeroTone)
+ {
+ m_initial = initial;
+ m_final = final;
+ m_tone = tone;
+ }
+
+ /**
+ * @brief Set this PinyinKey from its numerical value.
+ */
+ void set (guint16 value)
+ {
+ m_tone = value % PINYIN_Number_Of_Tones;
+ value /= PINYIN_Number_Of_Tones;
+ m_final = value % PINYIN_Number_Of_Finals;
+ m_initial = value / PINYIN_Number_Of_Finals;
+ }
+
+ /**
+ * @brief Get numerical value of this PinyinKey
+ */
+ guint16 get_value () const
+ {
+ return (m_initial * PINYIN_Number_Of_Finals + m_final) * PINYIN_Number_Of_Tones + m_tone;
+ }
+
+ /**
+ * Set PinyinKey's initial value to initial.
+ */
+ void set_initial (PinyinInitial initial = PINYIN_ZeroInitial)
+ {
+ m_initial = initial;
+ }
+
+ /**
+ * Set PinyinKey's final value to final.
+ */
+ void set_final (PinyinFinal final = PINYIN_ZeroFinal)
+ {
+ m_final = final;
+ }
+
+ /**
+ * Set PinyinKey's tone value to tone.
+ */
+ void set_tone (PinyinTone tone = PINYIN_ZeroTone)
+ {
+ m_tone = tone;
+ }
+
+ /**
+ * Get initial value of this key.
+ */
+ PinyinInitial get_initial () const
+ {
+ return static_cast<PinyinInitial>(m_initial);
+ }
+
+ /**
+ * Get final value of this key.
+ */
+ PinyinFinal get_final () const
+ {
+ return static_cast<PinyinFinal>(m_final);
+ }
+
+ /**
+ * Get tone value of this key.
+ */
+ PinyinTone get_tone () const
+ {
+ return static_cast<PinyinTone>(m_tone);
+ }
+
+ /**
+ * Get Latin name of this key's initial.
+ */
+ const char* get_initial_string () const;
+
+ /**
+ * Get Chinese ZhuYin name of this key's initial, in UTF-8 encoding.
+ */
+ const char* get_initial_zhuyin_string () const;
+
+ /**
+ * Get Latin name of this key's final.
+ */
+ const char* get_final_string () const;
+
+ /**
+ * Get Chinese ZhuYin name of this key's final, in UTF-8 encoding.
+ */
+ const char* get_final_zhuyin_string () const;
+
+ /**
+ * Get Latin name of this key's tone.
+ */
+ const char* get_tone_string () const;
+
+ /**
+ * Get Chinese ZhuYin name of this key's tone, in UTF-8 encoding.
+ */
+ const char* get_tone_zhuyin_string () const;
+
+ /**
+ * Get Latin name of this key.
+ */
+ const char * get_key_string () const;
+
+ /**
+ * Get Chinese ZhuYin name of this key, in UTF-8 encoding.
+ */
+ const char * get_key_zhuyin_string () const;
+
+ /**
+ * Check if this key is empty.
+ */
+ bool is_empty () const
+ {
+ return m_initial == PINYIN_ZeroInitial && m_final == PINYIN_ZeroFinal && m_tone == PINYIN_ZeroTone;
+ }
+
+ /**
+ * Check if this key has both initial, final and tone.
+ */
+ bool is_complete () const
+ {
+ return m_initial != PINYIN_ZeroInitial && m_final != PINYIN_ZeroFinal && m_tone != PINYIN_ZeroTone;
+ }
+
+ bool operator == (PinyinKey rhs) const
+ {
+ return m_initial == rhs.m_initial && m_final == rhs.m_final && m_tone == rhs.m_tone;
+ }
+
+ bool operator != (PinyinKey rhs) const
+ {
+ return m_initial != rhs.m_initial || m_final != rhs.m_final || m_tone != rhs.m_tone;
+ }
+
+ bool operator < (PinyinKey rhs) const
+ {
+ if (m_initial < rhs.m_initial) return true;
+ if (m_initial > rhs.m_initial) return false;
+ if (m_final < rhs.m_final) return true;
+ if (m_final > rhs.m_final) return false;
+ return m_tone < rhs.m_tone;
+ }
+
+ bool operator > (PinyinKey rhs) const
+ {
+ if (m_initial > rhs.m_initial) return true;
+ if (m_initial < rhs.m_initial) return false;
+ if (m_final > rhs.m_final) return true;
+ if (m_final < rhs.m_final) return false;
+ return m_tone > rhs.m_tone;
+ }
+};
+
+/**
+ * NULL Validator of PinyinKey object.
+ *
+ * This class is for validating a PinyinKey object.
+ */
+class PinyinValidator
+{
+public:
+ /**
+ * Overloaded operator () function to validate a pinyin key.
+ *
+ * @param key The key to be validated.
+ * @return true if the key is valid.
+ */
+ virtual bool operator () (PinyinKey key) const = 0;
+};
+
+class PinyinLargeTable;
+/**
+ * Validator of PinyinKey object.
+ *
+ * This class is for validating a PinyinKey object.
+ */
+class BitmapPinyinValidator:public PinyinValidator
+{
+ char m_bitmap [(PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones + 7) / 8];
+
+public:
+ BitmapPinyinValidator (const PinyinLargeTable *table = 0);
+
+ /**
+ * initialize the validator with specified custom settings
+ * and PinyinLargeTable.
+ */
+ void initialize (const PinyinLargeTable *table = 0);
+
+ /**
+ * Overloaded operator () function to validate a pinyin key.
+ *
+ * @param key The key to be validated.
+ * @return true if the key is valid.
+ */
+ virtual bool operator () (PinyinKey key) const;
+};
+
+/**
+ * NULL Validator of PinyinKey object.
+ *
+ * This class is for validating a PinyinKey object.
+ */
+class NullPinyinValidator:public PinyinValidator
+{
+public:
+ /**
+ * Overloaded operator () function to validate a pinyin key.
+ *
+ * @param key The key to be validated.
+ * @return true if the key is valid.
+ */
+ virtual bool operator () (PinyinKey key) const{
+ return true;
+ }
+};
+
+/**
+ * @brief Class to translate string into PinyinKey.
+ */
+class PinyinParser
+{
+public:
+ virtual ~PinyinParser ();
+
+ /**
+ * @brief Translate only one PinyinKey from a string.
+ *
+ * @param validator PinyinValidator object to valid result.
+ * @param key Stores result PinyinKey.
+ * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string,
+ * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme,
+ * it's an UTF-8 string which contains ZhuYin chars.
+ * @param len The length of str, in number of chars rather than bytes.
+ *
+ * @return the number of chars were actually used.
+ */
+ virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const = 0;
+
+ /**
+ * @brief Handy wrapper function of parse_one_key(), which accept a String object instead of char *.
+ */
+ int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char * &str) const
+ {
+ return parse_one_key (validator, key, str, g_utf8_strlen (str, -1));
+ }
+
+ /**
+ * @brief Translate the source string into a set of PinyinKeys.
+ *
+ * @param validator PinyinValidator object to valid result.
+ * @param keys Stores result PinyinKeys.
+ * @param str Input string in UTF-8 encoding, in most case this string is just a plain ASCII string,
+ * but for ZhuYin Parser works in ZHUYIN_ZHUYIN scheme,
+ * it's an UTF-8 string which contains ZhuYin chars.
+ * @param len The length of str, in number of chars rather than bytes.
+ *
+ * @return the number of chars were actually used.
+ */
+ virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys,PinyinKeyPosVector & poses, const char *str, int len = -1) const = 0;
+
+public:
+ static void normalize (PinyinKey &key);
+};
+
+/**
+ * The default Pinyin Parser which parses full pinyin string into PinyinKeys.
+ */
+class PinyinDefaultParser : public PinyinParser
+{
+public:
+ virtual ~PinyinDefaultParser ();
+
+ virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const;
+ virtual int parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const;
+
+public:
+ using PinyinParser::parse_one_key;
+ using PinyinParser::parse;
+};
+
+/* The valid input chars of ShuangPin is a-z and ';'
+ */
+class PinyinShuangPinParser : public PinyinParser
+{
+ PinyinInitial m_initial_map [27];
+ PinyinFinal m_final_map [27][2];
+
+public:
+ /**
+ * Constructor
+ *
+ * @param scheme the predefined ShuangPin scheme to be used.
+ */
+ PinyinShuangPinParser (PinyinShuangPinScheme scheme = SHUANG_PIN_DEFAULT);
+ PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]);
+
+ virtual ~PinyinShuangPinParser ();
+
+ virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const;
+ virtual int parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const;
+
+public:
+ void set_scheme (PinyinShuangPinScheme scheme);
+ void set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]);
+
+ void get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2]);
+
+public:
+ using PinyinParser::parse_one_key;
+ using PinyinParser::parse;
+};
+
+int pinyin_compare_initial (const PinyinCustomSettings &custom,
+ PinyinInitial lhs,
+ PinyinInitial rhs);
+
+int pinyin_compare_final (const PinyinCustomSettings &custom,
+ PinyinFinal lhs,
+ PinyinFinal rhs);
+
+int pinyin_compare_tone (const PinyinCustomSettings &custom,
+ PinyinTone lhs,
+ PinyinTone rhs);
+};
+
+using namespace novel;
+
+#endif