summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-08-31 15:28:04 +0800
committerPeng Wu <alexepico@gmail.com>2011-08-31 15:28:04 +0800
commit35c9c51a81780340924cb0edef4b77e719f15b92 (patch)
tree82836eef7c0f2308530e0e4921aeb08c208c5c16 /src
parent420ffb6d69189f5e0c4e80e7f829cde61fccab4f (diff)
downloadlibpinyin-35c9c51a81780340924cb0edef4b77e719f15b92.tar.gz
libpinyin-35c9c51a81780340924cb0edef4b77e719f15b92.tar.xz
libpinyin-35c9c51a81780340924cb0edef4b77e719f15b92.zip
add chewing parser
Diffstat (limited to 'src')
-rw-r--r--src/storage/pinyin_base.cpp296
-rw-r--r--src/storage/pinyin_base.h52
2 files changed, 346 insertions, 2 deletions
diff --git a/src/storage/pinyin_base.cpp b/src/storage/pinyin_base.cpp
index f23c4cf..467bdb3 100644
--- a/src/storage/pinyin_base.cpp
+++ b/src/storage/pinyin_base.cpp
@@ -755,7 +755,7 @@ PinyinParser::normalize (PinyinKey &key)
{
static const PinyinReplaceRulePair rules [] =
{
-#if 0
+#if 1
{PINYIN_ZeroInitial, PINYIN_I, PINYIN_Yi, PINYIN_I},
{PINYIN_ZeroInitial, PINYIN_Ia, PINYIN_Yi, PINYIN_A},
{PINYIN_ZeroInitial, PINYIN_Ian, PINYIN_Yi, PINYIN_An},
@@ -1135,7 +1135,7 @@ PinyinDefaultParser::parse (const PinyinValidator &validator, PinyinKeyVector &
DefaultParserCache cache = g_array_new (FALSE, TRUE, sizeof (DefaultParserCacheElement));
g_array_set_size(cache, len);
- for ( size_t index = 0 ; index < len ; index++){
+ for ( int index = 0 ; index < len ; index++){
DefaultParserCacheElement * element =
&g_array_index(cache,DefaultParserCacheElement, index);
*element = elm;
@@ -1352,6 +1352,298 @@ PinyinShuangPinParser::get_scheme (PinyinInitial initial_map[27], PinyinFinal fi
}
}
+PinyinZhuYinParser::PinyinZhuYinParser (PinyinZhuYinScheme scheme)
+ : m_scheme (scheme)
+{
+}
+
+PinyinZhuYinParser::~PinyinZhuYinParser ()
+{
+}
+
+int
+PinyinZhuYinParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const
+{
+ PinyinKey candkeys[4][3];
+ gunichar ch;
+
+ if (len < 0) len = g_utf8_strlen (str, -1);
+
+ for (int i= 0; i < 4 && i < len; ++i) {
+ ch = g_utf8_get_char (str);
+ if (!get_keys (candkeys[i], ch))
+ break;
+ str = g_utf8_next_char (str);
+ }
+
+ return pack_keys (key, validator, candkeys);
+}
+
+int
+PinyinZhuYinParser::parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len) const
+{
+ g_array_set_size(keys, 0);
+ g_array_set_size(poses, 0);
+
+ if (!str || !len || ! (*str)) return 0;
+
+ int used_len = 0;
+
+ PinyinKey key;
+ PinyinKeyPos pos;
+
+ if (len < 0) len = g_utf8_strlen (str, -1);
+
+ while (used_len < len) {
+ if (g_utf8_get_char (str) == ' ') {
+ ++used_len;
+ str = g_utf8_next_char (str);
+ continue;
+ }
+
+ int one_len = parse_one_key (validator, key, str, len);
+
+ if (one_len) {
+ pos.set_pos (used_len);
+ pos.set_length (one_len);
+ g_array_append_val (keys, key);
+ g_array_append_val (poses, pos);
+ } else {
+ break;
+ }
+
+ /* utf8 next n chars. */
+ for ( int i = 0; i < one_len; ++i ) {
+ str = g_utf8_next_char (str);
+ }
+ used_len += one_len;
+ }
+
+ return used_len;
+}
+
+void
+PinyinZhuYinParser::set_scheme (PinyinZhuYinScheme scheme)
+{
+ m_scheme = scheme;
+}
+
+PinyinZhuYinScheme
+PinyinZhuYinParser::get_scheme () const
+{
+ return m_scheme;
+}
+
+bool
+PinyinZhuYinParser::get_keys (PinyinKey keys[], gunichar ch) const
+{
+ if (m_scheme == ZHUYIN_ZHUYIN) {
+ if (ch == 0x20 || ch == 0x02C9) keys [0].set_tone (PINYIN_First);
+ else if (ch == 0x02CA) keys [0].set_tone (PINYIN_Second);
+ else if (ch == 0x02C7) keys [0].set_tone (PINYIN_Third);
+ else if (ch == 0x02CB) keys [0].set_tone (PINYIN_Fourth);
+ else if (ch == 0x02D9) keys [0].set_tone (PINYIN_Fifth);
+ else if (ch >= 0x3105 && ch <= 0x3129) {
+ keys[0] = __zhuyin_zhuyin_map[ch - 0x3105][0];
+ keys[1] = __zhuyin_zhuyin_map[ch - 0x3105][1];
+ keys[2] = __zhuyin_zhuyin_map[ch - 0x3105][2];
+ }
+ } else if (ch >= 0x20 && ch <= 0x7D) {
+ keys[0] = __zhuyin_maps[m_scheme][ch - 0x20][0];
+ keys[1] = __zhuyin_maps[m_scheme][ch - 0x20][1];
+ keys[2] = __zhuyin_maps[m_scheme][ch - 0x20][2];
+ } else {
+ keys[0].clear ();
+ keys[1].clear ();
+ keys[2].clear ();
+ }
+
+ return !keys[0].is_empty ();
+}
+
+struct ZhuYinFinalReplaceRulePair
+{
+ PinyinFinal final1;
+ PinyinFinal final2;
+ PinyinFinal new_final;
+};
+
+class ZhuYinFinalReplaceRulePairLessThan
+{
+public:
+ bool operator () (const ZhuYinFinalReplaceRulePair &lhs, const ZhuYinFinalReplaceRulePair &rhs) const {
+ if (lhs.final1 < rhs.final1) return true;
+ if (lhs.final1 > rhs.final1) return false;
+ return lhs.final2 < rhs.final2;
+ }
+};
+
+int
+PinyinZhuYinParser::pack_keys (PinyinKey &key, const PinyinValidator &validator, const PinyinKey keys[][3]) const
+{
+ static const ZhuYinFinalReplaceRulePair final_rules [] =
+ {
+ {PINYIN_I, PINYIN_A, PINYIN_Ia},
+ {PINYIN_I, PINYIN_An, PINYIN_Ian},
+ {PINYIN_I, PINYIN_Ang, PINYIN_Iang},
+ {PINYIN_I, PINYIN_Ao, PINYIN_Iao},
+ {PINYIN_I, PINYIN_Ea, PINYIN_Ie},
+ {PINYIN_I, PINYIN_En, PINYIN_In},
+ {PINYIN_I, PINYIN_Eng, PINYIN_Ing},
+ {PINYIN_I, PINYIN_O, PINYIN_I},
+ {PINYIN_I, PINYIN_Ou, PINYIN_Iu},
+ {PINYIN_U, PINYIN_A, PINYIN_Ua},
+ {PINYIN_U, PINYIN_Ai, PINYIN_Uai},
+ {PINYIN_U, PINYIN_An, PINYIN_Uan},
+ {PINYIN_U, PINYIN_Ang, PINYIN_Uang},
+ {PINYIN_U, PINYIN_Ei, PINYIN_Ui},
+ {PINYIN_U, PINYIN_En, PINYIN_Un},
+ {PINYIN_U, PINYIN_Eng, PINYIN_Ueng},
+ {PINYIN_U, PINYIN_O, PINYIN_Uo},
+ {PINYIN_V, PINYIN_An, PINYIN_Van},
+ {PINYIN_V, PINYIN_Ea, PINYIN_Ve},
+ {PINYIN_V, PINYIN_En, PINYIN_Vn},
+ {PINYIN_V, PINYIN_Eng, PINYIN_Iong}
+ };
+
+ static const ZhuYinFinalReplaceRulePair *final_rules_start = final_rules;
+ static const ZhuYinFinalReplaceRulePair *final_rules_end = final_rules + sizeof(final_rules)/sizeof(ZhuYinFinalReplaceRulePair);
+
+ PinyinInitial initial;
+ PinyinFinal final1;
+ PinyinFinal final2;
+ PinyinTone tone;
+
+ PinyinKey best_key;
+ int best_used_keys = 0;
+ int best_score = -1;
+ bool best_key_valid = false;
+
+ size_t num;
+ size_t size [4];
+ size_t possibles [4];
+
+ for (num=0; !keys[num][0].is_empty () && num<4; ++num) {
+ for (size[num]=0; !keys[num][size[num]].is_empty () && size[num]<3; ++size[num]);
+
+ possibles[num] = (num > 0 ? possibles[num-1] : 1) * size[num];
+ }
+
+ while (num) {
+ for (size_t i=0; i<possibles[num-1]; ++i) {
+ size_t n = i;
+ int score = 1;
+ int used_keys = 0;
+
+ initial = PINYIN_ZeroInitial;
+ final1 = final2 = PINYIN_ZeroFinal;
+ tone = PINYIN_ZeroTone;
+
+ for (size_t t=0; t<num; ++t) {
+ size_t idx = n % size[t];
+ n /= size[t];
+
+ if (keys[t][idx].get_initial () && !initial) {
+ initial = keys[t][idx].get_initial ();
+ if (final1) score = 0;
+ } else if (keys[t][idx].get_final () && !(final1 && final2)) {
+ if (!final1) final1 = keys[t][idx].get_final ();
+ else if (!final2) final2 = keys[t][idx].get_final ();
+ } else if (keys[t][idx].get_tone () && !tone) {
+ tone = keys[t][idx].get_tone ();
+ } else {
+ break;
+ }
+
+ used_keys = t+1;
+
+ // No initial and final allowed after tone key.
+ if (tone) break;
+ }
+
+ // A better candidate has been found.
+ if (best_score > score)
+ continue;
+
+ // Is it possible?
+ if (!initial && !final1 && !final2)
+ continue;
+
+ if (final1 && final2) {
+ if (final2 == PINYIN_I || final2 == PINYIN_U || final2 == PINYIN_V)
+ std_lite::swap (final1, final2);
+
+ // Invalid finals.
+ if (final1 != PINYIN_I && final1 != PINYIN_U && final1 != PINYIN_V)
+ continue;
+
+ // In such case, there must be no initial,
+ // otherwise it's illegal.
+ if (final1 == PINYIN_I && final2 == PINYIN_O) {
+ if (!initial) {
+ initial = PINYIN_Yi;
+ final1 = PINYIN_O;
+ final2 = PINYIN_ZeroFinal;
+ } else {
+ continue;
+ }
+ } else {
+ ZhuYinFinalReplaceRulePair fp;
+ fp.final1 = final1;
+ fp.final2 = final2;
+
+ const ZhuYinFinalReplaceRulePair *p =
+ std_lite::lower_bound (final_rules_start, final_rules_end, fp, ZhuYinFinalReplaceRulePairLessThan ());
+
+ // It's invalid that got two finals but they are not in our rules
+ if (p != final_rules_end && p->final1 == fp.final1 && p->final2 == fp.final2)
+ final1 = p->new_final;
+ else
+ continue;
+
+ if (final1 == PINYIN_Ueng && initial)
+ final1 = PINYIN_Ong;
+ }
+ } else if ((initial == PINYIN_Zhi || initial == PINYIN_Chi || initial == PINYIN_Shi ||
+ initial == PINYIN_Zi || initial == PINYIN_Ci || initial == PINYIN_Si ||
+ initial == PINYIN_Ri) && !final1) {
+ final1 = PINYIN_I;
+ }
+
+ key.set (initial, final1, tone);
+ PinyinParser::normalize (key);
+
+ bool key_valid;
+ if (best_score < score ||
+ (best_score == score &&
+ (best_used_keys < used_keys ||
+ ((key_valid = validator (key)) && !best_key_valid)))) {
+
+ best_key = key;
+ best_used_keys = used_keys;
+ best_score = score;
+ best_key_valid = key_valid;
+
+ // Break loop if a valid key with tone has been found.
+ if (key_valid && final1 && tone) {
+ num = 0;
+ break;
+ }
+ }
+ }
+
+ if (num > (size_t)best_used_keys)
+ num = best_used_keys;
+ else
+ break;
+ }
+
+ // CAUTION: The best key maybe not a valid key
+ key = best_key;
+ // pos.set_length (best_used_keys);
+ return best_used_keys;
+}
+
namespace pinyin{
//////////////////////////////////////////////////////////////////////////////
diff --git a/src/storage/pinyin_base.h b/src/storage/pinyin_base.h
index c093d60..163c9eb 100644
--- a/src/storage/pinyin_base.h
+++ b/src/storage/pinyin_base.h
@@ -618,6 +618,57 @@ public:
using PinyinParser::parse;
};
+/**
+ * @brief Class to parse ZhuYin input string
+ *
+ * Several keyboard scheme are supported:
+ * * ZHUYIN_ZHUYIN Parse original ZhuYin string, such as ㄅㄧㄢ
+ * * ZHUYIN_STANDARD Standard ZhuYin keyboard, which maps 1 to Bo(ㄅ), q to Po(ㄆ) etc.
+ * * ZHUYIN_HSU Hsu ZhuYin keyboard, which uses a-z (except q) chars.
+ * * ZHUYIN_IBM IBM ZhuYin keyboard, which maps 1 to Bo(ㄅ), 2 to Po(ㄆ) etc.
+ * * ZHUYIN_GIN_YIEH Gin-Yieh ZhuYin keyboard.
+ * * ZHUYIN_ET Eten (倚天) ZhuYin keyboard.
+ * * ZHUYIN_ET26 Eten (倚天) ZhuYin keyboard, which only uses a-z chars.
+ *
+ * In order to enable upper-level input method to display intermediate inputted string in ZhuYin chars,
+ * ZhuYin parser may return invalid keys, so that PinyinKey::get_key_zhuyin_string() can be called for
+ * each of these keys to get the intermediate inputted ZhuYin string.
+ *
+ * UTF-8 string is used in ZhuYin Parser, because the requirement of supporting original ZhuYin strings.
+ * So that the length of inputted string is calculated in number of utf8 chars instead of bytes.
+ */
+class PinyinZhuYinParser : public PinyinParser
+{
+ PinyinZhuYinScheme m_scheme;
+
+public:
+ /**
+ * Constructor
+ *
+ * @param scheme the predefined ZhuYIn scheme to be used.
+ */
+ PinyinZhuYinParser (PinyinZhuYinScheme scheme = ZHUYIN_DEFAULT);
+
+ virtual ~PinyinZhuYinParser ();
+
+ virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const;
+ virtual int parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const;
+
+public:
+ void set_scheme (PinyinZhuYinScheme scheme);
+ PinyinZhuYinScheme get_scheme () const;
+
+private:
+ bool get_keys (PinyinKey keys[], gunichar ch) const;
+
+ int pack_keys (PinyinKey &key, const PinyinValidator &validator, const PinyinKey keys[][3]) const;
+
+public:
+ using PinyinParser::parse_one_key;
+ using PinyinParser::parse;
+};
+
+
int pinyin_compare_initial (const PinyinCustomSettings &custom,
PinyinInitial lhs,
PinyinInitial rhs);
@@ -629,6 +680,7 @@ int pinyin_compare_final (const PinyinCustomSettings &custom,
int pinyin_compare_tone (const PinyinCustomSettings &custom,
PinyinTone lhs,
PinyinTone rhs);
+
};
#endif