diff options
author | Peng Wu <alexepico@gmail.com> | 2011-08-31 15:28:04 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-08-31 15:28:04 +0800 |
commit | 35c9c51a81780340924cb0edef4b77e719f15b92 (patch) | |
tree | 82836eef7c0f2308530e0e4921aeb08c208c5c16 /src | |
parent | 420ffb6d69189f5e0c4e80e7f829cde61fccab4f (diff) | |
download | libpinyin-35c9c51a81780340924cb0edef4b77e719f15b92.tar.gz libpinyin-35c9c51a81780340924cb0edef4b77e719f15b92.tar.xz libpinyin-35c9c51a81780340924cb0edef4b77e719f15b92.zip |
add chewing parser
Diffstat (limited to 'src')
-rw-r--r-- | src/storage/pinyin_base.cpp | 296 | ||||
-rw-r--r-- | src/storage/pinyin_base.h | 52 |
2 files changed, 346 insertions, 2 deletions
diff --git a/src/storage/pinyin_base.cpp b/src/storage/pinyin_base.cpp index f23c4cf..467bdb3 100644 --- a/src/storage/pinyin_base.cpp +++ b/src/storage/pinyin_base.cpp @@ -755,7 +755,7 @@ PinyinParser::normalize (PinyinKey &key) { static const PinyinReplaceRulePair rules [] = { -#if 0 +#if 1 {PINYIN_ZeroInitial, PINYIN_I, PINYIN_Yi, PINYIN_I}, {PINYIN_ZeroInitial, PINYIN_Ia, PINYIN_Yi, PINYIN_A}, {PINYIN_ZeroInitial, PINYIN_Ian, PINYIN_Yi, PINYIN_An}, @@ -1135,7 +1135,7 @@ PinyinDefaultParser::parse (const PinyinValidator &validator, PinyinKeyVector & DefaultParserCache cache = g_array_new (FALSE, TRUE, sizeof (DefaultParserCacheElement)); g_array_set_size(cache, len); - for ( size_t index = 0 ; index < len ; index++){ + for ( int index = 0 ; index < len ; index++){ DefaultParserCacheElement * element = &g_array_index(cache,DefaultParserCacheElement, index); *element = elm; @@ -1352,6 +1352,298 @@ PinyinShuangPinParser::get_scheme (PinyinInitial initial_map[27], PinyinFinal fi } } +PinyinZhuYinParser::PinyinZhuYinParser (PinyinZhuYinScheme scheme) + : m_scheme (scheme) +{ +} + +PinyinZhuYinParser::~PinyinZhuYinParser () +{ +} + +int +PinyinZhuYinParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const +{ + PinyinKey candkeys[4][3]; + gunichar ch; + + if (len < 0) len = g_utf8_strlen (str, -1); + + for (int i= 0; i < 4 && i < len; ++i) { + ch = g_utf8_get_char (str); + if (!get_keys (candkeys[i], ch)) + break; + str = g_utf8_next_char (str); + } + + return pack_keys (key, validator, candkeys); +} + +int +PinyinZhuYinParser::parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len) const +{ + g_array_set_size(keys, 0); + g_array_set_size(poses, 0); + + if (!str || !len || ! (*str)) return 0; + + int used_len = 0; + + PinyinKey key; + PinyinKeyPos pos; + + if (len < 0) len = g_utf8_strlen (str, -1); + + while (used_len < len) { + if (g_utf8_get_char (str) == ' ') { + ++used_len; + str = g_utf8_next_char (str); + continue; + } + + int one_len = parse_one_key (validator, key, str, len); + + if (one_len) { + pos.set_pos (used_len); + pos.set_length (one_len); + g_array_append_val (keys, key); + g_array_append_val (poses, pos); + } else { + break; + } + + /* utf8 next n chars. */ + for ( int i = 0; i < one_len; ++i ) { + str = g_utf8_next_char (str); + } + used_len += one_len; + } + + return used_len; +} + +void +PinyinZhuYinParser::set_scheme (PinyinZhuYinScheme scheme) +{ + m_scheme = scheme; +} + +PinyinZhuYinScheme +PinyinZhuYinParser::get_scheme () const +{ + return m_scheme; +} + +bool +PinyinZhuYinParser::get_keys (PinyinKey keys[], gunichar ch) const +{ + if (m_scheme == ZHUYIN_ZHUYIN) { + if (ch == 0x20 || ch == 0x02C9) keys [0].set_tone (PINYIN_First); + else if (ch == 0x02CA) keys [0].set_tone (PINYIN_Second); + else if (ch == 0x02C7) keys [0].set_tone (PINYIN_Third); + else if (ch == 0x02CB) keys [0].set_tone (PINYIN_Fourth); + else if (ch == 0x02D9) keys [0].set_tone (PINYIN_Fifth); + else if (ch >= 0x3105 && ch <= 0x3129) { + keys[0] = __zhuyin_zhuyin_map[ch - 0x3105][0]; + keys[1] = __zhuyin_zhuyin_map[ch - 0x3105][1]; + keys[2] = __zhuyin_zhuyin_map[ch - 0x3105][2]; + } + } else if (ch >= 0x20 && ch <= 0x7D) { + keys[0] = __zhuyin_maps[m_scheme][ch - 0x20][0]; + keys[1] = __zhuyin_maps[m_scheme][ch - 0x20][1]; + keys[2] = __zhuyin_maps[m_scheme][ch - 0x20][2]; + } else { + keys[0].clear (); + keys[1].clear (); + keys[2].clear (); + } + + return !keys[0].is_empty (); +} + +struct ZhuYinFinalReplaceRulePair +{ + PinyinFinal final1; + PinyinFinal final2; + PinyinFinal new_final; +}; + +class ZhuYinFinalReplaceRulePairLessThan +{ +public: + bool operator () (const ZhuYinFinalReplaceRulePair &lhs, const ZhuYinFinalReplaceRulePair &rhs) const { + if (lhs.final1 < rhs.final1) return true; + if (lhs.final1 > rhs.final1) return false; + return lhs.final2 < rhs.final2; + } +}; + +int +PinyinZhuYinParser::pack_keys (PinyinKey &key, const PinyinValidator &validator, const PinyinKey keys[][3]) const +{ + static const ZhuYinFinalReplaceRulePair final_rules [] = + { + {PINYIN_I, PINYIN_A, PINYIN_Ia}, + {PINYIN_I, PINYIN_An, PINYIN_Ian}, + {PINYIN_I, PINYIN_Ang, PINYIN_Iang}, + {PINYIN_I, PINYIN_Ao, PINYIN_Iao}, + {PINYIN_I, PINYIN_Ea, PINYIN_Ie}, + {PINYIN_I, PINYIN_En, PINYIN_In}, + {PINYIN_I, PINYIN_Eng, PINYIN_Ing}, + {PINYIN_I, PINYIN_O, PINYIN_I}, + {PINYIN_I, PINYIN_Ou, PINYIN_Iu}, + {PINYIN_U, PINYIN_A, PINYIN_Ua}, + {PINYIN_U, PINYIN_Ai, PINYIN_Uai}, + {PINYIN_U, PINYIN_An, PINYIN_Uan}, + {PINYIN_U, PINYIN_Ang, PINYIN_Uang}, + {PINYIN_U, PINYIN_Ei, PINYIN_Ui}, + {PINYIN_U, PINYIN_En, PINYIN_Un}, + {PINYIN_U, PINYIN_Eng, PINYIN_Ueng}, + {PINYIN_U, PINYIN_O, PINYIN_Uo}, + {PINYIN_V, PINYIN_An, PINYIN_Van}, + {PINYIN_V, PINYIN_Ea, PINYIN_Ve}, + {PINYIN_V, PINYIN_En, PINYIN_Vn}, + {PINYIN_V, PINYIN_Eng, PINYIN_Iong} + }; + + static const ZhuYinFinalReplaceRulePair *final_rules_start = final_rules; + static const ZhuYinFinalReplaceRulePair *final_rules_end = final_rules + sizeof(final_rules)/sizeof(ZhuYinFinalReplaceRulePair); + + PinyinInitial initial; + PinyinFinal final1; + PinyinFinal final2; + PinyinTone tone; + + PinyinKey best_key; + int best_used_keys = 0; + int best_score = -1; + bool best_key_valid = false; + + size_t num; + size_t size [4]; + size_t possibles [4]; + + for (num=0; !keys[num][0].is_empty () && num<4; ++num) { + for (size[num]=0; !keys[num][size[num]].is_empty () && size[num]<3; ++size[num]); + + possibles[num] = (num > 0 ? possibles[num-1] : 1) * size[num]; + } + + while (num) { + for (size_t i=0; i<possibles[num-1]; ++i) { + size_t n = i; + int score = 1; + int used_keys = 0; + + initial = PINYIN_ZeroInitial; + final1 = final2 = PINYIN_ZeroFinal; + tone = PINYIN_ZeroTone; + + for (size_t t=0; t<num; ++t) { + size_t idx = n % size[t]; + n /= size[t]; + + if (keys[t][idx].get_initial () && !initial) { + initial = keys[t][idx].get_initial (); + if (final1) score = 0; + } else if (keys[t][idx].get_final () && !(final1 && final2)) { + if (!final1) final1 = keys[t][idx].get_final (); + else if (!final2) final2 = keys[t][idx].get_final (); + } else if (keys[t][idx].get_tone () && !tone) { + tone = keys[t][idx].get_tone (); + } else { + break; + } + + used_keys = t+1; + + // No initial and final allowed after tone key. + if (tone) break; + } + + // A better candidate has been found. + if (best_score > score) + continue; + + // Is it possible? + if (!initial && !final1 && !final2) + continue; + + if (final1 && final2) { + if (final2 == PINYIN_I || final2 == PINYIN_U || final2 == PINYIN_V) + std_lite::swap (final1, final2); + + // Invalid finals. + if (final1 != PINYIN_I && final1 != PINYIN_U && final1 != PINYIN_V) + continue; + + // In such case, there must be no initial, + // otherwise it's illegal. + if (final1 == PINYIN_I && final2 == PINYIN_O) { + if (!initial) { + initial = PINYIN_Yi; + final1 = PINYIN_O; + final2 = PINYIN_ZeroFinal; + } else { + continue; + } + } else { + ZhuYinFinalReplaceRulePair fp; + fp.final1 = final1; + fp.final2 = final2; + + const ZhuYinFinalReplaceRulePair *p = + std_lite::lower_bound (final_rules_start, final_rules_end, fp, ZhuYinFinalReplaceRulePairLessThan ()); + + // It's invalid that got two finals but they are not in our rules + if (p != final_rules_end && p->final1 == fp.final1 && p->final2 == fp.final2) + final1 = p->new_final; + else + continue; + + if (final1 == PINYIN_Ueng && initial) + final1 = PINYIN_Ong; + } + } else if ((initial == PINYIN_Zhi || initial == PINYIN_Chi || initial == PINYIN_Shi || + initial == PINYIN_Zi || initial == PINYIN_Ci || initial == PINYIN_Si || + initial == PINYIN_Ri) && !final1) { + final1 = PINYIN_I; + } + + key.set (initial, final1, tone); + PinyinParser::normalize (key); + + bool key_valid; + if (best_score < score || + (best_score == score && + (best_used_keys < used_keys || + ((key_valid = validator (key)) && !best_key_valid)))) { + + best_key = key; + best_used_keys = used_keys; + best_score = score; + best_key_valid = key_valid; + + // Break loop if a valid key with tone has been found. + if (key_valid && final1 && tone) { + num = 0; + break; + } + } + } + + if (num > (size_t)best_used_keys) + num = best_used_keys; + else + break; + } + + // CAUTION: The best key maybe not a valid key + key = best_key; + // pos.set_length (best_used_keys); + return best_used_keys; +} + namespace pinyin{ ////////////////////////////////////////////////////////////////////////////// diff --git a/src/storage/pinyin_base.h b/src/storage/pinyin_base.h index c093d60..163c9eb 100644 --- a/src/storage/pinyin_base.h +++ b/src/storage/pinyin_base.h @@ -618,6 +618,57 @@ public: using PinyinParser::parse; }; +/** + * @brief Class to parse ZhuYin input string + * + * Several keyboard scheme are supported: + * * ZHUYIN_ZHUYIN Parse original ZhuYin string, such as ㄅㄧㄢ + * * ZHUYIN_STANDARD Standard ZhuYin keyboard, which maps 1 to Bo(ㄅ), q to Po(ㄆ) etc. + * * ZHUYIN_HSU Hsu ZhuYin keyboard, which uses a-z (except q) chars. + * * ZHUYIN_IBM IBM ZhuYin keyboard, which maps 1 to Bo(ㄅ), 2 to Po(ㄆ) etc. + * * ZHUYIN_GIN_YIEH Gin-Yieh ZhuYin keyboard. + * * ZHUYIN_ET Eten (倚天) ZhuYin keyboard. + * * ZHUYIN_ET26 Eten (倚天) ZhuYin keyboard, which only uses a-z chars. + * + * In order to enable upper-level input method to display intermediate inputted string in ZhuYin chars, + * ZhuYin parser may return invalid keys, so that PinyinKey::get_key_zhuyin_string() can be called for + * each of these keys to get the intermediate inputted ZhuYin string. + * + * UTF-8 string is used in ZhuYin Parser, because the requirement of supporting original ZhuYin strings. + * So that the length of inputted string is calculated in number of utf8 chars instead of bytes. + */ +class PinyinZhuYinParser : public PinyinParser +{ + PinyinZhuYinScheme m_scheme; + +public: + /** + * Constructor + * + * @param scheme the predefined ZhuYIn scheme to be used. + */ + PinyinZhuYinParser (PinyinZhuYinScheme scheme = ZHUYIN_DEFAULT); + + virtual ~PinyinZhuYinParser (); + + virtual int parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) const; + virtual int parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len = -1) const; + +public: + void set_scheme (PinyinZhuYinScheme scheme); + PinyinZhuYinScheme get_scheme () const; + +private: + bool get_keys (PinyinKey keys[], gunichar ch) const; + + int pack_keys (PinyinKey &key, const PinyinValidator &validator, const PinyinKey keys[][3]) const; + +public: + using PinyinParser::parse_one_key; + using PinyinParser::parse; +}; + + int pinyin_compare_initial (const PinyinCustomSettings &custom, PinyinInitial lhs, PinyinInitial rhs); @@ -629,6 +680,7 @@ int pinyin_compare_final (const PinyinCustomSettings &custom, int pinyin_compare_tone (const PinyinCustomSettings &custom, PinyinTone lhs, PinyinTone rhs); + }; #endif |