diff options
author | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
commit | f41d1fdf83408e042ab07925710a8913bad0c27c (patch) | |
tree | 1757833ac4cdd0830834d2f9ef92be07c0bc1a5b /src/storage/pinyin_base.cpp | |
parent | 34acf9be9033e0dc0a5905999133482c20b6cbf3 (diff) | |
download | libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.gz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.xz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.zip |
import from pinyin.
Diffstat (limited to 'src/storage/pinyin_base.cpp')
-rw-r--r-- | src/storage/pinyin_base.cpp | 1425 |
1 files changed, 1425 insertions, 0 deletions
diff --git a/src/storage/pinyin_base.cpp b/src/storage/pinyin_base.cpp new file mode 100644 index 0000000..cffee3c --- /dev/null +++ b/src/storage/pinyin_base.cpp @@ -0,0 +1,1425 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2002,2003,2006 James Su + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "stl_lite.h" +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" + +// Internal data definition + +/** + * struct of pinyin token. + * + * this struct store the informations of a pinyin token + * (an initial or final) + */ +struct PinyinToken +{ + const char *latin; /**< Latin name of the token. */ + const char *zhuyin; /**< Zhuyin name in UTF-8. */ + int latin_len; /**< length of Latin name. */ + int zhuyin_len; /**< length of Chinese name. */ +}; + +/** + * struct to index PinyinToken list. + */ +struct PinyinTokenIndex +{ + int start; + int num; +}; + +static const PinyinToken __pinyin_initials[] = +{ + {"", "", 0, 0}, + {"b", "ㄅ", 1, 1}, + {"c", "ㄘ", 1, 1}, + {"ch","ㄔ", 2, 1}, + {"d", "ㄉ", 1, 1}, + {"f", "ㄈ", 1, 1}, + {"h", "ㄏ", 1, 1}, + {"g", "ㄍ", 1, 1}, + {"j", "ㄐ", 1, 1}, + {"k", "ㄎ", 1, 1}, + {"m", "ㄇ", 1, 1}, + {"n", "ㄋ", 1, 1}, + {"l", "ㄌ", 1, 1}, + {"r", "ㄖ", 1, 1}, + {"p", "ㄆ", 1, 1}, + {"q", "ㄑ", 1, 1}, + {"s", "ㄙ", 1, 1}, + {"sh","ㄕ", 2, 1}, + {"t", "ㄊ", 1, 1}, + {"w", "ㄨ", 1, 1}, //Should be omitted in some case. + {"x", "ㄒ", 1, 1}, + {"y", "ㄧ", 1, 1}, //Should be omitted in some case. + {"z", "ㄗ", 1, 1}, + {"zh","ㄓ", 2, 1} +}; + +static const PinyinToken __pinyin_finals[] = +{ + {"", "", 0, 0}, + {"a", "ㄚ", 1, 1}, + {"ai", "ㄞ", 2, 1}, + {"an", "ㄢ", 2, 1}, + {"ang", "ㄤ", 3, 1}, + {"ao", "ㄠ", 2, 1}, + {"e", "ㄜ", 1, 1}, + {"ea", "ㄝ", 2, 1}, + {"ei", "ㄟ", 2, 1}, + {"en", "ㄣ", 2, 1}, + {"eng", "ㄥ", 3, 1}, + {"er", "ㄦ", 2, 1}, + {"i", "ㄧ", 1, 1}, + {"ia", "ㄧㄚ", 2, 2}, + {"ian", "ㄧㄢ", 3, 2}, + {"iang","ㄧㄤ", 4, 2}, + {"iao", "ㄧㄠ", 3, 2}, + {"ie", "ㄧㄝ", 2, 2}, + {"in", "ㄧㄣ", 2, 2}, + {"ing", "ㄧㄥ", 3, 2}, + {"iong","ㄩㄥ", 4, 2}, + {"iu", "ㄧㄡ", 2, 2}, + {"ng", "ㄣ", 2, 1}, + {"o", "ㄛ", 1, 1}, + {"ong", "ㄨㄥ", 3, 2}, + {"ou", "ㄡ", 2, 1}, + {"u", "ㄨ", 1, 1}, + {"ua", "ㄨㄚ", 2, 2}, + {"uai", "ㄨㄞ", 3, 2}, + {"uan", "ㄨㄢ", 3, 2}, + {"uang","ㄨㄤ", 4, 2}, + {"ue", "ㄩㄝ", 2, 2}, + {"ueng","ㄨㄥ", 4, 2}, + {"ui", "ㄨㄟ", 2, 2}, + {"un", "ㄨㄣ", 2, 2}, + {"uo", "ㄨㄛ", 2, 2}, + {"v", "ㄩ", 1, 1}, + {"van", "ㄩㄢ", 3, 2}, + {"ve", "ㄩㄝ", 2, 2}, + {"vn", "ㄩㄣ", 2, 2} +}; + +static const PinyinToken __pinyin_tones [] = +{ + {"", "", 0, 0}, + {"1", "ˉ", 1, 1}, + {"2", "ˊ", 1, 1}, + {"3", "ˇ", 1, 1}, + {"4", "ˋ", 1, 1}, + {"5", "˙", 1, 1} +}; + +static const PinyinTokenIndex __pinyin_initials_index[] = +{ + //a b c d e f g h i j k l m + {-1,0},{1,1}, {2,2}, {4,1}, {-1,0},{5,1}, {7,1}, {6,1}, {-1,0},{8,1}, {9,1}, {12,1},{10,1}, + //n o p q r s t u v w x y z + {11,1},{-1,0},{14,1},{15,1},{13,1},{16,2},{18,1},{-1,0},{-1,0},{19,1},{20,1},{21,1},{22,2} +}; + +static const PinyinTokenIndex __pinyin_finals_index[] = +{ + //a b c d e f g h i j k l m + {1,5}, {-1,0},{-1,0},{-1,0},{6,6},{-1,0},{-1,0},{-1,0},{12,10},{-1,0},{-1,0},{-1,0},{-1,0}, + //n o p q r s t u v w x y z + {22,1},{23,3},{-1,0},{-1,0},{-1,0},{-1,0},{-1,0},{26,10},{36,4},{-1,0},{-1,0},{-1,0},{-1,0} +}; + + + +static const PinyinInitial __shuang_pin_stone_initial_map [] = +{ + PINYIN_ZeroInitial, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Shi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Chi, // U + PINYIN_Zhi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_stone_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ia, PINYIN_Ua }, // B + { PINYIN_Uan, PINYIN_ZeroFinal }, // C + { PINYIN_Ao, PINYIN_ZeroFinal }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_An, PINYIN_ZeroFinal }, // F + { PINYIN_Ang, PINYIN_ZeroFinal }, // G + { PINYIN_Uang,PINYIN_Iang }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_Ian, PINYIN_ZeroFinal }, // J + { PINYIN_Iao, PINYIN_ZeroFinal }, // K + { PINYIN_In, PINYIN_ZeroFinal }, // L + { PINYIN_Ie, PINYIN_ZeroFinal }, // M + { PINYIN_Iu, PINYIN_ZeroFinal }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Ou, PINYIN_ZeroFinal }, // P + { PINYIN_Ing, PINYIN_Er }, // Q + { PINYIN_En, PINYIN_ZeroFinal }, // R + { PINYIN_Ai, PINYIN_ZeroFinal }, // S + { PINYIN_Ng, PINYIN_Eng }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_Ui }, // V + { PINYIN_Ei, PINYIN_ZeroFinal }, // W + { PINYIN_Uai, PINYIN_Ue }, // X + { PINYIN_Ong, PINYIN_Iong }, // Y + { PINYIN_Un, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_zrm_initial_map [] = +{ + PINYIN_ZeroInitial, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Chi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Shi, // U + PINYIN_Zhi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_zrm_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ou, PINYIN_ZeroFinal }, // B + { PINYIN_Iao, PINYIN_ZeroFinal }, // C + { PINYIN_Uang,PINYIN_Iang }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_En, PINYIN_ZeroFinal }, // F + { PINYIN_Ng, PINYIN_Eng }, // G + { PINYIN_Ang, PINYIN_ZeroFinal }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_An, PINYIN_ZeroFinal }, // J + { PINYIN_Ao, PINYIN_ZeroFinal }, // K + { PINYIN_Ai, PINYIN_ZeroFinal }, // L + { PINYIN_Ian, PINYIN_ZeroFinal }, // M + { PINYIN_In, PINYIN_ZeroFinal }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Un, PINYIN_ZeroFinal }, // P + { PINYIN_Iu, PINYIN_ZeroFinal }, // Q + { PINYIN_Uan, PINYIN_Er }, // R + { PINYIN_Ong, PINYIN_Iong }, // S + { PINYIN_Ue, PINYIN_ZeroFinal }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_Ui }, // V + { PINYIN_Ia, PINYIN_Ua }, // W + { PINYIN_Ie, PINYIN_ZeroFinal }, // X + { PINYIN_Ing, PINYIN_Uai }, // Y + { PINYIN_Ei, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_ms_initial_map [] = +{ + PINYIN_ZeroInitial, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Chi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Shi, // U + PINYIN_Zhi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_ms_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ou, PINYIN_ZeroFinal }, // B + { PINYIN_Iao, PINYIN_ZeroFinal }, // C + { PINYIN_Uang,PINYIN_Iang }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_En, PINYIN_ZeroFinal }, // F + { PINYIN_Ng, PINYIN_Eng }, // G + { PINYIN_Ang, PINYIN_ZeroFinal }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_An, PINYIN_ZeroFinal }, // J + { PINYIN_Ao, PINYIN_ZeroFinal }, // K + { PINYIN_Ai, PINYIN_ZeroFinal }, // L + { PINYIN_Ian, PINYIN_ZeroFinal }, // M + { PINYIN_In, PINYIN_ZeroFinal }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Un, PINYIN_ZeroFinal }, // P + { PINYIN_Iu, PINYIN_ZeroFinal }, // Q + { PINYIN_Uan, PINYIN_Er }, // R + { PINYIN_Ong, PINYIN_Iong }, // S + { PINYIN_Ue, PINYIN_ZeroFinal }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_Ui }, // V + { PINYIN_Ia, PINYIN_Ua }, // W + { PINYIN_Ie, PINYIN_ZeroFinal }, // X + { PINYIN_Uai, PINYIN_V }, // Y + { PINYIN_Ei, PINYIN_ZeroFinal }, // Z + { PINYIN_Ing, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_ziguang_initial_map [] = +{ + PINYIN_Chi, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Shi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Zhi, // U + PINYIN_ZeroInitial, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_ziguang_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Iao, PINYIN_ZeroFinal }, // B + { PINYIN_Ing, PINYIN_ZeroFinal }, // C + { PINYIN_Ie, PINYIN_ZeroFinal }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_Ian, PINYIN_ZeroFinal }, // F + { PINYIN_Uang,PINYIN_Iang }, // G + { PINYIN_Ong, PINYIN_Iong }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_Iu, PINYIN_Er }, // J + { PINYIN_Ei, PINYIN_ZeroFinal }, // K + { PINYIN_Uan, PINYIN_ZeroFinal }, // L + { PINYIN_Un, PINYIN_ZeroFinal }, // M + { PINYIN_Ui, PINYIN_Ue }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Ai, PINYIN_ZeroFinal }, // P + { PINYIN_Ao, PINYIN_ZeroFinal }, // Q + { PINYIN_An, PINYIN_ZeroFinal }, // R + { PINYIN_Ang, PINYIN_ZeroFinal }, // S + { PINYIN_Ng, PINYIN_Eng }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_ZeroFinal }, // V + { PINYIN_En, PINYIN_ZeroFinal }, // W + { PINYIN_Ia, PINYIN_Ua }, // X + { PINYIN_In, PINYIN_Uai }, // Y + { PINYIN_Ou, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_abc_initial_map [] = +{ + PINYIN_Zhi, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_Chi, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_ZeroInitial, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_ZeroInitial, // U + PINYIN_Shi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_abc_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ou, PINYIN_ZeroFinal }, // B + { PINYIN_In, PINYIN_Uai }, // C + { PINYIN_Ia, PINYIN_Ua }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_En, PINYIN_ZeroFinal }, // F + { PINYIN_Ng, PINYIN_Eng }, // G + { PINYIN_Ang, PINYIN_ZeroFinal }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_An, PINYIN_ZeroFinal }, // J + { PINYIN_Ao, PINYIN_ZeroFinal }, // K + { PINYIN_Ai, PINYIN_ZeroFinal }, // L + { PINYIN_Ui, PINYIN_Ue }, // M + { PINYIN_Un, PINYIN_ZeroFinal }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Uan, PINYIN_ZeroFinal }, // P + { PINYIN_Ei, PINYIN_ZeroFinal }, // Q + { PINYIN_Iu, PINYIN_Er }, // R + { PINYIN_Ong, PINYIN_Iong }, // S + { PINYIN_Uang,PINYIN_Iang }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_ZeroFinal }, // V + { PINYIN_Ian, PINYIN_ZeroFinal }, // W + { PINYIN_Ie, PINYIN_ZeroFinal }, // X + { PINYIN_Ing, PINYIN_ZeroFinal }, // Y + { PINYIN_Iao, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + + +static const PinyinInitial __shuang_pin_liushi_initial_map [] = +{ + PINYIN_ZeroInitial, // A + PINYIN_Bo, // B + PINYIN_Ci, // C + PINYIN_De, // D + PINYIN_ZeroInitial, // E + PINYIN_Fo, // F + PINYIN_Ge, // G + PINYIN_He, // H + PINYIN_Chi, // I + PINYIN_Ji, // J + PINYIN_Ke, // K + PINYIN_Le, // L + PINYIN_Mo, // M + PINYIN_Ne, // N + PINYIN_ZeroInitial, // O + PINYIN_Po, // P + PINYIN_Qi, // Q + PINYIN_Ri, // R + PINYIN_Si, // S + PINYIN_Te, // T + PINYIN_Shi, // U + PINYIN_Zhi, // V + PINYIN_Wu, // W + PINYIN_Xi, // X + PINYIN_Yi, // Y + PINYIN_Zi, // Z + PINYIN_ZeroInitial, // ; +}; + +static const PinyinFinal __shuang_pin_liushi_final_map [][2] = +{ + { PINYIN_A, PINYIN_ZeroFinal }, // A + { PINYIN_Ao, PINYIN_ZeroFinal }, // B + { PINYIN_Ang, PINYIN_ZeroFinal }, // C + { PINYIN_Uan, PINYIN_ZeroFinal }, // D + { PINYIN_E, PINYIN_ZeroFinal }, // E + { PINYIN_An, PINYIN_ZeroFinal }, // F + { PINYIN_Ong, PINYIN_Iong }, // G + { PINYIN_Ui, PINYIN_Ue }, // H + { PINYIN_I, PINYIN_ZeroFinal }, // I + { PINYIN_Ia, PINYIN_Ua }, // J + { PINYIN_Un, PINYIN_ZeroFinal }, // K + { PINYIN_Iu, PINYIN_ZeroFinal }, // L + { PINYIN_In, PINYIN_ZeroFinal }, // M + { PINYIN_Uang,PINYIN_Iang }, // N + { PINYIN_Uo, PINYIN_O }, // O + { PINYIN_Ng, PINYIN_Eng }, // P + { PINYIN_Ing, PINYIN_ZeroFinal }, // Q + { PINYIN_Ou, PINYIN_Er }, // R + { PINYIN_Ai, PINYIN_ZeroFinal }, // S + { PINYIN_Ian, PINYIN_ZeroFinal }, // T + { PINYIN_U, PINYIN_ZeroFinal }, // U + { PINYIN_V, PINYIN_En }, // V + { PINYIN_Ei, PINYIN_ZeroFinal }, // W + { PINYIN_Ie, PINYIN_ZeroFinal }, // X + { PINYIN_Uai, PINYIN_ZeroFinal }, // Y + { PINYIN_Iao, PINYIN_ZeroFinal }, // Z + { PINYIN_ZeroFinal, PINYIN_ZeroFinal }, // ; +}; + +static const size_t __zhuyin_zhuyin_map_start_char = 0x3105; +static const size_t __zhuyin_zhuyin_map_tone_start_idx = 37; +static const PinyinKey __zhuyin_zhuyin_map [][3] = +{ + {PinyinKey(PINYIN_Bo),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Po),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Mo),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Fo),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_De),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Te),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ne),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Le),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ge),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ke),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_He),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ji),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Qi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Xi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Zhi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Chi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Shi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ri),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Zi),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Ci),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_Si),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_A),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_O),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_E),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ea),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ai),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ei),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ao),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ou),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_An),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_En),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Ang),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Eng),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_Er),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_I),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_U),PinyinKey(),PinyinKey()}, + {PinyinKey(PINYIN_ZeroInitial,PINYIN_V),PinyinKey(),PinyinKey()}, +}; + +static const size_t __zhuyin_map_start_char = 0x20; +#include "pinyin_zhuyin_map_data.h" + +static const PinyinKey (*__zhuyin_maps []) [3] = { + __zhuyin_zhuyin_map, + __zhuyin_standard_map, + __zhuyin_hsu_map, + __zhuyin_ibm_map, + __zhuyin_gin_yieh_map, + __zhuyin_et_map, + __zhuyin_et26_map, + 0 +}; + + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinCustomSettings + +PinyinCustomSettings::PinyinCustomSettings () + : use_incomplete (true) +{ + for (size_t i=0; i<=PINYIN_AmbLast; ++i) + use_ambiguities [i] = false; +} + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinKey + +const guint16 PinyinKey::min_value = 0; +const guint16 PinyinKey::max_value = PINYIN_Number_Of_Initials * PINYIN_Number_Of_Finals * PINYIN_Number_Of_Tones - 1; + +const char* +PinyinKey::get_initial_string () const +{ + return __pinyin_initials [m_initial].latin; +} + +const char* +PinyinKey::get_initial_zhuyin_string () const +{ + if ((m_initial == PINYIN_Wu && m_final == PINYIN_U) || + (m_initial == PINYIN_Yi && + (m_final == PINYIN_I || m_final == PINYIN_In || m_final == PINYIN_Ing || m_final == PINYIN_Ong || + m_final == PINYIN_U || m_final == PINYIN_Ue || m_final == PINYIN_Uan || m_final == PINYIN_Un))) + return ""; + + return __pinyin_initials [m_initial].zhuyin; +} + +const char* +PinyinKey::get_final_string () const +{ + return __pinyin_finals [m_final].latin; +} + +const char* +PinyinKey::get_final_zhuyin_string () const +{ + if (m_initial == PINYIN_Yi && m_final == PINYIN_Ong) { + return __pinyin_finals [PINYIN_Iong].zhuyin; + } else if (m_initial == PINYIN_Yi || m_initial == PINYIN_Ji || m_initial == PINYIN_Qi || m_initial == PINYIN_Xi) { + switch (m_final) { + case PINYIN_U: + return __pinyin_finals [PINYIN_V].zhuyin; + case PINYIN_Ue: + return __pinyin_finals [PINYIN_Ve].zhuyin; + case PINYIN_Uan: + return __pinyin_finals [PINYIN_Van].zhuyin; + case PINYIN_Un: + return __pinyin_finals [PINYIN_Vn].zhuyin; + } + if (m_initial == PINYIN_Yi && m_final == PINYIN_E) + return __pinyin_finals [PINYIN_Ea].zhuyin; + } else if ((m_initial == PINYIN_Ne || m_initial == PINYIN_Le) && m_final == PINYIN_Ue) { + return __pinyin_finals [PINYIN_Ve].zhuyin; + } else if ((m_initial == PINYIN_Zhi || m_initial == PINYIN_Chi || m_initial == PINYIN_Shi || + m_initial == PINYIN_Zi || m_initial == PINYIN_Ci || m_initial == PINYIN_Si || + m_initial == PINYIN_Ri) && m_final == PINYIN_I) { + return ""; + } + + return __pinyin_finals [m_final].zhuyin; +} + +const char* +PinyinKey::get_tone_string () const +{ + return __pinyin_tones [m_tone].latin; +} + +const char* +PinyinKey::get_tone_zhuyin_string () const +{ + return __pinyin_tones [m_tone].zhuyin; +} + +const char * +PinyinKey::get_key_string () const +{ + char key [16]; + g_snprintf (key, 15, "%s%s%s", get_initial_string(), get_final_string(), get_tone_string ()); + + return g_strdup(key); +} + +const char * +PinyinKey::get_key_zhuyin_string () const +{ + char key [32]; + g_snprintf (key, 31, "%s%s%s", get_initial_zhuyin_string(), get_final_zhuyin_string(), get_tone_zhuyin_string ()); + + return g_strdup (key); +} + +int +PinyinKey::set (const PinyinValidator &validator, const char *str, int len) +{ + if (!str || ! (*str)) + return 0; + + PinyinDefaultParser parser; + + return parser.parse_one_key (validator, *this, str, len); +} + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinValidator +BitmapPinyinValidator::BitmapPinyinValidator (const PinyinLargeTable *table) +{ + initialize (table); +} + +void +BitmapPinyinValidator::initialize (const PinyinLargeTable *table) +{ + memset (m_bitmap, 0, sizeof (m_bitmap)); + + if (!table) return; + + for (guint16 val=0; val<=PinyinKey::max_value; ++val) + if (!table->has_key (PinyinKey (val))) + m_bitmap [val >> 3] |= (1 << (val % 8)); +} + +bool +BitmapPinyinValidator::operator () (PinyinKey key) const +{ + if (key.is_empty ()) return false; + + guint16 val = key.get_value (); + + return (m_bitmap [ val >> 3 ] & (1 << (val % 8))) == 0; +} + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinParser +PinyinParser::~PinyinParser () +{ +} + +struct PinyinReplaceRulePair +{ + PinyinInitial initial; + PinyinFinal final; + PinyinInitial new_initial; + PinyinFinal new_final; +}; + +class PinyinReplaceRulePairLessThan +{ +public: + bool operator () (const PinyinReplaceRulePair &lhs, const PinyinReplaceRulePair &rhs) const { + if (lhs.initial < rhs.initial) return true; + if (lhs.initial > rhs.initial) return false; + return lhs.final < rhs.final; + } +}; + +void +PinyinParser::normalize (PinyinKey &key) +{ + static const PinyinReplaceRulePair rules [] = + { +#if 0 + {PINYIN_ZeroInitial, PINYIN_I, PINYIN_Yi, PINYIN_I}, + {PINYIN_ZeroInitial, PINYIN_Ia, PINYIN_Yi, PINYIN_A}, + {PINYIN_ZeroInitial, PINYIN_Ian, PINYIN_Yi, PINYIN_An}, + {PINYIN_ZeroInitial, PINYIN_Iang, PINYIN_Yi, PINYIN_Ang}, + {PINYIN_ZeroInitial, PINYIN_Iao, PINYIN_Yi, PINYIN_Ao}, + {PINYIN_ZeroInitial, PINYIN_Ie, PINYIN_Yi, PINYIN_E}, + {PINYIN_ZeroInitial, PINYIN_In, PINYIN_Yi, PINYIN_In}, + {PINYIN_ZeroInitial, PINYIN_Ing, PINYIN_Yi, PINYIN_Ing}, + {PINYIN_ZeroInitial, PINYIN_Iong, PINYIN_Yi, PINYIN_Ong}, + {PINYIN_ZeroInitial, PINYIN_Iu, PINYIN_Yi, PINYIN_Ou}, + {PINYIN_ZeroInitial, PINYIN_U, PINYIN_Wu, PINYIN_U}, + {PINYIN_ZeroInitial, PINYIN_Ua, PINYIN_Wu, PINYIN_A}, + {PINYIN_ZeroInitial, PINYIN_Uai, PINYIN_Wu, PINYIN_Ai}, + {PINYIN_ZeroInitial, PINYIN_Uan, PINYIN_Wu, PINYIN_An}, + {PINYIN_ZeroInitial, PINYIN_Uang, PINYIN_Wu, PINYIN_Ang}, + {PINYIN_ZeroInitial, PINYIN_Ue, PINYIN_Wu, PINYIN_E}, + {PINYIN_ZeroInitial, PINYIN_Ueng, PINYIN_Wu, PINYIN_Eng}, + {PINYIN_ZeroInitial, PINYIN_Ui, PINYIN_Wu, PINYIN_Ei}, + {PINYIN_ZeroInitial, PINYIN_Un, PINYIN_Wu, PINYIN_En}, + {PINYIN_ZeroInitial, PINYIN_Uo, PINYIN_Wu, PINYIN_O}, + {PINYIN_ZeroInitial, PINYIN_V, PINYIN_Yi, PINYIN_U}, + {PINYIN_ZeroInitial, PINYIN_Van, PINYIN_Yi, PINYIN_Uan}, + {PINYIN_ZeroInitial, PINYIN_Ve, PINYIN_Yi, PINYIN_Ue}, + {PINYIN_ZeroInitial, PINYIN_Vn, PINYIN_Yi, PINYIN_Un}, +#endif + {PINYIN_Ji, PINYIN_V, PINYIN_Ji, PINYIN_U}, + {PINYIN_Ji, PINYIN_Van, PINYIN_Ji, PINYIN_Uan}, + {PINYIN_Ji, PINYIN_Ve, PINYIN_Ji, PINYIN_Ue}, + {PINYIN_Ji, PINYIN_Vn, PINYIN_Ji, PINYIN_Un}, + {PINYIN_Ne, PINYIN_Ve, PINYIN_Ne, PINYIN_Ue}, + {PINYIN_Le, PINYIN_Ve, PINYIN_Le, PINYIN_Ue}, + {PINYIN_Qi, PINYIN_V, PINYIN_Qi, PINYIN_U}, + {PINYIN_Qi, PINYIN_Van, PINYIN_Qi, PINYIN_Uan}, + {PINYIN_Qi, PINYIN_Ve, PINYIN_Qi, PINYIN_Ue}, + {PINYIN_Qi, PINYIN_Vn, PINYIN_Qi, PINYIN_Un}, + {PINYIN_Xi, PINYIN_V, PINYIN_Xi, PINYIN_U}, + {PINYIN_Xi, PINYIN_Van, PINYIN_Xi, PINYIN_Uan}, + {PINYIN_Xi, PINYIN_Ve, PINYIN_Xi, PINYIN_Ue}, + {PINYIN_Xi, PINYIN_Vn, PINYIN_Xi, PINYIN_Un} + }; + static const PinyinReplaceRulePair *rules_start = rules; + static const PinyinReplaceRulePair *rules_end = rules + sizeof(rules)/sizeof(PinyinReplaceRulePair); + + PinyinReplaceRulePair kp; + + kp.initial = key.get_initial (); + kp.final = key.get_final (); + + const PinyinReplaceRulePair *p = std_lite::lower_bound (rules_start, rules_end, kp, PinyinReplaceRulePairLessThan ()); + + if (p->initial == kp.initial && p->final == kp.final) { + key.set_initial (p->new_initial); + key.set_final (p->new_final); + } +} + +//============== Internal functions used by PinyinDefaultParser ============== +static int +__default_parser_parse_initial (PinyinInitial &initial, const char *str, int len) +{ + int lastlen = 0; + + initial = PINYIN_ZeroInitial; + + if (str && *str >= 'a' && *str <= 'z') { + int start = __pinyin_initials_index [*str - 'a'].start; + int end = __pinyin_initials_index [*str - 'a'].num + start; + + if (start > 0) { + for (int i = start; i < end; ++i) { + if ((len < 0 || len >= __pinyin_initials [i].latin_len) && __pinyin_initials [i].latin_len >= lastlen) { + int j; + for (j = 1; j < __pinyin_initials [i].latin_len; ++j) { + if (str [j] != __pinyin_initials [i].latin [j]) + break; + } + if (j == __pinyin_initials [i].latin_len) { + initial = static_cast<PinyinInitial>(i); + lastlen = __pinyin_initials [i].latin_len; + } + } + } + } + } + + return lastlen; +} +static int +__default_parser_parse_final (PinyinFinal &final, const char *str, int len) +{ + int lastlen = 0; + + final = PINYIN_ZeroFinal; + + if (str && *str >= 'a' && *str <= 'z') { + int start = __pinyin_finals_index [*str - 'a'].start; + int end = __pinyin_finals_index [*str - 'a'].num + start; + + if (start > 0) { + for (int i = start; i < end; ++i) { + if ((len < 0 || len >= __pinyin_finals [i].latin_len) && __pinyin_finals [i].latin_len >= lastlen) { + int j; + for (j = 1; j < __pinyin_finals [i].latin_len; ++j) { + if (str [j] != __pinyin_finals [i].latin [j]) + break; + } + if (j == __pinyin_finals [i].latin_len) { + final = static_cast<PinyinFinal>(i); + lastlen = __pinyin_finals [i].latin_len; + } + } + } + } + } + + return lastlen; +} +static int +__default_parser_parse_tone (PinyinTone &tone, const char *str, int len) +{ + tone = PINYIN_ZeroTone; + + if (str && (len >= 1 || len < 0)) { + int kt = (*str) - '0'; + if (kt >= PINYIN_First && kt <= PINYIN_LastTone) { + tone = static_cast<PinyinTone>(kt); + return 1; + } + } + return 0; +} + +static int +__default_parser_parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len = -1) +{ + int initial_len = 0; + int final_len = 0; + int tone_len = 0; + + const char *ptr; + + PinyinInitial initial; + PinyinFinal final; + PinyinTone tone; + + key.clear (); + + if (!str || !len) return 0; + + if (len < 0) len = strlen (str); + + while (len > 0) { + ptr = str; + + initial = PINYIN_ZeroInitial; + final = PINYIN_ZeroFinal; + tone = PINYIN_ZeroTone; + + final_len = __default_parser_parse_final (final, ptr, len); + ptr += final_len; + len -= final_len; + + // An initial is present + if (final == PINYIN_ZeroFinal) { + initial_len = __default_parser_parse_initial (initial, ptr, len); + ptr += initial_len; + len -= initial_len; + if (len){ + final_len = __default_parser_parse_final (final, ptr, len); + ptr += final_len; + len -= final_len; + } + } + + if (len) + tone_len = __default_parser_parse_tone (tone, ptr, len); + + key.set (initial, final, tone); + + PinyinParser::normalize (key); + + // A valid key was found, return. + if (validator (key)) break; + + // The key is invalid, reduce the len and find again. + len = initial_len + final_len + tone_len - 1; + + initial_len = final_len = tone_len = 0; + + key.clear (); + } + + len = initial_len + final_len + tone_len; + + return len; +} + +struct DefaultParserCacheElement +{ + PinyinKey key; + PinyinKeyPos pos; + int num_keys; + int parsed_len; + int next_start; +}; + +typedef GArray* DefaultParserCache; /* Array of DefaultParserCacheElement */ + +static int +__default_parser_parse_recursive (const PinyinValidator &validator, + DefaultParserCache &cache, + int &real_start, + int &num_keys, + const char *str, + int len, + int start) +{ + if (*str == 0 || len == 0) return 0; + + int used_len = 0; + + real_start = 0; + num_keys = 0; + + if (*str == '\'' || *str == ' ') { + ++used_len; + ++str; + ++start; + --len; + } + + if (!isalpha (*str) || !len) + return 0; + + real_start = start; + + // The best keys start from this position have been found, just return the result. + DefaultParserCacheElement* element = &g_array_index + (cache, DefaultParserCacheElement, start); + + + if (element->num_keys >=0) { + num_keys = element->num_keys; + return element->parsed_len; + } + + PinyinKey first_key; + PinyinKey best_first_key; + PinyinKeyPos pos; + + int first_len = 0; + int best_first_len = 0; + + int remained_len = 0; + int best_remained_len = 0; + + int remained_keys = 0; + int best_remained_keys = 0; + + int remained_start = 0; + int best_remained_start = 0; + + first_len = __default_parser_parse_one_key (validator, first_key, str, len); + + if (!first_len) { + element = &g_array_index(cache, DefaultParserCacheElement, start); + + element->key = PinyinKey (); + element->num_keys = 0; + element->parsed_len = 0; + element->next_start = start; + return 0; + } + + best_first_key = first_key; + best_first_len = first_len; + + if (len > first_len) { + char ch1 = str [first_len -1]; + char ch2 = str [first_len]; + + best_remained_len = __default_parser_parse_recursive (validator, + cache, + best_remained_start, + best_remained_keys, + str + first_len, + len - first_len, + start + first_len); + + // For those keys which the last char is 'g' or 'n' or 'r', try put the end char into the next key. + if (first_len > 1 && + (((ch1=='g' || ch1=='n' || ch1=='r') && (ch2=='a' || ch2=='e' || ch2=='i' || ch2=='o' || ch2=='u' || ch2=='v')) || + ((ch1=='a' || ch1=='e' || ch1=='o') && (ch2=='i' || ch2=='n' || ch2=='o' || ch2=='r' || ch2=='u')))) { + + first_len = __default_parser_parse_one_key (validator, first_key, str, first_len - 1); + + if (first_len) { + remained_len = __default_parser_parse_recursive (validator, + cache, + remained_start, + remained_keys, + str + first_len, + len - first_len, + start + first_len); + + + DefaultParserCacheElement* best_remained_element = &g_array_index + (cache, DefaultParserCacheElement, best_remained_start); + + // A better seq was found. + if (remained_len != 0 && (remained_len + first_len) >= (best_remained_len + best_first_len) && + (remained_keys <= best_remained_keys || best_remained_keys == 0)) { +#if 0 + if ((remained_len + first_len) > (best_remained_len + best_first_len) || + remained_keys < best_remained_keys || + best_remained_element->key.get_final () == PINYIN_ZeroFinal || + best_remained_element->key.get_initial () == PINYIN_Wu || + best_remained_element->key.get_initial () == PINYIN_Yi) { +#endif + best_first_len = first_len; + best_first_key = first_key; + best_remained_len = remained_len; + best_remained_keys = remained_keys; + best_remained_start = remained_start; +#if 0 + } +#endif + } + } + } + } + + num_keys = best_remained_keys + 1; + + + element = &g_array_index + (cache, DefaultParserCacheElement, start); + + pos.set_pos(start); + pos.set_length(best_first_len); + + element->key = best_first_key; + element->pos = pos; + element->num_keys = num_keys; + element->parsed_len = used_len + best_first_len + best_remained_len; + element->next_start = best_remained_start; + + return element->parsed_len; +} +//============================================================================ + +PinyinDefaultParser::~PinyinDefaultParser () +{ +} + +int +PinyinDefaultParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const +{ + return __default_parser_parse_one_key (validator, key, str, len); +} + +int +PinyinDefaultParser::parse (const PinyinValidator &validator, PinyinKeyVector & keys, PinyinKeyPosVector & poses, const char *str, int len) const +{ + g_array_set_size(keys, 0); + g_array_set_size(poses, 0); + + if (!str || !len) return 0; + + if (len < 0) len = strlen (str); + + DefaultParserCacheElement elm; + + elm.num_keys = -1L; + elm.parsed_len = 0; + elm.next_start = 0; + + DefaultParserCache cache = g_array_new (FALSE, TRUE, sizeof (DefaultParserCacheElement)); + g_array_set_size(cache, len); + for ( size_t index = 0 ; index < len ; index++){ + DefaultParserCacheElement * element = + &g_array_index(cache,DefaultParserCacheElement, index); + *element = elm; + } + int start = 0; + int num_keys = 0; + + len = __default_parser_parse_recursive (validator, cache, start, num_keys, str, len, 0); + + for (size_t i=0; i<(size_t)num_keys; ++i) { + DefaultParserCacheElement* element = &g_array_index + (cache, DefaultParserCacheElement, start); + g_array_append_val(keys, element->key); + g_array_append_val(poses, element->pos); + start = element->next_start; + } + + return len; +} + +PinyinShuangPinParser::PinyinShuangPinParser (PinyinShuangPinScheme scheme) +{ + set_scheme (scheme); +} + +PinyinShuangPinParser::PinyinShuangPinParser (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]) +{ + set_scheme (initial_map, final_map); +} + +PinyinShuangPinParser::~PinyinShuangPinParser () +{ +} + +int +PinyinShuangPinParser::parse_one_key (const PinyinValidator &validator, PinyinKey &key, const char *str, int len) const +{ + key.clear (); + + if (!str || !len || ! (*str)) return 0; + + if (len < 0) len = strlen (str); + + PinyinInitial initial = PINYIN_ZeroInitial; + PinyinFinal final = PINYIN_ZeroFinal; + PinyinFinal final_cands [4] = { PINYIN_ZeroFinal, PINYIN_ZeroFinal, PINYIN_ZeroFinal, PINYIN_ZeroFinal }; + + PinyinTone tone = PINYIN_ZeroTone; + + int idx [2] = {-1, -1}; + int used_len = 0; + + size_t i; + bool matched = false; + + for (i = 0; i < 2 && i < (size_t) len; ++i) { + if (str [i] >= 'a' && str [i] <= 'z') idx [i] = str [i] - 'a'; + else if (str [i] == ';') idx [i] = 26; + } + + // parse initial or final + if (idx [0] >= 0) { + initial = m_initial_map [idx[0]]; + final_cands [0] = m_final_map [idx[0]][0]; + final_cands [1] = m_final_map [idx[0]][1]; + } + + if (initial == PINYIN_ZeroInitial && final_cands [0] == PINYIN_ZeroFinal) + return 0; + + // parse final, if str [0] == 'o' (idx [0] == 14) then just skip to parse final. + if (idx [1] >= 0 && (initial != PINYIN_ZeroInitial || idx[0] == 14)) { + final_cands [2] = m_final_map [idx [1]][0]; + final_cands [3] = m_final_map [idx [1]][1]; + + for (i = 2; i < 4; ++i) { + if (final_cands [i] != PINYIN_ZeroFinal) { + key.set (initial, final_cands [i]); + PinyinParser::normalize (key); + + if (validator (key)) { + final = final_cands [i]; + matched = true; + used_len = 2; + str += 2; + len -= 2; + break; + } + } + } + } + + if (!matched) { + initial = PINYIN_ZeroInitial; + for (i = 0; i < 2; ++i) { + key.set (initial, final_cands [i]); + PinyinParser::normalize (key); + + if (validator (key)) { + final = final_cands [i]; + matched = true; + used_len = 1; + ++str; + --len; + break; + } + } + } + + if (!matched) return 0; + + // parse tone + if (len) { + int kt = (*str) - '0'; + if (kt >= PINYIN_First && kt <= PINYIN_LastTone) { + tone = static_cast<PinyinTone>(kt); + + key.set (initial, final, tone); + + if (validator (key)) { + return used_len + 1; + } + } + } + + return used_len; +} + +int +PinyinShuangPinParser::parse (const PinyinValidator &validator, PinyinKeyVector &keys, PinyinKeyPosVector & poses, const char *str, int len) const +{ + g_array_set_size(keys, 0); + g_array_set_size(poses, 0); + + if (!str || !len || ! (*str)) return 0; + + if (len < 0) len = strlen (str); + + int used_len = 0; + + PinyinKey key; + PinyinKeyPos pos; + + while (used_len < len) { + if (*str == '\'' || *str == ' ') { + ++str; + ++used_len; + continue; + } + + int one_len = parse_one_key (validator, key, str, len); + + if (one_len) { + pos.set_pos(used_len); + pos.set_length(one_len); + g_array_append_val(keys, key); + g_array_append_val(poses, pos); + } else { + break; + } + + str += one_len; + used_len += one_len; + } + + return used_len; +} + +void +PinyinShuangPinParser::set_scheme (PinyinShuangPinScheme scheme) +{ + switch (scheme) { + case SHUANG_PIN_STONE: + set_scheme (__shuang_pin_stone_initial_map, __shuang_pin_stone_final_map); + break; + case SHUANG_PIN_ZRM: + set_scheme (__shuang_pin_zrm_initial_map, __shuang_pin_zrm_final_map); + break; + case SHUANG_PIN_MS: + set_scheme (__shuang_pin_ms_initial_map, __shuang_pin_ms_final_map); + break; + case SHUANG_PIN_ZIGUANG: + set_scheme (__shuang_pin_ziguang_initial_map, __shuang_pin_ziguang_final_map); + break; + case SHUANG_PIN_ABC: + set_scheme (__shuang_pin_abc_initial_map, __shuang_pin_abc_final_map); + break; + case SHUANG_PIN_LIUSHI: + set_scheme (__shuang_pin_liushi_initial_map, __shuang_pin_liushi_final_map); + break; + default: + set_scheme (__shuang_pin_zrm_initial_map, __shuang_pin_zrm_final_map); + return; + } +} + +void +PinyinShuangPinParser::set_scheme (const PinyinInitial initial_map[27], const PinyinFinal final_map[27][2]) +{ + for (size_t i = 0; i < 27; ++i) { + m_initial_map [i] = initial_map [i]; + m_final_map [i][0] = final_map [i][0]; + m_final_map [i][1] = final_map [i][1]; + } +} + +void +PinyinShuangPinParser::get_scheme (PinyinInitial initial_map[27], PinyinFinal final_map[27][2]) +{ + for (size_t i = 0; i < 27; ++i) { + initial_map [i] = m_initial_map [i]; + final_map [i][0] = m_final_map [i][0]; + final_map [i][1] = m_final_map [i][1]; + } +} + +namespace novel{ + +////////////////////////////////////////////////////////////////////////////// +// implementation of PinyinKey comparision classe +int pinyin_compare_initial (const PinyinCustomSettings &custom, + PinyinInitial lhs, + PinyinInitial rhs) +{ + if ((lhs == rhs) || + (custom.use_ambiguities [PINYIN_AmbZhiZi] && + ((lhs == PINYIN_Zhi && rhs == PINYIN_Zi) || + (lhs == PINYIN_Zi && rhs == PINYIN_Zhi))) || + + (custom.use_ambiguities [PINYIN_AmbChiCi] && + ((lhs == PINYIN_Chi && rhs == PINYIN_Ci) || + (lhs == PINYIN_Ci && rhs == PINYIN_Chi))) || + + (custom.use_ambiguities [PINYIN_AmbShiSi] && + ((lhs == PINYIN_Shi && rhs == PINYIN_Si) || + (lhs == PINYIN_Si && rhs == PINYIN_Shi))) || + + (custom.use_ambiguities [PINYIN_AmbLeRi] && + ((lhs == PINYIN_Le && rhs == PINYIN_Ri) || + (lhs == PINYIN_Ri && rhs == PINYIN_Le))) || + + (custom.use_ambiguities [PINYIN_AmbNeLe] && + ((lhs == PINYIN_Ne && rhs == PINYIN_Le) || + (lhs == PINYIN_Le && rhs == PINYIN_Ne))) || + + (custom.use_ambiguities [PINYIN_AmbFoHe] && + ((lhs == PINYIN_Fo && rhs == PINYIN_He) || + (lhs == PINYIN_He && rhs == PINYIN_Fo))) + ) + return 0; + else if (lhs < rhs) return -1; + return 1; +} + +int pinyin_compare_final (const PinyinCustomSettings &custom, + PinyinFinal lhs, + PinyinFinal rhs) +{ + if(((lhs == rhs) || + (custom.use_ambiguities [PINYIN_AmbAnAng] && + ((lhs == PINYIN_An && rhs == PINYIN_Ang) || + (lhs == PINYIN_Ang && rhs == PINYIN_An))) || + + (custom.use_ambiguities [PINYIN_AmbEnEng] && + ((lhs == PINYIN_En && rhs == PINYIN_Eng) || + (lhs == PINYIN_Eng && rhs == PINYIN_En))) || + + (custom.use_ambiguities [PINYIN_AmbInIng] && + ((lhs == PINYIN_In && rhs == PINYIN_Ing) || + (lhs == PINYIN_Ing && rhs == PINYIN_In))))) + return 0; + else if (custom.use_incomplete && (lhs == PINYIN_ZeroFinal || rhs == PINYIN_ZeroFinal)) + return 0; + else if (lhs < rhs) return -1; + return 1; +} + +int pinyin_compare_tone (const PinyinCustomSettings &custom, + PinyinTone lhs, + PinyinTone rhs) +{ + if(lhs == rhs || !lhs || !rhs) + return 0; + else if (lhs < rhs) return -1; + return 1; +} + +}; |