diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/Makefile.data | 15 | ||||
-rw-r--r-- | scripts/bopomofo.py | 530 | ||||
-rw-r--r-- | scripts/chewing.py | 73 | ||||
-rw-r--r-- | scripts/chewing_enum.h.in | 45 | ||||
-rw-r--r-- | scripts/chewing_table.h.in | 50 | ||||
-rw-r--r-- | scripts/chewingkey.py | 150 | ||||
-rw-r--r-- | scripts/correct.py | 95 | ||||
-rw-r--r-- | scripts/double_pinyin_table.h.in | 56 | ||||
-rw-r--r-- | scripts/genbopomofoheader.py | 123 | ||||
-rw-r--r-- | scripts/genchewingkey.py | 41 | ||||
-rw-r--r-- | scripts/gendoublepinyinheader.py | 69 | ||||
-rw-r--r-- | scripts/genpinyinheader.py | 46 | ||||
-rw-r--r-- | scripts/genpinyins.py | 57 | ||||
-rw-r--r-- | scripts/genpinyintable.py | 115 | ||||
-rw-r--r-- | scripts/genspecialtable.py | 93 | ||||
-rw-r--r-- | scripts/pinyin.py | 400 | ||||
-rw-r--r-- | scripts/pinyin_parser_table.h.in | 34 | ||||
-rw-r--r-- | scripts/pinyintable.py | 168 | ||||
-rw-r--r-- | scripts/specials.txt | 0 | ||||
-rw-r--r-- | scripts/specialtable.py | 123 | ||||
-rw-r--r-- | scripts/utils.py | 41 |
21 files changed, 2324 insertions, 0 deletions
diff --git a/scripts/Makefile.data b/scripts/Makefile.data new file mode 100644 index 0000000..7929e97 --- /dev/null +++ b/scripts/Makefile.data @@ -0,0 +1,15 @@ +all: pinyins.txt + + +pinyins.txt: + python3 genpinyins.py + + +update-header: + python3 genpinyinheader.py > ../src/storage/pinyin_parser_table.h + python3 gendoublepinyinheader.py > ../src/storage/double_pinyin_table.h + python3 genbopomofoheader.py > ../src/storage/chewing_table.h + python3 genchewingkey.py > ../src/storage/chewing_enum.h + + +.PHONY: pinyins.txt diff --git a/scripts/bopomofo.py b/scripts/bopomofo.py new file mode 100644 index 0000000..91a8744 --- /dev/null +++ b/scripts/bopomofo.py @@ -0,0 +1,530 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (c) 2010 BYVoid <byvoid1@gmail.com> +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +BOPOMOFO_PINYIN_MAP = { + "ㄅ" : "b", + "ㄅㄚ" : "ba", + "ㄅㄛ" : "bo", + "ㄅㄞ" : "bai", + "ㄅㄟ" : "bei", + "ㄅㄠ" : "bao", + "ㄅㄢ" : "ban", + "ㄅㄣ" : "ben", + "ㄅㄤ" : "bang", + "ㄅㄥ" : "beng", + "ㄅㄧ" : "bi", + "ㄅㄧㄝ" : "bie", + "ㄅㄧㄠ" : "biao", + "ㄅㄧㄢ" : "bian", + "ㄅㄧㄣ" : "bin", + "ㄅㄧㄥ" : "bing", + "ㄅㄨ" : "bu", + "ㄆ" : "p", + "ㄆㄚ" : "pa", + "ㄆㄛ" : "po", + "ㄆㄞ" : "pai", + "ㄆㄟ" : "pei", + "ㄆㄠ" : "pao", + "ㄆㄡ" : "pou", + "ㄆㄢ" : "pan", + "ㄆㄣ" : "pen", + "ㄆㄤ" : "pang", + "ㄆㄥ" : "peng", + "ㄆㄧ" : "pi", + "ㄆㄧㄝ" : "pie", + "ㄆㄧㄠ" : "piao", + "ㄆㄧㄢ" : "pian", + "ㄆㄧㄣ" : "pin", + "ㄆㄧㄥ" : "ping", + "ㄆㄨ" : "pu", + "ㄇ" : "m", + "ㄇㄚ" : "ma", + "ㄇㄛ" : "mo", + "ㄇㄜ" : "me", + "ㄇㄞ" : "mai", + "ㄇㄟ" : "mei", + "ㄇㄠ" : "mao", + "ㄇㄡ" : "mou", + "ㄇㄢ" : "man", + "ㄇㄣ" : "men", + "ㄇㄤ" : "mang", + "ㄇㄥ" : "meng", + "ㄇㄧ" : "mi", + "ㄇㄧㄝ" : "mie", + "ㄇㄧㄠ" : "miao", + "ㄇㄧㄡ" : "miu", + "ㄇㄧㄢ" : "mian", + "ㄇㄧㄣ" : "min", + "ㄇㄧㄥ" : "ming", + "ㄇㄨ" : "mu", + "ㄈ" : "f", + "ㄈㄚ" : "fa", + "ㄈㄛ" : "fo", + "ㄈㄜ" : "fe", + "ㄈㄟ" : "fei", + "ㄈㄡ" : "fou", + "ㄈㄢ" : "fan", + "ㄈㄣ" : "fen", + "ㄈㄤ" : "fang", + "ㄈㄥ" : "feng", + "ㄈㄨ" : "fu", + "ㄉ" : "d", + "ㄉㄚ" : "da", + "ㄉㄜ" : "de", + "ㄉㄞ" : "dai", + "ㄉㄟ" : "dei", + "ㄉㄠ" : "dao", + "ㄉㄡ" : "dou", + "ㄉㄢ" : "dan", + "ㄉㄣ" : "den", + "ㄉㄤ" : "dang", + "ㄉㄥ" : "deng", + "ㄉㄧ" : "di", + "ㄉㄧㄚ" : "dia", + "ㄉㄧㄝ" : "die", + "ㄉㄧㄠ" : "diao", + "ㄉㄧㄡ" : "diu", + "ㄉㄧㄢ" : "dian", + "ㄉㄧㄣ" : "din", + "ㄉㄧㄥ" : "ding", + "ㄉㄨ" : "du", + "ㄉㄨㄛ" : "duo", + "ㄉㄨㄟ" : "dui", + "ㄉㄨㄢ" : "duan", + "ㄉㄨㄣ" : "dun", + "ㄉㄨㄥ" : "dong", + "ㄊ" : "t", + "ㄊㄚ" : "ta", + "ㄊㄜ" : "te", + "ㄊㄞ" : "tai", + "ㄊㄠ" : "tao", + "ㄊㄡ" : "tou", + "ㄊㄢ" : "tan", + "ㄊㄤ" : "tang", + "ㄊㄥ" : "teng", + "ㄊㄧ" : "ti", + "ㄊㄧㄝ" : "tie", + "ㄊㄧㄠ" : "tiao", + "ㄊㄧㄢ" : "tian", + "ㄊㄧㄥ" : "ting", + "ㄊㄨ" : "tu", + "ㄊㄨㄛ" : "tuo", + "ㄊㄨㄟ" : "tui", + "ㄊㄨㄢ" : "tuan", + "ㄊㄨㄣ" : "tun", + "ㄊㄨㄥ" : "tong", + "ㄋ" : "n", + "ㄋㄚ" : "na", + "ㄋㄜ" : "ne", + "ㄋㄞ" : "nai", + "ㄋㄟ" : "nei", + "ㄋㄠ" : "nao", + "ㄋㄡ" : "nou", + "ㄋㄢ" : "nan", + "ㄋㄣ" : "nen", + "ㄋㄤ" : "nang", + "ㄋㄥ" : "neng", + "ㄋㄧ" : "ni", + "ㄋㄧㄚ" : "nia", + "ㄋㄧㄝ" : "nie", + "ㄋㄧㄠ" : "niao", + "ㄋㄧㄡ" : "niu", + "ㄋㄧㄢ" : "nian", + "ㄋㄧㄣ" : "nin", + "ㄋㄧㄤ" : "niang", + "ㄋㄧㄥ" : "ning", + "ㄋㄨ" : "nu", + "ㄋㄨㄛ" : "nuo", + "ㄋㄨㄢ" : "nuan", + "ㄋㄨㄣ" : "nun", + "ㄋㄨㄥ" : "nong", + "ㄋㄩ" : "nv", + "ㄋㄩㄝ" : "nve", + "ㄌ" : "l", + "ㄌㄚ" : "la", + "ㄌㄛ" : "lo", + "ㄌㄜ" : "le", + "ㄌㄞ" : "lai", + "ㄌㄟ" : "lei", + "ㄌㄠ" : "lao", + "ㄌㄡ" : "lou", + "ㄌㄢ" : "lan", + "ㄌㄣ" : "len", + "ㄌㄤ" : "lang", + "ㄌㄥ" : "leng", + "ㄌㄧ" : "li", + "ㄌㄧㄚ" : "lia", + "ㄌㄧㄝ" : "lie", + "ㄌㄧㄠ" : "liao", + "ㄌㄧㄡ" : "liu", + "ㄌㄧㄢ" : "lian", + "ㄌㄧㄣ" : "lin", + "ㄌㄧㄤ" : "liang", + "ㄌㄧㄥ" : "ling", + "ㄌㄨ" : "lu", + "ㄌㄨㄛ" : "luo", + "ㄌㄨㄢ" : "luan", + "ㄌㄨㄣ" : "lun", + "ㄌㄨㄥ" : "long", + "ㄌㄩ" : "lv", + "ㄌㄩㄝ" : "lve", + "ㄍ" : "g", + "ㄍㄚ" : "ga", + "ㄍㄜ" : "ge", + "ㄍㄞ" : "gai", + "ㄍㄟ" : "gei", + "ㄍㄠ" : "gao", + "ㄍㄡ" : "gou", + "ㄍㄢ" : "gan", + "ㄍㄣ" : "gen", + "ㄍㄤ" : "gang", + "ㄍㄥ" : "geng", + "ㄍㄨ" : "gu", + "ㄍㄨㄚ" : "gua", + "ㄍㄨㄛ" : "guo", + "ㄍㄨㄞ" : "guai", + "ㄍㄨㄟ" : "gui", + "ㄍㄨㄢ" : "guan", + "ㄍㄨㄣ" : "gun", + "ㄍㄨㄤ" : "guang", + "ㄍㄨㄥ" : "gong", + "ㄎ" : "k", + "ㄎㄚ" : "ka", + "ㄎㄜ" : "ke", + "ㄎㄞ" : "kai", + "ㄎㄟ" : "kei", + "ㄎㄠ" : "kao", + "ㄎㄡ" : "kou", + "ㄎㄢ" : "kan", + "ㄎㄣ" : "ken", + "ㄎㄤ" : "kang", + "ㄎㄥ" : "keng", + "ㄎㄨ" : "ku", + "ㄎㄨㄚ" : "kua", + "ㄎㄨㄛ" : "kuo", + "ㄎㄨㄞ" : "kuai", + "ㄎㄨㄟ" : "kui", + "ㄎㄨㄢ" : "kuan", + "ㄎㄨㄣ" : "kun", + "ㄎㄨㄤ" : "kuang", + "ㄎㄨㄥ" : "kong", + "ㄏ" : "h", + "ㄏㄚ" : "ha", + "ㄏㄜ" : "he", + "ㄏㄞ" : "hai", + "ㄏㄟ" : "hei", + "ㄏㄠ" : "hao", + "ㄏㄡ" : "hou", + "ㄏㄢ" : "han", + "ㄏㄣ" : "hen", + "ㄏㄤ" : "hang", + "ㄏㄥ" : "heng", + "ㄏㄨ" : "hu", + "ㄏㄨㄚ" : "hua", + "ㄏㄨㄛ" : "huo", + "ㄏㄨㄞ" : "huai", + "ㄏㄨㄟ" : "hui", + "ㄏㄨㄢ" : "huan", + "ㄏㄨㄣ" : "hun", + "ㄏㄨㄤ" : "huang", + "ㄏㄨㄥ" : "hong", + "ㄐ" : "j", + "ㄐㄧ" : "ji", + "ㄐㄧㄚ" : "jia", + "ㄐㄧㄝ" : "jie", + "ㄐㄧㄠ" : "jiao", + "ㄐㄧㄡ" : "jiu", + "ㄐㄧㄢ" : "jian", + "ㄐㄧㄣ" : "jin", + "ㄐㄧㄤ" : "jiang", + "ㄐㄧㄥ" : "jing", + "ㄐㄩ" : "ju", + "ㄐㄩㄝ" : "jue", + "ㄐㄩㄢ" : "juan", + "ㄐㄩㄣ" : "jun", + "ㄐㄩㄥ" : "jiong", + "ㄑ" : "q", + "ㄑㄧ" : "qi", + "ㄑㄧㄚ" : "qia", + "ㄑㄧㄝ" : "qie", + "ㄑㄧㄠ" : "qiao", + "ㄑㄧㄡ" : "qiu", + "ㄑㄧㄢ" : "qian", + "ㄑㄧㄣ" : "qin", + "ㄑㄧㄤ" : "qiang", + "ㄑㄧㄥ" : "qing", + "ㄑㄩ" : "qu", + "ㄑㄩㄝ" : "que", + "ㄑㄩㄢ" : "quan", + "ㄑㄩㄣ" : "qun", + "ㄑㄩㄥ" : "qiong", + "ㄒ" : "x", + "ㄒㄧ" : "xi", + "ㄒㄧㄚ" : "xia", + "ㄒㄧㄝ" : "xie", + "ㄒㄧㄠ" : "xiao", + "ㄒㄧㄡ" : "xiu", + "ㄒㄧㄢ" : "xian", + "ㄒㄧㄣ" : "xin", + "ㄒㄧㄤ" : "xiang", + "ㄒㄧㄥ" : "xing", + "ㄒㄩ" : "xu", + "ㄒㄩㄝ" : "xue", + "ㄒㄩㄢ" : "xuan", + "ㄒㄩㄣ" : "xun", + "ㄒㄩㄥ" : "xiong", + "ㄓ" : "zhi", + "ㄓㄚ" : "zha", + "ㄓㄜ" : "zhe", + "ㄓㄞ" : "zhai", + "ㄓㄟ" : "zhei", + "ㄓㄠ" : "zhao", + "ㄓㄡ" : "zhou", + "ㄓㄢ" : "zhan", + "ㄓㄣ" : "zhen", + "ㄓㄤ" : "zhang", + "ㄓㄥ" : "zheng", + "ㄓㄨ" : "zhu", + "ㄓㄨㄚ" : "zhua", + "ㄓㄨㄛ" : "zhuo", + "ㄓㄨㄞ" : "zhuai", + "ㄓㄨㄟ" : "zhui", + "ㄓㄨㄢ" : "zhuan", + "ㄓㄨㄣ" : "zhun", + "ㄓㄨㄤ" : "zhuang", + "ㄓㄨㄥ" : "zhong", + "ㄔ" : "chi", + "ㄔㄚ" : "cha", + "ㄔㄜ" : "che", + "ㄔㄞ" : "chai", + "ㄔㄠ" : "chao", + "ㄔㄡ" : "chou", + "ㄔㄢ" : "chan", + "ㄔㄣ" : "chen", + "ㄔㄤ" : "chang", + "ㄔㄥ" : "cheng", + "ㄔㄨ" : "chu", + "ㄔㄨㄚ" : "chua", + "ㄔㄨㄛ" : "chuo", + "ㄔㄨㄞ" : "chuai", + "ㄔㄨㄟ" : "chui", + "ㄔㄨㄢ" : "chuan", + "ㄔㄨㄣ" : "chun", + "ㄔㄨㄤ" : "chuang", + "ㄔㄨㄥ" : "chong", + "ㄕ" : "shi", + "ㄕㄚ" : "sha", + "ㄕㄜ" : "she", + "ㄕㄞ" : "shai", + "ㄕㄟ" : "shei", + "ㄕㄠ" : "shao", + "ㄕㄡ" : "shou", + "ㄕㄢ" : "shan", + "ㄕㄣ" : "shen", + "ㄕㄤ" : "shang", + "ㄕㄥ" : "sheng", + "ㄕㄨ" : "shu", + "ㄕㄨㄚ" : "shua", + "ㄕㄨㄛ" : "shuo", + "ㄕㄨㄞ" : "shuai", + "ㄕㄨㄟ" : "shui", + "ㄕㄨㄢ" : "shuan", + "ㄕㄨㄣ" : "shun", + "ㄕㄨㄤ" : "shuang", + "ㄖ" : "ri", + "ㄖㄜ" : "re", + "ㄖㄠ" : "rao", + "ㄖㄡ" : "rou", + "ㄖㄢ" : "ran", + "ㄖㄣ" : "ren", + "ㄖㄤ" : "rang", + "ㄖㄥ" : "reng", + "ㄖㄨ" : "ru", + "ㄖㄨㄚ" : "rua", + "ㄖㄨㄛ" : "ruo", + "ㄖㄨㄟ" : "rui", + "ㄖㄨㄢ" : "ruan", + "ㄖㄨㄣ" : "run", + "ㄖㄨㄥ" : "rong", + "ㄗ" : "zi", + "ㄗㄚ" : "za", + "ㄗㄜ" : "ze", + "ㄗㄞ" : "zai", + "ㄗㄟ" : "zei", + "ㄗㄠ" : "zao", + "ㄗㄡ" : "zou", + "ㄗㄢ" : "zan", + "ㄗㄣ" : "zen", + "ㄗㄤ" : "zang", + "ㄗㄥ" : "zeng", + "ㄗㄨ" : "zu", + "ㄗㄨㄛ" : "zuo", + "ㄗㄨㄟ" : "zui", + "ㄗㄨㄢ" : "zuan", + "ㄗㄨㄣ" : "zun", + "ㄗㄨㄥ" : "zong", + "ㄘ" : "ci", + "ㄘㄚ" : "ca", + "ㄘㄜ" : "ce", + "ㄘㄞ" : "cai", + "ㄘㄠ" : "cao", + "ㄘㄡ" : "cou", + "ㄘㄢ" : "can", + "ㄘㄣ" : "cen", + "ㄘㄤ" : "cang", + "ㄘㄥ" : "ceng", + "ㄘㄨ" : "cu", + "ㄘㄨㄛ" : "cuo", + "ㄘㄨㄟ" : "cui", + "ㄘㄨㄢ" : "cuan", + "ㄘㄨㄣ" : "cun", + "ㄘㄨㄥ" : "cong", + "ㄙ" : "si", + "ㄙㄚ" : "sa", + "ㄙㄜ" : "se", + "ㄙㄞ" : "sai", + "ㄙㄠ" : "sao", + "ㄙㄡ" : "sou", + "ㄙㄢ" : "san", + "ㄙㄣ" : "sen", + "ㄙㄤ" : "sang", + "ㄙㄥ" : "seng", + "ㄙㄨ" : "su", + "ㄙㄨㄛ" : "suo", + "ㄙㄨㄟ" : "sui", + "ㄙㄨㄢ" : "suan", + "ㄙㄨㄣ" : "sun", + "ㄙㄨㄥ" : "song", + "ㄚ" : "a", + "ㄛ" : "o", + "ㄜ" : "e", + "ㄞ" : "ai", + "ㄟ" : "ei", + "ㄠ" : "ao", + "ㄡ" : "ou", + "ㄢ" : "an", + "ㄣ" : "en", + "ㄤ" : "ang", + "ㄥ" : "eng", + "ㄦ" : "er", + "ㄧ" : "yi", + "ㄧㄚ" : "ya", + "ㄧㄛ" : "yo", + "ㄧㄝ" : "ye", + "ㄧㄞ" : "yai", + "ㄧㄠ" : "yao", + "ㄧㄡ" : "you", + "ㄧㄢ" : "yan", + "ㄧㄣ" : "yin", + "ㄧㄤ" : "yang", + "ㄧㄥ" : "ying", + "ㄨ" : "wu", + "ㄨㄚ" : "wa", + "ㄨㄛ" : "wo", + "ㄨㄞ" : "wai", + "ㄨㄟ" : "wei", + "ㄨㄢ" : "wan", + "ㄨㄣ" : "wen", + "ㄨㄤ" : "wang", + "ㄨㄥ" : "weng", + "ㄩ" : "yu", + "ㄩㄝ" : "yue", + "ㄩㄢ" : "yuan", + "ㄩㄣ" : "yun", + "ㄩㄥ" : "yong", + "ㄫ" : "ng", +} + +PINYIN_BOPOMOFO_MAP = dict([(v, k) for k, v in BOPOMOFO_PINYIN_MAP.items()]) + +SPECIAL_INITIAL_SET = {'ci', 'chi', 'si', 'shi', 'zi', 'zhi', 'ri'} + +''' +SHENG_YUN_BOPOMOFO_MAP = { + "b" : "ㄅ", + "p" : "ㄆ", + "m" : "ㄇ", + "f" : "ㄈ", + "d" : "ㄉ", + "t" : "ㄊ", + "n" : "ㄋ", + "l" : "ㄌ", + "g" : "ㄍ", + "k" : "ㄎ", + "h" : "ㄏ", + "j" : "ㄐ", + "q" : "ㄑ", + "x" : "ㄒ", + "zh" : "ㄓ", + "ch" : "ㄔ", + "sh" : "ㄕ", + "r" : "ㄖ", + "z" : "ㄗ", + "c" : "ㄘ", + "s" : "ㄙ", + + # 韻母為u,ue,un,uan,ong時ㄧ省略 + "y" : ("ㄧ", (("u", "ue", "un", "uan", "ong"), "")), + "w" : "ㄨ", + "a" : "ㄚ", + "o" : "ㄛ", + "e" : ("ㄜ", ("y", "ㄝ")), # y後面為ㄝ + + # zh ch sh r z c s y後面為空 + "i" : ("ㄧ", (("zh", "ch", "sh", "r", "z", "c", "s", "y"), "")), + + # jqxy後面為ㄩ w後面為空 + "u" : ("ㄨ", ("jqxy", "ㄩ")), + "v" : "ㄩ", + "ai" : "ㄞ", + "ei" : "ㄟ", + "ao" : "ㄠ", + "ou" : "ㄡ", + "an" : "ㄢ", + "en" : "ㄣ", + "ang" : "ㄤ", + "eng" : "ㄥ", + "er" : "ㄦ", + "ia" : "ㄧㄚ", + "ie" : "ㄧㄝ", + "iai" : "ㄧㄞ", + "iao" : "ㄧㄠ", + "iu" : "ㄧㄡ", + "ian" : "ㄧㄢ", + "in" : ("ㄧㄣ", ("y", "ㄣ")), #y後面為ㄣ + "iang" : "ㄧㄤ", + "ing" : ("ㄧㄥ", ("y", "ㄥ")), #y後面為ㄥ + "ua" : "ㄨㄚ", + "uo" : "ㄨㄛ", + "ue" : "ㄩㄝ", + # TODO: "ve" is OK? + "ve" : "ㄩㄝ", + "uai" : "ㄨㄞ", + "ui" : "ㄨㄟ", + "uan" : ("ㄨㄢ", ("jqxy", "ㄩㄢ")), # jqxy後面是ㄩㄢ + "un" : ("ㄨㄣ", ("jqxy", "ㄩㄣ")), # jqxy後面是ㄩㄣ + "uang" : ("ㄨㄤ", ("jqxy", "ㄩㄤ")), # jqxy後面是ㄩㄤ + "ong" : ("ㄨㄥ", ("jqxy", "ㄩㄥ")), # y後面為ㄩㄥ + "iong" : "ㄩㄥ", +} +''' diff --git a/scripts/chewing.py b/scripts/chewing.py new file mode 100644 index 0000000..b49c84f --- /dev/null +++ b/scripts/chewing.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +ASCII_CHEWING_INITIAL_MAP = { + "CHEWING_B" : "ㄅ", + "CHEWING_C" : "ㄘ", + "CHEWING_CH" : "ㄔ", + "CHEWING_D" : "ㄉ", + "CHEWING_F" : "ㄈ", + "CHEWING_H" : "ㄏ", + "CHEWING_G" : "ㄍ", + "CHEWING_K" : "ㄎ", + "CHEWING_J" : "ㄐ", + "CHEWING_M" : "ㄇ", + "CHEWING_N" : "ㄋ", + "CHEWING_L" : "ㄌ", + "CHEWING_R" : "ㄖ", + "CHEWING_P" : "ㄆ", + "CHEWING_Q" : "ㄑ", + "CHEWING_S" : "ㄙ", + "CHEWING_SH" : "ㄕ", + "CHEWING_T" : "ㄊ", + "CHEWING_X" : "ㄒ", + "CHEWING_Z" : "ㄗ", + "CHEWING_ZH" : "ㄓ", +} + +CHEWING_ASCII_INITIAL_MAP = dict([(v, k) for k, v in ASCII_CHEWING_INITIAL_MAP.items()]) + +ASCII_CHEWING_MIDDLE_MAP = { + "CHEWING_I" : "ㄧ", + "CHEWING_U" : "ㄨ", + "CHEWING_V" : "ㄩ", +} + +CHEWING_ASCII_MIDDLE_MAP = dict([(v, k) for k, v in ASCII_CHEWING_MIDDLE_MAP.items()]) + +ASCII_CHEWING_FINAL_MAP = { + "CHEWING_A" : "ㄚ", + "CHEWING_AI" : "ㄞ", + "CHEWING_AN" : "ㄢ", + "CHEWING_ANG" : "ㄤ", + "CHEWING_AO" : "ㄠ", + "CHEWING_E" : "ㄝ", # merge "ㄝ" and "ㄜ" + "CHEWING_EI" : "ㄟ", + "CHEWING_EN" : "ㄣ", + "CHEWING_ENG" : "ㄥ", + "CHEWING_ER" : "ㄦ", + "CHEWING_NG" : "ㄫ", + "CHEWING_O" : "ㄛ", + "CHEWING_OU" : "ㄡ", +} + +CHEWING_ASCII_FINAL_MAP = dict([(v, k) for k, v in ASCII_CHEWING_FINAL_MAP.items()]) diff --git a/scripts/chewing_enum.h.in b/scripts/chewing_enum.h.in new file mode 100644 index 0000000..46072df --- /dev/null +++ b/scripts/chewing_enum.h.in @@ -0,0 +1,45 @@ +#ifndef CHEWING_ENUM_H +#define CHEWING_ENUM_H + +namespace pinyin{ + +/** + * @brief enums of chewing initial element. + */ + +enum ChewingInitial +{ +@CHEWING_INITIAL@ +}; + + +/** + * @brief enums of chewing middle element. + */ + +enum ChewingMiddle +{ +@CHEWING_MIDDLE@ +}; + + +/** + * @brief enums of chewing final element. + */ +enum ChewingFinal +{ +@CHEWING_FINAL@ +}; + + +/** + * @brief enums of chewing tone element. + */ +enum ChewingTone +{ +@CHEWING_TONE@ +}; + +}; + +#endif diff --git a/scripts/chewing_table.h.in b/scripts/chewing_table.h.in new file mode 100644 index 0000000..8780b17 --- /dev/null +++ b/scripts/chewing_table.h.in @@ -0,0 +1,50 @@ +#ifndef CHEWING_TABLE_H +#define CHEWING_TABLE_H + +namespace pinyin{ + +const chewing_symbol_item_t chewing_standard_symbols[] = { +@STANDARD_SYMBOLS@ +}; + +const chewing_tone_item_t chewing_standard_tones[] = { +@STANDARD_TONES@ +}; + + +const chewing_symbol_item_t chewing_ginyieh_symbols[] = { +@GINYIEH_SYMBOLS@ +}; + +const chewing_tone_item_t chewing_ginyieh_tones[] = { +@GINYIEH_TONES@ +}; + +const chewing_symbol_item_t chewing_eten_symbols[] = { +@ETEN_SYMBOLS@ +}; + +const chewing_tone_item_t chewing_eten_tones[] = { +@ETEN_TONES@ +}; + +const chewing_symbol_item_t chewing_ibm_symbols[] = { +@IBM_SYMBOLS@ +}; + +const chewing_tone_item_t chewing_ibm_tones[] = { +@IBM_TONES@ +}; + +const char * chewing_tone_table[CHEWING_NUMBER_OF_TONES] = { +"", +"ˉ", +"ˊ", +"ˇ", +"ˋ", +"˙" +}; + +}; + +#endif diff --git a/scripts/chewingkey.py b/scripts/chewingkey.py new file mode 100644 index 0000000..5f5770f --- /dev/null +++ b/scripts/chewingkey.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +CHEWING_INITIAL_LIST = [ + 'CHEWING_ZERO_INITIAL', #Zero Initial + 'CHEWING_B', #"ㄅ" + 'CHEWING_C', #"ㄘ" + 'CHEWING_CH', #"ㄔ" + 'CHEWING_D', #"ㄉ" + 'CHEWING_F', #"ㄈ" + 'CHEWING_H', #"ㄏ" + 'CHEWING_G', #"ㄍ" + 'CHEWING_K', #"ㄎ" + 'CHEWING_J', #"ㄐ" + 'CHEWING_M', #"ㄇ" + 'CHEWING_N', #"ㄋ" + 'CHEWING_L', #"ㄌ" + 'CHEWING_R', #"ㄖ" + 'CHEWING_P', #"ㄆ" + 'CHEWING_Q', #"ㄑ" + 'CHEWING_S', #"ㄙ" + 'CHEWING_SH', #"ㄕ" + 'CHEWING_T', #"ㄊ" + 'PINYIN_W', #Invalid Chewing + 'CHEWING_X', #"ㄒ" + 'PINYIN_Y', #Invalid Chewing + 'CHEWING_Z', #"ㄗ" + 'CHEWING_ZH' #"ㄓ" +] + + +CHEWING_MIDDLE_LIST = [ + 'CHEWING_ZERO_MIDDLE', #Zero Middle + 'CHEWING_I', #"ㄧ" + 'CHEWING_U', #"ㄨ" + 'CHEWING_V' #"ㄩ" +] + + +CHEWING_FINAL_LIST = [ + 'CHEWING_ZERO_FINAL', #Zero Final + 'CHEWING_A', #"ㄚ" + 'CHEWING_AI', #"ㄞ" + 'CHEWING_AN', #"ㄢ" + 'CHEWING_ANG', #"ㄤ" + 'CHEWING_AO', #"ㄠ" + 'CHEWING_E', #"ㄝ" and "ㄜ" + 'INVALID_EA', #Invalid Pinyin/Chewing + 'CHEWING_EI', #"ㄟ" + 'CHEWING_EN', #"ㄣ" + 'CHEWING_ENG', #"ㄥ" + 'CHEWING_ER', #"ㄦ" + 'CHEWING_NG', #"ㄫ" + 'CHEWING_O', #"ㄛ" + 'PINYIN_ONG', #"ueng" + 'CHEWING_OU', #"ㄡ" + 'PINYIN_IN', #"ien" + 'PINYIN_ING' #"ieng" +] + + +CHEWING_TONE_LIST = [ + 'CHEWING_ZERO_TONE', #Zero Tone + 'CHEWING_1', #" " + 'CHEWING_2', #'ˊ' + 'CHEWING_3', #'ˇ' + 'CHEWING_4', #'ˋ' + 'CHEWING_5' #'˙' +] + + +def gen_entries(items, last_enum, num_enum): + entries = [] + for enum, item in enumerate(items, start=0): + entry = '{0} = {1}'.format(item, enum) + entries.append(entry) + + #last enum + entry = last_enum + ' = ' + items[-1] + entries.append(entry) + + #num enum + entry = num_enum + entries.append(entry) + + return ",\n".join(entries) + + +def gen_initials(): + return gen_entries(CHEWING_INITIAL_LIST, 'CHEWING_LAST_INITIAL', + 'CHEWING_NUMBER_OF_INITIALS = CHEWING_LAST_INITIAL + 1') + + +def gen_middles(): + return gen_entries(CHEWING_MIDDLE_LIST, 'CHEWING_LAST_MIDDLE', + 'CHEWING_NUMBER_OF_MIDDLES = CHEWING_LAST_MIDDLE + 1') + + +def gen_finals(): + return gen_entries(CHEWING_FINAL_LIST, 'CHEWING_LAST_FINAL', + 'CHEWING_NUMBER_OF_FINALS = CHEWING_LAST_FINAL + 1') + + +def gen_tones(): + return gen_entries(CHEWING_TONE_LIST, 'CHEWING_LAST_TONE', + 'CHEWING_NUMBER_OF_TONES = CHEWING_LAST_TONE + 1') + + +def gen_table_index(content_table): + entries = [] + for i in range(0, len(CHEWING_INITIAL_LIST)): + initial = CHEWING_INITIAL_LIST[i] + for m in range(0, len(CHEWING_MIDDLE_LIST)): + middle = CHEWING_MIDDLE_LIST[m] + for f in range(0, len(CHEWING_FINAL_LIST)): + final = CHEWING_FINAL_LIST[f] + chewingkey = 'ChewingKey({0}, {1}, {2})'.format(initial, middle, final) + index = -1 + try: + index = [x[2] for x in content_table].index(chewingkey) + except ValueError: + pass + + entry = '{0:<7} /* {1} */'.format(index, chewingkey) + entries.append(entry) + return ",\n".join(entries) + + +### main function ### +if __name__ == "__main__": + print(gen_initials() + gen_middles() + gen_finals() + gen_tones()) diff --git a/scripts/correct.py b/scripts/correct.py new file mode 100644 index 0000000..ffd5998 --- /dev/null +++ b/scripts/correct.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (c) 2007-2008 Peng Huang <shawn.p.huang@gmail.com> +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +auto_correct = [ + # "correct", "wrong" + ("ng", "gn"), + ("ng", "mg"), + ("iu", "iou"), + ("ui", "uei"), + ("un", "uen"), +# ("ue", "ve"), + ("ve", "ue"), + ("ong", "on"), +] + +auto_correct_ext = [ + # "correct", "wrong", flag + ("ju", "jv", "PINYIN_CORRECT_V_U"), + ("qu", "qv", "PINYIN_CORRECT_V_U"), + ("xu", "xv", "PINYIN_CORRECT_V_U"), + ("yu", "yv", "PINYIN_CORRECT_V_U"), + + ("jue", "jve", "PINYIN_CORRECT_V_U"), + ("que", "qve", "PINYIN_CORRECT_V_U"), + ("xue", "xve", "PINYIN_CORRECT_V_U"), + ("yue", "yve", "PINYIN_CORRECT_V_U"), + + ("juan", "jvan", "PINYIN_CORRECT_V_U"), + ("quan", "qvan", "PINYIN_CORRECT_V_U"), + ("xuan", "xvan", "PINYIN_CORRECT_V_U"), + ("yuan", "yvan", "PINYIN_CORRECT_V_U"), + + ("jun", "jvn", "PINYIN_CORRECT_V_U"), + ("qun", "qvn", "PINYIN_CORRECT_V_U"), + ("xun", "xvn", "PINYIN_CORRECT_V_U"), + ("yun", "yvn", "PINYIN_CORRECT_V_U"), + +# ("juang", "jvang", "PINYIN_CORRECT_V_U"), +# ("quang", "qvang", "PINYIN_CORRECT_V_U"), +# ("xuang", "xvang", "PINYIN_CORRECT_V_U"), +# ("yuang", "yvang", "PINYIN_CORRECT_V_U"), + +# ("jun", "jven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +# ("qun", "qven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +# ("xun", "xven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +# ("yun", "yven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +] + + +''' +fuzzy_shengmu = [ + ("c", "ch"), + ("ch", "c"), + ("z", "zh"), + ("zh", "z"), + ("s", "sh"), + ("sh", "s"), + ("l", "n"), + ("n", "l"), + ("f", "h"), + ("h", "f"), + ("l", "r"), + ("r", "l"), + ("k", "g"), + ("g", "k"), +] + +fuzzy_yunmu = [ + ("an", "ang"), + ("ang", "an"), + ("en", "eng"), + ("eng", "en"), + ("in", "ing"), + ("ing", "in"), +] +''' diff --git a/scripts/double_pinyin_table.h.in b/scripts/double_pinyin_table.h.in new file mode 100644 index 0000000..15a8ee9 --- /dev/null +++ b/scripts/double_pinyin_table.h.in @@ -0,0 +1,56 @@ +#ifndef DOUBLE_PINYIN_TABLE_H +#define DOUBLE_PINYIN_TABLE_H + +namespace pinyin{ + +const double_pinyin_scheme_shengmu_item_t double_pinyin_mspy_sheng[] = { +@MSPY_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_mspy_yun[] = { +@MSPY_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_zrm_sheng[] = { +@ZRM_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_zrm_yun[] = { +@ZRM_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_abc_sheng[] = { +@ABC_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_abc_yun[] = { +@ABC_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_zgpy_sheng[] = { +@ZGPY_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_zgpy_yun[] = { +@ZGPY_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_pyjj_sheng[] = { +@PYJJ_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_pyjj_yun[] = { +@PYJJ_YUN@ +}; + +const double_pinyin_scheme_shengmu_item_t double_pinyin_xhe_sheng[] = { +@XHE_SHENG@ +}; + +const double_pinyin_scheme_yunmu_item_t double_pinyin_xhe_yun[] = { +@XHE_YUN@ +}; + +}; + +#endif diff --git a/scripts/genbopomofoheader.py b/scripts/genbopomofoheader.py new file mode 100644 index 0000000..cb0fa86 --- /dev/null +++ b/scripts/genbopomofoheader.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (c) 2010 BYVoid <byvoid1@gmail.com> +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +from operator import itemgetter +from utils import expand_file + +bopomofo = [ + 'ㄅ', 'ㄆ', 'ㄇ', 'ㄈ', 'ㄉ', 'ㄊ', 'ㄋ', 'ㄌ', 'ㄍ', 'ㄎ', + 'ㄏ', 'ㄐ', 'ㄑ', 'ㄒ', 'ㄓ', 'ㄔ', 'ㄕ', 'ㄖ', 'ㄗ', 'ㄘ', 'ㄙ', + + 'ㄧ', 'ㄨ', 'ㄩ', 'ㄚ', 'ㄛ', 'ㄜ', 'ㄝ', 'ㄞ', 'ㄟ', 'ㄠ', 'ㄡ', + 'ㄢ', 'ㄣ', 'ㄤ', 'ㄥ', 'ㄦ', + + 'ˉ', 'ˊ', 'ˇ', 'ˋ', '˙', +] + +#陰平聲不標號, use space key +num_tones = -5 + +bopomofo_keyboards = { + #標準注音鍵盤 + 'STANDARD': + ( + "1","q","a","z","2","w","s","x","e","d","c","r","f","v","5","t","g","b","y","h","n", + "u","j","m","8","i","k",",","9","o","l",".","0","p",";","/","-", + " ","6","3","4","7", + ), + #精業注音鍵盤 + 'GINYIEH': + ( + "2","w","s","x","3","e","d","c","r","f","v","t","g","b","6","y","h","n","u","j","m", + "-","[","'","8","i","k",",","9","o","l",".","0","p",";","/","=", + " ","q","a","z","1", + ), + #倚天注音鍵盤 + 'ETEN': + ( + "b","p","m","f","d","t","n","l","v","k","h","g","7","c",",",".","/","j",";","'","s", + "e","x","u","a","o","r","w","i","q","z","y","8","9","0","-","=", + " ","2","3","4","1", + ), + #IBM注音鍵盤 + 'IBM': + ( + "1","2","3","4","5","6","7","8","9","0","-","q","w","e","r","t","y","u","i","o","p", + "a","s","d","f","g","h","j","k","l",";","z","x","c","v","b","n", + " ","m",",",".","/", + ), +} + + +def escape_char(ch): + if ch == "'" or ch == "\\": + ch = "\\" + ch; + return "'{0}'".format(ch) + + +#generate shengmu and yunmu here +def gen_chewing_symbols(scheme): + keyboard = bopomofo_keyboards[scheme] + keyboard = keyboard[: num_tones] + items = [] + for (i, key) in enumerate(keyboard): + items.append((key, bopomofo[i])) + items = sorted(items, key=itemgetter(0)) + entries = [] + for (key, string) in items: + key = escape_char(key) + string = '"{0}"'.format(string) + entry = "{{{0: <5}, {1}}}".format(key, string) + entries.append(entry) + entries.append("{'\\0', NULL}") + return ",\n".join(entries) + + +#generate tones here +def gen_chewing_tones(scheme): + keyboard = bopomofo_keyboards[scheme] + keyboard = keyboard[num_tones:] + items = [] + for (i, key) in enumerate(keyboard, start=1): + items.append((key, i)); + items = sorted(items, key=itemgetter(0)) + entries = [] + for (key, tone) in items: + key = escape_char(key); + entry = "{{{0: <5}, {1}}}".format(key, tone) + entries.append(entry) + entries.append("{'\\0', 0}") + return ",\n".join(entries) + + +def get_table_content(tablename): + (scheme, part) = tablename.split('_', 1) + if part == "SYMBOLS": + return gen_chewing_symbols(scheme); + if part == "TONES": + return gen_chewing_tones(scheme); + + +### main function ### +if __name__ == "__main__": + expand_file("chewing_table.h.in", get_table_content) diff --git a/scripts/genchewingkey.py b/scripts/genchewingkey.py new file mode 100644 index 0000000..4a0bdcd --- /dev/null +++ b/scripts/genchewingkey.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +from utils import expand_file +from chewingkey import gen_initials, gen_middles, gen_finals, gen_tones + + +def get_table_content(tablename): + if tablename == 'CHEWING_INITIAL': + return gen_initials() + if tablename == 'CHEWING_MIDDLE': + return gen_middles() + if tablename == 'CHEWING_FINAL': + return gen_finals() + if tablename == 'CHEWING_TONE': + return gen_tones() + + +### main function ### +if __name__ == "__main__": + expand_file("chewing_enum.h.in", get_table_content) + diff --git a/scripts/gendoublepinyinheader.py b/scripts/gendoublepinyinheader.py new file mode 100644 index 0000000..08dd817 --- /dev/null +++ b/scripts/gendoublepinyinheader.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import pinyin +from utils import expand_file + +def gen_shengmu_table(scheme): + entries = [] + #select shengmu mapping + sheng = pinyin.SHUANGPIN_SCHEMAS[scheme][0] + for c in "abcdefghijklmnopqrstuvwxyz;": + sh = sheng.get(c, "NULL") + if sh != "NULL": + sh = '"{0}"'.format(sh) + entry = '{{{0: <5}}} /* {1} */'.format(sh, c.upper()) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_yunmu_table(scheme): + entries = [] + #select yunmu mapping + yun = pinyin.SHUANGPIN_SCHEMAS[scheme][1] + for c in "abcdefghijklmnopqrstuvwxyz;": + y = yun.get(c, ("NULL", "NULL")) + if len(y) == 1: + y1 = y[0] + y2 = "NULL" + else: + y1, y2 = y + if y1 != "NULL": + y1 = '"{0}"'.format(y1) + if y2 != "NULL": + y2 = '"{0}"'.format(y2) + entry = '{{{{{0: <7}, {1: <7}}}}} /* {2} */'.format(y1, y2, c.upper()) + entries.append(entry) + return ',\n'.join(entries) + + +def get_table_content(tablename): + (scheme, part) = tablename.split('_', 1) + if part == "SHENG": + return gen_shengmu_table(scheme) + if part == "YUN": + return gen_yunmu_table(scheme) + + +### main function ### +if __name__ == "__main__": + expand_file("double_pinyin_table.h.in", get_table_content) diff --git a/scripts/genpinyinheader.py b/scripts/genpinyinheader.py new file mode 100644 index 0000000..81e0538 --- /dev/null +++ b/scripts/genpinyinheader.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +from utils import expand_file +from genpinyintable import gen_content_table, \ + gen_pinyin_index, gen_bopomofo_index, \ + gen_chewing_key_table +from genspecialtable import gen_divided_table, gen_resplit_table + +def get_table_content(tablename): + if tablename == 'CONTENT_TABLE': + return gen_content_table() + if tablename == 'PINYIN_INDEX': + return gen_pinyin_index() + if tablename == 'BOPOMOFO_INDEX': + return gen_bopomofo_index() + if tablename == 'DIVIDED_TABLE': + return gen_divided_table() + if tablename == 'RESPLIT_TABLE': + return gen_resplit_table() + if tablename == 'TABLE_INDEX': + return gen_chewing_key_table() + + +### main function ### +if __name__ == "__main__": + expand_file("pinyin_parser_table.h.in", get_table_content) diff --git a/scripts/genpinyins.py b/scripts/genpinyins.py new file mode 100644 index 0000000..fef40cd --- /dev/null +++ b/scripts/genpinyins.py @@ -0,0 +1,57 @@ +#!/usr/bin/python3 +import os +from operator import itemgetter + +pinyin_dict = {} + + +def strip_tone(old_pinyin_str): + oldpinyins = old_pinyin_str.split("'") + newpinyins = [] + + for pinyin in oldpinyins: + if pinyin[-1].isdigit(): + pinyin = pinyin[:-1] + newpinyins.append(pinyin) + + new_pinyin_str = "'".join(newpinyins) + return new_pinyin_str + + +def add_pinyin_dict(pinyin, freq): + if 0 == freq: + return + if not pinyin in pinyin_dict: + pinyin_dict[pinyin] = freq + else: + pinyin_dict[pinyin] += freq + + +def load_phrase(filename): + phrasefile = open(filename, "r") + for line in phrasefile.readlines(): + line = line.rstrip(os.linesep) + (pinyin, word, token, freq) = line.split(None, 3) + pinyin = strip_tone(pinyin) + freq = int(freq) + + if len(word) in [1, 2]: + add_pinyin_dict(pinyin, freq) + + phrasefile.close() + +load_phrase("../data/gb_char.table") +load_phrase("../data/gbk_char.table") + + +def save_pinyin(filename): + pinyinfile = open(filename, "w") + for pinyin, freq in pinyin_dict.items(): + freq = str(freq) + line = "\t".join((pinyin, freq)) + pinyinfile.writelines([line, os.linesep]) + pinyinfile.close() + + +if __name__ == "__main__": + save_pinyin("pinyins.txt") diff --git a/scripts/genpinyintable.py b/scripts/genpinyintable.py new file mode 100644 index 0000000..cc60034 --- /dev/null +++ b/scripts/genpinyintable.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +import operator +import bopomofo +from pinyintable import * +from chewingkey import gen_table_index + + +content_table = [] +pinyin_index = [] +bopomofo_index = [] + +#pinyin table +def filter_pinyin_list(): + for (correct, wrong, bopomofo, flags, chewing) in gen_pinyin_list(): + flags = '|'.join(flags) + chewing = "ChewingKey({0})".format(', '.join(chewing)) + #correct = correct.replace("v", "ü") + content_table.append((correct, bopomofo, chewing)) + if "IS_PINYIN" in flags: + pinyin_index.append((wrong, flags, correct)) + if "IS_CHEWING" in flags: + bopomofo_index.append((bopomofo, flags)) + + +def sort_all(): + global content_table, pinyin_index, bopomofo_index + #remove duplicates + content_table = list(set(content_table)) + pinyin_index = list(set(pinyin_index)) + bopomofo_index = list(set(bopomofo_index)) + #define sort function + sortfunc = operator.itemgetter(0) + #begin sort + content_table = sorted(content_table, key=sortfunc) + #prepend zero item to reserve the invalid item + content_table.insert(0, ("", "", "ChewingKey()")) + #sort index + pinyin_index = sorted(pinyin_index, key=sortfunc) + bopomofo_index = sorted(bopomofo_index, key=sortfunc) + +def get_sheng_yun(pinyin): + if pinyin == None: + return None, None + if pinyin == "": + return "", "" + if pinyin == "ng": + return "", "ng" + for i in range(2, 0, -1): + s = pinyin[:i] + if s in shengmu_list: + return s, pinyin[i:] + return "", pinyin + +def gen_content_table(): + entries = [] + for ((correct, bopomofo, chewing)) in content_table: + (shengmu, yunmu) = get_sheng_yun(correct) + entry = '{{"{0}", "{1}", "{2}", "{3}", {4}}}'.format(correct, shengmu, yunmu, bopomofo, chewing) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_pinyin_index(): + entries = [] + for (wrong, flags, correct) in pinyin_index: + index = [x[0] for x in content_table].index(correct) + entry = '{{"{0}", {1}, {2}}}'.format(wrong, flags, index) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_bopomofo_index(): + entries = [] + for (bopomofo_str, flags) in bopomofo_index: + pinyin_str = bopomofo.BOPOMOFO_PINYIN_MAP[bopomofo_str] + index = [x[0] for x in content_table].index(pinyin_str) + entry = '{{"{0}", {1}, {2}}}'.format(bopomofo_str, flags, index) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_chewing_key_table(): + return gen_table_index(content_table) + + +#init code +filter_pinyin_list() +sort_all() + + +### main function ### +if __name__ == "__main__": + #s = gen_content_table() + gen_pinyin_index() + gen_bopomofo_index() + s = gen_chewing_key_table() + print(s) diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py new file mode 100644 index 0000000..061f9d1 --- /dev/null +++ b/scripts/genspecialtable.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import operator +import pinyin +from pinyintable import get_chewing, get_shengmu_chewing +from specialtable import * + +pinyin_list = sorted(pinyin.PINYIN_LIST) +shengmu_list = sorted(pinyin.SHENGMU_LIST) + +divided_list = [] +resplit_list = [] + + +def sort_all(): + global divided_list, resplit_list + divided_list = sorted(divided_list, key=operator.itemgetter(0)) + resplit_list = sorted(resplit_list, key=operator.itemgetter(0, 1)) + +''' +def get_chewing_string(pinyin): + #handle shengmu + if pinyin not in pinyin_list: + if pinyin in shengmu_list: + chewing_key = get_shengmu_chewing(pinyin) + else: + assert False, "Un-expected pinyin string." + else: + chewing_key = get_chewing(pinyin) + chewing_str = 'ChewingKey({0})'.format(', '.join(chewing_key)) + return chewing_str +''' + +def gen_divided_table(): + entries = [] + for (pinyin_key, orig_freq, first_key, second_key, new_freq) \ + in divided_list: + + if orig_freq >= new_freq: + assert orig_freq > 0, "Expected orig_freq > 0 here." + + entry = '{{"{0}", {1}, {{"{2}", "{3}"}}, {4}}}'.format \ + (pinyin_key, orig_freq, first_key, second_key, new_freq) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_resplit_table(): + entries = [] + for (orig_first_key, orig_second_key, orig_freq, \ + new_first_key, new_second_key, new_freq) in resplit_list: + + if orig_freq >= new_freq: + assert orig_freq > 0, "Expected orig_freq > 0 here." + + entry = '{{{{"{0}", "{1}"}}, {2}, {{"{3}", "{4}"}}, {5}}}'.format \ + (orig_first_key, orig_second_key, orig_freq,\ + new_first_key, new_second_key, new_freq) + entries.append(entry) + return ',\n'.join(entries) + + +#init code, load lists +divided_list = filter_divided() +resplit_list = filter_resplit() +sort_all() + + +### main function ### +if __name__ == "__main__": + s = gen_divided_table() + '\n' + gen_resplit_table() + print(s) + diff --git a/scripts/pinyin.py b/scripts/pinyin.py new file mode 100644 index 0000000..dd0e156 --- /dev/null +++ b/scripts/pinyin.py @@ -0,0 +1,400 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (c) 2007-2008 Peng Huang <shawn.p.huang@gmail.com> +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +N_ = lambda x : x +PINYIN_DICT = { + "a" : 1, "ai" : 2, "an" : 3, "ang" : 4, "ao" : 5, + "ba" : 6, "bai" : 7, "ban" : 8, "bang" : 9, "bao" : 10, + "bei" : 11, "ben" : 12, "beng" : 13, "bi" : 14, "bian" : 15, + "biao" : 16, "bie" : 17, "bin" : 18, "bing" : 19, "bo" : 20, + "bu" : 21, "ca" : 22, "cai" : 23, "can" : 24, "cang" : 25, + "cao" : 26, "ce" : 27, "cen" : 28, "ceng" : 29, "ci" : 30, + "cong" : 31, "cou" : 32, "cu" : 33, "cuan" : 34, "cui" : 35, + "cun" : 36, "cuo" : 37, "cha" : 38, "chai" : 39, "chan" : 40, + "chang" : 41, "chao" : 42, "che" : 43, "chen" : 44, "cheng" : 45, + "chi" : 46, "chong" : 47, "chou" : 48, "chu" : 49, "chuai" : 50, + "chuan" : 51, "chuang" : 52, "chui" : 53, "chun" : 54, "chuo" : 55, + "da" : 56, "dai" : 57, "dan" : 58, "dang" : 59, "dao" : 60, + "de" : 61, "dei" : 62, + # "den" : 63, + "deng" : 64, "di" : 65, + "dia" : 66, "dian" : 67, "diao" : 68, "die" : 69, "ding" : 70, + "diu" : 71, "dong" : 72, "dou" : 73, "du" : 74, "duan" : 75, + "dui" : 76, "dun" : 77, "duo" : 78, "e" : 79, "ei" : 80, + "en" : 81, "er" : 82, "fa" : 83, "fan" : 84, "fang" : 85, + "fei" : 86, "fen" : 87, "feng" : 88, "fo" : 89, "fou" : 90, + "fu" : 91, "ga" : 92, "gai" : 93, "gan" : 94, "gang" : 95, + "gao" : 96, "ge" : 97, "gei" : 98, "gen" : 99, "geng" : 100, + "gong" : 101, "gou" : 102, "gu" : 103, "gua" : 104, "guai" : 105, + "guan" : 106, "guang" : 107, "gui" : 108, "gun" : 109, "guo" : 110, + "ha" : 111, "hai" : 112, "han" : 113, "hang" : 114, "hao" : 115, + "he" : 116, "hei" : 117, "hen" : 118, "heng" : 119, "hong" : 120, + "hou" : 121, "hu" : 122, "hua" : 123, "huai" : 124, "huan" : 125, + "huang" : 126, "hui" : 127, "hun" : 128, "huo" : 129, "ji" : 130, + "jia" : 131, "jian" : 132, "jiang" : 133, "jiao" : 134, "jie" : 135, + "jin" : 136, "jing" : 137, "jiong" : 138, "jiu" : 139, "ju" : 140, + "juan" : 141, "jue" : 142, "jun" : 143, "ka" : 144, "kai" : 145, + "kan" : 146, "kang" : 147, "kao" : 148, "ke" : 149, + # "kei" : 150, + "ken" : 151, "keng" : 152, "kong" : 153, "kou" : 154, "ku" : 155, + "kua" : 156, "kuai" : 157, "kuan" : 158, "kuang" : 159, "kui" : 160, + "kun" : 161, "kuo" : 162, "la" : 163, "lai" : 164, "lan" : 165, + "lang" : 166, "lao" : 167, "le" : 168, "lei" : 169, "leng" : 170, + "li" : 171, "lia" : 172, "lian" : 173, "liang" : 174, "liao" : 175, + "lie" : 176, "lin" : 177, "ling" : 178, "liu" : 179, + "lo" : 180, + "long" : 181, "lou" : 182, "lu" : 183, "luan" : 184, + # "lue" : 185, + "lun" : 186, "luo" : 187, "lv" : 188, "lve" : 189, + "ma" : 190, + "mai" : 191, "man" : 192, "mang" : 193, "mao" : 194, "me" : 195, + "mei" : 196, "men" : 197, "meng" : 198, "mi" : 199, "mian" : 200, + "miao" : 201, "mie" : 202, "min" : 203, "ming" : 204, "miu" : 205, + "mo" : 206, "mou" : 207, "mu" : 208, "na" : 209, "nai" : 210, + "nan" : 211, "nang" : 212, "nao" : 213, "ne" : 214, "nei" : 215, + "nen" : 216, "neng" : 217, "ni" : 218, "nian" : 219, "niang" : 220, + "niao" : 221, "nie" : 222, "nin" : 223, "ning" : 224, "niu" : 225, + "ng" : 226, + "nong" : 227, "nou" : 228, "nu" : 229, "nuan" : 230, + # "nue" : 231, + "nuo" : 232, "nv" : 233, "nve" : 234, + "o" : 235, + "ou" : 236, "pa" : 237, "pai" : 238, "pan" : 239, "pang" : 240, + "pao" : 241, "pei" : 242, "pen" : 243, "peng" : 244, "pi" : 245, + "pian" : 246, "piao" : 247, "pie" : 248, "pin" : 249, "ping" : 250, + "po" : 251, "pou" : 252, "pu" : 253, "qi" : 254, "qia" : 255, + "qian" : 256, "qiang" : 257, "qiao" : 258, "qie" : 259, "qin" : 260, + "qing" : 261, "qiong" : 262, "qiu" : 263, "qu" : 264, "quan" : 265, + "que" : 266, "qun" : 267, "ran" : 268, "rang" : 269, "rao" : 270, + "re" : 271, "ren" : 272, "reng" : 273, "ri" : 274, "rong" : 275, + "rou" : 276, "ru" : 277, "ruan" : 278, "rui" : 279, "run" : 280, + "ruo" : 281, "sa" : 282, "sai" : 283, "san" : 284, "sang" : 285, + "sao" : 286, "se" : 287, "sen" : 288, "seng" : 289, "si" : 290, + "song" : 291, "sou" : 292, "su" : 293, "suan" : 294, "sui" : 295, + "sun" : 296, "suo" : 297, "sha" : 298, "shai" : 299, "shan" : 300, + "shang" : 301, "shao" : 302, "she" : 303, "shei" : 304, "shen" : 305, + "sheng" : 306, "shi" : 307, "shou" : 308, "shu" : 309, "shua" : 310, + "shuai" : 311, "shuan" : 312, "shuang" : 313, "shui" : 314, "shun" : 315, + "shuo" : 316, "ta" : 317, "tai" : 318, "tan" : 319, "tang" : 320, + "tao" : 321, "te" : 322, + # "tei" : 323, + "teng" : 324, "ti" : 325, + "tian" : 326, "tiao" : 327, "tie" : 328, "ting" : 329, "tong" : 330, + "tou" : 331, "tu" : 332, "tuan" : 333, "tui" : 334, "tun" : 335, + "tuo" : 336, "wa" : 337, "wai" : 338, "wan" : 339, "wang" : 340, + "wei" : 341, "wen" : 342, "weng" : 343, "wo" : 344, "wu" : 345, + "xi" : 346, "xia" : 347, "xian" : 348, "xiang" : 349, "xiao" : 350, + "xie" : 351, "xin" : 352, "xing" : 353, "xiong" : 354, "xiu" : 355, + "xu" : 356, "xuan" : 357, "xue" : 358, "xun" : 359, "ya" : 360, + "yan" : 361, "yang" : 362, "yao" : 363, "ye" : 364, "yi" : 365, + "yin" : 366, "ying" : 367, "yo" : 368, "yong" : 369, "you" : 370, + "yu" : 371, "yuan" : 372, "yue" : 373, "yun" : 374, "za" : 375, + "zai" : 376, "zan" : 377, "zang" : 378, "zao" : 379, "ze" : 380, + "zei" : 381, "zen" : 382, "zeng" : 383, "zi" : 384, "zong" : 385, + "zou" : 386, "zu" : 387, "zuan" : 388, "zui" : 389, "zun" : 390, + "zuo" : 391, "zha" : 392, "zhai" : 393, "zhan" : 394, "zhang" : 395, + "zhao" : 396, "zhe" : 397, "zhen" : 398, "zheng" : 399, "zhi" : 400, + "zhong" : 401, "zhou" : 402, "zhu" : 403, "zhua" : 404, "zhuai" : 405, + "zhuan" : 406, "zhuang" : 407, "zhui" : 408, "zhun" : 409, "zhuo" : 410, + # some weird pinyins + #~ "eng" : 411, "chua" : 412, "fe" : 413, "fiao" : 414, "liong" : 415 +} + +PINYIN_LIST = PINYIN_DICT.keys () + + +SHENGMU_DICT = { + "b" : 1, "p" : 2, "m" : 3, "f" : 4, "d" : 5, + "t" : 6, "n" : 7, "l" : 8, "g" : 9, "k" : 10, "h" : 11, + "j" : 12, "q" : 13, "x" : 14, "zh" : 15, "ch" : 16, "sh" : 17, + "r" : 18, "z" : 19, "c" : 20, "s" : 21, "y" : 22, "w" : 23 +} + +SHENGMU_LIST = SHENGMU_DICT.keys () + + +YUNMU_DICT = { + "a" : 1, "ai" : 2, "an" : 3, "ang" : 4, "ao" : 5, + "e" : 6, "ei" : 7, "en" : 8, "eng" : 9, "er" : 10, + "i" : 11, "ia" : 12, "ian" : 13, "iang" : 14, "iao" : 15, + "ie" : 16, "in" : 17, "ing" : 18, "iong" : 19, "iu" : 20, + "o" : 21, "ong" : 22, "ou" : 23, "u" : 24, "ua" : 25, + "uai" : 26, "uan" : 27, "uang" : 28, "ue" : 29, "ui" : 30, + "un" : 31, "uo" : 32, "v" : 33, "ve" : 34 +} + +YUNMU_LIST = YUNMU_DICT.keys () + + +MOHU_SHENGMU = { + "z" : ("z", "zh"), + "zh" : ("z", "zh"), + "c" : ("c", "ch"), + "ch" : ("c", "ch"), + "s" : ("s", "sh"), + "sh" : ("s", "sh"), + "l" : ("l", "n"), + "n" : ("l", "n") +} + +MOHU_YUNMU = { + "an" : ("an", "ang"), + "ang" : ("an", "ang"), + "en" : ("en", "eng"), + "eng" : ("en", "eng"), + "in" : ("in", "ing"), + "ing" : ("in", "ing") +} + +MSPY_SHUANGPIN_SHENGMU_DICT = { + "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "ch","j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "sh","v" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +MSPY_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("ou",), + "c" : ("iao",), + "d" : ("uang", "iang"), + "e" : ("e",), + "f" : ("en",), + "g" : ("eng", "ng"), + "h" : ("ang",), + "i" : ("i",), + "j" : ("an",), + "k" : ("ao",), + "l" : ("ai",), + "m" : ("ian",), + "n" : ("in",), + "o" : ("uo", "o"), + "p" : ("un",), + "q" : ("iu",), + "r" : ("uan", "er"), + "s" : ("ong", "iong"), + "t" : ("ue",), + "u" : ("u",), + "v" : ("ui","ue"), + "w" : ("ia","ua"), + "x" : ("ie",), + "y" : ("uai", "v"), + "z" : ("ei",), + ";" : ("ing",) +} + +ZRM_SHUANGPIN_SHENGMU_DICT = { + "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "ch","j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "sh","v" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +ZRM_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("ou",), + "c" : ("iao",), + "d" : ("uang", "iang"), + "e" : ("e",), + "f" : ("en",), + "g" : ("eng", "ng"), + "h" : ("ang",), + "i" : ("i",), + "j" : ("an",), + "k" : ("ao",), + "l" : ("ai",), + "m" : ("ian",), + "n" : ("in",), + "o" : ("uo", "o"), + "p" : ("un",), + "q" : ("iu",), + "r" : ("uan", "er"), + "s" : ("ong", "iong"), + "t" : ("ue",), + "u" : ("u",), + "v" : ("ui","v"), + "w" : ("ia","ua"), + "x" : ("ie",), + "y" : ("uai", "ing"), + "z" : ("ei",), +} + +ABC_SHUANGPIN_SHENGMU_DICT = { + "a" : "zh", "b" : "b", "c" : "c", "d" : "d", "e":"ch", "f" : "f", "g" : "g", + "h" : "h", "j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "v" : "sh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +ABC_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("ou",), + "c" : ("in","uai"), + "d" : ("ia", "ua"), + "e" : ("e",), + "f" : ("en",), + "g" : ("eng", "ng"), + "h" : ("ang",), + "i" : ("i",), + "j" : ("an",), + "k" : ("ao",), + "l" : ("ai",), + "m" : ("ue","ui"), + "n" : ("un",), + "o" : ("uo", "o"), + "p" : ("uan",), + "q" : ("ei",), + "r" : ("er", "iu"), + "s" : ("ong", "iong"), + "t" : ("iang","uang"), + "u" : ("u",), + "v" : ("v","ue"), + "w" : ("ian",), + "x" : ("ie",), + "y" : ("ing",), + "z" : ("iao",), +} + +ZGPY_SHUANGPIN_SHENGMU_DICT = { + "a" : "ch", "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "sh","j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +ZGPY_SHUANGPIN_YUNMU_DICT = { + "a" : ("a", ), + "b" : ("iao", ), + "d" : ("ie", ), + "e" : ("e", ), + "f" : ("ian", ), + "g" : ("iang", "uang"), + "h" : ("ong", "iong"), + "i" : ("i", ), + "j" : ("er", "iu"), + "k" : ("ei", ), + "l" : ("uan", ), + "m" : ("un", ), + "n" : ("ue", "ui"), + "o" : ("uo", "o"), + "p" : ("ai", ), + "q" : ("ao", ), + "r" : ("an", ), + "s" : ("ang", ), + "t" : ("eng", "ng"), + "u" : ("u", ), + "v" : ("v", ), + "w" : ("en", ), + "x" : ("ia", "ua"), + "y" : ("in", "uai"), + "z" : ("ou" ,), + ";" : ("ing", ) +} + +PYJJ_SHUANGPIN_SHENGMU_DICT = { + "a" : "'", "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "sh","j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "ch","v" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z" +} + +PYJJ_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("ia","ua"), + "c" : ("uan",), + "d" : ("ao", ), + "e" : ("e",), + "f" : ("an",), + "g" : ("ang",), + "h" : ("iang","uang"), + "i" : ("i",), + "j" : ("ian",), + "k" : ("iao",), + "l" : ("in",), + "m" : ("ie",), + "n" : ("iu",), + "o" : ("uo", "o"), + "p" : ("ou",), + "q" : ("er","ing"), + "r" : ("en", ), + "s" : ("ai", ), + "t" : ("eng", "ng"), + "u" : ("u",), + "v" : ("v","ui"), + "w" : ("ei",), + "x" : ("uai","ue"), + "y" : ("ong","iong"), + "z" : ("un",), +} + +XHE_SHUANGPIN_SHENGMU_DICT = { + "b" : "b", "c" : "c", "d" : "d", "f" : "f", "g" : "g", + "h" : "h", "i" : "ch", "j" : "j", "k" : "k", "l" : "l", + "m" : "m", "n" : "n", "o" : "'", "p" : "p", "q" : "q", + "r" : "r", "s" : "s", "t" : "t", "u" : "sh", "v" : "zh", + "w" : "w", "x" : "x", "y" : "y", "z" : "z", + "a" : "'", "e" : "'" +} + +XHE_SHUANGPIN_YUNMU_DICT = { + "a" : ("a",), + "b" : ("in",), + "c" : ("ao",), + "d" : ("ai",), + "e" : ("e",), + "f" : ("en",), + "g" : ("eng", "ng"), + "h" : ("ang",), + "i" : ("i",), + "j" : ("an",), + "k" : ("uai", "ing"), + "l" : ("iang", "uang"), + "m" : ("ian",), + "n" : ("iao",), + "o" : ("uo", "o"), + "p" : ("ie",), + "q" : ("iu",), + "r" : ("uan", "er"), + "s" : ("ong", "iong"), + "t" : ("ue",), + "u" : ("u",), + "v" : ("v", "ui"), + "w" : ("ei",), + "x" : ("ia", "ua"), + "y" : ("un",), + "z" : ("ou",), +} + +SHUANGPIN_SCHEMAS = { + N_("MSPY") : (MSPY_SHUANGPIN_SHENGMU_DICT, MSPY_SHUANGPIN_YUNMU_DICT), + N_("ZRM") : (ZRM_SHUANGPIN_SHENGMU_DICT, ZRM_SHUANGPIN_YUNMU_DICT), + N_("ABC") : (ABC_SHUANGPIN_SHENGMU_DICT, ABC_SHUANGPIN_YUNMU_DICT), + N_("ZGPY") : (ZGPY_SHUANGPIN_SHENGMU_DICT, ZGPY_SHUANGPIN_YUNMU_DICT), + N_("PYJJ") : (PYJJ_SHUANGPIN_SHENGMU_DICT, PYJJ_SHUANGPIN_YUNMU_DICT), + N_("XHE") : (XHE_SHUANGPIN_SHENGMU_DICT, XHE_SHUANGPIN_YUNMU_DICT), +} + diff --git a/scripts/pinyin_parser_table.h.in b/scripts/pinyin_parser_table.h.in new file mode 100644 index 0000000..2f98e0e --- /dev/null +++ b/scripts/pinyin_parser_table.h.in @@ -0,0 +1,34 @@ +#ifndef PINYIN_PARSER_TABLE_H +#define PINYIN_PARSER_TABLE_H + +namespace pinyin{ + +const pinyin_index_item_t pinyin_index[] = { +@PINYIN_INDEX@ +}; + +const chewing_index_item_t chewing_index[] = { +@BOPOMOFO_INDEX@ +}; + +const content_table_item_t content_table[] = { +@CONTENT_TABLE@ +}; + +const divided_table_item_t divided_table[] = { +@DIVIDED_TABLE@ +}; + +const resplit_table_item_t resplit_table[] = { +@RESPLIT_TABLE@ +}; + +const gint chewing_key_table[CHEWING_NUMBER_OF_INITIALS * + CHEWING_NUMBER_OF_MIDDLES * + CHEWING_NUMBER_OF_FINALS] = { +@TABLE_INDEX@ +}; + +}; + +#endif diff --git a/scripts/pinyintable.py b/scripts/pinyintable.py new file mode 100644 index 0000000..bddf2dc --- /dev/null +++ b/scripts/pinyintable.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +import pinyin +import bopomofo +import chewing +import itertools +from correct import * + + +pinyin_list = sorted(bopomofo.PINYIN_BOPOMOFO_MAP.keys()) +shengmu_list = sorted(pinyin.SHENGMU_LIST) + + +def check_pinyin_chewing_map(): + for pinyin_key in pinyin.PINYIN_DICT.keys(): + if pinyin_key in pinyin_list: + pass + else: + print("pinyin %s has no chewing mapping", pinyin_key) + + +def get_chewing(pinyin_key): + initial, middle, final = \ + 'CHEWING_ZERO_INITIAL', 'CHEWING_ZERO_MIDDLE', 'CHEWING_ZERO_FINAL' + assert pinyin_key != None + assert pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP + + #handle 'w' and 'y' + if pinyin_key[0] == 'w': + initial = 'PINYIN_W' + if pinyin_key[0] == 'y': + initial = 'PINYIN_Y' + + #get chewing string + bopomofo_str = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + + #handle bopomofo SPECIAL_INITIAL_SET + if pinyin_key in bopomofo.SPECIAL_INITIAL_SET: + middle = "CHEWING_I" + #normal process + for char in bopomofo_str: + if char in chewing.CHEWING_ASCII_INITIAL_MAP: + initial = chewing.CHEWING_ASCII_INITIAL_MAP[char] + if char in chewing.CHEWING_ASCII_MIDDLE_MAP: + middle = chewing.CHEWING_ASCII_MIDDLE_MAP[char] + if char in chewing.CHEWING_ASCII_FINAL_MAP: + final = chewing.CHEWING_ASCII_FINAL_MAP[char] + if char == "ㄜ": # merge "ㄝ" and "ㄜ" + final = "CHEWING_E" + + post_process_rules = { + #handle "ueng"/"ong" + ("CHEWING_U", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ONG"), + #handle "veng"/"iong" + ("CHEWING_V", "CHEWING_ENG"): ("CHEWING_I", "PINYIN_ONG"), + #handle "ien"/"in" + ("CHEWING_I", "CHEWING_EN"): ("CHEWING_ZERO_MIDDLE", "PINYIN_IN"), + #handle "ieng"/"ing" + ("CHEWING_I", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ING"), + } + + if (middle, final) in post_process_rules: + (middle, final) = post_process_rules[(middle, final)] + + return initial, middle, final + + +def gen_pinyin_list(): + for p in itertools.chain(gen_pinyins(), + gen_shengmu(), + gen_corrects(), + gen_u_to_v(), + ): + yield p + + +def gen_pinyins(): + #generate all pinyins in bopomofo + for pinyin_key in pinyin_list: + flags = [] + if pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP.keys(): + flags.append("IS_CHEWING") + if pinyin_key in pinyin.PINYIN_LIST or \ + pinyin_key in pinyin.SHENGMU_LIST: + flags.append("IS_PINYIN") + if pinyin_key in shengmu_list: + flags.append("PINYIN_INCOMPLETE") + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + if chewing_key in chewing.CHEWING_ASCII_INITIAL_MAP and \ + pinyin_key not in bopomofo.SPECIAL_INITIAL_SET: + flags.append("CHEWING_INCOMPLETE") + yield pinyin_key, pinyin_key, chewing_key, \ + flags, get_chewing(pinyin_key) + + +def get_shengmu_chewing(shengmu): + assert shengmu in shengmu_list, "Expected shengmu here." + chewing_key = 'CHEWING_{0}'.format(shengmu.upper()) + if chewing_key in chewing.ASCII_CHEWING_INITIAL_MAP: + initial = chewing_key + else: + initial = 'PINYIN_{0}'.format(shengmu.upper()) + return initial, "CHEWING_ZERO_MIDDLE", "CHEWING_ZERO_FINAL" + +def gen_shengmu(): + #generate all shengmu + for shengmu in shengmu_list: + if shengmu in pinyin_list: + continue + flags = ["IS_PINYIN", "PINYIN_INCOMPLETE"] + chewing_key = get_shengmu_chewing(shengmu) + chewing_initial = chewing_key[0] + if chewing_initial in chewing.ASCII_CHEWING_INITIAL_MAP: + chewing_initial = chewing.ASCII_CHEWING_INITIAL_MAP[chewing_initial] + yield shengmu, shengmu, chewing_initial, \ + flags, chewing_key + + +def gen_corrects(): + #generate corrections + for correct, wrong in auto_correct: + flags = ['IS_PINYIN', 'PINYIN_CORRECT_{0}_{1}'.format(wrong.upper(), + correct.upper())] + for pinyin_key in pinyin_list: + #fixes partial pinyin instead of the whole pinyin + if pinyin_key.endswith(correct) and pinyin_key != correct: + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + new_pinyin_key = pinyin_key.replace(correct, wrong) + yield pinyin_key, new_pinyin_key, chewing_key,\ + flags, get_chewing(pinyin_key) + + +def gen_u_to_v(): + #generate U to V + for correct, wrong, flags in auto_correct_ext: + #over-ride flags + flags = ['IS_PINYIN', 'PINYIN_CORRECT_V_U'] + pinyin_key = correct + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + yield correct, wrong, chewing_key, flags, get_chewing(pinyin_key) + +### main function ### +if __name__ == "__main__": + #pre-check here + check_pinyin_chewing_map() + + #dump + for p in gen_pinyin_list(): + print (p) diff --git a/scripts/specials.txt b/scripts/specials.txt new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/scripts/specials.txt diff --git a/scripts/specialtable.py b/scripts/specialtable.py new file mode 100644 index 0000000..b6fb680 --- /dev/null +++ b/scripts/specialtable.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import os +import sys +import math +import pinyin + +pinyin_list = sorted(pinyin.PINYIN_LIST) +shengmu_list = sorted(pinyin.SHENGMU_LIST) +yunmu_list = sorted(pinyin.YUNMU_LIST) + +phrase_dict = {} + + +def load_phrase(filename): + phrasefile = open(filename, "r") + for line in phrasefile.readlines(): + line = line.rstrip(os.linesep) + (pinyin_str, freq) = line.split(None, 1) + freq = int(freq) + if 0 == freq: + #print(pinyin_str) + continue + + # no duplicate here + if "'" in pinyin_str: + (first_key, second_key) = pinyin_str.split("'") + phrase_dict[(first_key, second_key)] = freq + else: + phrase_dict[pinyin_str] = freq + phrasefile.close() + + +def gen_all_divided(): + for pinyin_key in pinyin_list: + for first_key in pinyin_list: + if len(pinyin_key) <= len(first_key): + continue + if not pinyin_key.startswith(first_key): + continue + second_key = pinyin_key[len(first_key):] + if second_key in pinyin_list: + yield pinyin_key, first_key, second_key + + +def filter_divided(): + for (pinyin_key, first_key, second_key) in gen_all_divided(): + if not (first_key, second_key) in phrase_dict: + continue + orig_freq = 0 + if pinyin_key in phrase_dict: + orig_freq = phrase_dict[pinyin_key] + new_freq = phrase_dict[(first_key, second_key)] + yield pinyin_key, orig_freq, first_key, second_key, new_freq + + +def gen_all_resplit(): + for pinyin_key in pinyin_list: + if pinyin_key[-1] in ["n", "g", "r"]: + for yun in yunmu_list: + if yun not in pinyin_list: + continue + #check first new pinyin key + if not pinyin_key[:-1] in pinyin_list: + continue + #check second new pinyin key + new_pinyin_key = pinyin_key[-1] + yun + if new_pinyin_key in pinyin_list: + yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key +''' + elif pinyin_key[-1] in ["e"]: + #check first new pinyin key + if pinyin_key[:-1] in pinyin_list: + yield pinyin_key, "r", pinyin_key[:-1], "er" +''' + + +def filter_resplit(): + for (orig_first_key, orig_second_key, new_first_key, new_second_key) \ + in gen_all_resplit(): + #do the reverse here, as libpinyin pinyin parser is different with + #ibus-pinyin's parser. + (orig_first_key, orig_second_key, new_first_key, new_second_key) = \ + (new_first_key, new_second_key, orig_first_key, orig_second_key) + if (new_first_key, new_second_key) not in phrase_dict: + continue + orig_freq = 0 + new_freq = phrase_dict[(new_first_key, new_second_key)] + if (orig_first_key, orig_second_key) in phrase_dict: + orig_freq = phrase_dict[(orig_first_key, orig_second_key)] + yield orig_first_key, orig_second_key, orig_freq, \ + new_first_key, new_second_key, new_freq + + +#init code +load_phrase("pinyins.txt") +load_phrase("specials.txt") + +if __name__ == "__main__": + for p in filter_divided(): + print (p) + for p in filter_resplit(): + print (p) diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..01bdbc7 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu <alexepico@gmail.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import os + +header = '''/* This file is generated by python scripts. Don't edit this file directly. + */ +''' + +def expand_file(filename, get_table_content): + infile = open(filename, "r") + print(header) + for line in infile.readlines(): + line = line.rstrip(os.linesep) + if len(line) < 3 : + print(line) + continue + if line[0] == '@' and line[-1] == '@': + tablename = line[1:-1] + print(get_table_content(tablename)) + else: + print(line) |