From 2ef0735e5fafa28a51ec04cf8e24e21c7486a040 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Wed, 2 Nov 2011 12:08:45 +0800 Subject: rename files --- scripts/genpytable.py | 159 --------------------------------------------- scripts/genspecialtable.py | 105 ------------------------------ scripts/pinyintable.py | 159 +++++++++++++++++++++++++++++++++++++++++++++ scripts/specialtable.py | 105 ++++++++++++++++++++++++++++++ 4 files changed, 264 insertions(+), 264 deletions(-) delete mode 100644 scripts/genpytable.py delete mode 100644 scripts/genspecialtable.py create mode 100644 scripts/pinyintable.py create mode 100644 scripts/specialtable.py diff --git a/scripts/genpytable.py b/scripts/genpytable.py deleted file mode 100644 index ca22aa1..0000000 --- a/scripts/genpytable.py +++ /dev/null @@ -1,159 +0,0 @@ -# -*- coding: utf-8 -*- -# vim:set et sts=4 sw=4: -# -# libpinyin - Library to deal with pinyin. -# -# Copyright (C) 2011 Peng Wu -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - -import pinyin -import bopomofo -import chewing -import itertools -from correct import * - - -pinyin_list = sorted(bopomofo.PINYIN_BOPOMOFO_MAP.keys()) -shengmu_list = sorted(pinyin.SHENGMU_DICT.keys()) - - -def check_pinyin_chewing_map(): - for pinyin_key in pinyin.PINYIN_DICT.keys(): - if pinyin_key in pinyin_list: - pass - else: - print("pinyin %s has no chewing mapping", pinyin_key) - - -def get_chewing(pinyin_key): - initial, middle, final = \ - 'CHEWING_ZERO_INITIAL', 'CHEWING_ZERO_MIDDLE', 'CHEWING_ZERO_FINAL' - assert pinyin_key != None - assert pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP - - #handle 'w' and 'y' - if pinyin_key[0] == 'w': - initial = 'PINYIN_W' - if pinyin_key[0] == 'y': - initial = 'PINYIN_Y' - - #get chewing string - bopomofo_str = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] - - #handle 'ci', 'chi', 'si', 'shi', 'zi', 'zhi', 'ri' - if pinyin_key in {'ci', 'chi', 'si', 'shi', 'zi', 'zhi', 'ri'}: - middle = "CHEWING_I" - #normal process - for char in bopomofo_str: - if char in chewing.CHEWING_ASCII_INITIAL_MAP: - initial = chewing.CHEWING_ASCII_INITIAL_MAP[char] - if char in chewing.CHEWING_ASCII_MIDDLE_MAP: - middle = chewing.CHEWING_ASCII_MIDDLE_MAP[char] - if char in chewing.CHEWING_ASCII_FINAL_MAP: - final = chewing.CHEWING_ASCII_FINAL_MAP[char] - if char == "ㄜ": # merge "ㄝ" and "ㄜ" - final = "CHEWING_E" - - post_process_rules = { - #handle "ueng"/"ong" - ("CHEWING_U", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ONG"), - #handle "veng"/"iong" - ("CHEWING_V", "CHEWING_ENG"): ("CHEWING_I", "PINYIN_ONG"), - #handle "ien"/"in" - ("CHEWING_I", "CHEWING_EN"): ("CHEWING_ZERO_MIDDLE", "PINYIN_IN"), - #handle "ieng"/"ing" - ("CHEWING_I", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ING"), - } - - if (middle, final) in post_process_rules: - (middle, final) = post_process_rules[(middle, final)] - - return initial, middle, final - - -def gen_pinyin_list(): - for p in itertools.chain(gen_pinyins(), - gen_shengmu(), - gen_corrects(), - gen_u_to_v(), - ): - yield p - - -def gen_pinyins(): - #generate all pinyins in bopomofo - for pinyin_key in pinyin_list: - flags = [] - if pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP.keys(): - flags.append("IS_CHEWING") - if pinyin_key in pinyin.PINYIN_DICT.keys(): - flags.append("IS_PINYIN") - if pinyin_key in shengmu_list: - flags.append("PINYIN_INCOMPLETE") - chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] - if chewing_key in chewing.ASCII_CHEWING_INITIAL_MAP: - flags.append("CHEWING_INCOMPLETE") - yield pinyin_key, pinyin_key, chewing_key, \ - flags, get_chewing(pinyin_key) - - -def gen_shengmu(): - #generate all shengmu - for shengmu in shengmu_list: - if shengmu in pinyin_list: - continue - flags = ["IS_PINYIN", "PINYIN_INCOMPLETE"] - chewing_key = 'CHEWING_{0}'.format(shengmu.upper()) - if chewing_key in chewing.ASCII_CHEWING_INITIAL_MAP: - initial = chewing_key - chewing_key = chewing.ASCII_CHEWING_INITIAL_MAP[chewing_key] - else: - chewing_key = 'PINYIN_{0}'.format(shengmu.upper()) - initial = chewing_key - yield shengmu, shengmu, chewing_key, \ - flags, (initial, "CHEWING_ZREO_MIDDLE", "CHEWING_ZERO_FINAL") - - -def gen_corrects(): - #generate corrections - for correct, wrong in auto_correct: - flags = ['IS_PINYIN', 'PINYIN_CORRECT_{0}_{1}'.format(wrong.upper(), - correct.upper())] - for pinyin_key in pinyin_list: - if pinyin_key.endswith(correct) and pinyin_key != correct: - chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] - new_pinyin_key = pinyin_key.replace(correct, wrong) - yield pinyin_key, new_pinyin_key, chewing_key,\ - flags, get_chewing(pinyin_key) - - -def gen_u_to_v(): - #generate U to V - for correct, wrong, flags in auto_correct_ext: - #over-ride flags - flags = ['IS_PINYIN', 'PINYIN_CORRECT_V_U'] - pinyin_key = correct - chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] - yield correct, wrong, chewing_key, flags, get_chewing(pinyin_key) - -### main function ### -if __name__ == "__main__": - #pre-check here - check_pinyin_chewing_map() - - #dump - for pinyin_key in gen_pinyin_list(): - print (pinyin_key) diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py deleted file mode 100644 index 928ca9e..0000000 --- a/scripts/genspecialtable.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- -# vim:set et sts=4 sw=4: -# -# libpinyin - Library to deal with pinyin. -# -# Copyright (C) 2011 Peng Wu -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - - -import os -import sys -import math -import pinyin - -pinyin_list = sorted(pinyin.PINYIN_LIST) -shengmu_list = sorted(pinyin.SHENGMU_LIST) -yunmu_list = sorted(pinyin.YUNMU_LIST) - -phrase_dict = {} - - -def load_phrase(filename): - phrasefile = open(filename, "r") - for line in phrasefile.readlines(): - line = line.rstrip(os.linesep) - (pinyin_str, freq) = line.split(' ', 1) - freq = int(math.floor(float(freq))) - if 0 == freq: - #print(pinyin_str) - continue - - (first_key, second_key) = pinyin_str.split("'") - if first_key[-1].isdigit(): - first_key = first_key[:-1] - if second_key[-1].isdigit(): - second_key = second_key[:-1] - phrase_dict[(first_key, second_key)] = freq - phrasefile.close() - - -def gen_all_divided(): - for pinyin_key in pinyin_list: - for first_key in pinyin_list: - if len(pinyin_key) <= len(first_key): - continue - if not pinyin_key.startswith(first_key): - continue - second_key = pinyin_key[len(first_key):] - if second_key in pinyin_list: - yield pinyin_key, first_key, second_key - - -def filter_divided(): - for (pinyin_key, first_key, second_key) in gen_all_divided(): - if not (first_key, second_key) in phrase_dict: - continue - freq = phrase_dict[(first_key, second_key)] - yield pinyin_key, first_key, second_key, freq - - -def gen_all_resplit(): - for pinyin_key in pinyin_list: - if pinyin_key[-1] in ["n", "g", "r"]: - for yun in yunmu_list: - if yun not in pinyin_list: - continue - new_pinyin_key = pinyin_key[-1] + yun - # if new_pinyin in pinyin_list: - yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key - elif pinyin_key[-1] in ["e"]: - yield pinyin_key, "r", pinyin_key[:-1], "er" - - -def filter_resplit(): - for (orig_first_key, orig_second_key, new_first_key, new_second_key) \ - in gen_all_resplit(): - if not (new_first_key, new_second_key) in phrase_dict: - continue - orig_freq = 0 - new_freq = phrase_dict[(new_first_key, new_second_key)] - if (orig_first_key, orig_second_key) in phrase_dict: - orig_freq = phrase_dict[(orig_first_key, orig_second_key)] - yield orig_first_key, orig_second_key, orig_freq, \ - new_first_key, new_second_key, new_freq - - -if __name__ == "__main__": - load_phrase("pinyin2.txt") - for p in filter_divided(): - print (p) - for p in filter_resplit(): - print (p) diff --git a/scripts/pinyintable.py b/scripts/pinyintable.py new file mode 100644 index 0000000..ca22aa1 --- /dev/null +++ b/scripts/pinyintable.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +import pinyin +import bopomofo +import chewing +import itertools +from correct import * + + +pinyin_list = sorted(bopomofo.PINYIN_BOPOMOFO_MAP.keys()) +shengmu_list = sorted(pinyin.SHENGMU_DICT.keys()) + + +def check_pinyin_chewing_map(): + for pinyin_key in pinyin.PINYIN_DICT.keys(): + if pinyin_key in pinyin_list: + pass + else: + print("pinyin %s has no chewing mapping", pinyin_key) + + +def get_chewing(pinyin_key): + initial, middle, final = \ + 'CHEWING_ZERO_INITIAL', 'CHEWING_ZERO_MIDDLE', 'CHEWING_ZERO_FINAL' + assert pinyin_key != None + assert pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP + + #handle 'w' and 'y' + if pinyin_key[0] == 'w': + initial = 'PINYIN_W' + if pinyin_key[0] == 'y': + initial = 'PINYIN_Y' + + #get chewing string + bopomofo_str = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + + #handle 'ci', 'chi', 'si', 'shi', 'zi', 'zhi', 'ri' + if pinyin_key in {'ci', 'chi', 'si', 'shi', 'zi', 'zhi', 'ri'}: + middle = "CHEWING_I" + #normal process + for char in bopomofo_str: + if char in chewing.CHEWING_ASCII_INITIAL_MAP: + initial = chewing.CHEWING_ASCII_INITIAL_MAP[char] + if char in chewing.CHEWING_ASCII_MIDDLE_MAP: + middle = chewing.CHEWING_ASCII_MIDDLE_MAP[char] + if char in chewing.CHEWING_ASCII_FINAL_MAP: + final = chewing.CHEWING_ASCII_FINAL_MAP[char] + if char == "ㄜ": # merge "ㄝ" and "ㄜ" + final = "CHEWING_E" + + post_process_rules = { + #handle "ueng"/"ong" + ("CHEWING_U", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ONG"), + #handle "veng"/"iong" + ("CHEWING_V", "CHEWING_ENG"): ("CHEWING_I", "PINYIN_ONG"), + #handle "ien"/"in" + ("CHEWING_I", "CHEWING_EN"): ("CHEWING_ZERO_MIDDLE", "PINYIN_IN"), + #handle "ieng"/"ing" + ("CHEWING_I", "CHEWING_ENG"): ("CHEWING_ZERO_MIDDLE", "PINYIN_ING"), + } + + if (middle, final) in post_process_rules: + (middle, final) = post_process_rules[(middle, final)] + + return initial, middle, final + + +def gen_pinyin_list(): + for p in itertools.chain(gen_pinyins(), + gen_shengmu(), + gen_corrects(), + gen_u_to_v(), + ): + yield p + + +def gen_pinyins(): + #generate all pinyins in bopomofo + for pinyin_key in pinyin_list: + flags = [] + if pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP.keys(): + flags.append("IS_CHEWING") + if pinyin_key in pinyin.PINYIN_DICT.keys(): + flags.append("IS_PINYIN") + if pinyin_key in shengmu_list: + flags.append("PINYIN_INCOMPLETE") + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + if chewing_key in chewing.ASCII_CHEWING_INITIAL_MAP: + flags.append("CHEWING_INCOMPLETE") + yield pinyin_key, pinyin_key, chewing_key, \ + flags, get_chewing(pinyin_key) + + +def gen_shengmu(): + #generate all shengmu + for shengmu in shengmu_list: + if shengmu in pinyin_list: + continue + flags = ["IS_PINYIN", "PINYIN_INCOMPLETE"] + chewing_key = 'CHEWING_{0}'.format(shengmu.upper()) + if chewing_key in chewing.ASCII_CHEWING_INITIAL_MAP: + initial = chewing_key + chewing_key = chewing.ASCII_CHEWING_INITIAL_MAP[chewing_key] + else: + chewing_key = 'PINYIN_{0}'.format(shengmu.upper()) + initial = chewing_key + yield shengmu, shengmu, chewing_key, \ + flags, (initial, "CHEWING_ZREO_MIDDLE", "CHEWING_ZERO_FINAL") + + +def gen_corrects(): + #generate corrections + for correct, wrong in auto_correct: + flags = ['IS_PINYIN', 'PINYIN_CORRECT_{0}_{1}'.format(wrong.upper(), + correct.upper())] + for pinyin_key in pinyin_list: + if pinyin_key.endswith(correct) and pinyin_key != correct: + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + new_pinyin_key = pinyin_key.replace(correct, wrong) + yield pinyin_key, new_pinyin_key, chewing_key,\ + flags, get_chewing(pinyin_key) + + +def gen_u_to_v(): + #generate U to V + for correct, wrong, flags in auto_correct_ext: + #over-ride flags + flags = ['IS_PINYIN', 'PINYIN_CORRECT_V_U'] + pinyin_key = correct + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + yield correct, wrong, chewing_key, flags, get_chewing(pinyin_key) + +### main function ### +if __name__ == "__main__": + #pre-check here + check_pinyin_chewing_map() + + #dump + for pinyin_key in gen_pinyin_list(): + print (pinyin_key) diff --git a/scripts/specialtable.py b/scripts/specialtable.py new file mode 100644 index 0000000..928ca9e --- /dev/null +++ b/scripts/specialtable.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +import os +import sys +import math +import pinyin + +pinyin_list = sorted(pinyin.PINYIN_LIST) +shengmu_list = sorted(pinyin.SHENGMU_LIST) +yunmu_list = sorted(pinyin.YUNMU_LIST) + +phrase_dict = {} + + +def load_phrase(filename): + phrasefile = open(filename, "r") + for line in phrasefile.readlines(): + line = line.rstrip(os.linesep) + (pinyin_str, freq) = line.split(' ', 1) + freq = int(math.floor(float(freq))) + if 0 == freq: + #print(pinyin_str) + continue + + (first_key, second_key) = pinyin_str.split("'") + if first_key[-1].isdigit(): + first_key = first_key[:-1] + if second_key[-1].isdigit(): + second_key = second_key[:-1] + phrase_dict[(first_key, second_key)] = freq + phrasefile.close() + + +def gen_all_divided(): + for pinyin_key in pinyin_list: + for first_key in pinyin_list: + if len(pinyin_key) <= len(first_key): + continue + if not pinyin_key.startswith(first_key): + continue + second_key = pinyin_key[len(first_key):] + if second_key in pinyin_list: + yield pinyin_key, first_key, second_key + + +def filter_divided(): + for (pinyin_key, first_key, second_key) in gen_all_divided(): + if not (first_key, second_key) in phrase_dict: + continue + freq = phrase_dict[(first_key, second_key)] + yield pinyin_key, first_key, second_key, freq + + +def gen_all_resplit(): + for pinyin_key in pinyin_list: + if pinyin_key[-1] in ["n", "g", "r"]: + for yun in yunmu_list: + if yun not in pinyin_list: + continue + new_pinyin_key = pinyin_key[-1] + yun + # if new_pinyin in pinyin_list: + yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key + elif pinyin_key[-1] in ["e"]: + yield pinyin_key, "r", pinyin_key[:-1], "er" + + +def filter_resplit(): + for (orig_first_key, orig_second_key, new_first_key, new_second_key) \ + in gen_all_resplit(): + if not (new_first_key, new_second_key) in phrase_dict: + continue + orig_freq = 0 + new_freq = phrase_dict[(new_first_key, new_second_key)] + if (orig_first_key, orig_second_key) in phrase_dict: + orig_freq = phrase_dict[(orig_first_key, orig_second_key)] + yield orig_first_key, orig_second_key, orig_freq, \ + new_first_key, new_second_key, new_freq + + +if __name__ == "__main__": + load_phrase("pinyin2.txt") + for p in filter_divided(): + print (p) + for p in filter_resplit(): + print (p) -- cgit