From f40946027ed3bd50d5cce100564340d2b9161c0d Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 8 Sep 2015 15:51:12 +0800 Subject: merge specialtable.py and genspecialtable.py into specialtable.py --- scripts2/bopomofotable.py | 1 - scripts2/fullpinyintable.py | 100 +------------------------- scripts2/generateheader.py | 8 ++- scripts2/specialtable.py | 172 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 178 insertions(+), 103 deletions(-) create mode 100644 scripts2/specialtable.py (limited to 'scripts2') diff --git a/scripts2/bopomofotable.py b/scripts2/bopomofotable.py index e9beb1d..046a52d 100644 --- a/scripts2/bopomofotable.py +++ b/scripts2/bopomofotable.py @@ -23,7 +23,6 @@ import os from operator import itemgetter -from utils import expand_file from bopomofo import * def escape_char(ch): diff --git a/scripts2/fullpinyintable.py b/scripts2/fullpinyintable.py index 6ad05be..b8cb1a6 100644 --- a/scripts2/fullpinyintable.py +++ b/scripts2/fullpinyintable.py @@ -25,7 +25,7 @@ import itertools import chewing from pyzymap import ZHUYIN_PINYIN_MAP, ZHUYIN_LUOMA_PINYIN_MAP, ZHUYIN_SECONDARY_ZHUYIN_MAP from pyzymap import PINYIN_ZHUYIN_MAP, ZHUYIN_SPECIAL_INITIAL_SET_IN_PINYIN_FORM -from fullpinyin import PINYIN_LIST, SHENGMU_LIST, YUNMU_LIST +from fullpinyin import PINYIN_LIST, SHENGMU_LIST from options import * from utils import shuffle_all @@ -403,94 +403,6 @@ def gen_table_index_for_chewing_key(content_table): return ",\n".join(entries) -#special table -pinyin_list = sorted(PINYIN_LIST) -shengmu_list = sorted(SHENGMU_LIST) -yunmu_list = sorted(YUNMU_LIST) - -phrase_dict = {} - - -def load_phrase(filename): - phrasefile = open(filename, "r") - for line in phrasefile.readlines(): - line = line.rstrip(os.linesep) - (pinyin_str, freq) = line.split(None, 1) - freq = int(freq) - if 0 == freq: - #print(pinyin_str) - continue - - # no duplicate here - if "'" in pinyin_str: - (first_key, second_key) = pinyin_str.split("'") - phrase_dict[(first_key, second_key)] = freq - else: - phrase_dict[pinyin_str] = freq - phrasefile.close() - - -def gen_all_divided(): - for pinyin_key in pinyin_list: - for first_key in pinyin_list: - if len(pinyin_key) <= len(first_key): - continue - if not pinyin_key.startswith(first_key): - continue - second_key = pinyin_key[len(first_key):] - if second_key in pinyin_list: - yield pinyin_key, first_key, second_key - - -def filter_divided(): - for (pinyin_key, first_key, second_key) in gen_all_divided(): - if not (first_key, second_key) in phrase_dict: - continue - orig_freq = 0 - if pinyin_key in phrase_dict: - orig_freq = phrase_dict[pinyin_key] - new_freq = phrase_dict[(first_key, second_key)] - yield pinyin_key, orig_freq, first_key, second_key, new_freq - - -def gen_all_resplit(): - for pinyin_key in pinyin_list: - if pinyin_key[-1] in ["n", "g", "r"]: - for yun in yunmu_list: - if yun not in pinyin_list: - continue - #check first new pinyin key - if not pinyin_key[:-1] in pinyin_list: - continue - #check second new pinyin key - new_pinyin_key = pinyin_key[-1] + yun - if new_pinyin_key in pinyin_list: - yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key -''' - elif pinyin_key[-1] in ["e"]: - #check first new pinyin key - if pinyin_key[:-1] in pinyin_list: - yield pinyin_key, "r", pinyin_key[:-1], "er" -''' - - -def filter_resplit(): - for (orig_first_key, orig_second_key, new_first_key, new_second_key) \ - in gen_all_resplit(): - #do the reverse here, as libpinyin pinyin parser is different with - #ibus-pinyin's parser. - (orig_first_key, orig_second_key, new_first_key, new_second_key) = \ - (new_first_key, new_second_key, orig_first_key, orig_second_key) - if (new_first_key, new_second_key) not in phrase_dict: - continue - orig_freq = 0 - new_freq = phrase_dict[(new_first_key, new_second_key)] - if (orig_first_key, orig_second_key) in phrase_dict: - orig_freq = phrase_dict[(orig_first_key, orig_second_key)] - yield orig_first_key, orig_second_key, orig_freq, \ - new_first_key, new_second_key, new_freq - - #init full pinyin table code filter_pinyin_list() check_rules(hsu_correct, hsu_correct_special) @@ -498,10 +410,6 @@ check_rules(eten26_correct, eten26_correct_special) populate_more_zhuyin_index() sort_all() -#init resplit/divided table code -load_phrase("pinyins.txt") -#load_phrase("specials.txt") - ### main function ### if __name__ == "__main__": @@ -517,9 +425,3 @@ if __name__ == "__main__": s = gen_hsu_zhuyin_index() + gen_eten26_zhuyin_index() s = gen_table_index_for_chewing_key(content_table) print(s) - - #dump - for p in filter_divided(): - print (p) - for p in filter_resplit(): - print (p) diff --git a/scripts2/generateheader.py b/scripts2/generateheader.py index 901d682..dd54c29 100644 --- a/scripts2/generateheader.py +++ b/scripts2/generateheader.py @@ -20,9 +20,11 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +import os from argparse import ArgumentParser from chewing import gen_initials, gen_middles, gen_finals, gen_tones -from fullpinyintable import gen_content_table, gen_pinyin_index, gen_luoma_pinyin_index, gen_secondary_zhuyin_index, gen_zhuyin_index, gen_hsu_zhuyin_index, gen_eten26_zhuyin_index, gen_divided_table, gen_resplit_table, gen_chewing_key_table +from fullpinyintable import gen_content_table, gen_pinyin_index, gen_luoma_pinyin_index, gen_secondary_zhuyin_index, gen_zhuyin_index, gen_hsu_zhuyin_index, gen_eten26_zhuyin_index, gen_table_index_for_chewing_key +from specialtable import gen_divided_table, gen_resplit_table from doublepinyintable import gen_shengmu_table, gen_yunmu_table from bopomofotable import gen_chewing_symbols, gen_chewing_initials, gen_chewing_middles, gen_chewing_finals, gen_chewing_tones @@ -63,7 +65,7 @@ def get_table_content(tablename): if tablename == 'RESPLIT_TABLE': return gen_resplit_table() if tablename == 'TABLE_INDEX': - return gen_chewing_key_table() + return gen_table_index_for_chewing_key() #double pinyin table (scheme, part) = tablename.split('_', 1) @@ -108,6 +110,6 @@ if __name__ == "__main__": help='input file.') args = parser.parse_args() - print(args) + #print(args) expand_file(args.infile) diff --git a/scripts2/specialtable.py b/scripts2/specialtable.py new file mode 100644 index 0000000..800a7b1 --- /dev/null +++ b/scripts2/specialtable.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- +# vim:set et sts=4 sw=4: +# +# libpinyin - Library to deal with pinyin. +# +# Copyright (C) 2011 Peng Wu +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +import os +import sys +import math +import operator +from fullpinyin import PINYIN_LIST, SHENGMU_LIST, YUNMU_LIST + +pinyin_list = sorted(PINYIN_LIST) +shengmu_list = sorted(SHENGMU_LIST) +yunmu_list = sorted(YUNMU_LIST) + +phrase_dict = {} + + +def load_phrase(filename): + phrasefile = open(filename, "r") + for line in phrasefile.readlines(): + line = line.rstrip(os.linesep) + (pinyin_str, freq) = line.split(None, 1) + freq = int(freq) + if 0 == freq: + #print(pinyin_str) + continue + + # no duplicate here + if "'" in pinyin_str: + (first_key, second_key) = pinyin_str.split("'") + phrase_dict[(first_key, second_key)] = freq + else: + phrase_dict[pinyin_str] = freq + phrasefile.close() + + +#generate the list +def gen_all_divided(): + for pinyin_key in pinyin_list: + for first_key in pinyin_list: + if len(pinyin_key) <= len(first_key): + continue + if not pinyin_key.startswith(first_key): + continue + second_key = pinyin_key[len(first_key):] + if second_key in pinyin_list: + yield pinyin_key, first_key, second_key + + +def filter_divided(): + for (pinyin_key, first_key, second_key) in gen_all_divided(): + if not (first_key, second_key) in phrase_dict: + continue + orig_freq = 0 + if pinyin_key in phrase_dict: + orig_freq = phrase_dict[pinyin_key] + new_freq = phrase_dict[(first_key, second_key)] + yield pinyin_key, orig_freq, first_key, second_key, new_freq + + +def gen_all_resplit(): + for pinyin_key in pinyin_list: + if pinyin_key[-1] in ["n", "g", "r"]: + for yun in yunmu_list: + if yun not in pinyin_list: + continue + #check first new pinyin key + if not pinyin_key[:-1] in pinyin_list: + continue + #check second new pinyin key + new_pinyin_key = pinyin_key[-1] + yun + if new_pinyin_key in pinyin_list: + yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key +''' + elif pinyin_key[-1] in ["e"]: + #check first new pinyin key + if pinyin_key[:-1] in pinyin_list: + yield pinyin_key, "r", pinyin_key[:-1], "er" +''' + + +def filter_resplit(): + for (orig_first_key, orig_second_key, new_first_key, new_second_key) \ + in gen_all_resplit(): + #do the reverse here, as libpinyin pinyin parser is different with + #ibus-pinyin's parser. + (orig_first_key, orig_second_key, new_first_key, new_second_key) = \ + (new_first_key, new_second_key, orig_first_key, orig_second_key) + if (new_first_key, new_second_key) not in phrase_dict: + continue + orig_freq = 0 + new_freq = phrase_dict[(new_first_key, new_second_key)] + if (orig_first_key, orig_second_key) in phrase_dict: + orig_freq = phrase_dict[(orig_first_key, orig_second_key)] + yield orig_first_key, orig_second_key, orig_freq, \ + new_first_key, new_second_key, new_freq + + +#generate the table +divided_list = [] +resplit_list = [] + + +def sort_all(): + global divided_list, resplit_list + divided_list = sorted(divided_list, key=operator.itemgetter(0)) + resplit_list = sorted(resplit_list, key=operator.itemgetter(0, 1)) + + +def gen_divided_table(): + entries = [] + for (pinyin_key, orig_freq, first_key, second_key, new_freq) \ + in divided_list: + + if orig_freq >= new_freq: + assert orig_freq > 0, "Expected orig_freq > 0 here." + + entry = '{{"{0}", {1}, {{"{2}", "{3}"}}, {4}}}'.format \ + (pinyin_key, orig_freq, first_key, second_key, new_freq) + entries.append(entry) + return ',\n'.join(entries) + + +def gen_resplit_table(): + entries = [] + for (orig_first_key, orig_second_key, orig_freq, \ + new_first_key, new_second_key, new_freq) in resplit_list: + + if orig_freq >= new_freq: + assert orig_freq > 0, "Expected orig_freq > 0 here." + + entry = '{{{{"{0}", "{1}"}}, {2}, {{"{3}", "{4}"}}, {5}}}'.format \ + (orig_first_key, orig_second_key, orig_freq,\ + new_first_key, new_second_key, new_freq) + entries.append(entry) + return ',\n'.join(entries) + + +#init code +load_phrase("pinyins.txt") +#load_phrase("specials.txt") +divided_list = filter_divided() +resplit_list = filter_resplit() +sort_all() + + +if __name__ == "__main__": + for p in filter_divided(): + print (p) + for p in filter_resplit(): + print (p) + + s = gen_divided_table() + '\n' + gen_resplit_table() + print(s) -- cgit