# -*- coding: utf-8 -*- # vim:set et sts=4 sw=4: # # libpinyin - Library to deal with pinyin. # # Copyright (C) 2011 Peng Wu # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. import os import sys import math import operator from fullpinyin import PINYIN_LIST, SHENGMU_LIST, YUNMU_LIST from fullpinyintable import content_table pinyin_list = sorted(PINYIN_LIST) shengmu_list = sorted(SHENGMU_LIST) yunmu_list = sorted(YUNMU_LIST) phrase_dict = {} def load_phrase(filename): phrasefile = open(filename, "r") for line in phrasefile.readlines(): line = line.rstrip(os.linesep) (pinyin_str, freq) = line.split(None, 1) freq = int(freq) if 0 == freq: #print(pinyin_str) continue # no duplicate here if "'" in pinyin_str: (first_key, second_key) = pinyin_str.split("'") phrase_dict[(first_key, second_key)] = freq else: phrase_dict[pinyin_str] = freq phrasefile.close() #generate the list def gen_all_divided(): for pinyin_key in pinyin_list: for first_key in pinyin_list: if len(pinyin_key) <= len(first_key): continue if not pinyin_key.startswith(first_key): continue second_key = pinyin_key[len(first_key):] if second_key in pinyin_list: yield pinyin_key, first_key, second_key def filter_divided(): for (pinyin_key, first_key, second_key) in gen_all_divided(): if not (first_key, second_key) in phrase_dict: continue orig_freq = 0 if pinyin_key in phrase_dict: orig_freq = phrase_dict[pinyin_key] new_freq = phrase_dict[(first_key, second_key)] yield pinyin_key, orig_freq, first_key, second_key, new_freq def gen_all_resplit(): for pinyin_key in pinyin_list: if pinyin_key[-1] in ["n", "g", "r"]: for yun in yunmu_list: if yun not in pinyin_list: continue #check first new pinyin key if not pinyin_key[:-1] in pinyin_list: continue #check second new pinyin key new_pinyin_key = pinyin_key[-1] + yun if new_pinyin_key in pinyin_list: yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key ''' elif pinyin_key[-1] in ["e"]: #check first new pinyin key if pinyin_key[:-1] in pinyin_list: yield pinyin_key, "r", pinyin_key[:-1], "er" ''' def filter_resplit(): for (orig_first_key, orig_second_key, new_first_key, new_second_key) \ in gen_all_resplit(): #do the reverse here, as libpinyin pinyin parser is different with #ibus-pinyin's parser. (orig_first_key, orig_second_key, new_first_key, new_second_key) = \ (new_first_key, new_second_key, orig_first_key, orig_second_key) if (new_first_key, new_second_key) not in phrase_dict: continue orig_freq = 0 new_freq = phrase_dict[(new_first_key, new_second_key)] if (orig_first_key, orig_second_key) in phrase_dict: orig_freq = phrase_dict[(orig_first_key, orig_second_key)] yield orig_first_key, orig_second_key, orig_freq, \ new_first_key, new_second_key, new_freq #generate the table divided_list = [] resplit_list = [] def sort_all(): global divided_list, resplit_list divided_list = sorted(divided_list, key=operator.itemgetter(0)) resplit_list = sorted(resplit_list, key=operator.itemgetter(0, 1)) def get_chewing_key(pinyin): # item[4] is chewing key, item[0] is pinyin for item in content_table: if pinyin == item[0]: return item[4] def gen_divided_table(): entries = [] for (pinyin_key, orig_freq, first_key, second_key, new_freq) \ in divided_list: if orig_freq >= new_freq: assert orig_freq > 0, "Expected orig_freq > 0 here." entry = '{{"{0}", {1}, {2}, {{"{3}", "{4}"}}, {{{5}, {6}}}, {7}}}'.format \ (pinyin_key, get_chewing_key(pinyin_key), orig_freq, \ first_key, second_key, \ get_chewing_key(first_key), get_chewing_key(second_key), new_freq) entries.append(entry) return ',\n'.join(entries) def gen_resplit_table(): entries = [] for (orig_first_key, orig_second_key, orig_freq, \ new_first_key, new_second_key, new_freq) in resplit_list: if orig_freq >= new_freq: assert orig_freq > 0, "Expected orig_freq > 0 here." entry = '{{{{"{0}", "{1}"}}, {{{2}, {3}}}, {4}, {{"{5}", "{6}"}}, {{{7}, {8}}}, {9}}}'.format \ (orig_first_key, orig_second_key, \ get_chewing_key(orig_first_key), \ get_chewing_key(orig_second_key), orig_freq,\ new_first_key, new_second_key, \ get_chewing_key(new_first_key), \ get_chewing_key(new_second_key), new_freq) entries.append(entry) return ',\n'.join(entries) #init code load_phrase("pinyins.txt") #load_phrase("specials.txt") divided_list = filter_divided() resplit_list = filter_resplit() sort_all() if __name__ == "__main__": for p in filter_divided(): print (p) for p in filter_resplit(): print (p) s = gen_divided_table() + '\n' + gen_resplit_table() print(s)