From 056c8a42c2d131ec30b80bbf921314693ee381f8 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 8 Aug 2013 13:41:12 +0800 Subject: clean up scripts --- scripts/Makefile.data | 9 +--- scripts/genpinyinheader.py | 5 +- scripts/genpinyins.py | 57 --------------------- scripts/genspecialtable.py | 93 ---------------------------------- scripts/specials.txt | 0 scripts/specialtable.py | 123 --------------------------------------------- 6 files changed, 3 insertions(+), 284 deletions(-) delete mode 100644 scripts/genpinyins.py delete mode 100644 scripts/genspecialtable.py delete mode 100644 scripts/specials.txt delete mode 100644 scripts/specialtable.py (limited to 'scripts') diff --git a/scripts/Makefile.data b/scripts/Makefile.data index 49f65b4..624db75 100644 --- a/scripts/Makefile.data +++ b/scripts/Makefile.data @@ -1,14 +1,7 @@ -all: pinyins.txt - - -pinyins.txt: - python3 genpinyins.py +all: update-header: python3 genpinyinheader.py > ../src/storage/pinyin_parser_table.h python3 genbopomofoheader.py > ../src/storage/chewing_table.h python3 genchewingkey.py > ../src/storage/chewing_enum.h - - -.PHONY: pinyins.txt diff --git a/scripts/genpinyinheader.py b/scripts/genpinyinheader.py index 81e0538..283c64a 100644 --- a/scripts/genpinyinheader.py +++ b/scripts/genpinyinheader.py @@ -24,7 +24,6 @@ from utils import expand_file from genpinyintable import gen_content_table, \ gen_pinyin_index, gen_bopomofo_index, \ gen_chewing_key_table -from genspecialtable import gen_divided_table, gen_resplit_table def get_table_content(tablename): if tablename == 'CONTENT_TABLE': @@ -34,9 +33,9 @@ def get_table_content(tablename): if tablename == 'BOPOMOFO_INDEX': return gen_bopomofo_index() if tablename == 'DIVIDED_TABLE': - return gen_divided_table() + return '' if tablename == 'RESPLIT_TABLE': - return gen_resplit_table() + return '' if tablename == 'TABLE_INDEX': return gen_chewing_key_table() diff --git a/scripts/genpinyins.py b/scripts/genpinyins.py deleted file mode 100644 index fef40cd..0000000 --- a/scripts/genpinyins.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/python3 -import os -from operator import itemgetter - -pinyin_dict = {} - - -def strip_tone(old_pinyin_str): - oldpinyins = old_pinyin_str.split("'") - newpinyins = [] - - for pinyin in oldpinyins: - if pinyin[-1].isdigit(): - pinyin = pinyin[:-1] - newpinyins.append(pinyin) - - new_pinyin_str = "'".join(newpinyins) - return new_pinyin_str - - -def add_pinyin_dict(pinyin, freq): - if 0 == freq: - return - if not pinyin in pinyin_dict: - pinyin_dict[pinyin] = freq - else: - pinyin_dict[pinyin] += freq - - -def load_phrase(filename): - phrasefile = open(filename, "r") - for line in phrasefile.readlines(): - line = line.rstrip(os.linesep) - (pinyin, word, token, freq) = line.split(None, 3) - pinyin = strip_tone(pinyin) - freq = int(freq) - - if len(word) in [1, 2]: - add_pinyin_dict(pinyin, freq) - - phrasefile.close() - -load_phrase("../data/gb_char.table") -load_phrase("../data/gbk_char.table") - - -def save_pinyin(filename): - pinyinfile = open(filename, "w") - for pinyin, freq in pinyin_dict.items(): - freq = str(freq) - line = "\t".join((pinyin, freq)) - pinyinfile.writelines([line, os.linesep]) - pinyinfile.close() - - -if __name__ == "__main__": - save_pinyin("pinyins.txt") diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py deleted file mode 100644 index 061f9d1..0000000 --- a/scripts/genspecialtable.py +++ /dev/null @@ -1,93 +0,0 @@ -# -*- coding: utf-8 -*- -# vim:set et sts=4 sw=4: -# -# libpinyin - Library to deal with pinyin. -# -# Copyright (C) 2011 Peng Wu -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - - -import operator -import pinyin -from pinyintable import get_chewing, get_shengmu_chewing -from specialtable import * - -pinyin_list = sorted(pinyin.PINYIN_LIST) -shengmu_list = sorted(pinyin.SHENGMU_LIST) - -divided_list = [] -resplit_list = [] - - -def sort_all(): - global divided_list, resplit_list - divided_list = sorted(divided_list, key=operator.itemgetter(0)) - resplit_list = sorted(resplit_list, key=operator.itemgetter(0, 1)) - -''' -def get_chewing_string(pinyin): - #handle shengmu - if pinyin not in pinyin_list: - if pinyin in shengmu_list: - chewing_key = get_shengmu_chewing(pinyin) - else: - assert False, "Un-expected pinyin string." - else: - chewing_key = get_chewing(pinyin) - chewing_str = 'ChewingKey({0})'.format(', '.join(chewing_key)) - return chewing_str -''' - -def gen_divided_table(): - entries = [] - for (pinyin_key, orig_freq, first_key, second_key, new_freq) \ - in divided_list: - - if orig_freq >= new_freq: - assert orig_freq > 0, "Expected orig_freq > 0 here." - - entry = '{{"{0}", {1}, {{"{2}", "{3}"}}, {4}}}'.format \ - (pinyin_key, orig_freq, first_key, second_key, new_freq) - entries.append(entry) - return ',\n'.join(entries) - - -def gen_resplit_table(): - entries = [] - for (orig_first_key, orig_second_key, orig_freq, \ - new_first_key, new_second_key, new_freq) in resplit_list: - - if orig_freq >= new_freq: - assert orig_freq > 0, "Expected orig_freq > 0 here." - - entry = '{{{{"{0}", "{1}"}}, {2}, {{"{3}", "{4}"}}, {5}}}'.format \ - (orig_first_key, orig_second_key, orig_freq,\ - new_first_key, new_second_key, new_freq) - entries.append(entry) - return ',\n'.join(entries) - - -#init code, load lists -divided_list = filter_divided() -resplit_list = filter_resplit() -sort_all() - - -### main function ### -if __name__ == "__main__": - s = gen_divided_table() + '\n' + gen_resplit_table() - print(s) - diff --git a/scripts/specials.txt b/scripts/specials.txt deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/specialtable.py b/scripts/specialtable.py deleted file mode 100644 index b6fb680..0000000 --- a/scripts/specialtable.py +++ /dev/null @@ -1,123 +0,0 @@ -# -*- coding: utf-8 -*- -# vim:set et sts=4 sw=4: -# -# libpinyin - Library to deal with pinyin. -# -# Copyright (C) 2011 Peng Wu -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - - -import os -import sys -import math -import pinyin - -pinyin_list = sorted(pinyin.PINYIN_LIST) -shengmu_list = sorted(pinyin.SHENGMU_LIST) -yunmu_list = sorted(pinyin.YUNMU_LIST) - -phrase_dict = {} - - -def load_phrase(filename): - phrasefile = open(filename, "r") - for line in phrasefile.readlines(): - line = line.rstrip(os.linesep) - (pinyin_str, freq) = line.split(None, 1) - freq = int(freq) - if 0 == freq: - #print(pinyin_str) - continue - - # no duplicate here - if "'" in pinyin_str: - (first_key, second_key) = pinyin_str.split("'") - phrase_dict[(first_key, second_key)] = freq - else: - phrase_dict[pinyin_str] = freq - phrasefile.close() - - -def gen_all_divided(): - for pinyin_key in pinyin_list: - for first_key in pinyin_list: - if len(pinyin_key) <= len(first_key): - continue - if not pinyin_key.startswith(first_key): - continue - second_key = pinyin_key[len(first_key):] - if second_key in pinyin_list: - yield pinyin_key, first_key, second_key - - -def filter_divided(): - for (pinyin_key, first_key, second_key) in gen_all_divided(): - if not (first_key, second_key) in phrase_dict: - continue - orig_freq = 0 - if pinyin_key in phrase_dict: - orig_freq = phrase_dict[pinyin_key] - new_freq = phrase_dict[(first_key, second_key)] - yield pinyin_key, orig_freq, first_key, second_key, new_freq - - -def gen_all_resplit(): - for pinyin_key in pinyin_list: - if pinyin_key[-1] in ["n", "g", "r"]: - for yun in yunmu_list: - if yun not in pinyin_list: - continue - #check first new pinyin key - if not pinyin_key[:-1] in pinyin_list: - continue - #check second new pinyin key - new_pinyin_key = pinyin_key[-1] + yun - if new_pinyin_key in pinyin_list: - yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key -''' - elif pinyin_key[-1] in ["e"]: - #check first new pinyin key - if pinyin_key[:-1] in pinyin_list: - yield pinyin_key, "r", pinyin_key[:-1], "er" -''' - - -def filter_resplit(): - for (orig_first_key, orig_second_key, new_first_key, new_second_key) \ - in gen_all_resplit(): - #do the reverse here, as libpinyin pinyin parser is different with - #ibus-pinyin's parser. - (orig_first_key, orig_second_key, new_first_key, new_second_key) = \ - (new_first_key, new_second_key, orig_first_key, orig_second_key) - if (new_first_key, new_second_key) not in phrase_dict: - continue - orig_freq = 0 - new_freq = phrase_dict[(new_first_key, new_second_key)] - if (orig_first_key, orig_second_key) in phrase_dict: - orig_freq = phrase_dict[(orig_first_key, orig_second_key)] - yield orig_first_key, orig_second_key, orig_freq, \ - new_first_key, new_second_key, new_freq - - -#init code -load_phrase("pinyins.txt") -load_phrase("specials.txt") - -if __name__ == "__main__": - for p in filter_divided(): - print (p) - for p in filter_resplit(): - print (p) -- cgit