summaryrefslogtreecommitdiffstats
path: root/scripts
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-08-08 13:41:12 +0800
committerPeng Wu <alexepico@gmail.com>2013-08-08 13:51:21 +0800
commit056c8a42c2d131ec30b80bbf921314693ee381f8 (patch)
tree2e080518f8f1a3bd9dfa1b249aa05839b01ec37a /scripts
parent6651303b1b28c3f57b5e6eafb732b9f19a9e27ae (diff)
downloadlibzhuyin-056c8a42c2d131ec30b80bbf921314693ee381f8.tar.gz
libzhuyin-056c8a42c2d131ec30b80bbf921314693ee381f8.tar.xz
libzhuyin-056c8a42c2d131ec30b80bbf921314693ee381f8.zip
clean up scripts
Diffstat (limited to 'scripts')
-rw-r--r--scripts/Makefile.data9
-rw-r--r--scripts/genpinyinheader.py5
-rw-r--r--scripts/genpinyins.py57
-rw-r--r--scripts/genspecialtable.py93
-rw-r--r--scripts/specials.txt0
-rw-r--r--scripts/specialtable.py123
6 files changed, 3 insertions, 284 deletions
diff --git a/scripts/Makefile.data b/scripts/Makefile.data
index 49f65b4..624db75 100644
--- a/scripts/Makefile.data
+++ b/scripts/Makefile.data
@@ -1,14 +1,7 @@
-all: pinyins.txt
-
-
-pinyins.txt:
- python3 genpinyins.py
+all:
update-header:
python3 genpinyinheader.py > ../src/storage/pinyin_parser_table.h
python3 genbopomofoheader.py > ../src/storage/chewing_table.h
python3 genchewingkey.py > ../src/storage/chewing_enum.h
-
-
-.PHONY: pinyins.txt
diff --git a/scripts/genpinyinheader.py b/scripts/genpinyinheader.py
index 81e0538..283c64a 100644
--- a/scripts/genpinyinheader.py
+++ b/scripts/genpinyinheader.py
@@ -24,7 +24,6 @@ from utils import expand_file
from genpinyintable import gen_content_table, \
gen_pinyin_index, gen_bopomofo_index, \
gen_chewing_key_table
-from genspecialtable import gen_divided_table, gen_resplit_table
def get_table_content(tablename):
if tablename == 'CONTENT_TABLE':
@@ -34,9 +33,9 @@ def get_table_content(tablename):
if tablename == 'BOPOMOFO_INDEX':
return gen_bopomofo_index()
if tablename == 'DIVIDED_TABLE':
- return gen_divided_table()
+ return ''
if tablename == 'RESPLIT_TABLE':
- return gen_resplit_table()
+ return ''
if tablename == 'TABLE_INDEX':
return gen_chewing_key_table()
diff --git a/scripts/genpinyins.py b/scripts/genpinyins.py
deleted file mode 100644
index fef40cd..0000000
--- a/scripts/genpinyins.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/python3
-import os
-from operator import itemgetter
-
-pinyin_dict = {}
-
-
-def strip_tone(old_pinyin_str):
- oldpinyins = old_pinyin_str.split("'")
- newpinyins = []
-
- for pinyin in oldpinyins:
- if pinyin[-1].isdigit():
- pinyin = pinyin[:-1]
- newpinyins.append(pinyin)
-
- new_pinyin_str = "'".join(newpinyins)
- return new_pinyin_str
-
-
-def add_pinyin_dict(pinyin, freq):
- if 0 == freq:
- return
- if not pinyin in pinyin_dict:
- pinyin_dict[pinyin] = freq
- else:
- pinyin_dict[pinyin] += freq
-
-
-def load_phrase(filename):
- phrasefile = open(filename, "r")
- for line in phrasefile.readlines():
- line = line.rstrip(os.linesep)
- (pinyin, word, token, freq) = line.split(None, 3)
- pinyin = strip_tone(pinyin)
- freq = int(freq)
-
- if len(word) in [1, 2]:
- add_pinyin_dict(pinyin, freq)
-
- phrasefile.close()
-
-load_phrase("../data/gb_char.table")
-load_phrase("../data/gbk_char.table")
-
-
-def save_pinyin(filename):
- pinyinfile = open(filename, "w")
- for pinyin, freq in pinyin_dict.items():
- freq = str(freq)
- line = "\t".join((pinyin, freq))
- pinyinfile.writelines([line, os.linesep])
- pinyinfile.close()
-
-
-if __name__ == "__main__":
- save_pinyin("pinyins.txt")
diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py
deleted file mode 100644
index 061f9d1..0000000
--- a/scripts/genspecialtable.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# -*- coding: utf-8 -*-
-# vim:set et sts=4 sw=4:
-#
-# libpinyin - Library to deal with pinyin.
-#
-# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-
-
-import operator
-import pinyin
-from pinyintable import get_chewing, get_shengmu_chewing
-from specialtable import *
-
-pinyin_list = sorted(pinyin.PINYIN_LIST)
-shengmu_list = sorted(pinyin.SHENGMU_LIST)
-
-divided_list = []
-resplit_list = []
-
-
-def sort_all():
- global divided_list, resplit_list
- divided_list = sorted(divided_list, key=operator.itemgetter(0))
- resplit_list = sorted(resplit_list, key=operator.itemgetter(0, 1))
-
-'''
-def get_chewing_string(pinyin):
- #handle shengmu
- if pinyin not in pinyin_list:
- if pinyin in shengmu_list:
- chewing_key = get_shengmu_chewing(pinyin)
- else:
- assert False, "Un-expected pinyin string."
- else:
- chewing_key = get_chewing(pinyin)
- chewing_str = 'ChewingKey({0})'.format(', '.join(chewing_key))
- return chewing_str
-'''
-
-def gen_divided_table():
- entries = []
- for (pinyin_key, orig_freq, first_key, second_key, new_freq) \
- in divided_list:
-
- if orig_freq >= new_freq:
- assert orig_freq > 0, "Expected orig_freq > 0 here."
-
- entry = '{{"{0}", {1}, {{"{2}", "{3}"}}, {4}}}'.format \
- (pinyin_key, orig_freq, first_key, second_key, new_freq)
- entries.append(entry)
- return ',\n'.join(entries)
-
-
-def gen_resplit_table():
- entries = []
- for (orig_first_key, orig_second_key, orig_freq, \
- new_first_key, new_second_key, new_freq) in resplit_list:
-
- if orig_freq >= new_freq:
- assert orig_freq > 0, "Expected orig_freq > 0 here."
-
- entry = '{{{{"{0}", "{1}"}}, {2}, {{"{3}", "{4}"}}, {5}}}'.format \
- (orig_first_key, orig_second_key, orig_freq,\
- new_first_key, new_second_key, new_freq)
- entries.append(entry)
- return ',\n'.join(entries)
-
-
-#init code, load lists
-divided_list = filter_divided()
-resplit_list = filter_resplit()
-sort_all()
-
-
-### main function ###
-if __name__ == "__main__":
- s = gen_divided_table() + '\n' + gen_resplit_table()
- print(s)
-
diff --git a/scripts/specials.txt b/scripts/specials.txt
deleted file mode 100644
index e69de29..0000000
--- a/scripts/specials.txt
+++ /dev/null
diff --git a/scripts/specialtable.py b/scripts/specialtable.py
deleted file mode 100644
index b6fb680..0000000
--- a/scripts/specialtable.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# -*- coding: utf-8 -*-
-# vim:set et sts=4 sw=4:
-#
-# libpinyin - Library to deal with pinyin.
-#
-# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-
-
-import os
-import sys
-import math
-import pinyin
-
-pinyin_list = sorted(pinyin.PINYIN_LIST)
-shengmu_list = sorted(pinyin.SHENGMU_LIST)
-yunmu_list = sorted(pinyin.YUNMU_LIST)
-
-phrase_dict = {}
-
-
-def load_phrase(filename):
- phrasefile = open(filename, "r")
- for line in phrasefile.readlines():
- line = line.rstrip(os.linesep)
- (pinyin_str, freq) = line.split(None, 1)
- freq = int(freq)
- if 0 == freq:
- #print(pinyin_str)
- continue
-
- # no duplicate here
- if "'" in pinyin_str:
- (first_key, second_key) = pinyin_str.split("'")
- phrase_dict[(first_key, second_key)] = freq
- else:
- phrase_dict[pinyin_str] = freq
- phrasefile.close()
-
-
-def gen_all_divided():
- for pinyin_key in pinyin_list:
- for first_key in pinyin_list:
- if len(pinyin_key) <= len(first_key):
- continue
- if not pinyin_key.startswith(first_key):
- continue
- second_key = pinyin_key[len(first_key):]
- if second_key in pinyin_list:
- yield pinyin_key, first_key, second_key
-
-
-def filter_divided():
- for (pinyin_key, first_key, second_key) in gen_all_divided():
- if not (first_key, second_key) in phrase_dict:
- continue
- orig_freq = 0
- if pinyin_key in phrase_dict:
- orig_freq = phrase_dict[pinyin_key]
- new_freq = phrase_dict[(first_key, second_key)]
- yield pinyin_key, orig_freq, first_key, second_key, new_freq
-
-
-def gen_all_resplit():
- for pinyin_key in pinyin_list:
- if pinyin_key[-1] in ["n", "g", "r"]:
- for yun in yunmu_list:
- if yun not in pinyin_list:
- continue
- #check first new pinyin key
- if not pinyin_key[:-1] in pinyin_list:
- continue
- #check second new pinyin key
- new_pinyin_key = pinyin_key[-1] + yun
- if new_pinyin_key in pinyin_list:
- yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key
-'''
- elif pinyin_key[-1] in ["e"]:
- #check first new pinyin key
- if pinyin_key[:-1] in pinyin_list:
- yield pinyin_key, "r", pinyin_key[:-1], "er"
-'''
-
-
-def filter_resplit():
- for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
- in gen_all_resplit():
- #do the reverse here, as libpinyin pinyin parser is different with
- #ibus-pinyin's parser.
- (orig_first_key, orig_second_key, new_first_key, new_second_key) = \
- (new_first_key, new_second_key, orig_first_key, orig_second_key)
- if (new_first_key, new_second_key) not in phrase_dict:
- continue
- orig_freq = 0
- new_freq = phrase_dict[(new_first_key, new_second_key)]
- if (orig_first_key, orig_second_key) in phrase_dict:
- orig_freq = phrase_dict[(orig_first_key, orig_second_key)]
- yield orig_first_key, orig_second_key, orig_freq, \
- new_first_key, new_second_key, new_freq
-
-
-#init code
-load_phrase("pinyins.txt")
-load_phrase("specials.txt")
-
-if __name__ == "__main__":
- for p in filter_divided():
- print (p)
- for p in filter_resplit():
- print (p)