merge specialtable.py and genspecialtable.py into specialtable.py

author: Peng Wu <alexepico@gmail.com> 2015-09-08 15:51:12 +0800
committer: Peng Wu <alexepico@gmail.com> 2015-09-08 15:53:20 +0800
commit: f40946027ed3bd50d5cce100564340d2b9161c0d (patch)
tree: 9bb15730fa7b0ad224d821ab9eb3754221fefa7a /scripts2
parent: dff380f2847a030ae1e2db4c26c2316cf70fa7ca (diff)
4 files changed, 178 insertions, 103 deletions
diff --git a/scripts2/bopomofotable.py b/scripts2/bopomofotable.py
index e9beb1d..046a52d 100644
--- a/scripts2/bopomofotable.py
+++ b/scripts2/bopomofotable.py
@@ -23,7 +23,6 @@
 
 import os
 from operator import itemgetter
-from utils import expand_file
 from bopomofo import *
 
 def escape_char(ch):
diff --git a/scripts2/fullpinyintable.py b/scripts2/fullpinyintable.py
index 6ad05be..b8cb1a6 100644
--- a/scripts2/fullpinyintable.py
+++ b/scripts2/fullpinyintable.py
@@ -25,7 +25,7 @@ import itertools
 import chewing
 from pyzymap import ZHUYIN_PINYIN_MAP, ZHUYIN_LUOMA_PINYIN_MAP, ZHUYIN_SECONDARY_ZHUYIN_MAP
 from pyzymap import PINYIN_ZHUYIN_MAP, ZHUYIN_SPECIAL_INITIAL_SET_IN_PINYIN_FORM
-from fullpinyin import PINYIN_LIST, SHENGMU_LIST, YUNMU_LIST
+from fullpinyin import PINYIN_LIST, SHENGMU_LIST
 from options import *
 from utils import shuffle_all
 
@@ -403,94 +403,6 @@ def gen_table_index_for_chewing_key(content_table):
     return ",\n".join(entries)
 
 
-#special table
-pinyin_list = sorted(PINYIN_LIST)
-shengmu_list = sorted(SHENGMU_LIST)
-yunmu_list = sorted(YUNMU_LIST)
-
-phrase_dict = {}
-
-
-def load_phrase(filename):
-    phrasefile = open(filename, "r")
-    for line in phrasefile.readlines():
-        line = line.rstrip(os.linesep)
-        (pinyin_str, freq) = line.split(None, 1)
-        freq = int(freq)
-        if 0 == freq:
-            #print(pinyin_str)
-            continue
-
-        # no duplicate here
-        if "'" in pinyin_str:
-            (first_key, second_key) = pinyin_str.split("'")
-            phrase_dict[(first_key, second_key)] = freq
-        else:
-            phrase_dict[pinyin_str] = freq
-    phrasefile.close()
-
-
-def gen_all_divided():
-    for pinyin_key in pinyin_list:
-        for first_key in pinyin_list:
-            if len(pinyin_key) <= len(first_key):
-                continue
-            if not pinyin_key.startswith(first_key):
-                continue
-            second_key = pinyin_key[len(first_key):]
-            if second_key in pinyin_list:
-                yield pinyin_key, first_key, second_key
-
-
-def filter_divided():
-    for (pinyin_key, first_key, second_key) in gen_all_divided():
-        if not (first_key, second_key) in phrase_dict:
-            continue
-        orig_freq = 0
-        if pinyin_key in phrase_dict:
-            orig_freq = phrase_dict[pinyin_key]
-        new_freq = phrase_dict[(first_key, second_key)]
-        yield pinyin_key, orig_freq, first_key, second_key, new_freq
-
-
-def gen_all_resplit():
-    for pinyin_key in pinyin_list:
-        if pinyin_key[-1] in ["n", "g", "r"]:
-            for yun in yunmu_list:
-                if yun not in pinyin_list:
-                    continue
-                #check first new pinyin key
-                if not pinyin_key[:-1] in pinyin_list:
-                    continue
-                #check second new pinyin key
-                new_pinyin_key = pinyin_key[-1] + yun
-                if new_pinyin_key in pinyin_list:
-                    yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key
-'''
-        elif pinyin_key[-1] in ["e"]:
-            #check first new pinyin key
-            if pinyin_key[:-1] in pinyin_list:
-                yield pinyin_key, "r", pinyin_key[:-1], "er"
-'''
-
-
-def filter_resplit():
-    for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
-    in gen_all_resplit():
-        #do the reverse here, as libpinyin pinyin parser is different with
-        #ibus-pinyin's parser.
-        (orig_first_key, orig_second_key, new_first_key, new_second_key) = \
-            (new_first_key, new_second_key, orig_first_key, orig_second_key)
-        if (new_first_key, new_second_key) not in phrase_dict:
-            continue
-        orig_freq = 0
-        new_freq = phrase_dict[(new_first_key, new_second_key)]
-        if (orig_first_key, orig_second_key) in phrase_dict:
-            orig_freq = phrase_dict[(orig_first_key, orig_second_key)]
-        yield orig_first_key, orig_second_key, orig_freq, \
-        new_first_key, new_second_key, new_freq
-
-
 #init full pinyin table code
 filter_pinyin_list()
 check_rules(hsu_correct, hsu_correct_special)
@@ -498,10 +410,6 @@ check_rules(eten26_correct, eten26_correct_special)
 populate_more_zhuyin_index()
 sort_all()
 
-#init resplit/divided table code
-load_phrase("pinyins.txt")
-#load_phrase("specials.txt")
-
 
 ### main function ###
 if __name__ == "__main__":
@@ -517,9 +425,3 @@ if __name__ == "__main__":
     s = gen_hsu_zhuyin_index() + gen_eten26_zhuyin_index()
     s = gen_table_index_for_chewing_key(content_table)
     print(s)
-
-    #dump
-    for p in filter_divided():
-        print (p)
-    for p in filter_resplit():
-        print (p)
diff --git a/scripts2/generateheader.py b/scripts2/generateheader.py
index 901d682..dd54c29 100644
--- a/scripts2/generateheader.py
+++ b/scripts2/generateheader.py
@@ -20,9 +20,11 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 
 
+import os
 from argparse import ArgumentParser
 from chewing import gen_initials, gen_middles, gen_finals, gen_tones
-from fullpinyintable import gen_content_table, gen_pinyin_index, gen_luoma_pinyin_index, gen_secondary_zhuyin_index, gen_zhuyin_index, gen_hsu_zhuyin_index, gen_eten26_zhuyin_index, gen_divided_table, gen_resplit_table, gen_chewing_key_table
+from fullpinyintable import gen_content_table, gen_pinyin_index, gen_luoma_pinyin_index, gen_secondary_zhuyin_index, gen_zhuyin_index, gen_hsu_zhuyin_index, gen_eten26_zhuyin_index, gen_table_index_for_chewing_key
+from specialtable import gen_divided_table, gen_resplit_table
 from doublepinyintable import gen_shengmu_table, gen_yunmu_table
 from bopomofotable import gen_chewing_symbols, gen_chewing_initials, gen_chewing_middles, gen_chewing_finals, gen_chewing_tones
 
@@ -63,7 +65,7 @@ def get_table_content(tablename):
     if tablename == 'RESPLIT_TABLE':
         return gen_resplit_table()
     if tablename == 'TABLE_INDEX':
-        return gen_chewing_key_table()
+        return gen_table_index_for_chewing_key()
 
     #double pinyin table
     (scheme, part) = tablename.split('_', 1)
@@ -108,6 +110,6 @@ if __name__ == "__main__":
                         help='input file.')
 
     args = parser.parse_args()
-    print(args)
+    #print(args)
     expand_file(args.infile)
 
diff --git a/scripts2/specialtable.py b/scripts2/specialtable.py
new file mode 100644
index 0000000..800a7b1
--- /dev/null
+++ b/scripts2/specialtable.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+# vim:set et sts=4 sw=4:
+#
+# libpinyin - Library to deal with pinyin.
+#
+# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+import os
+import sys
+import math
+import operator
+from fullpinyin import PINYIN_LIST, SHENGMU_LIST, YUNMU_LIST
+
+pinyin_list = sorted(PINYIN_LIST)
+shengmu_list = sorted(SHENGMU_LIST)
+yunmu_list = sorted(YUNMU_LIST)
+
+phrase_dict = {}
+
+
+def load_phrase(filename):
+    phrasefile = open(filename, "r")
+    for line in phrasefile.readlines():
+        line = line.rstrip(os.linesep)
+        (pinyin_str, freq) = line.split(None, 1)
+        freq = int(freq)
+        if 0 == freq:
+            #print(pinyin_str)
+            continue
+
+        # no duplicate here
+        if "'" in pinyin_str:
+            (first_key, second_key) = pinyin_str.split("'")
+            phrase_dict[(first_key, second_key)] = freq
+        else:
+            phrase_dict[pinyin_str] = freq
+    phrasefile.close()
+
+
+#generate the list
+def gen_all_divided():
+    for pinyin_key in pinyin_list:
+        for first_key in pinyin_list:
+            if len(pinyin_key) <= len(first_key):
+                continue
+            if not pinyin_key.startswith(first_key):
+                continue
+            second_key = pinyin_key[len(first_key):]
+            if second_key in pinyin_list:
+                yield pinyin_key, first_key, second_key
+
+
+def filter_divided():
+    for (pinyin_key, first_key, second_key) in gen_all_divided():
+        if not (first_key, second_key) in phrase_dict:
+            continue
+        orig_freq = 0
+        if pinyin_key in phrase_dict:
+            orig_freq = phrase_dict[pinyin_key]
+        new_freq = phrase_dict[(first_key, second_key)]
+        yield pinyin_key, orig_freq, first_key, second_key, new_freq
+
+
+def gen_all_resplit():
+    for pinyin_key in pinyin_list:
+        if pinyin_key[-1] in ["n", "g", "r"]:
+            for yun in yunmu_list:
+                if yun not in pinyin_list:
+                    continue
+                #check first new pinyin key
+                if not pinyin_key[:-1] in pinyin_list:
+                    continue
+                #check second new pinyin key
+                new_pinyin_key = pinyin_key[-1] + yun
+                if new_pinyin_key in pinyin_list:
+                    yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key
+'''
+        elif pinyin_key[-1] in ["e"]:
+            #check first new pinyin key
+            if pinyin_key[:-1] in pinyin_list:
+                yield pinyin_key, "r", pinyin_key[:-1], "er"
+'''
+
+
+def filter_resplit():
+    for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
+    in gen_all_resplit():
+        #do the reverse here, as libpinyin pinyin parser is different with
+        #ibus-pinyin's parser.
+        (orig_first_key, orig_second_key, new_first_key, new_second_key) = \
+            (new_first_key, new_second_key, orig_first_key, orig_second_key)
+        if (new_first_key, new_second_key) not in phrase_dict:
+            continue
+        orig_freq = 0
+        new_freq = phrase_dict[(new_first_key, new_second_key)]
+        if (orig_first_key, orig_second_key) in phrase_dict:
+            orig_freq = phrase_dict[(orig_first_key, orig_second_key)]
+        yield orig_first_key, orig_second_key, orig_freq, \
+        new_first_key, new_second_key, new_freq
+
+
+#generate the table
+divided_list = []
+resplit_list = []
+
+
+def sort_all():
+    global divided_list, resplit_list
+    divided_list = sorted(divided_list, key=operator.itemgetter(0))
+    resplit_list = sorted(resplit_list, key=operator.itemgetter(0, 1))
+
+
+def gen_divided_table():
+    entries = []
+    for (pinyin_key, orig_freq, first_key, second_key, new_freq) \
+            in divided_list:
+
+        if orig_freq >= new_freq:
+            assert orig_freq > 0, "Expected orig_freq > 0 here."
+
+        entry = '{{"{0}", {1}, {{"{2}", "{3}"}}, {4}}}'.format \
+            (pinyin_key, orig_freq, first_key, second_key, new_freq)
+        entries.append(entry)
+    return ',\n'.join(entries)
+
+
+def gen_resplit_table():
+    entries = []
+    for (orig_first_key, orig_second_key, orig_freq, \
+        new_first_key, new_second_key, new_freq) in resplit_list:
+
+        if orig_freq >= new_freq:
+            assert orig_freq > 0, "Expected orig_freq > 0 here."
+
+        entry = '{{{{"{0}", "{1}"}}, {2}, {{"{3}", "{4}"}}, {5}}}'.format \
+            (orig_first_key, orig_second_key, orig_freq,\
+                 new_first_key, new_second_key, new_freq)
+        entries.append(entry)
+    return ',\n'.join(entries)
+
+
+#init code
+load_phrase("pinyins.txt")
+#load_phrase("specials.txt")
+divided_list = filter_divided()
+resplit_list = filter_resplit()
+sort_all()
+
+
+if __name__ == "__main__":
+    for p in filter_divided():
+        print (p)
+    for p in filter_resplit():
+        print (p)
+
+    s = gen_divided_table() + '\n' + gen_resplit_table()
+    print(s)
author	Peng Wu <alexepico@gmail.com>	2015-09-08 15:51:12 +0800
committer	Peng Wu <alexepico@gmail.com>	2015-09-08 15:53:20 +0800
commit	f40946027ed3bd50d5cce100564340d2b9161c0d (patch)
tree	9bb15730fa7b0ad224d821ab9eb3754221fefa7a /scripts2
parent	dff380f2847a030ae1e2db4c26c2316cf70fa7ca (diff)