write gen special in progress

author: Peng Wu <alexepico@gmail.com> 2011-10-26 15:23:24 +0800
committer: Peng Wu <alexepico@gmail.com> 2011-10-26 15:23:24 +0800
commit: 746e6b11a88ce3b120b052253f5773f361c03b16 (patch)
tree: 5fded825a1559a0f5139653cb5db79c61d55483c /scripts
parent: b5f85c8f94097e34a91e6a74cc4f0e3736aa2c2f (diff)
download: libpinyin-746e6b11a88ce3b120b052253f5773f361c03b16.tar.gz
libpinyin-746e6b11a88ce3b120b052253f5773f361c03b16.tar.xz
libpinyin-746e6b11a88ce3b120b052253f5773f361c03b16.zip
2 files changed, 66 insertions, 11 deletions
diff --git a/scripts/genpytable.py b/scripts/genpytable.py
index faf90f9..f64f6bf 100644
--- a/scripts/genpytable.py
+++ b/scripts/genpytable.py
@@ -81,7 +81,7 @@ def get_chewing(pinyin_key):
 
     return initial, middle, final
 
-def get_pinyin_list():
+def gen_pinyin_list():
     for p in itertools.chain(gen_pinyins(),
                              gen_shengmu(),
                              gen_corrects(),
@@ -145,5 +145,5 @@ if __name__ == "__main__":
     check_pinyin_chewing_map()
 
     #dump
-    for pinyin_key in get_pinyin_list():
+    for pinyin_key in gen_pinyin_list():
         print (pinyin_key)
diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py
index bc92d0b..bb3f061 100644
--- a/scripts/genspecialtable.py
+++ b/scripts/genspecialtable.py
@@ -19,25 +19,80 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 
+
+import os
 import sys
+import math
 import pinyin
 
 pinyin_list = sorted(pinyin.PINYIN_LIST)
 shengmu_list = sorted(pinyin.SHENGMU_LIST)
 yunmu_list = sorted(pinyin.YUNMU_LIST)
 
-def get_all_special():
-    for p in pinyin_list:
-        if p[-1] in ["n", "g", "r"]:
+phrase_dict = {}
+
+
+def load_phrase(filename):
+    phrasefile = open(filename, "r")
+    for line in phrasefile.readlines():
+        line = line.rstrip(os.linesep)
+        (pinyin_str, freq) = line.split(' ', 1)
+        freq = int(math.floor(float(freq)))
+        if 0 == freq:
+            #print(pinyin_str)
+            continue
+        (first_key, second_key) = pinyin_str.split("'")
+        phrase_dict[(first_key, second_key)] = freq
+    phrasefile.close()
+
+
+def gen_all_divided():
+    for pinyin_key in pinyin_list:
+        for first_key in pinyin_list:
+            if len(pinyin_key) <= len(first_key):
+                continue
+            if not pinyin_key.startswith(first_key):
+                continue
+            second_key = pinyin_key[len(first_key):]
+            if second_key in pinyin_list:
+                yield pinyin_key, first_key, second_key
+
+
+def filter_divided():
+    for (pinyin_key, first_key, second_key) in gen_all_divided():
+        if not (first_key, second_key) in phrase_dict:
+            continue
+        freq = phrase_dict[(first_key, second_key)]
+        yield pinyin_key, first_key, second_key, freq
+
+
+def gen_all_special():
+    for pinyin_key in pinyin_list:
+        if pinyin_key[-1] in ["n", "g", "r"]:
             for yun in yunmu_list:
                 if yun not in pinyin_list:
                     continue
-                new_pinyin = p[-1] + yun
+                new_pinyin_key = pinyin_key[-1] + yun
                 # if new_pinyin in pinyin_list:
-                yield p, yun, p[:-1], new_pinyin
-        elif p[-1] in ["e"]:
-            yield p, "r", p[:-1], "er"
+                yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key
+        elif pinyin_key[-1] in ["e"]:
+            yield pinyin_key, "r", pinyin_key[:-1], "er"
+
+
+def filter_special():
+    for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
+    in gen_all_special():
+        if not (new_first_key, new_second_key) in phrase_dict:
+            continue
+        orig_freq = 0
+        new_freq = phrase_dict[(new_first_key, new_second_key)]
+        if (orig_first_key, orig_second_key) in phrase_dict:
+            orig_freq = phrase_dict[(orig_first_key, orig_second_key)]
+        yield orig_first_key, orig_second_key, orig_freq, \
+        new_first_key, new_second_key, new_freq
+
 
 if __name__ == "__main__":
-    for pinyins in get_all_special():
-        print (pinyins)
+    load_phrase("pinyin2.txt")
+    for p in filter_special():
+        print (p)
author	Peng Wu <alexepico@gmail.com>	2011-10-26 15:23:24 +0800
committer	Peng Wu <alexepico@gmail.com>	2011-10-26 15:23:24 +0800
commit	746e6b11a88ce3b120b052253f5773f361c03b16 (patch)
tree	5fded825a1559a0f5139653cb5db79c61d55483c /scripts
parent	b5f85c8f94097e34a91e6a74cc4f0e3736aa2c2f (diff)
download	libpinyin-746e6b11a88ce3b120b052253f5773f361c03b16.tar.gz libpinyin-746e6b11a88ce3b120b052253f5773f361c03b16.tar.xz libpinyin-746e6b11a88ce3b120b052253f5773f361c03b16.zip