summaryrefslogtreecommitdiffstats
path: root/scripts2
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2015-09-06 14:29:10 +0800
committerPeng Wu <alexepico@gmail.com>2015-09-06 14:29:10 +0800
commit98074b17447f8aeca00ab037ae6b0c3c02a2a90b (patch)
treef7d6fadf5bc010e22805713eb58bc166fd0ddd57 /scripts2
parent62ea43a74061fdee78620b77e07f1c224fde0ef1 (diff)
downloadlibpinyin-98074b17447f8aeca00ab037ae6b0c3c02a2a90b.tar.gz
libpinyin-98074b17447f8aeca00ab037ae6b0c3c02a2a90b.tar.xz
libpinyin-98074b17447f8aeca00ab037ae6b0c3c02a2a90b.zip
import distill.py from genpinyins.py
Diffstat (limited to 'scripts2')
-rw-r--r--scripts2/tools/distill.py57
1 files changed, 57 insertions, 0 deletions
diff --git a/scripts2/tools/distill.py b/scripts2/tools/distill.py
new file mode 100644
index 0000000..8502605
--- /dev/null
+++ b/scripts2/tools/distill.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python3
+import os
+from operator import itemgetter
+
+pinyin_dict = {}
+
+
+def strip_tone(old_pinyin_str):
+ oldpinyins = old_pinyin_str.split("'")
+ newpinyins = []
+
+ for pinyin in oldpinyins:
+ if pinyin[-1].isdigit():
+ pinyin = pinyin[:-1]
+ newpinyins.append(pinyin)
+
+ new_pinyin_str = "'".join(newpinyins)
+ return new_pinyin_str
+
+
+def add_pinyin_dict(pinyin, freq):
+ if 0 == freq:
+ return
+ if not pinyin in pinyin_dict:
+ pinyin_dict[pinyin] = freq
+ else:
+ pinyin_dict[pinyin] += freq
+
+
+def load_phrase(filename):
+ phrasefile = open(filename, "r")
+ for line in phrasefile.readlines():
+ line = line.rstrip(os.linesep)
+ (pinyin, word, token, freq) = line.split(None, 3)
+ pinyin = strip_tone(pinyin)
+ freq = int(freq)
+
+ if len(word) in [1, 2]:
+ add_pinyin_dict(pinyin, freq)
+
+ phrasefile.close()
+
+load_phrase("../../data/gb_char.table")
+load_phrase("../../data/gbk_char.table")
+
+
+def save_pinyin(filename):
+ pinyinfile = open(filename, "w")
+ for pinyin, freq in pinyin_dict.items():
+ freq = str(freq)
+ line = "\t".join((pinyin, freq))
+ pinyinfile.writelines([line, os.linesep])
+ pinyinfile.close()
+
+
+if __name__ == "__main__":
+ save_pinyin("pinyins.txt")