summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2016-07-26 17:53:18 +0800
committerPeng Wu <alexepico@gmail.com>2016-10-09 14:10:51 +0800
commitd3a8368acbf3327a82915f12fb92a814df3ed6a5 (patch)
tree056610c7925020dae43339fc0acb77a44078598e /tools
parent5fee2b5938c1671595517edd5678f08078de3345 (diff)
downloadtrainer-d3a8368acbf3327a82915f12fb92a814df3ed6a5.tar.gz
trainer-d3a8368acbf3327a82915f12fb92a814df3ed6a5.tar.xz
trainer-d3a8368acbf3327a82915f12fb92a814df3ed6a5.zip
write mergepartialopengram.py
Diffstat (limited to 'tools')
-rw-r--r--tools/mergepartialopengram.py51
1 files changed, 51 insertions, 0 deletions
diff --git a/tools/mergepartialopengram.py b/tools/mergepartialopengram.py
new file mode 100644
index 0000000..4c971a7
--- /dev/null
+++ b/tools/mergepartialopengram.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python3
+import os
+
+from distill import strip_tone
+from filteropengram import Untouched, words_dict, load_opengram_dictionary
+
+
+'''
+merge partial opengram into merged_gb*_opengram.txt
+'''
+
+def meet_word(outfile, word):
+ if not word in words_dict:
+ return
+
+ (status, pinyins) = words_dict[word]
+ assert Untouched == status
+
+ for (pinyin, freq) in pinyins:
+ freq = str(freq)
+ oneline = "\t".join((word, pinyin, freq))
+ outfile.writelines([oneline, os.linesep])
+
+ del words_dict[word]
+
+
+def merge_core_dictionary(infilename, outfilename):
+ infile = open(infilename, "r")
+ outfile = open(outfilename, "w")
+
+ for oneline in infile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+ (word, pinyin, freq) = oneline.split(None, 2)
+ freq = int(freq)
+ meet_word(outfile, word)
+
+ outfile.close()
+ infile.close()
+
+
+if __name__ == "__main__":
+ print('Loading partial opengram dictionary')
+ load_opengram_dictionary("partial_opengram.txt")
+
+ print('Merging partial opengram dictionary')
+ merge_core_dictionary("merged_gb_char.txt", "merged_gb_char_opengram.txt")
+ merge_core_dictionary("merged_gb_phrase.txt", "merged_gb_phrase_opengram.txt")
+ merge_core_dictionary("merged_gbk_char.txt", "merged_gbk_char_opengram.txt")
+
+ print('Check remained phrases')
+ assert 0 == len(words_dict)