diff options
author | Peng Wu <alexepico@gmail.com> | 2016-07-26 17:53:18 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2016-10-09 14:10:51 +0800 |
commit | d3a8368acbf3327a82915f12fb92a814df3ed6a5 (patch) | |
tree | 056610c7925020dae43339fc0acb77a44078598e /tools | |
parent | 5fee2b5938c1671595517edd5678f08078de3345 (diff) | |
download | trainer-d3a8368acbf3327a82915f12fb92a814df3ed6a5.tar.gz trainer-d3a8368acbf3327a82915f12fb92a814df3ed6a5.tar.xz trainer-d3a8368acbf3327a82915f12fb92a814df3ed6a5.zip |
write mergepartialopengram.py
Diffstat (limited to 'tools')
-rw-r--r-- | tools/mergepartialopengram.py | 51 |
1 files changed, 51 insertions, 0 deletions
diff --git a/tools/mergepartialopengram.py b/tools/mergepartialopengram.py new file mode 100644 index 0000000..4c971a7 --- /dev/null +++ b/tools/mergepartialopengram.py @@ -0,0 +1,51 @@ +#!/usr/bin/python3 +import os + +from distill import strip_tone +from filteropengram import Untouched, words_dict, load_opengram_dictionary + + +''' +merge partial opengram into merged_gb*_opengram.txt +''' + +def meet_word(outfile, word): + if not word in words_dict: + return + + (status, pinyins) = words_dict[word] + assert Untouched == status + + for (pinyin, freq) in pinyins: + freq = str(freq) + oneline = "\t".join((word, pinyin, freq)) + outfile.writelines([oneline, os.linesep]) + + del words_dict[word] + + +def merge_core_dictionary(infilename, outfilename): + infile = open(infilename, "r") + outfile = open(outfilename, "w") + + for oneline in infile.readlines(): + oneline = oneline.rstrip(os.linesep) + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + meet_word(outfile, word) + + outfile.close() + infile.close() + + +if __name__ == "__main__": + print('Loading partial opengram dictionary') + load_opengram_dictionary("partial_opengram.txt") + + print('Merging partial opengram dictionary') + merge_core_dictionary("merged_gb_char.txt", "merged_gb_char_opengram.txt") + merge_core_dictionary("merged_gb_phrase.txt", "merged_gb_phrase_opengram.txt") + merge_core_dictionary("merged_gbk_char.txt", "merged_gbk_char_opengram.txt") + + print('Check remained phrases') + assert 0 == len(words_dict) |