From d3a8368acbf3327a82915f12fb92a814df3ed6a5 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 26 Jul 2016 17:53:18 +0800 Subject: write mergepartialopengram.py --- tools/mergepartialopengram.py | 51 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 tools/mergepartialopengram.py diff --git a/tools/mergepartialopengram.py b/tools/mergepartialopengram.py new file mode 100644 index 0000000..4c971a7 --- /dev/null +++ b/tools/mergepartialopengram.py @@ -0,0 +1,51 @@ +#!/usr/bin/python3 +import os + +from distill import strip_tone +from filteropengram import Untouched, words_dict, load_opengram_dictionary + + +''' +merge partial opengram into merged_gb*_opengram.txt +''' + +def meet_word(outfile, word): + if not word in words_dict: + return + + (status, pinyins) = words_dict[word] + assert Untouched == status + + for (pinyin, freq) in pinyins: + freq = str(freq) + oneline = "\t".join((word, pinyin, freq)) + outfile.writelines([oneline, os.linesep]) + + del words_dict[word] + + +def merge_core_dictionary(infilename, outfilename): + infile = open(infilename, "r") + outfile = open(outfilename, "w") + + for oneline in infile.readlines(): + oneline = oneline.rstrip(os.linesep) + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + meet_word(outfile, word) + + outfile.close() + infile.close() + + +if __name__ == "__main__": + print('Loading partial opengram dictionary') + load_opengram_dictionary("partial_opengram.txt") + + print('Merging partial opengram dictionary') + merge_core_dictionary("merged_gb_char.txt", "merged_gb_char_opengram.txt") + merge_core_dictionary("merged_gb_phrase.txt", "merged_gb_phrase_opengram.txt") + merge_core_dictionary("merged_gbk_char.txt", "merged_gbk_char_opengram.txt") + + print('Check remained phrases') + assert 0 == len(words_dict) -- cgit