diff options
author | Peng Wu <alexepico@gmail.com> | 2016-07-26 17:31:36 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2016-10-09 14:09:02 +0800 |
commit | 5fee2b5938c1671595517edd5678f08078de3345 (patch) | |
tree | dbe7ebeaaa9449b862121f74016d1891b3629387 | |
parent | b1161515e611d261a51b2fbb8232cacb497fb54b (diff) | |
download | trainer-5fee2b5938c1671595517edd5678f08078de3345.tar.gz trainer-5fee2b5938c1671595517edd5678f08078de3345.tar.xz trainer-5fee2b5938c1671595517edd5678f08078de3345.zip |
write filteropengram.py
-rw-r--r-- | tools/filteropengram.py | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/tools/filteropengram.py b/tools/filteropengram.py new file mode 100644 index 0000000..8c79c94 --- /dev/null +++ b/tools/filteropengram.py @@ -0,0 +1,132 @@ +#!/usr/bin/python3 +import os + +from distill import strip_tone + + +''' +filter out the already existing libpinyin phrases from opengram dictionary. +''' + +( +# not in libpinyin, move to opengram.txt +Untouched, +# only partial information in libpinyin, save to partial_opengram.txt +Partial, +# already in libpinyin, do nothing +Complete +) = range(3, 6) + +# key: word, value: (status, pinyins) +# pinyins: list of (pinyin, freq) +words_dict = {} + + +def add_words_dict(word, pinyin, freq): + # assume all tones are already removed + assert pinyin == strip_tone(pinyin) + + if not word in words_dict: + status = Untouched + pinyins = [] + pinyins.append((pinyin, freq)) + words_dict[word] = (status, pinyins) + else: + (status, pinyins) = words_dict[word] + assert Untouched == status + + for i, item in enumerate(pinyins): + (oldpinyin, oldfreq) = item + assert oldpinyin != pinyin + + pinyins.append((pinyin, freq)) + + +def filter_out(word, pinyin): + if not word in words_dict: + return + + (status, pinyins) = words_dict[word] + status = Partial + + found = False + for i, item in enumerate(pinyins): + (oldpinyin, oldfreq) = item + if oldpinyin == pinyin: + del pinyins[i] + found = True + + if not found: + print('Missing {0} and {1} in opengram'.format(word, pinyin)) + + if 0 == len(pinyins): + status = Complete + + words_dict[word] = (status, pinyins) + + +def load_opengram_dictionary(infilename): + infile = open(infilename, "r") + + for oneline in infile.readlines(): + oneline = oneline.rstrip(os.linesep) + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + add_words_dict(word, pinyin, freq) + + infile.close() + + +def filter_core_dictionary(infilename): + infile = open(infilename, "r") + + for oneline in infile.readlines(): + oneline = oneline.rstrip(os.linesep) + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + filter_out(word, pinyin) + + infile.close() + + +def save_opengram_dictionary(outfilename): + outfile = open(outfilename, "w") + + for word in words_dict: + (status, pinyins) = words_dict[word] + if Untouched == status: + for (pinyin, freq) in pinyins: + freq = str(freq) + oneline = "\t".join((word, pinyin, freq)) + outfile.writelines([oneline, os.linesep]) + + outfile.close() + + +def save_partial_dictionary(outfilename): + outfile = open(outfilename, "w") + + for word in words_dict: + (status, pinyins) = words_dict[word] + if Partial == status: + for (pinyin, freq) in pinyins: + freq = str(freq) + oneline = "\t".join((word, pinyin, freq)) + outfile.writelines([oneline, os.linesep]) + + outfile.close() + + +if __name__ == "__main__": + print('Loading opengram dictionary') + load_opengram_dictionary("dict.full") + + print('Filtering libpinyin dictionary') + filter_core_dictionary("merged_gb_char.txt") + filter_core_dictionary("merged_gb_phrase.txt") + filter_core_dictionary("merged_gbk_char.txt") + + print('Saving opengram dictionary') + save_opengram_dictionary("opengram.txt") + print('Saving paritial dictionary') + save_partial_dictionary("partial_opengram.txt") |