From 5fee2b5938c1671595517edd5678f08078de3345 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 26 Jul 2016 17:31:36 +0800 Subject: write filteropengram.py --- tools/filteropengram.py | 132 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 tools/filteropengram.py diff --git a/tools/filteropengram.py b/tools/filteropengram.py new file mode 100644 index 0000000..8c79c94 --- /dev/null +++ b/tools/filteropengram.py @@ -0,0 +1,132 @@ +#!/usr/bin/python3 +import os + +from distill import strip_tone + + +''' +filter out the already existing libpinyin phrases from opengram dictionary. +''' + +( +# not in libpinyin, move to opengram.txt +Untouched, +# only partial information in libpinyin, save to partial_opengram.txt +Partial, +# already in libpinyin, do nothing +Complete +) = range(3, 6) + +# key: word, value: (status, pinyins) +# pinyins: list of (pinyin, freq) +words_dict = {} + + +def add_words_dict(word, pinyin, freq): + # assume all tones are already removed + assert pinyin == strip_tone(pinyin) + + if not word in words_dict: + status = Untouched + pinyins = [] + pinyins.append((pinyin, freq)) + words_dict[word] = (status, pinyins) + else: + (status, pinyins) = words_dict[word] + assert Untouched == status + + for i, item in enumerate(pinyins): + (oldpinyin, oldfreq) = item + assert oldpinyin != pinyin + + pinyins.append((pinyin, freq)) + + +def filter_out(word, pinyin): + if not word in words_dict: + return + + (status, pinyins) = words_dict[word] + status = Partial + + found = False + for i, item in enumerate(pinyins): + (oldpinyin, oldfreq) = item + if oldpinyin == pinyin: + del pinyins[i] + found = True + + if not found: + print('Missing {0} and {1} in opengram'.format(word, pinyin)) + + if 0 == len(pinyins): + status = Complete + + words_dict[word] = (status, pinyins) + + +def load_opengram_dictionary(infilename): + infile = open(infilename, "r") + + for oneline in infile.readlines(): + oneline = oneline.rstrip(os.linesep) + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + add_words_dict(word, pinyin, freq) + + infile.close() + + +def filter_core_dictionary(infilename): + infile = open(infilename, "r") + + for oneline in infile.readlines(): + oneline = oneline.rstrip(os.linesep) + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + filter_out(word, pinyin) + + infile.close() + + +def save_opengram_dictionary(outfilename): + outfile = open(outfilename, "w") + + for word in words_dict: + (status, pinyins) = words_dict[word] + if Untouched == status: + for (pinyin, freq) in pinyins: + freq = str(freq) + oneline = "\t".join((word, pinyin, freq)) + outfile.writelines([oneline, os.linesep]) + + outfile.close() + + +def save_partial_dictionary(outfilename): + outfile = open(outfilename, "w") + + for word in words_dict: + (status, pinyins) = words_dict[word] + if Partial == status: + for (pinyin, freq) in pinyins: + freq = str(freq) + oneline = "\t".join((word, pinyin, freq)) + outfile.writelines([oneline, os.linesep]) + + outfile.close() + + +if __name__ == "__main__": + print('Loading opengram dictionary') + load_opengram_dictionary("dict.full") + + print('Filtering libpinyin dictionary') + filter_core_dictionary("merged_gb_char.txt") + filter_core_dictionary("merged_gb_phrase.txt") + filter_core_dictionary("merged_gbk_char.txt") + + print('Saving opengram dictionary') + save_opengram_dictionary("opengram.txt") + print('Saving paritial dictionary') + save_partial_dictionary("partial_opengram.txt") -- cgit