summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2016-07-26 17:31:36 +0800
committerPeng Wu <alexepico@gmail.com>2016-10-09 14:09:02 +0800
commit5fee2b5938c1671595517edd5678f08078de3345 (patch)
treedbe7ebeaaa9449b862121f74016d1891b3629387 /tools
parentb1161515e611d261a51b2fbb8232cacb497fb54b (diff)
downloadtrainer-5fee2b5938c1671595517edd5678f08078de3345.tar.gz
trainer-5fee2b5938c1671595517edd5678f08078de3345.tar.xz
trainer-5fee2b5938c1671595517edd5678f08078de3345.zip
write filteropengram.py
Diffstat (limited to 'tools')
-rw-r--r--tools/filteropengram.py132
1 files changed, 132 insertions, 0 deletions
diff --git a/tools/filteropengram.py b/tools/filteropengram.py
new file mode 100644
index 0000000..8c79c94
--- /dev/null
+++ b/tools/filteropengram.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python3
+import os
+
+from distill import strip_tone
+
+
+'''
+filter out the already existing libpinyin phrases from opengram dictionary.
+'''
+
+(
+# not in libpinyin, move to opengram.txt
+Untouched,
+# only partial information in libpinyin, save to partial_opengram.txt
+Partial,
+# already in libpinyin, do nothing
+Complete
+) = range(3, 6)
+
+# key: word, value: (status, pinyins)
+# pinyins: list of (pinyin, freq)
+words_dict = {}
+
+
+def add_words_dict(word, pinyin, freq):
+ # assume all tones are already removed
+ assert pinyin == strip_tone(pinyin)
+
+ if not word in words_dict:
+ status = Untouched
+ pinyins = []
+ pinyins.append((pinyin, freq))
+ words_dict[word] = (status, pinyins)
+ else:
+ (status, pinyins) = words_dict[word]
+ assert Untouched == status
+
+ for i, item in enumerate(pinyins):
+ (oldpinyin, oldfreq) = item
+ assert oldpinyin != pinyin
+
+ pinyins.append((pinyin, freq))
+
+
+def filter_out(word, pinyin):
+ if not word in words_dict:
+ return
+
+ (status, pinyins) = words_dict[word]
+ status = Partial
+
+ found = False
+ for i, item in enumerate(pinyins):
+ (oldpinyin, oldfreq) = item
+ if oldpinyin == pinyin:
+ del pinyins[i]
+ found = True
+
+ if not found:
+ print('Missing {0} and {1} in opengram'.format(word, pinyin))
+
+ if 0 == len(pinyins):
+ status = Complete
+
+ words_dict[word] = (status, pinyins)
+
+
+def load_opengram_dictionary(infilename):
+ infile = open(infilename, "r")
+
+ for oneline in infile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+ (word, pinyin, freq) = oneline.split(None, 2)
+ freq = int(freq)
+ add_words_dict(word, pinyin, freq)
+
+ infile.close()
+
+
+def filter_core_dictionary(infilename):
+ infile = open(infilename, "r")
+
+ for oneline in infile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+ (word, pinyin, freq) = oneline.split(None, 2)
+ freq = int(freq)
+ filter_out(word, pinyin)
+
+ infile.close()
+
+
+def save_opengram_dictionary(outfilename):
+ outfile = open(outfilename, "w")
+
+ for word in words_dict:
+ (status, pinyins) = words_dict[word]
+ if Untouched == status:
+ for (pinyin, freq) in pinyins:
+ freq = str(freq)
+ oneline = "\t".join((word, pinyin, freq))
+ outfile.writelines([oneline, os.linesep])
+
+ outfile.close()
+
+
+def save_partial_dictionary(outfilename):
+ outfile = open(outfilename, "w")
+
+ for word in words_dict:
+ (status, pinyins) = words_dict[word]
+ if Partial == status:
+ for (pinyin, freq) in pinyins:
+ freq = str(freq)
+ oneline = "\t".join((word, pinyin, freq))
+ outfile.writelines([oneline, os.linesep])
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ print('Loading opengram dictionary')
+ load_opengram_dictionary("dict.full")
+
+ print('Filtering libpinyin dictionary')
+ filter_core_dictionary("merged_gb_char.txt")
+ filter_core_dictionary("merged_gb_phrase.txt")
+ filter_core_dictionary("merged_gbk_char.txt")
+
+ print('Saving opengram dictionary')
+ save_opengram_dictionary("opengram.txt")
+ print('Saving paritial dictionary')
+ save_partial_dictionary("partial_opengram.txt")