summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2016-07-26 14:03:39 +0800
committerPeng Wu <alexepico@gmail.com>2016-10-09 14:06:42 +0800
commitb1161515e611d261a51b2fbb8232cacb497fb54b (patch)
treee3919c0cfa797a029e71297e6500af54b56a162d
parentc798526a5c4138e09afd10620d645a7c133ef7fe (diff)
downloadtrainer-b1161515e611d261a51b2fbb8232cacb497fb54b.tar.gz
trainer-b1161515e611d261a51b2fbb8232cacb497fb54b.tar.xz
trainer-b1161515e611d261a51b2fbb8232cacb497fb54b.zip
write convertopengram.py
-rw-r--r--tools/convertopengram.py68
1 files changed, 68 insertions, 0 deletions
diff --git a/tools/convertopengram.py b/tools/convertopengram.py
new file mode 100644
index 0000000..64dd895
--- /dev/null
+++ b/tools/convertopengram.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+import os
+from argparse import ArgumentParser
+
+from distill import strip_tone
+
+'''
+convert the opengram dictionary file format
+to libpinyin input file format,
+the same format as utils/storage/gen_pinyin_table.cpp .
+'''
+
+#minimum pinyin frequency
+minimum = 3
+
+#default pinyin total frequency
+total_frequency = 100
+
+
+def handle_pinyin(outfile, word, num, pinyin):
+ # no tones in opengram dictionary
+ stripped = strip_tone(pinyin)
+ assert stripped == pinyin
+
+ freq = 0
+ if not ":" in pinyin:
+ freq = total_frequency / num
+ else:
+ (py, freq) = pinyin.split(":", 1)
+ assert freq.endswith("%")
+ freq = freq.rstrip("%")
+ freq = float(freq)
+ freq = total_frequency * freq
+ pinyin = py
+
+ freq = int(freq)
+ freq = max(freq, minimum)
+ freq = str(freq)
+ oneline = "\t".join((word, pinyin, freq))
+ outfile.writelines([oneline, os.linesep])
+
+
+def handle_line(outfile, line):
+ (word, pinyins) = line.split(None, 1)
+ pinyin_list = pinyins.split(None)
+ num = len(pinyin_list)
+ for pinyin in pinyin_list:
+ handle_pinyin(outfile, word, num, pinyin)
+
+
+def handle_file(infilename, outfilename):
+ infile = open(infilename, "r")
+ outfile = open(outfilename, "w")
+ for oneline in infile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+ handle_line(outfile, oneline)
+ outfile.close()
+ infile.close()
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser(description='convert opengram dictionary.')
+ parser.add_argument('infile', help='input file')
+ parser.add_argument('outfile', help='output file')
+ args = parser.parse_args()
+ print(args)
+
+ handle_file(args.infile, args.outfile)