diff options
author | Peng Wu <alexepico@gmail.com> | 2016-07-26 14:03:39 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2016-10-09 14:06:42 +0800 |
commit | b1161515e611d261a51b2fbb8232cacb497fb54b (patch) | |
tree | e3919c0cfa797a029e71297e6500af54b56a162d /tools | |
parent | c798526a5c4138e09afd10620d645a7c133ef7fe (diff) | |
download | trainer-b1161515e611d261a51b2fbb8232cacb497fb54b.tar.gz trainer-b1161515e611d261a51b2fbb8232cacb497fb54b.tar.xz trainer-b1161515e611d261a51b2fbb8232cacb497fb54b.zip |
write convertopengram.py
Diffstat (limited to 'tools')
-rw-r--r-- | tools/convertopengram.py | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/tools/convertopengram.py b/tools/convertopengram.py new file mode 100644 index 0000000..64dd895 --- /dev/null +++ b/tools/convertopengram.py @@ -0,0 +1,68 @@ +#!/usr/bin/python3 +import os +from argparse import ArgumentParser + +from distill import strip_tone + +''' +convert the opengram dictionary file format +to libpinyin input file format, +the same format as utils/storage/gen_pinyin_table.cpp . +''' + +#minimum pinyin frequency +minimum = 3 + +#default pinyin total frequency +total_frequency = 100 + + +def handle_pinyin(outfile, word, num, pinyin): + # no tones in opengram dictionary + stripped = strip_tone(pinyin) + assert stripped == pinyin + + freq = 0 + if not ":" in pinyin: + freq = total_frequency / num + else: + (py, freq) = pinyin.split(":", 1) + assert freq.endswith("%") + freq = freq.rstrip("%") + freq = float(freq) + freq = total_frequency * freq + pinyin = py + + freq = int(freq) + freq = max(freq, minimum) + freq = str(freq) + oneline = "\t".join((word, pinyin, freq)) + outfile.writelines([oneline, os.linesep]) + + +def handle_line(outfile, line): + (word, pinyins) = line.split(None, 1) + pinyin_list = pinyins.split(None) + num = len(pinyin_list) + for pinyin in pinyin_list: + handle_pinyin(outfile, word, num, pinyin) + + +def handle_file(infilename, outfilename): + infile = open(infilename, "r") + outfile = open(outfilename, "w") + for oneline in infile.readlines(): + oneline = oneline.rstrip(os.linesep) + handle_line(outfile, oneline) + outfile.close() + infile.close() + + +if __name__ == "__main__": + parser = ArgumentParser(description='convert opengram dictionary.') + parser.add_argument('infile', help='input file') + parser.add_argument('outfile', help='output file') + args = parser.parse_args() + print(args) + + handle_file(args.infile, args.outfile) |