summaryrefslogtreecommitdiffstats
path: root/tools/convertopengram.py
blob: 64dd89552a8537f0945ea6d7571390d6a23ad921 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python3
import os
from argparse import ArgumentParser

from distill import strip_tone

'''
convert the opengram dictionary file format
to libpinyin input file format,
the same format as utils/storage/gen_pinyin_table.cpp .
'''

#minimum pinyin frequency
minimum = 3

#default pinyin total frequency
total_frequency = 100


def handle_pinyin(outfile, word, num, pinyin):
    # no tones in opengram dictionary
    stripped = strip_tone(pinyin)
    assert stripped == pinyin

    freq = 0
    if not ":" in pinyin:
        freq = total_frequency / num
    else:
        (py, freq) = pinyin.split(":", 1)
        assert freq.endswith("%")
        freq = freq.rstrip("%")
        freq = float(freq)
        freq = total_frequency * freq
        pinyin = py

    freq = int(freq)
    freq = max(freq, minimum)
    freq = str(freq)
    oneline = "\t".join((word, pinyin, freq))
    outfile.writelines([oneline, os.linesep])


def handle_line(outfile, line):
    (word, pinyins) = line.split(None, 1)
    pinyin_list = pinyins.split(None)
    num = len(pinyin_list)
    for pinyin in pinyin_list:
        handle_pinyin(outfile, word, num, pinyin)


def handle_file(infilename, outfilename):
    infile = open(infilename, "r")
    outfile = open(outfilename, "w")
    for oneline in infile.readlines():
        oneline = oneline.rstrip(os.linesep)
        handle_line(outfile, oneline)
    outfile.close()
    infile.close()


if __name__ == "__main__":
    parser = ArgumentParser(description='convert opengram dictionary.')
    parser.add_argument('infile', help='input file')
    parser.add_argument('outfile', help='output file')
    args = parser.parse_args()
    print(args)

    handle_file(args.infile, args.outfile)