summaryrefslogtreecommitdiffstats
path: root/scripts/genpinyins.py
blob: fef40cd196e08b475c1d1cedb7670d128bef5067 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/python3
import os
from operator import itemgetter

pinyin_dict = {}


def strip_tone(old_pinyin_str):
    oldpinyins = old_pinyin_str.split("'")
    newpinyins = []

    for pinyin in oldpinyins:
        if pinyin[-1].isdigit():
            pinyin = pinyin[:-1]
        newpinyins.append(pinyin)

    new_pinyin_str = "'".join(newpinyins)
    return new_pinyin_str


def add_pinyin_dict(pinyin, freq):
    if 0 == freq:
        return
    if not pinyin in pinyin_dict:
        pinyin_dict[pinyin] = freq
    else:
        pinyin_dict[pinyin] += freq


def load_phrase(filename):
    phrasefile = open(filename, "r")
    for line in phrasefile.readlines():
        line = line.rstrip(os.linesep)
        (pinyin, word, token, freq) = line.split(None, 3)
        pinyin = strip_tone(pinyin)
        freq = int(freq)

        if len(word) in [1, 2]:
            add_pinyin_dict(pinyin, freq)

    phrasefile.close()

load_phrase("../data/gb_char.table")
load_phrase("../data/gbk_char.table")


def save_pinyin(filename):
    pinyinfile = open(filename, "w")
    for pinyin, freq in pinyin_dict.items():
        freq = str(freq)
        line = "\t".join((pinyin, freq))
        pinyinfile.writelines([line, os.linesep])
    pinyinfile.close()


if __name__ == "__main__":
    save_pinyin("pinyins.txt")