summaryrefslogtreecommitdiffstats
path: root/tools/distill.py
blob: 97d939bc81933cd3230dc65645386a9ed05b8efa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/python3
import os
from operator import itemgetter
from argparse import ArgumentParser

words_set = set([])
words_dict = {}


def add_words_set(word):
    if len(word) < 2:
        return
    if not word in words_set:
        words_set.add(word)

def strip_tone(old_pinyin_str):
    oldpinyins = old_pinyin_str.split("'")
    newpinyins = []
    for pinyin in oldpinyins:
        if pinyin[-1].isdigit():
            pinyin = pinyin[:-1]
        newpinyins.append(pinyin)
    new_pinyin_str = "'".join(newpinyins)
    return new_pinyin_str


def add_words_dict(word, pinyin, freq):
    pinyin = strip_tone(pinyin)
    if not (word, pinyin) in words_dict:
        words_dict[(word, pinyin)] = freq
    else:
        words_dict[(word, pinyin)] += freq


def load_phrase(filename):
    phrasefile = open(filename, "r")

    for oneline in phrasefile.readlines():
        oneline = oneline.rstrip(os.linesep)
        (pinyin, word, token, freq) = oneline.split(None, 3)
        freq = int(freq)
        add_words_set(word)
        add_words_dict(word, pinyin, freq)

    phrasefile.close()


words_list = []
oldwords_list = []


def sort_words():
    #sorting
    global words_list
    words_list = list(words_set)
    words_list.sort()

    global oldwords_list
    oldwords_list = []
    for key, value in words_dict.items():
        (word, pinyin) = key
        freq = value
        oldwords_list.append((word, pinyin, freq))
    oldwords_list.sort(key=itemgetter(0))


def save_words_list(filename):
    wordsfile = open(filename, 'w')
    for word in words_list:
        wordsfile.writelines([word, os.linesep])
    wordsfile.close()


def save_words_dict(filename):
    wordsfile = open(filename, 'w')
    for (word, pinyin, freq) in oldwords_list:
        freq = str(freq)
        oneline = "\t".join((word, pinyin, freq))
        wordsfile.writelines([oneline, os.linesep])
    wordsfile.close()


if __name__ == "__main__":
    parser = ArgumentParser(description='distill dictionaries.')
    parser.add_argument('inputs', type=str, nargs='*', \
                            help='dictionaries', \
                            default=['gb_char.table', 'gbk_char.table', \
                                         'opengram.table', 'merged.table'])


    args = parser.parse_args()
    print(args)
    #loading
    for filename in args.inputs:
        load_phrase(filename)

    sort_words()

    save_words_list("words.txt")
    save_words_dict("oldwords.txt")