summaryrefslogtreecommitdiffstats
path: root/tools/filteropengram.py
blob: 8c79c941aaf0cc25d5bdcbee888f66ed186ac553 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/python3
import os

from distill import strip_tone


'''
filter out the already existing libpinyin phrases from opengram dictionary.
'''

(
# not in libpinyin, move to opengram.txt
Untouched,
# only partial information in libpinyin, save to partial_opengram.txt
Partial,
# already in libpinyin, do nothing
Complete
) = range(3, 6)

# key: word, value: (status, pinyins)
# pinyins: list of (pinyin, freq)
words_dict = {}


def add_words_dict(word, pinyin, freq):
    # assume all tones are already removed
    assert pinyin == strip_tone(pinyin)

    if not word in words_dict:
        status = Untouched
        pinyins = []
        pinyins.append((pinyin, freq))
        words_dict[word] = (status, pinyins)
    else:
        (status, pinyins) = words_dict[word]
        assert Untouched == status

        for i, item in enumerate(pinyins):
            (oldpinyin, oldfreq) = item
            assert oldpinyin != pinyin

        pinyins.append((pinyin, freq))


def filter_out(word, pinyin):
    if not word in words_dict:
        return

    (status, pinyins) = words_dict[word]
    status = Partial

    found = False
    for i, item in enumerate(pinyins):
        (oldpinyin, oldfreq) = item
        if oldpinyin == pinyin:
            del pinyins[i]
            found = True

    if not found:
        print('Missing {0} and {1} in opengram'.format(word, pinyin))

    if 0 == len(pinyins):
        status = Complete

    words_dict[word] = (status, pinyins)


def load_opengram_dictionary(infilename):
    infile = open(infilename, "r")

    for oneline in infile.readlines():
        oneline = oneline.rstrip(os.linesep)
        (word, pinyin, freq) = oneline.split(None, 2)
        freq = int(freq)
        add_words_dict(word, pinyin, freq)

    infile.close()


def filter_core_dictionary(infilename):
    infile = open(infilename, "r")

    for oneline in infile.readlines():
        oneline = oneline.rstrip(os.linesep)
        (word, pinyin, freq) = oneline.split(None, 2)
        freq = int(freq)
        filter_out(word, pinyin)

    infile.close()


def save_opengram_dictionary(outfilename):
    outfile = open(outfilename, "w")

    for word in words_dict:
        (status, pinyins) = words_dict[word]
        if Untouched == status:
            for (pinyin, freq) in pinyins:
                freq = str(freq)
                oneline = "\t".join((word, pinyin, freq))
                outfile.writelines([oneline, os.linesep])

    outfile.close()


def save_partial_dictionary(outfilename):
    outfile = open(outfilename, "w")

    for word in words_dict:
        (status, pinyins) = words_dict[word]
        if Partial == status:
            for (pinyin, freq) in pinyins:
                freq = str(freq)
                oneline = "\t".join((word, pinyin, freq))
                outfile.writelines([oneline, os.linesep])

    outfile.close()


if __name__ == "__main__":
    print('Loading opengram dictionary')
    load_opengram_dictionary("dict.full")

    print('Filtering libpinyin dictionary')
    filter_core_dictionary("merged_gb_char.txt")
    filter_core_dictionary("merged_gb_phrase.txt")
    filter_core_dictionary("merged_gbk_char.txt")

    print('Saving opengram dictionary')
    save_opengram_dictionary("opengram.txt")
    print('Saving paritial dictionary')
    save_partial_dictionary("partial_opengram.txt")