1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
#!/usr/bin/python3
import os
from distill import strip_tone
'''
filter out the already existing libpinyin phrases from opengram dictionary.
'''
(
# not in libpinyin, move to opengram.txt
Untouched,
# only partial information in libpinyin, save to partial_opengram.txt
Partial,
# already in libpinyin, do nothing
Complete
) = range(3, 6)
# key: word, value: (status, pinyins)
# pinyins: list of (pinyin, freq)
words_dict = {}
def add_words_dict(word, pinyin, freq):
# assume all tones are already removed
assert pinyin == strip_tone(pinyin)
if not word in words_dict:
status = Untouched
pinyins = []
pinyins.append((pinyin, freq))
words_dict[word] = (status, pinyins)
else:
(status, pinyins) = words_dict[word]
assert Untouched == status
for i, item in enumerate(pinyins):
(oldpinyin, oldfreq) = item
assert oldpinyin != pinyin
pinyins.append((pinyin, freq))
def filter_out(word, pinyin):
if not word in words_dict:
return
(status, pinyins) = words_dict[word]
status = Partial
found = False
for i, item in enumerate(pinyins):
(oldpinyin, oldfreq) = item
if oldpinyin == pinyin:
del pinyins[i]
found = True
if not found:
print('Missing {0} and {1} in opengram'.format(word, pinyin))
if 0 == len(pinyins):
status = Complete
words_dict[word] = (status, pinyins)
def load_opengram_dictionary(infilename):
infile = open(infilename, "r")
for oneline in infile.readlines():
oneline = oneline.rstrip(os.linesep)
(word, pinyin, freq) = oneline.split(None, 2)
freq = int(freq)
add_words_dict(word, pinyin, freq)
infile.close()
def filter_core_dictionary(infilename):
infile = open(infilename, "r")
for oneline in infile.readlines():
oneline = oneline.rstrip(os.linesep)
(word, pinyin, freq) = oneline.split(None, 2)
freq = int(freq)
filter_out(word, pinyin)
infile.close()
def save_opengram_dictionary(outfilename):
outfile = open(outfilename, "w")
for word in words_dict:
(status, pinyins) = words_dict[word]
if Untouched == status:
for (pinyin, freq) in pinyins:
freq = str(freq)
oneline = "\t".join((word, pinyin, freq))
outfile.writelines([oneline, os.linesep])
outfile.close()
def save_partial_dictionary(outfilename):
outfile = open(outfilename, "w")
for word in words_dict:
(status, pinyins) = words_dict[word]
if Partial == status:
for (pinyin, freq) in pinyins:
freq = str(freq)
oneline = "\t".join((word, pinyin, freq))
outfile.writelines([oneline, os.linesep])
outfile.close()
if __name__ == "__main__":
print('Loading opengram dictionary')
load_opengram_dictionary("dict.full")
print('Filtering libpinyin dictionary')
filter_core_dictionary("merged_gb_char.txt")
filter_core_dictionary("merged_gb_phrase.txt")
filter_core_dictionary("merged_gbk_char.txt")
print('Saving opengram dictionary')
save_opengram_dictionary("opengram.txt")
print('Saving paritial dictionary')
save_partial_dictionary("partial_opengram.txt")
|