1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
#!/usr/bin/python3
import os
from operator import itemgetter
words_set = set([])
words_dict = {}
def add_words_set(word):
if len(word) < 2:
return
if not word in words_set:
words_set.add(word)
def strip_tone(old_pinyin_str):
oldpinyins = old_pinyin_str.split("'")
newpinyins = []
for pinyin in oldpinyins:
if pinyin[-1].isdigit():
pinyin = pinyin[:-1]
newpinyins.append(pinyin)
new_pinyin_str = "'".join(newpinyins)
return new_pinyin_str
def add_words_dict(word, pinyin, freq):
pinyin = strip_tone(pinyin)
if not (word, pinyin) in words_dict:
words_dict[(word, pinyin)] = freq
else:
words_dict[(word, pinyin)] += freq
def load_phrase(filename):
phrasefile = open(filename, "r")
for oneline in phrasefile.readlines():
oneline = oneline.rstrip(os.linesep)
(pinyin, word, token, freq) = oneline.split(None, 3)
freq = int(freq)
add_words_set(word)
add_words_dict(word, pinyin, freq)
phrasefile.close()
load_phrase("gb_char.table")
load_phrase("gbk_char.table")
#sorting
words_list = list(words_set)
words_list.sort()
oldwords_list = []
for key, value in words_dict.items():
(word, pinyin) = key
freq = value
oldwords_list.append((word, pinyin, freq))
oldwords_list.sort(key=itemgetter(0))
def save_words_list(filename):
wordsfile = open(filename, 'w')
for word in words_list:
wordsfile.writelines([word, os.linesep])
wordsfile.close()
def save_words_dict(filename):
wordsfile = open(filename, 'w')
for (word, pinyin, freq) in oldwords_list:
freq = str(freq)
oneline = "\t".join((word, pinyin, freq))
wordsfile.writelines([oneline, os.linesep])
wordsfile.close()
if __name__ == "__main__":
save_words_list("words.txt")
save_words_dict("oldwords.txt")
|