summaryrefslogtreecommitdiffstats
path: root/tools/merge.py
blob: 80ae80f319b8a4db5959fbd85d00d7623fc0ee73 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/python3
import os
from argparse import ArgumentParser


#minimum duplicates in recognized dictionaries to be merged
threshold = 3

#minimum pinyin frequency
#keep the un-merged word/pinyin/freq un-touched,
#only affect the merged word/pinyin/freq tuples.
#try to use 2 to maximize the phrases in merged table.
minimum = 2

#default pinyin total frequency
default = 100

words_dict = {}

def load_recognized_words(filename):
    print(filename)

    words = set([])
    wordfile = open(filename, "r")
    for oneline in wordfile.readlines():
        oneline = oneline.rstrip(os.linesep)

        if len(oneline) == 0:
            continue

        (word, pinyin, freq) = oneline.split(None, 2)

        if not word in words:
            words.add(word)

    wordfile.close()

    for word in words:
        if word in words_dict:
            words_dict[word] += 1
        else:
            words_dict[word] = 1


merged_words_dict = {}

def filter_recognized_words(filename):
    print(filename)
    lines = []

    #loading
    wordfile = open(filename, "r")
    for oneline in wordfile.readlines():
        oneline = oneline.rstrip(os.linesep)

        if len(oneline) == 0:
            continue

        (word, pinyin, freq) = oneline.split(None, 2)
        freq = int(freq)

        if not word in words_dict:
            lines.append(oneline)
            continue

        occurs = words_dict[word]
        if occurs < threshold:
            lines.append(oneline)
            continue

        if word in merged_words_dict:
            merged_words_dict[word].append((pinyin, freq))
        else:
            merged_words_dict[word] = [(pinyin, freq)]

    wordfile.close()

    #saving
    wordfile = open(filename, "w")
    for oneline in lines:
        wordfile.writelines([oneline, os.linesep])
    wordfile.close()


def save_merged_words(filename):
    print(filename)

    wordfile = open(filename, "w")
    for word, pairs in merged_words_dict.items():
        pinyins = {}
        for pinyin, freq in pairs:
            if pinyin in pinyins:
                pinyins[pinyin] += freq
            else:
                pinyins[pinyin] = freq

        freqsum = sum([ freq for pinyin, freq in pinyins.items() ])

        for pinyin, freq in pinyins.items():
            freq = int(default * freq / freqsum)

            if freq < minimum:
                continue

            freq = str(freq)

            oneline = '\t'.join((word, pinyin, freq))
            wordfile.writelines([oneline, os.linesep])

    wordfile.close()


if __name__ == "__main__":
    parser = ArgumentParser(description='merge dictionaries.')
    parser.add_argument('-o', '--output', action='store', \
                            help='merged dictionary', \
                            default='merged.txt')
    parser.add_argument('inputs', type=str, nargs='+', \
                            help='dictionaries')


    args = parser.parse_args()
    print(args)
    #loading
    for filename in args.inputs:
        load_recognized_words(filename)
    #filtering
    for filename in args.inputs:
        filter_recognized_words(filename)
    #saving merged dictionary
    save_merged_words(args.output)
    print('done')