summaryrefslogtreecommitdiffstats
path: root/tools/mergepartialopengram.py
blob: 4c971a73e09cc9362e1427c3b5e645b0ea8edbc2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/python3
import os

from distill import strip_tone
from filteropengram import Untouched, words_dict, load_opengram_dictionary


'''
merge partial opengram into merged_gb*_opengram.txt
'''

def meet_word(outfile, word):
    if not word in words_dict:
        return

    (status, pinyins) = words_dict[word]
    assert Untouched == status

    for (pinyin, freq) in pinyins:
        freq = str(freq)
        oneline = "\t".join((word, pinyin, freq))
        outfile.writelines([oneline, os.linesep])

    del words_dict[word]


def merge_core_dictionary(infilename, outfilename):
    infile = open(infilename, "r")
    outfile = open(outfilename, "w")

    for oneline in infile.readlines():
        oneline = oneline.rstrip(os.linesep)
        (word, pinyin, freq) = oneline.split(None, 2)
        freq = int(freq)
        meet_word(outfile, word)

    outfile.close()
    infile.close()


if __name__ == "__main__":
    print('Loading partial opengram dictionary')
    load_opengram_dictionary("partial_opengram.txt")

    print('Merging partial opengram dictionary')
    merge_core_dictionary("merged_gb_char.txt", "merged_gb_char_opengram.txt")
    merge_core_dictionary("merged_gb_phrase.txt", "merged_gb_phrase_opengram.txt")
    merge_core_dictionary("merged_gbk_char.txt", "merged_gbk_char_opengram.txt")

    print('Check remained phrases')
    assert 0 == len(words_dict)