generate.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

#!/usr/bin/python3
import os
import os.path
import subprocess
import utils
from myconfig import MyConfig


config = MyConfig()

#change cwd to the libpinyin utils/training directory
libpinyin_dir = config.getToolsDir()
libpinyin_sub_dir = os.path.join(libpinyin_dir, 'utils', 'training')
os.chdir(libpinyin_sub_dir)
#chdir done

def handleError(error):
    sys.exit(error)


#Note: all file passed here should be trained.
def generateOneText(infile, modelfile, reportfile):
    #begin processing
    cmdline = ['./gen_k_mixture_model', '--maximum-occurs-allowed', \
                   config.getMaximumOccurs(), \
                   '--maximum-increase-rates-allowed', \
                   config.getMaximumIncreaseRates(), \
                   '--k-mixture-model-file', \
                   modelfile, infile]
    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
                           close_fds=True)

    lines = subprocess.stderr.readlines()
    if lines:
        print('found error report')
        with open(reportfile, 'ab') as f:
            f.writelines(lines)
        f.close()

    os.waitpid(subprocess.pid, 0);
    #end processing


#Note: should check the corpus file size, and skip the too small text file.
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    textnum, modelnum, aggmodelsize = 0, 0, 0
    #begin processing
    indexfile = open(indexpath, 'r')
    for i, oneline in enumerate(indexfile.readlines()):
        #remove trailing '\n'
        oneline = oneline.rstrip(os.linesep)
        (title, textpath) = oneline.split('#')
        infile = config.getTextDir() + textpath
        infilesize = utils.get_file_length(infile)
        if infilesize < config.getMinimumFileSize():
            print("Skipping " + title + '#' + textpath)
            continue
        aggmodelsize += infilesize
        modeldir = os.path.join(config.getModelDir(), subdir, indexname)
        os.makedirs(modeldir, exist_ok=True)
        modelfile = os.path.join(modeldir, \
                                 config.getCandidateModelName(modelnum))
        reportfile = modelfile + config.getReportPostfix()
        print("Proccessing " + title + '#' + textpath)
        generateOneText(infile, modelfile, reportfile)
        print("Processed " + title + '#' + textpath)
        if aggmodelsize > config.getCandidateModelSize():
            modelnum++
            modeldir = os.path.join(config.getModelDir(), subdir, indexname)
            modelfile = os.path.join(modeldir, \
                                         config.getCandidateModelName(modelnum))
            reportfile = modelfile + config.getReportPostfix()
            if os.access(modelfile, os.F_OK):
                os.unlink(modelfile)
            if os.access(reportfile, os.F_OK):
                os.unlink(reportfile)
            #save current process in status file
            pass
    indexfile.close()
    #end processing


def walkThroughIndex(path):
    for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
        for onefile in files:
            filepath = os.path.join(root, onefile)
            indexpostfix = config.getIndexPostfix()
            if onefile.endswith(indexpostfix):
                subdir = os.path.relpath(root, path)
                indexname = onefile[:-len(indexpostfix)]
                handleOneIndex(filepath, subdir, indexname)
            elif onefile.endswith(config.getStatusPostfix()):
                pass
            else:
                print('Unexpected file:' + filepath)

if __name__ == '__main__':
    pass