1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
import os
class MyConfig:
''' My Configuration '''
m_current_epoch = {'SegmentEpoch': 1, \
'GenerateEpoch': 1, \
'EstimateEpoch': 1, \
'PruneEpoch': 1, \
'EvaluateEpoch': 1, \
'PrepareEpoch': 2, \
'PopulateEpoch': 3, \
'PartialWordThresholdEpoch': 4, \
}
def getEpochs(self):
return self.m_current_epoch
m_trainer_dir = '/media/data/Program/trainer'
def getBaseDir(self):
return self.m_trainer_dir
def getTextDir(self):
return self.m_trainer_dir + os.sep + 'texts'
def getModelDir(self):
return self.m_trainer_dir + os.sep + 'models'
def getFinalModelDir(self):
return self.m_trainer_dir + os.sep + 'finals'
m_tools_dir = '/media/data/Program/trainer/tools/libpinyin'
def getToolsDir(self):
return self.m_tools_dir
m_evals_dir = '/media/data/Program/trainer/evals/libpinyin'
def getEvalsDir(self):
return self.m_evals_dir
def getEstimatesModel(self):
estimates_model = self.m_tools_dir + '/data/estimates.db'
return estimates_model
def getEstimateIndex(self):
return 'estimate.index'
def getSortedEstimateIndex(self):
return 'estimate.sorted.index'
def getInMemoryFileSystem(self):
return '/dev/shm'
def getEvalsTextFileName(self):
return 'evals2.text'
def getMinimumFileSize(self):
#about 1,200 Chinese characters
minimum_chinese_characters = 1200
minimum_file_size = minimum_chinese_characters * 3 + \
minimum_chinese_characters / 2
return minimum_file_size
#the trained corpus size of model candidates
def getCandidateModelSize(self):
candidate_model_size = 11.9 * 1024 * 1024 * 1
return candidate_model_size
def getModelPostfix(self):
return '.db'
def getCandidateModelName(self, index):
candidate_model_name = "model-candidates-{0}.db"
return candidate_model_name.format(index)
def getMaximumOccursAllowed(self):
return 20
def getMaximumIncreaseRatesAllowed(self):
return 3.
def getReportPostfix(self):
return '.report'
def getSegmentPostfix(self):
return '.segmented'
def getSegmentReportPostfix(self):
return '.segment.report'
#For both index page, item page and binary model file
def getStatusPostfix(self):
return '.status'
def getIndexPostfix(self):
return '.index'
def getTextPostfix(self):
return '.text'
def getFinalModelFileName(self):
return 'interpolation2.text'
def getFinalStatusFileName(self):
return 'cwd.status'
'''
Word Recognizer Configuration
'''
def getWordRecognizerDir(self):
return self.m_trainer_dir + os.sep + 'words'
def getNgramFileName(self, length):
return str(length) + '-gram.db'
def getWordSep(self):
return " "
def getMaximumCombineNumber(self):
N = 5
assert N >= 2, 'at least bi-gram'
return N
def getMinimumOccurrence(self):
return 3 # minimum word occurrence
def getPartialWordThreshold(self):
return 0.30 # the last 10% in position
def getNewWordThreshold(self):
return 0.30 / 2 # the last 5% in position
def getMaximumIteration(self):
return 20 # roughly around N
def getWordsListFileName(self):
return "words.txt"
def getWordsWithPinyinFileName(self):
return "oldwords.txt"
|