summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-07-21 18:20:08 +0800
committerPeng Wu <alexepico@gmail.com>2011-07-21 18:20:08 +0800
commitabead147cd1eea3ff7ba8b34055374a8ad812464 (patch)
treeeb3d5c457a5aaf89b2437baa9ee9cd3b01d55145
parenteea694d78b09818af1d3bba432bfa97262db8c2a (diff)
downloadtrainer-abead147cd1eea3ff7ba8b34055374a8ad812464.tar.gz
trainer-abead147cd1eea3ff7ba8b34055374a8ad812464.tar.xz
trainer-abead147cd1eea3ff7ba8b34055374a8ad812464.zip
begin to write generate tool
-rw-r--r--generate.py29
-rw-r--r--lib/myconfig.py11
2 files changed, 40 insertions, 0 deletions
diff --git a/generate.py b/generate.py
new file mode 100644
index 0000000..8cb0be8
--- /dev/null
+++ b/generate.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python3
+from myconfig import MyConfig
+
+
+config = MyConfig()
+
+#change cwd to the libpinyin utils/training directory
+libpinyin_dir = config.getToolsDir()
+libpinyin_sub_dir = os.path.join(libpinyin_dir, 'utils', 'training')
+os.chdir(libpinyin_sub_dir)
+#chdir done
+
+def handleError(error):
+ sys.exit(error)
+
+
+#Note: all file passed here should be trained.
+def generateOneText(infile, modelfile):
+ pass
+
+#Note: should check the corpus file size, and skip the too small text file.
+def handleOneIndex(indexpath):
+ pass
+
+def walkThroughIndex(path):
+ pass
+
+if __name__ == '__main__':
+ pass
diff --git a/lib/myconfig.py b/lib/myconfig.py
index 4ed631f..bea11a3 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -46,6 +46,17 @@ class MyConfig:
def getMinimumFileSize(self):
return self.m_minimum_file_size
+ m_candidate_model_size = 11.9 * 1024 * 1024
+
+ #the trained corpus size of model candidates
+ def getCandidateModelSize(self):
+ return self.m_candidate_model_size
+
+ m_candidate_model_name = "model-candidates-{0}.db"
+
+ def getCandidateModelName(self, index):
+ return self.m_candidate_model_name.format(index)
+
m_segment_postfix = '.segmented'
def getSegmentPostfix(self):