begin to write generate tool

author: Peng Wu <alexepico@gmail.com> 2011-07-21 18:20:08 +0800
committer: Peng Wu <alexepico@gmail.com> 2011-07-21 18:20:08 +0800
commit: abead147cd1eea3ff7ba8b34055374a8ad812464 (patch)
tree: eb3d5c457a5aaf89b2437baa9ee9cd3b01d55145
parent: eea694d78b09818af1d3bba432bfa97262db8c2a (diff)
2 files changed, 40 insertions, 0 deletions
diff --git a/generate.py b/generate.py
new file mode 100644
index 0000000..8cb0be8
--- /dev/null
+++ b/generate.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python3
+from myconfig import MyConfig
+
+
+config = MyConfig()
+
+#change cwd to the libpinyin utils/training directory
+libpinyin_dir = config.getToolsDir()
+libpinyin_sub_dir = os.path.join(libpinyin_dir, 'utils', 'training')
+os.chdir(libpinyin_sub_dir)
+#chdir done
+
+def handleError(error):
+    sys.exit(error)
+
+
+#Note: all file passed here should be trained.
+def generateOneText(infile, modelfile):
+    pass
+
+#Note: should check the corpus file size, and skip the too small text file.
+def handleOneIndex(indexpath):
+    pass
+
+def walkThroughIndex(path):
+    pass
+
+if __name__ == '__main__':
+    pass
diff --git a/lib/myconfig.py b/lib/myconfig.py
index 4ed631f..bea11a3 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -46,6 +46,17 @@ class MyConfig:
     def getMinimumFileSize(self):
         return self.m_minimum_file_size
 
+    m_candidate_model_size = 11.9 * 1024 * 1024
+
+    #the trained corpus size of model candidates
+    def getCandidateModelSize(self):
+        return self.m_candidate_model_size
+
+    m_candidate_model_name = "model-candidates-{0}.db"
+
+    def getCandidateModelName(self, index):
+        return self.m_candidate_model_name.format(index)
+        
     m_segment_postfix = '.segmented'
 
     def getSegmentPostfix(self):
author	Peng Wu <alexepico@gmail.com>	2011-07-21 18:20:08 +0800
committer	Peng Wu <alexepico@gmail.com>	2011-07-21 18:20:08 +0800
commit	abead147cd1eea3ff7ba8b34055374a8ad812464 (patch)
tree	eb3d5c457a5aaf89b2437baa9ee9cd3b01d55145
parent	eea694d78b09818af1d3bba432bfa97262db8c2a (diff)