write generate.py in progress

author: Peng Wu <alexepico@gmail.com> 2011-07-23 17:13:39 +0800
committer: Peng Wu <alexepico@gmail.com> 2011-07-23 17:13:39 +0800
commit: 4e18ee54f475ee6ca588d9504c627a4dfa409645 (patch)
tree: d85bba43c17df8465642af0115cb6abe5c111c03 /generate.py
parent: abead147cd1eea3ff7ba8b34055374a8ad812464 (diff)
download: trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.tar.gz
trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.tar.xz
trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.zip
1 files changed, 76 insertions, 5 deletions
diff --git a/generate.py b/generate.py
index 8cb0be8..06b2f92 100644
--- a/generate.py
+++ b/generate.py
@@ -1,4 +1,8 @@
 #!/usr/bin/python3
+import os
+import os.path
+import subprocess
+import utils
 from myconfig import MyConfig
 
 
@@ -15,15 +19,82 @@ def handleError(error):
 
 
 #Note: all file passed here should be trained.
-def generateOneText(infile, modelfile):
-    pass
+def generateOneText(infile, modelfile, reportfile):
+    #begin processing
+    cmdline = ['./gen_k_mixture_model', '--maximum-occurs-allowed', \
+                   config.getMaximumOccurs(), \
+                   '--maximum-increase-rates-allowed', \
+                   config.getMaximumIncreaseRates(), \
+                   '--k-mixture-model-file', \
+                   modelfile, infile]
+    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
+                           close_fds=True)
+
+    lines = subprocess.stderr.readlines()
+    if lines:
+        print('found error report')
+        with open(reportfile, 'ab') as f:
+            f.writelines(lines)
+        f.close()
+
+    os.waitpid(subprocess.pid, 0);
+    #end processing
+
 
 #Note: should check the corpus file size, and skip the too small text file.
-def handleOneIndex(indexpath):
-    pass
+def handleOneIndex(indexpath, subdir, indexname):
+    print(indexpath, subdir, indexname)
+
+    textnum, modelnum, aggmodelsize = 0, 0, 0
+    #begin processing
+    indexfile = open(indexpath, 'r')
+    for i, oneline in enumerate(indexfile.readlines()):
+        #remove trailing '\n'
+        oneline = oneline.rstrip(os.linesep)
+        (title, textpath) = oneline.split('#')
+        infile = config.getTextDir() + textpath
+        infilesize = utils.get_file_length(infile)
+        if infilesize < config.getMinimumFileSize():
+            print("Skipping " + title + '#' + textpath)
+            continue
+        aggmodelsize += infilesize
+        modeldir = os.path.join(config.getModelDir(), subdir, indexname)
+        os.makedirs(modeldir, exist_ok=True)
+        modelfile = os.path.join(modeldir, \
+                                 config.getCandidateModelName(modelnum))
+        reportfile = modelfile + config.getReportPostfix()
+        print("Proccessing " + title + '#' + textpath)
+        generateOneText(infile, modelfile, reportfile)
+        print("Processed " + title + '#' + textpath)
+        if aggmodelsize > config.getCandidateModelSize():
+            modelnum++
+            modeldir = os.path.join(config.getModelDir(), subdir, indexname)
+            modelfile = os.path.join(modeldir, \
+                                         config.getCandidateModelName(modelnum))
+            reportfile = modelfile + config.getReportPostfix()
+            if os.access(modelfile, os.F_OK):
+                os.unlink(modelfile)
+            if os.access(reportfile, os.F_OK):
+                os.unlink(reportfile)
+            #save current process in status file
+            pass
+    indexfile.close()
+    #end processing
+
 
 def walkThroughIndex(path):
-    pass
+    for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
+        for onefile in files:
+            filepath = os.path.join(root, onefile)
+            indexpostfix = config.getIndexPostfix()
+            if onefile.endswith(indexpostfix):
+                subdir = os.path.relpath(root, path)
+                indexname = onefile[:-len(indexpostfix)]
+                handleOneIndex(filepath, subdir, indexname)
+            elif onefile.endswith(config.getStatusPostfix()):
+                pass
+            else:
+                print('Unexpected file:' + filepath)
 
 if __name__ == '__main__':
     pass
author	Peng Wu <alexepico@gmail.com>	2011-07-23 17:13:39 +0800
committer	Peng Wu <alexepico@gmail.com>	2011-07-23 17:13:39 +0800
commit	4e18ee54f475ee6ca588d9504c627a4dfa409645 (patch)
tree	d85bba43c17df8465642af0115cb6abe5c111c03 /generate.py
parent	abead147cd1eea3ff7ba8b34055374a8ad812464 (diff)
download	trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.tar.gz trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.tar.xz trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.zip