summaryrefslogtreecommitdiffstats
path: root/generate.py
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-07-23 17:13:39 +0800
committerPeng Wu <alexepico@gmail.com>2011-07-23 17:13:39 +0800
commit4e18ee54f475ee6ca588d9504c627a4dfa409645 (patch)
treed85bba43c17df8465642af0115cb6abe5c111c03 /generate.py
parentabead147cd1eea3ff7ba8b34055374a8ad812464 (diff)
downloadtrainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.tar.gz
trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.tar.xz
trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.zip
write generate.py in progress
Diffstat (limited to 'generate.py')
-rw-r--r--generate.py81
1 files changed, 76 insertions, 5 deletions
diff --git a/generate.py b/generate.py
index 8cb0be8..06b2f92 100644
--- a/generate.py
+++ b/generate.py
@@ -1,4 +1,8 @@
#!/usr/bin/python3
+import os
+import os.path
+import subprocess
+import utils
from myconfig import MyConfig
@@ -15,15 +19,82 @@ def handleError(error):
#Note: all file passed here should be trained.
-def generateOneText(infile, modelfile):
- pass
+def generateOneText(infile, modelfile, reportfile):
+ #begin processing
+ cmdline = ['./gen_k_mixture_model', '--maximum-occurs-allowed', \
+ config.getMaximumOccurs(), \
+ '--maximum-increase-rates-allowed', \
+ config.getMaximumIncreaseRates(), \
+ '--k-mixture-model-file', \
+ modelfile, infile]
+ subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
+ close_fds=True)
+
+ lines = subprocess.stderr.readlines()
+ if lines:
+ print('found error report')
+ with open(reportfile, 'ab') as f:
+ f.writelines(lines)
+ f.close()
+
+ os.waitpid(subprocess.pid, 0);
+ #end processing
+
#Note: should check the corpus file size, and skip the too small text file.
-def handleOneIndex(indexpath):
- pass
+def handleOneIndex(indexpath, subdir, indexname):
+ print(indexpath, subdir, indexname)
+
+ textnum, modelnum, aggmodelsize = 0, 0, 0
+ #begin processing
+ indexfile = open(indexpath, 'r')
+ for i, oneline in enumerate(indexfile.readlines()):
+ #remove trailing '\n'
+ oneline = oneline.rstrip(os.linesep)
+ (title, textpath) = oneline.split('#')
+ infile = config.getTextDir() + textpath
+ infilesize = utils.get_file_length(infile)
+ if infilesize < config.getMinimumFileSize():
+ print("Skipping " + title + '#' + textpath)
+ continue
+ aggmodelsize += infilesize
+ modeldir = os.path.join(config.getModelDir(), subdir, indexname)
+ os.makedirs(modeldir, exist_ok=True)
+ modelfile = os.path.join(modeldir, \
+ config.getCandidateModelName(modelnum))
+ reportfile = modelfile + config.getReportPostfix()
+ print("Proccessing " + title + '#' + textpath)
+ generateOneText(infile, modelfile, reportfile)
+ print("Processed " + title + '#' + textpath)
+ if aggmodelsize > config.getCandidateModelSize():
+ modelnum++
+ modeldir = os.path.join(config.getModelDir(), subdir, indexname)
+ modelfile = os.path.join(modeldir, \
+ config.getCandidateModelName(modelnum))
+ reportfile = modelfile + config.getReportPostfix()
+ if os.access(modelfile, os.F_OK):
+ os.unlink(modelfile)
+ if os.access(reportfile, os.F_OK):
+ os.unlink(reportfile)
+ #save current process in status file
+ pass
+ indexfile.close()
+ #end processing
+
def walkThroughIndex(path):
- pass
+ for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
+ for onefile in files:
+ filepath = os.path.join(root, onefile)
+ indexpostfix = config.getIndexPostfix()
+ if onefile.endswith(indexpostfix):
+ subdir = os.path.relpath(root, path)
+ indexname = onefile[:-len(indexpostfix)]
+ handleOneIndex(filepath, subdir, indexname)
+ elif onefile.endswith(config.getStatusPostfix()):
+ pass
+ else:
+ print('Unexpected file:' + filepath)
if __name__ == '__main__':
pass