diff options
author | Peng Wu <alexepico@gmail.com> | 2011-07-23 17:13:39 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-07-23 17:13:39 +0800 |
commit | 4e18ee54f475ee6ca588d9504c627a4dfa409645 (patch) | |
tree | d85bba43c17df8465642af0115cb6abe5c111c03 /generate.py | |
parent | abead147cd1eea3ff7ba8b34055374a8ad812464 (diff) | |
download | trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.tar.gz trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.tar.xz trainer-4e18ee54f475ee6ca588d9504c627a4dfa409645.zip |
write generate.py in progress
Diffstat (limited to 'generate.py')
-rw-r--r-- | generate.py | 81 |
1 files changed, 76 insertions, 5 deletions
diff --git a/generate.py b/generate.py index 8cb0be8..06b2f92 100644 --- a/generate.py +++ b/generate.py @@ -1,4 +1,8 @@ #!/usr/bin/python3 +import os +import os.path +import subprocess +import utils from myconfig import MyConfig @@ -15,15 +19,82 @@ def handleError(error): #Note: all file passed here should be trained. -def generateOneText(infile, modelfile): - pass +def generateOneText(infile, modelfile, reportfile): + #begin processing + cmdline = ['./gen_k_mixture_model', '--maximum-occurs-allowed', \ + config.getMaximumOccurs(), \ + '--maximum-increase-rates-allowed', \ + config.getMaximumIncreaseRates(), \ + '--k-mixture-model-file', \ + modelfile, infile] + subprocess = Popen(cmdline, shell=False, stderr=PIPE, \ + close_fds=True) + + lines = subprocess.stderr.readlines() + if lines: + print('found error report') + with open(reportfile, 'ab') as f: + f.writelines(lines) + f.close() + + os.waitpid(subprocess.pid, 0); + #end processing + #Note: should check the corpus file size, and skip the too small text file. -def handleOneIndex(indexpath): - pass +def handleOneIndex(indexpath, subdir, indexname): + print(indexpath, subdir, indexname) + + textnum, modelnum, aggmodelsize = 0, 0, 0 + #begin processing + indexfile = open(indexpath, 'r') + for i, oneline in enumerate(indexfile.readlines()): + #remove trailing '\n' + oneline = oneline.rstrip(os.linesep) + (title, textpath) = oneline.split('#') + infile = config.getTextDir() + textpath + infilesize = utils.get_file_length(infile) + if infilesize < config.getMinimumFileSize(): + print("Skipping " + title + '#' + textpath) + continue + aggmodelsize += infilesize + modeldir = os.path.join(config.getModelDir(), subdir, indexname) + os.makedirs(modeldir, exist_ok=True) + modelfile = os.path.join(modeldir, \ + config.getCandidateModelName(modelnum)) + reportfile = modelfile + config.getReportPostfix() + print("Proccessing " + title + '#' + textpath) + generateOneText(infile, modelfile, reportfile) + print("Processed " + title + '#' + textpath) + if aggmodelsize > config.getCandidateModelSize(): + modelnum++ + modeldir = os.path.join(config.getModelDir(), subdir, indexname) + modelfile = os.path.join(modeldir, \ + config.getCandidateModelName(modelnum)) + reportfile = modelfile + config.getReportPostfix() + if os.access(modelfile, os.F_OK): + os.unlink(modelfile) + if os.access(reportfile, os.F_OK): + os.unlink(reportfile) + #save current process in status file + pass + indexfile.close() + #end processing + def walkThroughIndex(path): - pass + for root, dirs, files in os.walk(path, topdown=True, onerror=handleError): + for onefile in files: + filepath = os.path.join(root, onefile) + indexpostfix = config.getIndexPostfix() + if onefile.endswith(indexpostfix): + subdir = os.path.relpath(root, path) + indexname = onefile[:-len(indexpostfix)] + handleOneIndex(filepath, subdir, indexname) + elif onefile.endswith(config.getStatusPostfix()): + pass + else: + print('Unexpected file:' + filepath) if __name__ == '__main__': pass |