diff options
-rw-r--r-- | docs/fileformat | 2 | ||||
-rw-r--r-- | generate.py | 81 | ||||
-rw-r--r-- | lib/myconfig.py | 10 | ||||
-rwxr-xr-x | segment.py | 4 |
4 files changed, 89 insertions, 8 deletions
diff --git a/docs/fileformat b/docs/fileformat index bd94abe..89d5e1f 100644 --- a/docs/fileformat +++ b/docs/fileformat @@ -32,7 +32,7 @@ Status File Format <number>.text.status will be generated, like: {'GenerateEpoch': 2}. 2. For <items>.index, <items>.index.status will be generated, like: - {'GenerateEpoch': 2}. + {'GenerateEpoch': 2, 'GenerateIndexEnd':10, 'GenerateTextEnd':1000}. 3. The generated K Mixture Model files are placed in 'models': 1. The model files are placed in the same sub-directory as <items>.index; 2. Each model files are named as 'model-candidates-0.db', etc. diff --git a/generate.py b/generate.py index 8cb0be8..06b2f92 100644 --- a/generate.py +++ b/generate.py @@ -1,4 +1,8 @@ #!/usr/bin/python3 +import os +import os.path +import subprocess +import utils from myconfig import MyConfig @@ -15,15 +19,82 @@ def handleError(error): #Note: all file passed here should be trained. -def generateOneText(infile, modelfile): - pass +def generateOneText(infile, modelfile, reportfile): + #begin processing + cmdline = ['./gen_k_mixture_model', '--maximum-occurs-allowed', \ + config.getMaximumOccurs(), \ + '--maximum-increase-rates-allowed', \ + config.getMaximumIncreaseRates(), \ + '--k-mixture-model-file', \ + modelfile, infile] + subprocess = Popen(cmdline, shell=False, stderr=PIPE, \ + close_fds=True) + + lines = subprocess.stderr.readlines() + if lines: + print('found error report') + with open(reportfile, 'ab') as f: + f.writelines(lines) + f.close() + + os.waitpid(subprocess.pid, 0); + #end processing + #Note: should check the corpus file size, and skip the too small text file. -def handleOneIndex(indexpath): - pass +def handleOneIndex(indexpath, subdir, indexname): + print(indexpath, subdir, indexname) + + textnum, modelnum, aggmodelsize = 0, 0, 0 + #begin processing + indexfile = open(indexpath, 'r') + for i, oneline in enumerate(indexfile.readlines()): + #remove trailing '\n' + oneline = oneline.rstrip(os.linesep) + (title, textpath) = oneline.split('#') + infile = config.getTextDir() + textpath + infilesize = utils.get_file_length(infile) + if infilesize < config.getMinimumFileSize(): + print("Skipping " + title + '#' + textpath) + continue + aggmodelsize += infilesize + modeldir = os.path.join(config.getModelDir(), subdir, indexname) + os.makedirs(modeldir, exist_ok=True) + modelfile = os.path.join(modeldir, \ + config.getCandidateModelName(modelnum)) + reportfile = modelfile + config.getReportPostfix() + print("Proccessing " + title + '#' + textpath) + generateOneText(infile, modelfile, reportfile) + print("Processed " + title + '#' + textpath) + if aggmodelsize > config.getCandidateModelSize(): + modelnum++ + modeldir = os.path.join(config.getModelDir(), subdir, indexname) + modelfile = os.path.join(modeldir, \ + config.getCandidateModelName(modelnum)) + reportfile = modelfile + config.getReportPostfix() + if os.access(modelfile, os.F_OK): + os.unlink(modelfile) + if os.access(reportfile, os.F_OK): + os.unlink(reportfile) + #save current process in status file + pass + indexfile.close() + #end processing + def walkThroughIndex(path): - pass + for root, dirs, files in os.walk(path, topdown=True, onerror=handleError): + for onefile in files: + filepath = os.path.join(root, onefile) + indexpostfix = config.getIndexPostfix() + if onefile.endswith(indexpostfix): + subdir = os.path.relpath(root, path) + indexname = onefile[:-len(indexpostfix)] + handleOneIndex(filepath, subdir, indexname) + elif onefile.endswith(config.getStatusPostfix()): + pass + else: + print('Unexpected file:' + filepath) if __name__ == '__main__': pass diff --git a/lib/myconfig.py b/lib/myconfig.py index bea11a3..38a3e53 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -56,6 +56,16 @@ class MyConfig: def getCandidateModelName(self, index): return self.m_candidate_model_name.format(index) + + m_maximum_occurs_allowed = 20 + + def getMaximumOccurs(self): + return self.m_maximum_occurs_allowed + + m_maximum_increase_rates_allowed = 3. + + def getMaximumIncreaseRates(self): + return self.m_maximum_increase_rate_allowed m_segment_postfix = '.segmented' @@ -57,9 +57,9 @@ def handleOneIndex(indexpath): #begin processing indexfile = open(indexpath, 'r') for oneline in indexfile.readlines(): - (title, textpath) = oneline.split('#') #remove tailing '\n' - textpath = textpath.rstrip(os.linesep) + oneline = oneline.rstrip(os.linesep) + (title, textpath) = oneline.split('#') infile = config.getTextDir() + textpath outfile = config.getTextDir() + textpath + config.getSegmentPostfix() reportfile = config.getTextDir() + textpath + \ |