summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docs/fileformat2
-rw-r--r--generate.py81
-rw-r--r--lib/myconfig.py10
-rwxr-xr-xsegment.py4
4 files changed, 89 insertions, 8 deletions
diff --git a/docs/fileformat b/docs/fileformat
index bd94abe..89d5e1f 100644
--- a/docs/fileformat
+++ b/docs/fileformat
@@ -32,7 +32,7 @@ Status File Format
<number>.text.status will be generated, like:
{'GenerateEpoch': 2}.
2. For <items>.index, <items>.index.status will be generated, like:
- {'GenerateEpoch': 2}.
+ {'GenerateEpoch': 2, 'GenerateIndexEnd':10, 'GenerateTextEnd':1000}.
3. The generated K Mixture Model files are placed in 'models':
1. The model files are placed in the same sub-directory as <items>.index;
2. Each model files are named as 'model-candidates-0.db', etc.
diff --git a/generate.py b/generate.py
index 8cb0be8..06b2f92 100644
--- a/generate.py
+++ b/generate.py
@@ -1,4 +1,8 @@
#!/usr/bin/python3
+import os
+import os.path
+import subprocess
+import utils
from myconfig import MyConfig
@@ -15,15 +19,82 @@ def handleError(error):
#Note: all file passed here should be trained.
-def generateOneText(infile, modelfile):
- pass
+def generateOneText(infile, modelfile, reportfile):
+ #begin processing
+ cmdline = ['./gen_k_mixture_model', '--maximum-occurs-allowed', \
+ config.getMaximumOccurs(), \
+ '--maximum-increase-rates-allowed', \
+ config.getMaximumIncreaseRates(), \
+ '--k-mixture-model-file', \
+ modelfile, infile]
+ subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
+ close_fds=True)
+
+ lines = subprocess.stderr.readlines()
+ if lines:
+ print('found error report')
+ with open(reportfile, 'ab') as f:
+ f.writelines(lines)
+ f.close()
+
+ os.waitpid(subprocess.pid, 0);
+ #end processing
+
#Note: should check the corpus file size, and skip the too small text file.
-def handleOneIndex(indexpath):
- pass
+def handleOneIndex(indexpath, subdir, indexname):
+ print(indexpath, subdir, indexname)
+
+ textnum, modelnum, aggmodelsize = 0, 0, 0
+ #begin processing
+ indexfile = open(indexpath, 'r')
+ for i, oneline in enumerate(indexfile.readlines()):
+ #remove trailing '\n'
+ oneline = oneline.rstrip(os.linesep)
+ (title, textpath) = oneline.split('#')
+ infile = config.getTextDir() + textpath
+ infilesize = utils.get_file_length(infile)
+ if infilesize < config.getMinimumFileSize():
+ print("Skipping " + title + '#' + textpath)
+ continue
+ aggmodelsize += infilesize
+ modeldir = os.path.join(config.getModelDir(), subdir, indexname)
+ os.makedirs(modeldir, exist_ok=True)
+ modelfile = os.path.join(modeldir, \
+ config.getCandidateModelName(modelnum))
+ reportfile = modelfile + config.getReportPostfix()
+ print("Proccessing " + title + '#' + textpath)
+ generateOneText(infile, modelfile, reportfile)
+ print("Processed " + title + '#' + textpath)
+ if aggmodelsize > config.getCandidateModelSize():
+ modelnum++
+ modeldir = os.path.join(config.getModelDir(), subdir, indexname)
+ modelfile = os.path.join(modeldir, \
+ config.getCandidateModelName(modelnum))
+ reportfile = modelfile + config.getReportPostfix()
+ if os.access(modelfile, os.F_OK):
+ os.unlink(modelfile)
+ if os.access(reportfile, os.F_OK):
+ os.unlink(reportfile)
+ #save current process in status file
+ pass
+ indexfile.close()
+ #end processing
+
def walkThroughIndex(path):
- pass
+ for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
+ for onefile in files:
+ filepath = os.path.join(root, onefile)
+ indexpostfix = config.getIndexPostfix()
+ if onefile.endswith(indexpostfix):
+ subdir = os.path.relpath(root, path)
+ indexname = onefile[:-len(indexpostfix)]
+ handleOneIndex(filepath, subdir, indexname)
+ elif onefile.endswith(config.getStatusPostfix()):
+ pass
+ else:
+ print('Unexpected file:' + filepath)
if __name__ == '__main__':
pass
diff --git a/lib/myconfig.py b/lib/myconfig.py
index bea11a3..38a3e53 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -56,6 +56,16 @@ class MyConfig:
def getCandidateModelName(self, index):
return self.m_candidate_model_name.format(index)
+
+ m_maximum_occurs_allowed = 20
+
+ def getMaximumOccurs(self):
+ return self.m_maximum_occurs_allowed
+
+ m_maximum_increase_rates_allowed = 3.
+
+ def getMaximumIncreaseRates(self):
+ return self.m_maximum_increase_rate_allowed
m_segment_postfix = '.segmented'
diff --git a/segment.py b/segment.py
index b65bfd7..ef7a53b 100755
--- a/segment.py
+++ b/segment.py
@@ -57,9 +57,9 @@ def handleOneIndex(indexpath):
#begin processing
indexfile = open(indexpath, 'r')
for oneline in indexfile.readlines():
- (title, textpath) = oneline.split('#')
#remove tailing '\n'
- textpath = textpath.rstrip(os.linesep)
+ oneline = oneline.rstrip(os.linesep)
+ (title, textpath) = oneline.split('#')
infile = config.getTextDir() + textpath
outfile = config.getTextDir() + textpath + config.getSegmentPostfix()
reportfile = config.getTextDir() + textpath + \