4 files changed, 89 insertions, 8 deletions
diff --git a/docs/fileformat b/docs/fileformat
index bd94abe..89d5e1f 100644
--- a/docs/fileformat
+++ b/docs/fileformat
@@ -32,7 +32,7 @@ Status File Format
           <number>.text.status will be generated, like:
           {'GenerateEpoch': 2}.
        2. For <items>.index, <items>.index.status will be generated, like:
-          {'GenerateEpoch': 2}.
+          {'GenerateEpoch': 2, 'GenerateIndexEnd':10, 'GenerateTextEnd':1000}.
        3. The generated K Mixture Model files are placed in 'models':
           1. The model files are placed in the same sub-directory as <items>.index;
           2. Each model files are named as 'model-candidates-0.db', etc.
diff --git a/generate.py b/generate.py
index 8cb0be8..06b2f92 100644
--- a/generate.py
+++ b/generate.py
@@ -1,4 +1,8 @@
 #!/usr/bin/python3
+import os
+import os.path
+import subprocess
+import utils
 from myconfig import MyConfig
 
 
@@ -15,15 +19,82 @@ def handleError(error):
 
 
 #Note: all file passed here should be trained.
-def generateOneText(infile, modelfile):
-    pass
+def generateOneText(infile, modelfile, reportfile):
+    #begin processing
+    cmdline = ['./gen_k_mixture_model', '--maximum-occurs-allowed', \
+                   config.getMaximumOccurs(), \
+                   '--maximum-increase-rates-allowed', \
+                   config.getMaximumIncreaseRates(), \
+                   '--k-mixture-model-file', \
+                   modelfile, infile]
+    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
+                           close_fds=True)
+
+    lines = subprocess.stderr.readlines()
+    if lines:
+        print('found error report')
+        with open(reportfile, 'ab') as f:
+            f.writelines(lines)
+        f.close()
+
+    os.waitpid(subprocess.pid, 0);
+    #end processing
+
 
 #Note: should check the corpus file size, and skip the too small text file.
-def handleOneIndex(indexpath):
-    pass
+def handleOneIndex(indexpath, subdir, indexname):
+    print(indexpath, subdir, indexname)
+
+    textnum, modelnum, aggmodelsize = 0, 0, 0
+    #begin processing
+    indexfile = open(indexpath, 'r')
+    for i, oneline in enumerate(indexfile.readlines()):
+        #remove trailing '\n'
+        oneline = oneline.rstrip(os.linesep)
+        (title, textpath) = oneline.split('#')
+        infile = config.getTextDir() + textpath
+        infilesize = utils.get_file_length(infile)
+        if infilesize < config.getMinimumFileSize():
+            print("Skipping " + title + '#' + textpath)
+            continue
+        aggmodelsize += infilesize
+        modeldir = os.path.join(config.getModelDir(), subdir, indexname)
+        os.makedirs(modeldir, exist_ok=True)
+        modelfile = os.path.join(modeldir, \
+                                 config.getCandidateModelName(modelnum))
+        reportfile = modelfile + config.getReportPostfix()
+        print("Proccessing " + title + '#' + textpath)
+        generateOneText(infile, modelfile, reportfile)
+        print("Processed " + title + '#' + textpath)
+        if aggmodelsize > config.getCandidateModelSize():
+            modelnum++
+            modeldir = os.path.join(config.getModelDir(), subdir, indexname)
+            modelfile = os.path.join(modeldir, \
+                                         config.getCandidateModelName(modelnum))
+            reportfile = modelfile + config.getReportPostfix()
+            if os.access(modelfile, os.F_OK):
+                os.unlink(modelfile)
+            if os.access(reportfile, os.F_OK):
+                os.unlink(reportfile)
+            #save current process in status file
+            pass
+    indexfile.close()
+    #end processing
+
 
 def walkThroughIndex(path):
-    pass
+    for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
+        for onefile in files:
+            filepath = os.path.join(root, onefile)
+            indexpostfix = config.getIndexPostfix()
+            if onefile.endswith(indexpostfix):
+                subdir = os.path.relpath(root, path)
+                indexname = onefile[:-len(indexpostfix)]
+                handleOneIndex(filepath, subdir, indexname)
+            elif onefile.endswith(config.getStatusPostfix()):
+                pass
+            else:
+                print('Unexpected file:' + filepath)
 
 if __name__ == '__main__':
     pass
diff --git a/lib/myconfig.py b/lib/myconfig.py
index bea11a3..38a3e53 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -56,6 +56,16 @@ class MyConfig:
 
     def getCandidateModelName(self, index):
         return self.m_candidate_model_name.format(index)
+
+    m_maximum_occurs_allowed = 20
+
+    def getMaximumOccurs(self):
+        return self.m_maximum_occurs_allowed
+
+    m_maximum_increase_rates_allowed = 3.
+
+    def getMaximumIncreaseRates(self):
+        return self.m_maximum_increase_rate_allowed
         
     m_segment_postfix = '.segmented'
 
diff --git a/segment.py b/segment.py
index b65bfd7..ef7a53b 100755
--- a/segment.py
+++ b/segment.py
@@ -57,9 +57,9 @@ def handleOneIndex(indexpath):
     #begin processing
     indexfile = open(indexpath, 'r')
     for oneline in indexfile.readlines():
-        (title, textpath) = oneline.split('#')
         #remove tailing '\n'
-        textpath = textpath.rstrip(os.linesep)
+        oneline = oneline.rstrip(os.linesep)
+        (title, textpath) = oneline.split('#')
         infile = config.getTextDir() + textpath
         outfile = config.getTextDir() + textpath + config.getSegmentPostfix()
         reportfile = config.getTextDir() + textpath + \