begin to write populate.py

author: Peng Wu <alexepico@gmail.com> 2013-01-10 16:40:35 +0800
committer: Peng Wu <alexepico@gmail.com> 2013-01-10 16:40:35 +0800
commit: a2862954e48a36cd117bbbe26dd235e7c21fd930 (patch)
tree: ba5bdf7fb73bbd5436df0c430861a0065d86806d /populate.py
parent: 2a3f09ba2e2aa0e99cb45b2ced390e7def0c157a (diff)
download: trainer-a2862954e48a36cd117bbbe26dd235e7c21fd930.tar.gz
trainer-a2862954e48a36cd117bbbe26dd235e7c21fd930.tar.xz
trainer-a2862954e48a36cd117bbbe26dd235e7c21fd930.zip
1 files changed, 178 insertions, 0 deletions
diff --git a/populate.py b/populate.py
new file mode 100644
index 0000000..8d82914
--- /dev/null
+++ b/populate.py
@@ -0,0 +1,178 @@
+#!/usr/bin/python3
+import os
+import os.path
+import sqlite3
+from argparse import ArgumentParser
+import utils
+from myconfig import MyConfig
+
+
+SELECT_NGRAM_DML = '''
+Select freq from ngram where words = "?";
+'''
+
+INSERT_NGRAM_DML = '''
+Insert into ngram(words, freq) values("?", ?);
+'''
+
+UPDATE_NGRAM_DML = '''
+Update ngram set freq = ? where words = "?";
+'''
+
+SELECT_ALL_DML = '''
+Select words, freq from ngram;
+'''
+
+INSERT_BIGRAM_DML = '''
+Insert into bigram(prefix, postfix, freq) values ("?", "?", ?);
+'''
+
+config = MyConfig()
+
+#maximum combine number
+N = config.getMaximumCombineNumber()
+
+#change cwd to the word recognizer directory
+words_dir = config.getWordRecognizerDir()
+os.makedirs(words_dir, exist_ok=True)
+os.chdir(words_dir)
+#chdir done
+
+
+def handleError(error):
+    sys.exit(error)
+
+
+def handleOneDocument(infile, conn, length):
+    print(infile, length)
+
+    infilestatuspath = infile + config.getStatusPostfix()
+    infilestatus = utils.load_status(infilestatuspath)
+    if not utils.check_epoch(infilestatus, 'Segment'):
+        raise utils.EpochError('Please segment first.\n')
+    if utils.check_epoch(infilestatus, 'Populate'):
+        return False
+
+    sep = config.getWordSep()
+
+    #train
+    docfile = open(infile + config.getSegmentPostfix(), 'r')
+    words = []
+
+    for oneline in docfile.readlines():
+        oneline = oneline.rstrip(os.linesep)
+
+        if len(oneline) == 0:
+            continue
+
+        (token, word) = oneline.split(" ", 1)
+        token = int(token)
+
+        if 0 == token:
+            words = []
+        else:
+            words.append(word)
+
+        if len(words) < length:
+            continue
+
+        if len(words) > length:
+            words.pop(0)
+
+        assert len(words) == length
+
+        #do sqlite training
+        words_str = sep + sep.join(words) + sep
+        print(words)
+
+    docfile.close()
+
+    #sign epoch
+    #utils.sign_epoch(infilestatus, 'Populate')
+    #utils.store_status(infilestatuspath, infilestatus)
+    return True
+
+def handleOnePass(indexpath, workdir, length):
+    print(indexpath, workdir, length)
+
+    filename = config.getNgramFileName(length)
+    filepath = workdir + os.sep + filename
+
+    conn = sqlite3.connect(filepath)
+
+    #begin processing
+    indexfile = open(indexpath, 'r')
+
+    for oneline in indexfile.readlines():
+        oneline = oneline.rstrip(os.linesep)
+        (title, textpath) = oneline.split('#')
+        infile = config.getTextDir() + textpath
+        infilesize = utils.get_file_length(infile + config.getSegmentPostfix())
+        if infilesize < config.getMinimumFileSize():
+            print("Skipping " + title + '#' + textpath)
+            continue
+
+        #process one document
+        handleOneDocument(infile, conn, length)
+
+    indexfile.close()
+
+    conn.commit()
+
+    if conn:
+        conn.close()
+
+def handleBigramPass(indexpath, workdir):
+    pass
+
+
+def handleOneIndex(indexpath, subdir, indexname):
+    print(indexpath, subdir, indexname)
+
+    indexstatuspath = indexpath + config.getStatusPostfix()
+    indexstatus = utils.load_status(indexstatuspath)
+    if not utils.check_epoch(indexstatus, 'Prepare'):
+        raise utils.EpochError('Please prepare first.\n')
+    if utils.check_epoch(indexstatus, 'Populate'):
+        return
+
+    workdir = config.getWordRecognizerDir() + os.sep + \
+        subdir + os.sep + indexname
+    print(workdir)
+
+    for i in range(1, N + 1):
+        handleOnePass(indexpath, workdir, i)
+
+    handleBigramPass(indexpath, workdir)
+
+    #sign epoch
+    #utils.sign_epoch(indexstatus, 'Populate')
+    #utils.store_status(indexstatuspath, indexstatus)
+    
+
+
+def walkThroughIndex(path):
+    for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
+        for onefile in files:
+            filepath = os.path.join(root, onefile)
+            indexpostfix = config.getIndexPostfix()
+            if onefile.endswith(indexpostfix):
+                subdir = os.path.relpath(root, path)
+                indexname = onefile[:-len(indexpostfix)]
+                handleOneIndex(filepath, subdir, indexname)
+            elif onefile.endswith(config.getStatusPostfix()):
+                pass
+            else:
+                print('Unexpected file:' + filepath)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description='Populate n-gram.')
+    parser.add_argument('--indexdir', action='store', \
+                            help='index directory', \
+                            default=os.path.join(config.getTextDir(), 'index'))
+
+    args = parser.parse_args()
+    print(args)
+    walkThroughIndex(args.indexdir)
+    print('done')
author	Peng Wu <alexepico@gmail.com>	2013-01-10 16:40:35 +0800
committer	Peng Wu <alexepico@gmail.com>	2013-01-10 16:40:35 +0800
commit	a2862954e48a36cd117bbbe26dd235e7c21fd930 (patch)
tree	ba5bdf7fb73bbd5436df0c430861a0065d86806d /populate.py
parent	2a3f09ba2e2aa0e99cb45b2ced390e7def0c157a (diff)
download	trainer-a2862954e48a36cd117bbbe26dd235e7c21fd930.tar.gz trainer-a2862954e48a36cd117bbbe26dd235e7c21fd930.tar.xz trainer-a2862954e48a36cd117bbbe26dd235e7c21fd930.zip