From a2862954e48a36cd117bbbe26dd235e7c21fd930 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 10 Jan 2013 16:40:35 +0800 Subject: begin to write populate.py --- populate.py | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 populate.py (limited to 'populate.py') diff --git a/populate.py b/populate.py new file mode 100644 index 0000000..8d82914 --- /dev/null +++ b/populate.py @@ -0,0 +1,178 @@ +#!/usr/bin/python3 +import os +import os.path +import sqlite3 +from argparse import ArgumentParser +import utils +from myconfig import MyConfig + + +SELECT_NGRAM_DML = ''' +Select freq from ngram where words = "?"; +''' + +INSERT_NGRAM_DML = ''' +Insert into ngram(words, freq) values("?", ?); +''' + +UPDATE_NGRAM_DML = ''' +Update ngram set freq = ? where words = "?"; +''' + +SELECT_ALL_DML = ''' +Select words, freq from ngram; +''' + +INSERT_BIGRAM_DML = ''' +Insert into bigram(prefix, postfix, freq) values ("?", "?", ?); +''' + +config = MyConfig() + +#maximum combine number +N = config.getMaximumCombineNumber() + +#change cwd to the word recognizer directory +words_dir = config.getWordRecognizerDir() +os.makedirs(words_dir, exist_ok=True) +os.chdir(words_dir) +#chdir done + + +def handleError(error): + sys.exit(error) + + +def handleOneDocument(infile, conn, length): + print(infile, length) + + infilestatuspath = infile + config.getStatusPostfix() + infilestatus = utils.load_status(infilestatuspath) + if not utils.check_epoch(infilestatus, 'Segment'): + raise utils.EpochError('Please segment first.\n') + if utils.check_epoch(infilestatus, 'Populate'): + return False + + sep = config.getWordSep() + + #train + docfile = open(infile + config.getSegmentPostfix(), 'r') + words = [] + + for oneline in docfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (token, word) = oneline.split(" ", 1) + token = int(token) + + if 0 == token: + words = [] + else: + words.append(word) + + if len(words) < length: + continue + + if len(words) > length: + words.pop(0) + + assert len(words) == length + + #do sqlite training + words_str = sep + sep.join(words) + sep + print(words) + + docfile.close() + + #sign epoch + #utils.sign_epoch(infilestatus, 'Populate') + #utils.store_status(infilestatuspath, infilestatus) + return True + +def handleOnePass(indexpath, workdir, length): + print(indexpath, workdir, length) + + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + + conn = sqlite3.connect(filepath) + + #begin processing + indexfile = open(indexpath, 'r') + + for oneline in indexfile.readlines(): + oneline = oneline.rstrip(os.linesep) + (title, textpath) = oneline.split('#') + infile = config.getTextDir() + textpath + infilesize = utils.get_file_length(infile + config.getSegmentPostfix()) + if infilesize < config.getMinimumFileSize(): + print("Skipping " + title + '#' + textpath) + continue + + #process one document + handleOneDocument(infile, conn, length) + + indexfile.close() + + conn.commit() + + if conn: + conn.close() + +def handleBigramPass(indexpath, workdir): + pass + + +def handleOneIndex(indexpath, subdir, indexname): + print(indexpath, subdir, indexname) + + indexstatuspath = indexpath + config.getStatusPostfix() + indexstatus = utils.load_status(indexstatuspath) + if not utils.check_epoch(indexstatus, 'Prepare'): + raise utils.EpochError('Please prepare first.\n') + if utils.check_epoch(indexstatus, 'Populate'): + return + + workdir = config.getWordRecognizerDir() + os.sep + \ + subdir + os.sep + indexname + print(workdir) + + for i in range(1, N + 1): + handleOnePass(indexpath, workdir, i) + + handleBigramPass(indexpath, workdir) + + #sign epoch + #utils.sign_epoch(indexstatus, 'Populate') + #utils.store_status(indexstatuspath, indexstatus) + + + +def walkThroughIndex(path): + for root, dirs, files in os.walk(path, topdown=True, onerror=handleError): + for onefile in files: + filepath = os.path.join(root, onefile) + indexpostfix = config.getIndexPostfix() + if onefile.endswith(indexpostfix): + subdir = os.path.relpath(root, path) + indexname = onefile[:-len(indexpostfix)] + handleOneIndex(filepath, subdir, indexname) + elif onefile.endswith(config.getStatusPostfix()): + pass + else: + print('Unexpected file:' + filepath) + + +if __name__ == '__main__': + parser = ArgumentParser(description='Populate n-gram.') + parser.add_argument('--indexdir', action='store', \ + help='index directory', \ + default=os.path.join(config.getTextDir(), 'index')) + + args = parser.parse_args() + print(args) + walkThroughIndex(args.indexdir) + print('done') -- cgit