diff options
author | Peng Wu <alexepico@gmail.com> | 2013-01-16 11:24:12 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-01-16 11:24:12 +0800 |
commit | 19fd47bf825fb85ca4f83563156ffa8436a54792 (patch) | |
tree | 5461e1ca2776f9af823161e0b02d67417dc2f522 /populatebigram.py | |
parent | 4fb5560fc42f501dece0f3803a36cf419af996fd (diff) | |
download | trainer-19fd47bf825fb85ca4f83563156ffa8436a54792.tar.gz trainer-19fd47bf825fb85ca4f83563156ffa8436a54792.tar.xz trainer-19fd47bf825fb85ca4f83563156ffa8436a54792.zip |
write populatebigram.py
Diffstat (limited to 'populatebigram.py')
-rw-r--r-- | populatebigram.py | 128 |
1 files changed, 128 insertions, 0 deletions
diff --git a/populatebigram.py b/populatebigram.py new file mode 100644 index 0000000..e6940e1 --- /dev/null +++ b/populatebigram.py @@ -0,0 +1,128 @@ +#!/usr/bin/sqlite3 +import os +import os.path +import sqlite3 +from argparse import ArgumentParser +import utils +from myconfig import MyConfig + + +CREATE_BIGRAM_DDL = ''' +CREATE TABLE bigram ( + prefix TEXT NOT NULL, + postfix TEXT NOT NULL, + freq INTEGER NOT NULL + ); +''' + +SELECT_ALL_NGRAM_DML = ''' +SELECT words, freq FROM ngram; +''' + +DELETE_BIGRAM_DML = ''' +DELETE FROM bigram; +''' + +INSERT_BIGRAM_DML = ''' +INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?); +''' + + +config = MyConfig() + +#change cwd to the word recognizer directory +words_dir = config.getWordRecognizerDir() +os.chdir(words_dir) +#chdir done + + +def handleError(error): + sys.exit(error) + + +def createBigramSqlite(indexpath, workdir): + print(indexpath, workdir, 'create bigram') + length = 2 + + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + print(filepath) + + if os.access(filepath, os.F_OK): + os.unlink(filepath) + + conn = sqlite3.connect(filepath) + cur = conn.cursor() + cur.execute(CREATE_BIGRAM_DDL) + conn.commit() + if conn: + conn.close() + + +def handleBigramPass(indexpath, workdir): + print(indexpath, workdir, 'bigram pass') + length = 2 + + sep = config.getWordSep() + + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + + #begin processing + conn = sqlite3.connect(filepath) + cur = conn.cursor() + + cur.execute(DELETE_BIGRAM_DML) + rows = cur.execute(SELECT_ALL_NGRAM_DML).fetchall() + for row in rows: + (words_str, freq) = row + + words = words_str.strip(sep).split(sep, 1) + assert len(words) == length + + (prefix, postfix) = words + + cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq)) + #print(prefix, postfix, freq) + + conn.commit() + + if conn: + conn.close() + + +def handleOneIndex(indexpath, subdir, indexname, fast): + print(indexpath, subdir, indexname) + + indexstatuspath = indexpath + config.getStatusPostfix() + indexstatus = utils.load_status(indexstatuspath) + if not utils.check_epoch(indexstatus, 'PartialWord'): + raise utils.EpochError('Please do partial word first.\n') + if utils.check_epoch(indexstatus, 'PopulateBigram'): + return + + workdir = config.getWordRecognizerDir() + os.sep + \ + subdir + os.sep + indexname + print(workdir) + + createBigramSqlite(indexpath, workdir) + handleBigramPass(indexpath, workdir) + + #sign epoch + utils.sign_epoch(indexstatus, 'PopulateBigram') + utils.store_status(indexstatuspath, indexstatus) + + +def walkThroughIndex(path): + for root, dirs, files in os.walk(path, topdown=True, onerror=handleError): + for onefile in files: + filepath = os.path.join(root, onefile) + indexpostfix = config.getIndexPostfix() + if onefile.endswith(indexpostfix): + subdir = os.path.relpath(root, path) + indexname = onefile[:-len(indexpostfix)] + handleOneIndex(filepath, subdir, indexname) + elif onefile.endswith(config.getStatusPostfix()): + pass + else: + print('Unexpected file:' + filepath) |