From ee5956baedf9713896c576925648768e360af92c Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 22 Jan 2013 11:40:13 +0800 Subject: begin to write newword.py --- newword.py | 116 ++++++++++++++++++++++++++++++++++++++++++++++ populatebigram.py | 134 ------------------------------------------------------ 2 files changed, 116 insertions(+), 134 deletions(-) create mode 100644 newword.py delete mode 100644 populatebigram.py diff --git a/newword.py b/newword.py new file mode 100644 index 0000000..434a27f --- /dev/null +++ b/newword.py @@ -0,0 +1,116 @@ +#!/usr/bin/sqlite3 +import os +import os.path +import sqlite3 +from argparse import ArgumentParser +import utils +from myconfig import MyConfig +from dirwalk import walkIndex + + +config = MyConfig() + +#change cwd to the word recognizer directory +words_dir = config.getWordRecognizerDir() +os.chdir(words_dir) +#chdir done + + +############################################################ +# Create Bigram Database # +############################################################ + + +CREATE_BIGRAM_DDL = ''' +CREATE TABLE bigram ( + prefix TEXT NOT NULL, + postfix TEXT NOT NULL, + freq INTEGER NOT NULL + ); +''' + +CREATE_BIGRAM_PREFIX_INDEX_DDL = ''' +CREATE INDEX bigram_prefix_index on bigram(prefix); +''' + +CREATE_BIGRAM_POSTFIX_INDEX_DDL = ''' +CREATE INDEX bigram_postfix_index on bigram(postfix); +''' + +SELECT_ALL_NGRAM_DML = ''' +SELECT words, freq FROM ngram; +''' + +INSERT_BIGRAM_DML = ''' +INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?); +''' + + +def createBigramSqlite(workdir): + print(workdir, 'create bigram') + + filename = config.getBigramFileName() + filepath = workdir + os.sep + filename + print(filepath) + + if os.access(filepath, os.F_OK): + os.unlink(filepath) + + conn = sqlite3.connect(filepath) + cur = conn.cursor() + cur.execute(CREATE_BIGRAM_DDL) + cur.execute(CREATE_BIGRAM_PREFIX_INDEX_DDL) + cur.execute(CREATE_BIGRAM_POSTFIX_INDEX_DDL) + conn.commit() + if conn: + conn.close() + + +def populateBigramSqlite(workdir): + print(workdir, 'populate bigram') + + sep = config.getWordSep() + + filename = config.getBigramFileName() + filepath = workdir + os.sep + filename + + bigram_conn = sqlite3.connect(filepath) + bigram_cur = bigram_conn.cursor() + + length = 2 + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + + ngram_conn = sqlite3.connect(filepath) + ngram_cur = ngram_conn.cursor() + + #begin processing + rows = ngram_cur.execute(SELECT_ALL_NGRAM_DML).fetchall() + for row in rows: + (words_str, freq) = row + + words = words_str.strip(sep).split(sep, 1) + assert len(words) == length + + (prefix, postfix) = words + + bigram_cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq)) + #print(prefix, postfix, freq) + + bigram_conn.commit() + ngram_conn.commit() + + if bigram_conn: + bigram_conn.close() + if ngram_conn: + ngram_conn.close() + + +############################################################ +# Get Threshold Pass # +############################################################ + + +############################################################ +# Get Word Pass # +############################################################ diff --git a/populatebigram.py b/populatebigram.py deleted file mode 100644 index 036e1b7..0000000 --- a/populatebigram.py +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/sqlite3 -import os -import os.path -import sqlite3 -from argparse import ArgumentParser -import utils -from myconfig import MyConfig -from dirwalk import walkIndex - -CREATE_BIGRAM_DDL = ''' -CREATE TABLE bigram ( - prefix TEXT NOT NULL, - postfix TEXT NOT NULL, - freq INTEGER NOT NULL - ); -''' - -CREATE_BIGRAM_PREFIX_INDEX_DDL = ''' -CREATE INDEX bigram_prefix_index on bigram(prefix); -''' - -CREATE_BIGRAM_POSTFIX_INDEX_DDL = ''' -CREATE INDEX bigram_postfix_index on bigram(postfix); -''' - -SELECT_ALL_NGRAM_DML = ''' -SELECT words, freq FROM ngram; -''' - -INSERT_BIGRAM_DML = ''' -INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?); -''' - - -config = MyConfig() - -#change cwd to the word recognizer directory -words_dir = config.getWordRecognizerDir() -os.chdir(words_dir) -#chdir done - - -def createBigramSqlite(indexpath, workdir): - print(indexpath, workdir, 'create bigram') - - filename = config.getBigramFileName() - filepath = workdir + os.sep + filename - print(filepath) - - if os.access(filepath, os.F_OK): - os.unlink(filepath) - - conn = sqlite3.connect(filepath) - cur = conn.cursor() - cur.execute(CREATE_BIGRAM_DDL) - cur.execute(CREATE_BIGRAM_PREFIX_INDEX_DDL) - cur.execute(CREATE_BIGRAM_POSTFIX_INDEX_DDL) - conn.commit() - if conn: - conn.close() - - -def handleBigramPass(indexpath, workdir): - print(indexpath, workdir, 'bigram pass') - - sep = config.getWordSep() - - filename = config.getBigramFileName() - filepath = workdir + os.sep + filename - - bigram_conn = sqlite3.connect(filepath) - bigram_cur = bigram_conn.cursor() - - length = 2 - filename = config.getNgramFileName(length) - filepath = workdir + os.sep + filename - - ngram_conn = sqlite3.connect(filepath) - ngram_cur = ngram_conn.cursor() - - #begin processing - rows = ngram_cur.execute(SELECT_ALL_NGRAM_DML).fetchall() - for row in rows: - (words_str, freq) = row - - words = words_str.strip(sep).split(sep, 1) - assert len(words) == length - - (prefix, postfix) = words - - bigram_cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq)) - #print(prefix, postfix, freq) - - bigram_conn.commit() - ngram_conn.commit() - - if bigram_conn: - bigram_conn.close() - if ngram_conn: - ngram_conn.close() - - -def handleOneIndex(indexpath, subdir, indexname): - print(indexpath, subdir, indexname) - - indexstatuspath = indexpath + config.getStatusPostfix() - indexstatus = utils.load_status(indexstatuspath) - if not utils.check_epoch(indexstatus, 'PartialWord'): - raise utils.EpochError('Please do partial word first.\n') - if utils.check_epoch(indexstatus, 'PopulateBigram'): - return - - workdir = config.getWordRecognizerDir() + os.sep + \ - subdir + os.sep + indexname - print(workdir) - - createBigramSqlite(indexpath, workdir) - handleBigramPass(indexpath, workdir) - - #sign epoch - utils.sign_epoch(indexstatus, 'PopulateBigram') - utils.store_status(indexstatuspath, indexstatus) - - -if __name__ == '__main__': - parser = ArgumentParser(description='Populate bi-gram.') - parser.add_argument('--indexdir', action='store', \ - help='index directory', \ - default=config.getTextIndexDir()) - - args = parser.parse_args() - print(args) - walkIndex(handleOneIndex, args.indexdir) - print('done') -- cgit