diff options
Diffstat (limited to 'newword.py')
-rw-r--r-- | newword.py | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/newword.py b/newword.py new file mode 100644 index 0000000..434a27f --- /dev/null +++ b/newword.py @@ -0,0 +1,116 @@ +#!/usr/bin/sqlite3 +import os +import os.path +import sqlite3 +from argparse import ArgumentParser +import utils +from myconfig import MyConfig +from dirwalk import walkIndex + + +config = MyConfig() + +#change cwd to the word recognizer directory +words_dir = config.getWordRecognizerDir() +os.chdir(words_dir) +#chdir done + + +############################################################ +# Create Bigram Database # +############################################################ + + +CREATE_BIGRAM_DDL = ''' +CREATE TABLE bigram ( + prefix TEXT NOT NULL, + postfix TEXT NOT NULL, + freq INTEGER NOT NULL + ); +''' + +CREATE_BIGRAM_PREFIX_INDEX_DDL = ''' +CREATE INDEX bigram_prefix_index on bigram(prefix); +''' + +CREATE_BIGRAM_POSTFIX_INDEX_DDL = ''' +CREATE INDEX bigram_postfix_index on bigram(postfix); +''' + +SELECT_ALL_NGRAM_DML = ''' +SELECT words, freq FROM ngram; +''' + +INSERT_BIGRAM_DML = ''' +INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?); +''' + + +def createBigramSqlite(workdir): + print(workdir, 'create bigram') + + filename = config.getBigramFileName() + filepath = workdir + os.sep + filename + print(filepath) + + if os.access(filepath, os.F_OK): + os.unlink(filepath) + + conn = sqlite3.connect(filepath) + cur = conn.cursor() + cur.execute(CREATE_BIGRAM_DDL) + cur.execute(CREATE_BIGRAM_PREFIX_INDEX_DDL) + cur.execute(CREATE_BIGRAM_POSTFIX_INDEX_DDL) + conn.commit() + if conn: + conn.close() + + +def populateBigramSqlite(workdir): + print(workdir, 'populate bigram') + + sep = config.getWordSep() + + filename = config.getBigramFileName() + filepath = workdir + os.sep + filename + + bigram_conn = sqlite3.connect(filepath) + bigram_cur = bigram_conn.cursor() + + length = 2 + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + + ngram_conn = sqlite3.connect(filepath) + ngram_cur = ngram_conn.cursor() + + #begin processing + rows = ngram_cur.execute(SELECT_ALL_NGRAM_DML).fetchall() + for row in rows: + (words_str, freq) = row + + words = words_str.strip(sep).split(sep, 1) + assert len(words) == length + + (prefix, postfix) = words + + bigram_cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq)) + #print(prefix, postfix, freq) + + bigram_conn.commit() + ngram_conn.commit() + + if bigram_conn: + bigram_conn.close() + if ngram_conn: + ngram_conn.close() + + +############################################################ +# Get Threshold Pass # +############################################################ + + +############################################################ +# Get Word Pass # +############################################################ |