summaryrefslogtreecommitdiffstats
path: root/newword.py
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-22 11:40:13 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-22 11:40:13 +0800
commitee5956baedf9713896c576925648768e360af92c (patch)
tree06e86fd2e5d7380810f3d4406250ae95ec7b07be /newword.py
parent2677abe7965caf3067806813795e3102c6661105 (diff)
downloadtrainer-ee5956baedf9713896c576925648768e360af92c.tar.gz
trainer-ee5956baedf9713896c576925648768e360af92c.tar.xz
trainer-ee5956baedf9713896c576925648768e360af92c.zip
begin to write newword.py
Diffstat (limited to 'newword.py')
-rw-r--r--newword.py116
1 files changed, 116 insertions, 0 deletions
diff --git a/newword.py b/newword.py
new file mode 100644
index 0000000..434a27f
--- /dev/null
+++ b/newword.py
@@ -0,0 +1,116 @@
+#!/usr/bin/sqlite3
+import os
+import os.path
+import sqlite3
+from argparse import ArgumentParser
+import utils
+from myconfig import MyConfig
+from dirwalk import walkIndex
+
+
+config = MyConfig()
+
+#change cwd to the word recognizer directory
+words_dir = config.getWordRecognizerDir()
+os.chdir(words_dir)
+#chdir done
+
+
+############################################################
+# Create Bigram Database #
+############################################################
+
+
+CREATE_BIGRAM_DDL = '''
+CREATE TABLE bigram (
+ prefix TEXT NOT NULL,
+ postfix TEXT NOT NULL,
+ freq INTEGER NOT NULL
+ );
+'''
+
+CREATE_BIGRAM_PREFIX_INDEX_DDL = '''
+CREATE INDEX bigram_prefix_index on bigram(prefix);
+'''
+
+CREATE_BIGRAM_POSTFIX_INDEX_DDL = '''
+CREATE INDEX bigram_postfix_index on bigram(postfix);
+'''
+
+SELECT_ALL_NGRAM_DML = '''
+SELECT words, freq FROM ngram;
+'''
+
+INSERT_BIGRAM_DML = '''
+INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?);
+'''
+
+
+def createBigramSqlite(workdir):
+ print(workdir, 'create bigram')
+
+ filename = config.getBigramFileName()
+ filepath = workdir + os.sep + filename
+ print(filepath)
+
+ if os.access(filepath, os.F_OK):
+ os.unlink(filepath)
+
+ conn = sqlite3.connect(filepath)
+ cur = conn.cursor()
+ cur.execute(CREATE_BIGRAM_DDL)
+ cur.execute(CREATE_BIGRAM_PREFIX_INDEX_DDL)
+ cur.execute(CREATE_BIGRAM_POSTFIX_INDEX_DDL)
+ conn.commit()
+ if conn:
+ conn.close()
+
+
+def populateBigramSqlite(workdir):
+ print(workdir, 'populate bigram')
+
+ sep = config.getWordSep()
+
+ filename = config.getBigramFileName()
+ filepath = workdir + os.sep + filename
+
+ bigram_conn = sqlite3.connect(filepath)
+ bigram_cur = bigram_conn.cursor()
+
+ length = 2
+ filename = config.getNgramFileName(length)
+ filepath = workdir + os.sep + filename
+
+ ngram_conn = sqlite3.connect(filepath)
+ ngram_cur = ngram_conn.cursor()
+
+ #begin processing
+ rows = ngram_cur.execute(SELECT_ALL_NGRAM_DML).fetchall()
+ for row in rows:
+ (words_str, freq) = row
+
+ words = words_str.strip(sep).split(sep, 1)
+ assert len(words) == length
+
+ (prefix, postfix) = words
+
+ bigram_cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq))
+ #print(prefix, postfix, freq)
+
+ bigram_conn.commit()
+ ngram_conn.commit()
+
+ if bigram_conn:
+ bigram_conn.close()
+ if ngram_conn:
+ ngram_conn.close()
+
+
+############################################################
+# Get Threshold Pass #
+############################################################
+
+
+############################################################
+# Get Word Pass #
+############################################################