summaryrefslogtreecommitdiffstats
path: root/populate.py
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-11 11:55:57 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-11 12:18:50 +0800
commit69ae77b35c57c6d74a5ad54fdafc24623aae2ac8 (patch)
tree82bd88bd11522da63fb4e0723ffca511e256a455 /populate.py
parentec12c5a071dccb3a0fa18364b9ca1be91fc655b8 (diff)
downloadtrainer-69ae77b35c57c6d74a5ad54fdafc24623aae2ac8.tar.gz
trainer-69ae77b35c57c6d74a5ad54fdafc24623aae2ac8.tar.xz
trainer-69ae77b35c57c6d74a5ad54fdafc24623aae2ac8.zip
write populate.py
Diffstat (limited to 'populate.py')
-rw-r--r--populate.py40
1 files changed, 34 insertions, 6 deletions
diff --git a/populate.py b/populate.py
index cbbffd0..28855e6 100644
--- a/populate.py
+++ b/populate.py
@@ -15,10 +15,14 @@ UPDATE_NGRAM_DML = '''
UPDATE ngram SET freq = freq + 1 WHERE words = ?;
'''
-SELECT_ALL_DML = '''
+SELECT_ALL_NGRAM_DML = '''
SELECT words, freq FROM ngram;
'''
+DELETE_BIGRAM_DML = '''
+DELETE FROM bigram;
+'''
+
INSERT_BIGRAM_DML = '''
INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?);
'''
@@ -81,6 +85,7 @@ def handleOneDocument(infile, conn, length):
#do sqlite training
words_str = sep + sep.join(words) + sep
+ #print(words_str)
rowcount = cur.execute(UPDATE_NGRAM_DML, (words_str,)).rowcount
#print(rowcount)
@@ -90,8 +95,8 @@ def handleOneDocument(infile, conn, length):
docfile.close()
#sign epoch
- #utils.sign_epoch(infilestatus, 'Populate')
- #utils.store_status(infilestatuspath, infilestatus)
+ utils.sign_epoch(infilestatus, 'Populate')
+ utils.store_status(infilestatuspath, infilestatus)
return True
def handleOnePass(indexpath, workdir, length):
@@ -125,7 +130,30 @@ def handleOnePass(indexpath, workdir, length):
conn.close()
def handleBigramPass(indexpath, workdir):
- pass
+ print(indexpath, workdir, 'bigram')
+ length = 2
+
+ sep = config.getWordSep()
+
+ filename = config.getNgramFileName(length)
+ filepath = workdir + os.sep + filename
+
+ #begin processing
+ conn = sqlite3.connect(filepath)
+ cur = conn.cursor()
+
+ cur.execute(DELETE_BIGRAM_DML)
+ rows = cur.execute(SELECT_ALL_NGRAM_DML).fetchall()
+ for row in rows:
+ (words_str, freq) = row
+
+ words = words_str.strip(sep).split(sep, 1)
+ assert len(words) == length
+
+ (prefix, postfix) = words
+
+ cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq))
+ #print(prefix, postfix, freq)
def handleOneIndex(indexpath, subdir, indexname):
@@ -148,8 +176,8 @@ def handleOneIndex(indexpath, subdir, indexname):
handleBigramPass(indexpath, workdir)
#sign epoch
- #utils.sign_epoch(indexstatus, 'Populate')
- #utils.store_status(indexstatuspath, indexstatus)
+ utils.sign_epoch(indexstatus, 'Populate')
+ utils.store_status(indexstatuspath, indexstatus)