diff options
author | Peng Wu <alexepico@gmail.com> | 2013-01-16 11:48:12 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-01-16 12:23:36 +0800 |
commit | 4e36f08dce2034fcf9a813313c1653e5ae725e58 (patch) | |
tree | 5f6090ddc22c6265e194f44976218a65cbaa64bc /populatebigram.py | |
parent | aaef9f0739a6b1db53f26baaaed28be053fa3bf6 (diff) | |
download | trainer-4e36f08dce2034fcf9a813313c1653e5ae725e58.tar.gz trainer-4e36f08dce2034fcf9a813313c1653e5ae725e58.tar.xz trainer-4e36f08dce2034fcf9a813313c1653e5ae725e58.zip |
improves populatebigram.py
Diffstat (limited to 'populatebigram.py')
-rw-r--r-- | populatebigram.py | 49 |
1 files changed, 32 insertions, 17 deletions
diff --git a/populatebigram.py b/populatebigram.py index e6940e1..cad54ca 100644 --- a/populatebigram.py +++ b/populatebigram.py @@ -19,10 +19,6 @@ SELECT_ALL_NGRAM_DML = ''' SELECT words, freq FROM ngram; ''' -DELETE_BIGRAM_DML = ''' -DELETE FROM bigram; -''' - INSERT_BIGRAM_DML = ''' INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?); ''' @@ -42,9 +38,8 @@ def handleError(error): def createBigramSqlite(indexpath, workdir): print(indexpath, workdir, 'create bigram') - length = 2 - filename = config.getNgramFileName(length) + filename = config.getBigramFileName() filepath = workdir + os.sep + filename print(filepath) @@ -61,19 +56,24 @@ def createBigramSqlite(indexpath, workdir): def handleBigramPass(indexpath, workdir): print(indexpath, workdir, 'bigram pass') - length = 2 sep = config.getWordSep() + filename = config.getBigramFileName() + filepath = workdir + os.sep + filename + + bigram_conn = sqlite3.connect(filepath) + bigram_cur = bigram_conn.cursor() + + length = 2 filename = config.getNgramFileName(length) filepath = workdir + os.sep + filename - #begin processing - conn = sqlite3.connect(filepath) - cur = conn.cursor() + ngram_conn = sqlite3.connect(filepath) + ngram_cur = ngram_conn.cursor() - cur.execute(DELETE_BIGRAM_DML) - rows = cur.execute(SELECT_ALL_NGRAM_DML).fetchall() + #begin processing + rows = ngram_cur.execute(SELECT_ALL_NGRAM_DML).fetchall() for row in rows: (words_str, freq) = row @@ -82,16 +82,19 @@ def handleBigramPass(indexpath, workdir): (prefix, postfix) = words - cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq)) + bigram_cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq)) #print(prefix, postfix, freq) - conn.commit() + bigram_conn.commit() + ngram_conn.commit() - if conn: - conn.close() + if bigram_conn: + bigram_conn.close() + if ngram_conn: + ngram_conn.close() -def handleOneIndex(indexpath, subdir, indexname, fast): +def handleOneIndex(indexpath, subdir, indexname): print(indexpath, subdir, indexname) indexstatuspath = indexpath + config.getStatusPostfix() @@ -126,3 +129,15 @@ def walkThroughIndex(path): pass else: print('Unexpected file:' + filepath) + + +if __name__ == '__main__': + parser = ArgumentParser(description='Populate bi-gram.') + parser.add_argument('--indexdir', action='store', \ + help='index directory', \ + default=os.path.join(config.getTextDir(), 'index')) + + args = parser.parse_args() + print(args) + walkThroughIndex(args.indexdir) + print('done') |