diff options
Diffstat (limited to 'populate.py')
-rw-r--r-- | populate.py | 31 |
1 files changed, 27 insertions, 4 deletions
diff --git a/populate.py b/populate.py index bca7f01..6b5105a 100644 --- a/populate.py +++ b/populate.py @@ -16,6 +16,9 @@ UPDATE_NGRAM_DML = ''' UPDATE ngram SET freq = freq + 1 WHERE words = ?; ''' +PRUNE_NGRAM_DML = ''' +DELETE FROM ngram WHERE freq <= ?; +''' config = MyConfig() @@ -28,7 +31,7 @@ os.chdir(words_dir) #chdir done -def handleOneDocument(infile, conn, length): +def handleOneDocument(infile, cur, length): print(infile, length) infilestatuspath = infile + config.getStatusPostfix() @@ -44,8 +47,6 @@ def handleOneDocument(infile, conn, length): docfile = open(infile + config.getSegmentPostfix(), 'r') words = [] - cur = conn.cursor() - for oneline in docfile.readlines(): oneline = oneline.rstrip(os.linesep) @@ -95,6 +96,7 @@ def handleOnePass(indexpath, workdir, length): filepath = workdir + os.sep + filename conn = sqlite3.connect(filepath) + cur = conn.cursor() #begin processing indexfile = open(indexpath, 'r') @@ -109,7 +111,7 @@ def handleOnePass(indexpath, workdir, length): continue #process one document - handleOneDocument(infile, conn, length) + handleOneDocument(infile, cur, length) indexfile.close() @@ -119,6 +121,25 @@ def handleOnePass(indexpath, workdir, length): conn.close() +def pruneNgramTable(indexpath, workdir, length): + print(indexpath, workdir, length, 'prune') + + threshold = config.getPruneMinimumOccurrence() + + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + + conn = sqlite3.connect(filepath) + cur = conn.cursor() + + rowcount = cur.execute(PRUNE_NGRAM_DML, (threshold, )).rowcount + #print(rowcount) + + conn.commit() + if conn: + conn.close() + + def handleOneIndex(indexpath, subdir, indexname, fast): print(indexpath, subdir, indexname) @@ -143,10 +164,12 @@ def handleOneIndex(indexpath, subdir, indexname, fast): shmfilepath = shmdir + os.sep + filename utils.copyfile(filepath, shmfilepath) handleOnePass(indexpath, shmdir, i) + pruneNgramTable(indexpath, shmdir, i) utils.copyfile(shmfilepath, filepath) os.unlink(shmfilepath) else: handleOnePass(indexpath, workdir, i) + handleOnePass(indexpath, workdir, i) #sign epoch utils.sign_epoch(indexstatus, 'Populate') |