summaryrefslogtreecommitdiffstats
path: root/populate.py
diff options
context:
space:
mode:
Diffstat (limited to 'populate.py')
-rw-r--r--populate.py31
1 files changed, 27 insertions, 4 deletions
diff --git a/populate.py b/populate.py
index bca7f01..6b5105a 100644
--- a/populate.py
+++ b/populate.py
@@ -16,6 +16,9 @@ UPDATE_NGRAM_DML = '''
UPDATE ngram SET freq = freq + 1 WHERE words = ?;
'''
+PRUNE_NGRAM_DML = '''
+DELETE FROM ngram WHERE freq <= ?;
+'''
config = MyConfig()
@@ -28,7 +31,7 @@ os.chdir(words_dir)
#chdir done
-def handleOneDocument(infile, conn, length):
+def handleOneDocument(infile, cur, length):
print(infile, length)
infilestatuspath = infile + config.getStatusPostfix()
@@ -44,8 +47,6 @@ def handleOneDocument(infile, conn, length):
docfile = open(infile + config.getSegmentPostfix(), 'r')
words = []
- cur = conn.cursor()
-
for oneline in docfile.readlines():
oneline = oneline.rstrip(os.linesep)
@@ -95,6 +96,7 @@ def handleOnePass(indexpath, workdir, length):
filepath = workdir + os.sep + filename
conn = sqlite3.connect(filepath)
+ cur = conn.cursor()
#begin processing
indexfile = open(indexpath, 'r')
@@ -109,7 +111,7 @@ def handleOnePass(indexpath, workdir, length):
continue
#process one document
- handleOneDocument(infile, conn, length)
+ handleOneDocument(infile, cur, length)
indexfile.close()
@@ -119,6 +121,25 @@ def handleOnePass(indexpath, workdir, length):
conn.close()
+def pruneNgramTable(indexpath, workdir, length):
+ print(indexpath, workdir, length, 'prune')
+
+ threshold = config.getPruneMinimumOccurrence()
+
+ filename = config.getNgramFileName(length)
+ filepath = workdir + os.sep + filename
+
+ conn = sqlite3.connect(filepath)
+ cur = conn.cursor()
+
+ rowcount = cur.execute(PRUNE_NGRAM_DML, (threshold, )).rowcount
+ #print(rowcount)
+
+ conn.commit()
+ if conn:
+ conn.close()
+
+
def handleOneIndex(indexpath, subdir, indexname, fast):
print(indexpath, subdir, indexname)
@@ -143,10 +164,12 @@ def handleOneIndex(indexpath, subdir, indexname, fast):
shmfilepath = shmdir + os.sep + filename
utils.copyfile(filepath, shmfilepath)
handleOnePass(indexpath, shmdir, i)
+ pruneNgramTable(indexpath, shmdir, i)
utils.copyfile(shmfilepath, filepath)
os.unlink(shmfilepath)
else:
handleOnePass(indexpath, workdir, i)
+ handleOnePass(indexpath, workdir, i)
#sign epoch
utils.sign_epoch(indexstatus, 'Populate')