From 2677abe7965caf3067806813795e3102c6661105 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 22 Jan 2013 11:26:05 +0800 Subject: add prune pass to populate.py --- lib/myconfig.py | 5 ++++- partialwordthreshold.py | 2 +- populate.py | 31 +++++++++++++++++++++++++++---- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/lib/myconfig.py b/lib/myconfig.py index 8874fe0..0ccd55c 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -132,7 +132,10 @@ class MyConfig: assert N >= 2, 'at least bi-gram' return N - def getMinimumOccurrence(self): + def getPruneMinimumOccurence(self): + return 1 # prune minimum occurrence + + def getWordMinimumOccurrence(self): return 3 # minimum word occurrence def getNgramMinimumOccurrence(self): diff --git a/partialwordthreshold.py b/partialwordthreshold.py index 69c2043..4729168 100644 --- a/partialwordthreshold.py +++ b/partialwordthreshold.py @@ -47,7 +47,7 @@ def computeThreshold(conn): freq = getWordFrequency(conn, word) - if freq < config.getMinimumOccurrence(): + if freq < config.getWordMinimumOccurrence(): continue wordswithfreq.append((word, freq)) diff --git a/populate.py b/populate.py index bca7f01..6b5105a 100644 --- a/populate.py +++ b/populate.py @@ -16,6 +16,9 @@ UPDATE_NGRAM_DML = ''' UPDATE ngram SET freq = freq + 1 WHERE words = ?; ''' +PRUNE_NGRAM_DML = ''' +DELETE FROM ngram WHERE freq <= ?; +''' config = MyConfig() @@ -28,7 +31,7 @@ os.chdir(words_dir) #chdir done -def handleOneDocument(infile, conn, length): +def handleOneDocument(infile, cur, length): print(infile, length) infilestatuspath = infile + config.getStatusPostfix() @@ -44,8 +47,6 @@ def handleOneDocument(infile, conn, length): docfile = open(infile + config.getSegmentPostfix(), 'r') words = [] - cur = conn.cursor() - for oneline in docfile.readlines(): oneline = oneline.rstrip(os.linesep) @@ -95,6 +96,7 @@ def handleOnePass(indexpath, workdir, length): filepath = workdir + os.sep + filename conn = sqlite3.connect(filepath) + cur = conn.cursor() #begin processing indexfile = open(indexpath, 'r') @@ -109,7 +111,7 @@ def handleOnePass(indexpath, workdir, length): continue #process one document - handleOneDocument(infile, conn, length) + handleOneDocument(infile, cur, length) indexfile.close() @@ -119,6 +121,25 @@ def handleOnePass(indexpath, workdir, length): conn.close() +def pruneNgramTable(indexpath, workdir, length): + print(indexpath, workdir, length, 'prune') + + threshold = config.getPruneMinimumOccurrence() + + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + + conn = sqlite3.connect(filepath) + cur = conn.cursor() + + rowcount = cur.execute(PRUNE_NGRAM_DML, (threshold, )).rowcount + #print(rowcount) + + conn.commit() + if conn: + conn.close() + + def handleOneIndex(indexpath, subdir, indexname, fast): print(indexpath, subdir, indexname) @@ -143,10 +164,12 @@ def handleOneIndex(indexpath, subdir, indexname, fast): shmfilepath = shmdir + os.sep + filename utils.copyfile(filepath, shmfilepath) handleOnePass(indexpath, shmdir, i) + pruneNgramTable(indexpath, shmdir, i) utils.copyfile(shmfilepath, filepath) os.unlink(shmfilepath) else: handleOnePass(indexpath, workdir, i) + handleOnePass(indexpath, workdir, i) #sign epoch utils.sign_epoch(indexstatus, 'Populate') -- cgit