add prune pass to populate.py

author: Peng Wu <alexepico@gmail.com> 2013-01-22 11:26:05 +0800
committer: Peng Wu <alexepico@gmail.com> 2013-01-22 11:26:05 +0800
commit: 2677abe7965caf3067806813795e3102c6661105 (patch)
tree: c0c522e623508752df2d729d4bde4c3486cb7f3d
parent: ce9f818a468ebdbfdfd1b8b2d46f881d8fe4d5b7 (diff)
download: trainer-2677abe7965caf3067806813795e3102c6661105.tar.gz
trainer-2677abe7965caf3067806813795e3102c6661105.tar.xz
trainer-2677abe7965caf3067806813795e3102c6661105.zip
3 files changed, 32 insertions, 6 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py
index 8874fe0..0ccd55c 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -132,7 +132,10 @@ class MyConfig:
         assert N >= 2, 'at least bi-gram'
         return N
 
-    def getMinimumOccurrence(self):
+    def getPruneMinimumOccurence(self):
+        return 1 # prune minimum occurrence
+
+    def getWordMinimumOccurrence(self):
         return 3 # minimum word occurrence
 
     def getNgramMinimumOccurrence(self):
diff --git a/partialwordthreshold.py b/partialwordthreshold.py
index 69c2043..4729168 100644
--- a/partialwordthreshold.py
+++ b/partialwordthreshold.py
@@ -47,7 +47,7 @@ def computeThreshold(conn):
 
         freq = getWordFrequency(conn, word)
 
-        if freq < config.getMinimumOccurrence():
+        if freq < config.getWordMinimumOccurrence():
             continue
 
         wordswithfreq.append((word, freq))
diff --git a/populate.py b/populate.py
index bca7f01..6b5105a 100644
--- a/populate.py
+++ b/populate.py
@@ -16,6 +16,9 @@ UPDATE_NGRAM_DML = '''
 UPDATE ngram SET freq = freq + 1 WHERE words = ?;
 '''
 
+PRUNE_NGRAM_DML = '''
+DELETE FROM ngram WHERE freq <= ?;
+'''
 
 config = MyConfig()
 
@@ -28,7 +31,7 @@ os.chdir(words_dir)
 #chdir done
 
 
-def handleOneDocument(infile, conn, length):
+def handleOneDocument(infile, cur, length):
     print(infile, length)
 
     infilestatuspath = infile + config.getStatusPostfix()
@@ -44,8 +47,6 @@ def handleOneDocument(infile, conn, length):
     docfile = open(infile + config.getSegmentPostfix(), 'r')
     words = []
 
-    cur = conn.cursor()
-
     for oneline in docfile.readlines():
         oneline = oneline.rstrip(os.linesep)
 
@@ -95,6 +96,7 @@ def handleOnePass(indexpath, workdir, length):
     filepath = workdir + os.sep + filename
 
     conn = sqlite3.connect(filepath)
+    cur = conn.cursor()
 
     #begin processing
     indexfile = open(indexpath, 'r')
@@ -109,7 +111,7 @@ def handleOnePass(indexpath, workdir, length):
             continue
 
         #process one document
-        handleOneDocument(infile, conn, length)
+        handleOneDocument(infile, cur, length)
 
     indexfile.close()
 
@@ -119,6 +121,25 @@ def handleOnePass(indexpath, workdir, length):
         conn.close()
 
 
+def pruneNgramTable(indexpath, workdir, length):
+    print(indexpath, workdir, length, 'prune')
+
+    threshold = config.getPruneMinimumOccurrence()
+
+    filename = config.getNgramFileName(length)
+    filepath = workdir + os.sep + filename
+
+    conn = sqlite3.connect(filepath)
+    cur = conn.cursor()
+
+    rowcount = cur.execute(PRUNE_NGRAM_DML, (threshold, )).rowcount
+    #print(rowcount)
+
+    conn.commit()
+    if conn:
+        conn.close()
+
+
 def handleOneIndex(indexpath, subdir, indexname, fast):
     print(indexpath, subdir, indexname)
 
@@ -143,10 +164,12 @@ def handleOneIndex(indexpath, subdir, indexname, fast):
             shmfilepath = shmdir + os.sep + filename
             utils.copyfile(filepath, shmfilepath)
             handleOnePass(indexpath, shmdir, i)
+            pruneNgramTable(indexpath, shmdir, i)
             utils.copyfile(shmfilepath, filepath)
             os.unlink(shmfilepath)
         else:
             handleOnePass(indexpath, workdir, i)
+            handleOnePass(indexpath, workdir, i)
 
     #sign epoch
     utils.sign_epoch(indexstatus, 'Populate')
author	Peng Wu <alexepico@gmail.com>	2013-01-22 11:26:05 +0800
committer	Peng Wu <alexepico@gmail.com>	2013-01-22 11:26:05 +0800
commit	2677abe7965caf3067806813795e3102c6661105 (patch)
tree	c0c522e623508752df2d729d4bde4c3486cb7f3d
parent	ce9f818a468ebdbfdfd1b8b2d46f881d8fe4d5b7 (diff)
download	trainer-2677abe7965caf3067806813795e3102c6661105.tar.gz trainer-2677abe7965caf3067806813795e3102c6661105.tar.xz trainer-2677abe7965caf3067806813795e3102c6661105.zip