summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-22 11:26:05 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-22 11:26:05 +0800
commit2677abe7965caf3067806813795e3102c6661105 (patch)
treec0c522e623508752df2d729d4bde4c3486cb7f3d
parentce9f818a468ebdbfdfd1b8b2d46f881d8fe4d5b7 (diff)
downloadtrainer-2677abe7965caf3067806813795e3102c6661105.tar.gz
trainer-2677abe7965caf3067806813795e3102c6661105.tar.xz
trainer-2677abe7965caf3067806813795e3102c6661105.zip
add prune pass to populate.py
-rw-r--r--lib/myconfig.py5
-rw-r--r--partialwordthreshold.py2
-rw-r--r--populate.py31
3 files changed, 32 insertions, 6 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py
index 8874fe0..0ccd55c 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -132,7 +132,10 @@ class MyConfig:
assert N >= 2, 'at least bi-gram'
return N
- def getMinimumOccurrence(self):
+ def getPruneMinimumOccurence(self):
+ return 1 # prune minimum occurrence
+
+ def getWordMinimumOccurrence(self):
return 3 # minimum word occurrence
def getNgramMinimumOccurrence(self):
diff --git a/partialwordthreshold.py b/partialwordthreshold.py
index 69c2043..4729168 100644
--- a/partialwordthreshold.py
+++ b/partialwordthreshold.py
@@ -47,7 +47,7 @@ def computeThreshold(conn):
freq = getWordFrequency(conn, word)
- if freq < config.getMinimumOccurrence():
+ if freq < config.getWordMinimumOccurrence():
continue
wordswithfreq.append((word, freq))
diff --git a/populate.py b/populate.py
index bca7f01..6b5105a 100644
--- a/populate.py
+++ b/populate.py
@@ -16,6 +16,9 @@ UPDATE_NGRAM_DML = '''
UPDATE ngram SET freq = freq + 1 WHERE words = ?;
'''
+PRUNE_NGRAM_DML = '''
+DELETE FROM ngram WHERE freq <= ?;
+'''
config = MyConfig()
@@ -28,7 +31,7 @@ os.chdir(words_dir)
#chdir done
-def handleOneDocument(infile, conn, length):
+def handleOneDocument(infile, cur, length):
print(infile, length)
infilestatuspath = infile + config.getStatusPostfix()
@@ -44,8 +47,6 @@ def handleOneDocument(infile, conn, length):
docfile = open(infile + config.getSegmentPostfix(), 'r')
words = []
- cur = conn.cursor()
-
for oneline in docfile.readlines():
oneline = oneline.rstrip(os.linesep)
@@ -95,6 +96,7 @@ def handleOnePass(indexpath, workdir, length):
filepath = workdir + os.sep + filename
conn = sqlite3.connect(filepath)
+ cur = conn.cursor()
#begin processing
indexfile = open(indexpath, 'r')
@@ -109,7 +111,7 @@ def handleOnePass(indexpath, workdir, length):
continue
#process one document
- handleOneDocument(infile, conn, length)
+ handleOneDocument(infile, cur, length)
indexfile.close()
@@ -119,6 +121,25 @@ def handleOnePass(indexpath, workdir, length):
conn.close()
+def pruneNgramTable(indexpath, workdir, length):
+ print(indexpath, workdir, length, 'prune')
+
+ threshold = config.getPruneMinimumOccurrence()
+
+ filename = config.getNgramFileName(length)
+ filepath = workdir + os.sep + filename
+
+ conn = sqlite3.connect(filepath)
+ cur = conn.cursor()
+
+ rowcount = cur.execute(PRUNE_NGRAM_DML, (threshold, )).rowcount
+ #print(rowcount)
+
+ conn.commit()
+ if conn:
+ conn.close()
+
+
def handleOneIndex(indexpath, subdir, indexname, fast):
print(indexpath, subdir, indexname)
@@ -143,10 +164,12 @@ def handleOneIndex(indexpath, subdir, indexname, fast):
shmfilepath = shmdir + os.sep + filename
utils.copyfile(filepath, shmfilepath)
handleOnePass(indexpath, shmdir, i)
+ pruneNgramTable(indexpath, shmdir, i)
utils.copyfile(shmfilepath, filepath)
os.unlink(shmfilepath)
else:
handleOnePass(indexpath, workdir, i)
+ handleOnePass(indexpath, workdir, i)
#sign epoch
utils.sign_epoch(indexstatus, 'Populate')