summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-23 13:26:34 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-23 14:02:07 +0800
commitd64d827209ec59a8886b5fb8bde949813d602cae (patch)
treeac73665cbc0308a9325261383ec587315285b5a7
parent014f1784618f2079dcf2404eb3e0e32dffedce74 (diff)
downloadtrainer-d64d827209ec59a8886b5fb8bde949813d602cae.tar.gz
trainer-d64d827209ec59a8886b5fb8bde949813d602cae.tar.xz
trainer-d64d827209ec59a8886b5fb8bde949813d602cae.zip
write computeThreshold
-rw-r--r--lib/myconfig.py6
-rw-r--r--newword.py55
2 files changed, 51 insertions, 10 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py
index bcbc3de..161088e 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -147,6 +147,9 @@ class MyConfig:
def getNewWordThreshold(self):
return 0.10 / 2 # the first 5% in position
+ def getMinimumEntropy(self):
+ return 1.
+
def getMaximumIteration(self):
return 20 # roughly around N
@@ -161,3 +164,6 @@ class MyConfig:
def getPartialWordFileName(self):
return "partialword.txt"
+
+ def getNewWordFileName(self):
+ return "newword.txt"
diff --git a/newword.py b/newword.py
index 05c67aa..5646c5f 100644
--- a/newword.py
+++ b/newword.py
@@ -3,7 +3,9 @@ import os
import os.path
import sqlite3
from argparse import ArgumentParser
+from operator import itemgetter
from math import log
+from sys import float_info
import utils
from myconfig import MyConfig
from dirwalk import walkIndex
@@ -112,17 +114,21 @@ def populateBigramSqlite(workdir):
############################################################
def computeEntropy(freqs):
- print(freqs)
+ #print(freqs)
totalfreq = sum(freqs)
freqs = [ freq / float(totalfreq) for freq in freqs ]
- assert 1 == sum(freqs)
+ assert abs(1 - sum(freqs)) < len(freqs) * float_info.epsilon
- entropy = sum([ - freq * log(freq) for freq in freqs ])
- print(entropy)
+ entropy = - sum([ freq * log(freq) for freq in freqs ])
return entropy
+############################################################
+# Get Threshold Pass #
+############################################################
+
+
SELECT_PREFIX_DML = '''
SELECT prefix, freq FROM bigram WHERE postfix = ? ;
'''
@@ -133,8 +139,6 @@ SELECT postfix, freq FROM bigram WHERE prefix = ? ;
def computePrefixEntropy(cur, word):
- print('prefix', word)
-
rows = cur.execute(SELECT_PREFIX_DML, (word, )).fetchall()
if 0 == len(rows):
return 0.
@@ -149,7 +153,6 @@ def computePrefixEntropy(cur, word):
def computePostfixEntropy(cur, word):
- print('postfix', word)
rows = cur.execute(SELECT_POSTFIX_DML, (word, )).fetchall()
if 0 == len(rows):
@@ -164,9 +167,41 @@ def computePostfixEntropy(cur, word):
return computeEntropy(freqs)
-############################################################
-# Get Threshold Pass #
-############################################################
+def computeThreshold(cur, tag):
+ wordswithentropy = []
+ wordlistfile = open(config.getWordsListFileName(), "r")
+
+ for oneline in wordlistfile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+
+ if len(oneline) == 0:
+ continue
+
+ word = oneline
+
+ entropy = 0.
+ if "prefix" == tag:
+ entropy = computePrefixEntropy(cur, word)
+ elif "postfix" == tag:
+ entropy = computePostfixEntropy(cur, word)
+ else:
+ raise "invalid tag value."
+
+ #print(word, entropy)
+
+ if entropy < config.getMinimumEntropy():
+ continue
+
+ wordswithentropy.append((word, entropy))
+
+ wordlistfile.close()
+
+ #ascending sort
+ wordswithentropy.sort(key=itemgetter(1))
+ pos = int(len(wordswithentropy) * config.getNewWordThreshold())
+ (word, threshold) = wordswithentropy[-pos]
+ print(word, threshold)
+ return threshold
############################################################