diff options
author | Peng Wu <alexepico@gmail.com> | 2013-01-23 13:26:34 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-01-23 14:02:07 +0800 |
commit | d64d827209ec59a8886b5fb8bde949813d602cae (patch) | |
tree | ac73665cbc0308a9325261383ec587315285b5a7 | |
parent | 014f1784618f2079dcf2404eb3e0e32dffedce74 (diff) | |
download | trainer-d64d827209ec59a8886b5fb8bde949813d602cae.tar.gz trainer-d64d827209ec59a8886b5fb8bde949813d602cae.tar.xz trainer-d64d827209ec59a8886b5fb8bde949813d602cae.zip |
write computeThreshold
-rw-r--r-- | lib/myconfig.py | 6 | ||||
-rw-r--r-- | newword.py | 55 |
2 files changed, 51 insertions, 10 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py index bcbc3de..161088e 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -147,6 +147,9 @@ class MyConfig: def getNewWordThreshold(self): return 0.10 / 2 # the first 5% in position + def getMinimumEntropy(self): + return 1. + def getMaximumIteration(self): return 20 # roughly around N @@ -161,3 +164,6 @@ class MyConfig: def getPartialWordFileName(self): return "partialword.txt" + + def getNewWordFileName(self): + return "newword.txt" @@ -3,7 +3,9 @@ import os import os.path import sqlite3 from argparse import ArgumentParser +from operator import itemgetter from math import log +from sys import float_info import utils from myconfig import MyConfig from dirwalk import walkIndex @@ -112,17 +114,21 @@ def populateBigramSqlite(workdir): ############################################################ def computeEntropy(freqs): - print(freqs) + #print(freqs) totalfreq = sum(freqs) freqs = [ freq / float(totalfreq) for freq in freqs ] - assert 1 == sum(freqs) + assert abs(1 - sum(freqs)) < len(freqs) * float_info.epsilon - entropy = sum([ - freq * log(freq) for freq in freqs ]) - print(entropy) + entropy = - sum([ freq * log(freq) for freq in freqs ]) return entropy +############################################################ +# Get Threshold Pass # +############################################################ + + SELECT_PREFIX_DML = ''' SELECT prefix, freq FROM bigram WHERE postfix = ? ; ''' @@ -133,8 +139,6 @@ SELECT postfix, freq FROM bigram WHERE prefix = ? ; def computePrefixEntropy(cur, word): - print('prefix', word) - rows = cur.execute(SELECT_PREFIX_DML, (word, )).fetchall() if 0 == len(rows): return 0. @@ -149,7 +153,6 @@ def computePrefixEntropy(cur, word): def computePostfixEntropy(cur, word): - print('postfix', word) rows = cur.execute(SELECT_POSTFIX_DML, (word, )).fetchall() if 0 == len(rows): @@ -164,9 +167,41 @@ def computePostfixEntropy(cur, word): return computeEntropy(freqs) -############################################################ -# Get Threshold Pass # -############################################################ +def computeThreshold(cur, tag): + wordswithentropy = [] + wordlistfile = open(config.getWordsListFileName(), "r") + + for oneline in wordlistfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + word = oneline + + entropy = 0. + if "prefix" == tag: + entropy = computePrefixEntropy(cur, word) + elif "postfix" == tag: + entropy = computePostfixEntropy(cur, word) + else: + raise "invalid tag value." + + #print(word, entropy) + + if entropy < config.getMinimumEntropy(): + continue + + wordswithentropy.append((word, entropy)) + + wordlistfile.close() + + #ascending sort + wordswithentropy.sort(key=itemgetter(1)) + pos = int(len(wordswithentropy) * config.getNewWordThreshold()) + (word, threshold) = wordswithentropy[-pos] + print(word, threshold) + return threshold ############################################################ |