From 78c8376ec429ba31f3e70b3ae0524751d15b8deb Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Wed, 23 Jan 2013 14:18:00 +0800 Subject: write filterPartialWord --- lib/myconfig.py | 6 ++-- newword.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 5 deletions(-) diff --git a/lib/myconfig.py b/lib/myconfig.py index 161088e..2e54d14 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -142,13 +142,13 @@ class MyConfig: return 9 # minimum word occurrence in n-gram table def getPartialWordThreshold(self): - return 0.10 # the first 10% in position + return 0.50 # the first 50% in position def getNewWordThreshold(self): - return 0.10 / 2 # the first 5% in position + return 0.50 # the first 50% in position, subject verb object. def getMinimumEntropy(self): - return 1. + return 0.3 def getMaximumIteration(self): return 20 # roughly around N diff --git a/newword.py b/newword.py index 5646c5f..9633d76 100644 --- a/newword.py +++ b/newword.py @@ -167,7 +167,9 @@ def computePostfixEntropy(cur, word): return computeEntropy(freqs) -def computeThreshold(cur, tag): +def computeThreshold(conn, tag): + cur = conn.cursor() + wordswithentropy = [] wordlistfile = open(config.getWordsListFileName(), "r") @@ -196,14 +198,93 @@ def computeThreshold(cur, tag): wordlistfile.close() + conn.commit() + #ascending sort wordswithentropy.sort(key=itemgetter(1)) pos = int(len(wordswithentropy) * config.getNewWordThreshold()) (word, threshold) = wordswithentropy[-pos] - print(word, threshold) + print(word, tag, threshold) return threshold ############################################################ # Get Word Pass # ############################################################ + +def filterPartialWord(workdir, conn, prethres, postthres): + words_set = set([]) + cur = conn.cursor() + + filename = workdir + os.sep + config.getPartialWordFileName() + partialwordfile = open(filename, "r") + + filename = workdir + os.sep + config.getNewWordFileName() + newwordfile = open(filename, "w") + + for oneline in partialwordfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (word, prefix, postfix, freq) = oneline.split(None, 3) + + if word in words_set: + continue + + entropy = computePrefixEntropy(cur, word) + if entropy < prethres: + continue + entropy = computePostfixEntropy(cur, word) + if entropy < postthres: + continue + + print(word) + newwordfile.writelines([word, os.linesep]) + words_set.add(word) + + newwordfile.close() + partialwordfile.close() + conn.commit() + + +############################################################ +# Handle Index # +############################################################ + +def handleOneIndex(indexpath, subdir, indexname): + print(indexpath, subdir, indexname) + + indexstatuspath = indexpath + config.getStatusPostfix() + indexstatus = utils.load_status(indexstatuspath) + if not utils.check_epoch(indexstatus, 'PartialWord'): + raise utils.EpochError('Please partial word first.\n') + if utils.check_epoch(indexstatus, 'NewWord'): + return + + workdir = config.getWordRecognizerDir() + os.sep + \ + subdir + os.sep + indexname + print(workdir) + + filename = config.getBigramFileName() + filepath = workdir + os.sep + filename + + conn = sqlite3.connect(filename) + + prethres = computeThreshold(conn, "prefix") + indexstatus['NewWordPrefixThreshold'] = prethres + postthres = computeThreshold(conn, "postfix") + indexstatus['NewWordPostfixThreshold'] = postthres + + utils.store_status(indexstatuspath, indexstatus) + + filterPartialWord(workdir, conn, prethres, postthres) + + conn.commit() + if conn: + conn.close() + + #sign epoch + utils.sign_epoch(indexstatus, 'NewWordThreshold') + utils.store_status(indexstatuspath, indexstatus) -- cgit