summaryrefslogtreecommitdiffstats
path: root/partialword.py
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-24 09:11:19 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-24 09:27:17 +0800
commitb7ca5611982fba8113fee325183141f65be684b2 (patch)
treee7145d33a298fcec182f1565995e548879ad584a /partialword.py
parentb0ed4df63c8e123772da05795dbacde0fcbba1ff (diff)
downloadtrainer-b7ca5611982fba8113fee325183141f65be684b2.tar.gz
trainer-b7ca5611982fba8113fee325183141f65be684b2.tar.xz
trainer-b7ca5611982fba8113fee325183141f65be684b2.zip
merge partialwordthreshold.py into partialword.py
Diffstat (limited to 'partialword.py')
-rw-r--r--partialword.py99
1 files changed, 87 insertions, 12 deletions
diff --git a/partialword.py b/partialword.py
index 1e7e51c..f67ac97 100644
--- a/partialword.py
+++ b/partialword.py
@@ -2,10 +2,91 @@
import os
import sqlite3
from argparse import ArgumentParser
+from operator import itemgetter
import utils
from myconfig import MyConfig
from dirwalk import walkIndex
+config = MyConfig()
+
+#change cwd to the word recognizer directory
+words_dir = config.getWordRecognizerDir()
+os.chdir(words_dir)
+#chdir done
+
+############################################################
+# Get Threshold #
+############################################################
+
+SELECT_WORD_DML = '''
+SELECT freq from ngram where words = ?;
+'''
+
+def getWordFrequency(conn, word):
+ sep = config.getWordSep()
+ word_str = sep + word + sep
+
+ cur = conn.cursor()
+ row = cur.execute(SELECT_WORD_DML, (word_str, )).fetchone()
+
+ if None == row:
+ return 0
+ else:
+ freq = row[0]
+ return freq
+
+
+def computeThreshold(conn):
+ wordswithfreq = []
+ wordlistfile = open(config.getWordsListFileName(), "r")
+
+ for oneline in wordlistfile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+
+ if len(oneline) == 0:
+ continue
+
+ word = oneline
+
+ freq = getWordFrequency(conn, word)
+
+ if freq < config.getWordMinimumOccurrence():
+ continue
+
+ wordswithfreq.append((word, freq))
+
+ wordlistfile.close()
+
+ #ascending sort
+ wordswithfreq.sort(key=itemgetter(1))
+ pos = int(len(wordswithfreq) * config.getPartialWordThreshold())
+ (word, threshold) = wordswithfreq[-pos]
+ print(word, threshold)
+ return threshold
+
+
+def getThreshold(workdir):
+ print(workdir, 'threshold')
+
+ length = 1
+ filename = config.getNgramFileName(length)
+ filepath = workdir + os.sep + filename
+
+ conn = sqlite3.connect(filepath)
+
+ threshold = computeThreshold(conn)
+
+ conn.commit()
+ if conn:
+ conn.close()
+
+ return threshold
+
+
+############################################################
+# Get Partial Word #
+############################################################
+
SELECT_PARTIAL_WORD_DML = '''
SELECT words, freq FROM ngram WHERE freq > ?;
'''
@@ -42,16 +123,9 @@ SELECT_MERGE_HIGH_NGRAM_DML = '''
SELECT words, freq FROM ngram_fts WHERE words MATCH ?;
'''
-config = MyConfig()
-
#maximum combine number
N = config.getMaximumCombineNumber()
-#change cwd to the word recognizer directory
-words_dir = config.getWordRecognizerDir()
-os.chdir(words_dir)
-#chdir done
-
#load existing words
words_set = set([])
@@ -263,18 +337,19 @@ def handleOneIndex(indexpath, subdir, indexname):
indexstatuspath = indexpath + config.getStatusPostfix()
indexstatus = utils.load_status(indexstatuspath)
- if not utils.check_epoch(indexstatus, 'PartialWordThreshold'):
- raise utils.EpochError \
- ('Please partial word threshold estimate first.\n')
+ if not utils.check_epoch(indexstatus, 'Populate'):
+ raise utils.EpochError('Please populate first.\n')
if utils.check_epoch(indexstatus, 'PartialWord'):
return
- threshold = indexstatus['PartialWordThreshold']
-
workdir = config.getWordRecognizerDir() + os.sep + \
subdir + os.sep + indexname
print(workdir)
+ threshold = getThreshold(workdir)
+ indexstatus['PartialWordThreshold'] = threshold
+ utils.store_status(indexstatuspath, indexstatus)
+
recognizePartialWord(workdir, threshold)
#sign epoch