summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-24 09:11:19 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-24 09:27:17 +0800
commitb7ca5611982fba8113fee325183141f65be684b2 (patch)
treee7145d33a298fcec182f1565995e548879ad584a
parentb0ed4df63c8e123772da05795dbacde0fcbba1ff (diff)
downloadtrainer-b7ca5611982fba8113fee325183141f65be684b2.tar.gz
trainer-b7ca5611982fba8113fee325183141f65be684b2.tar.xz
trainer-b7ca5611982fba8113fee325183141f65be684b2.zip
merge partialwordthreshold.py into partialword.py
-rw-r--r--partialword.py99
-rw-r--r--partialwordthreshold.py107
2 files changed, 87 insertions, 119 deletions
diff --git a/partialword.py b/partialword.py
index 1e7e51c..f67ac97 100644
--- a/partialword.py
+++ b/partialword.py
@@ -2,10 +2,91 @@
import os
import sqlite3
from argparse import ArgumentParser
+from operator import itemgetter
import utils
from myconfig import MyConfig
from dirwalk import walkIndex
+config = MyConfig()
+
+#change cwd to the word recognizer directory
+words_dir = config.getWordRecognizerDir()
+os.chdir(words_dir)
+#chdir done
+
+############################################################
+# Get Threshold #
+############################################################
+
+SELECT_WORD_DML = '''
+SELECT freq from ngram where words = ?;
+'''
+
+def getWordFrequency(conn, word):
+ sep = config.getWordSep()
+ word_str = sep + word + sep
+
+ cur = conn.cursor()
+ row = cur.execute(SELECT_WORD_DML, (word_str, )).fetchone()
+
+ if None == row:
+ return 0
+ else:
+ freq = row[0]
+ return freq
+
+
+def computeThreshold(conn):
+ wordswithfreq = []
+ wordlistfile = open(config.getWordsListFileName(), "r")
+
+ for oneline in wordlistfile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+
+ if len(oneline) == 0:
+ continue
+
+ word = oneline
+
+ freq = getWordFrequency(conn, word)
+
+ if freq < config.getWordMinimumOccurrence():
+ continue
+
+ wordswithfreq.append((word, freq))
+
+ wordlistfile.close()
+
+ #ascending sort
+ wordswithfreq.sort(key=itemgetter(1))
+ pos = int(len(wordswithfreq) * config.getPartialWordThreshold())
+ (word, threshold) = wordswithfreq[-pos]
+ print(word, threshold)
+ return threshold
+
+
+def getThreshold(workdir):
+ print(workdir, 'threshold')
+
+ length = 1
+ filename = config.getNgramFileName(length)
+ filepath = workdir + os.sep + filename
+
+ conn = sqlite3.connect(filepath)
+
+ threshold = computeThreshold(conn)
+
+ conn.commit()
+ if conn:
+ conn.close()
+
+ return threshold
+
+
+############################################################
+# Get Partial Word #
+############################################################
+
SELECT_PARTIAL_WORD_DML = '''
SELECT words, freq FROM ngram WHERE freq > ?;
'''
@@ -42,16 +123,9 @@ SELECT_MERGE_HIGH_NGRAM_DML = '''
SELECT words, freq FROM ngram_fts WHERE words MATCH ?;
'''
-config = MyConfig()
-
#maximum combine number
N = config.getMaximumCombineNumber()
-#change cwd to the word recognizer directory
-words_dir = config.getWordRecognizerDir()
-os.chdir(words_dir)
-#chdir done
-
#load existing words
words_set = set([])
@@ -263,18 +337,19 @@ def handleOneIndex(indexpath, subdir, indexname):
indexstatuspath = indexpath + config.getStatusPostfix()
indexstatus = utils.load_status(indexstatuspath)
- if not utils.check_epoch(indexstatus, 'PartialWordThreshold'):
- raise utils.EpochError \
- ('Please partial word threshold estimate first.\n')
+ if not utils.check_epoch(indexstatus, 'Populate'):
+ raise utils.EpochError('Please populate first.\n')
if utils.check_epoch(indexstatus, 'PartialWord'):
return
- threshold = indexstatus['PartialWordThreshold']
-
workdir = config.getWordRecognizerDir() + os.sep + \
subdir + os.sep + indexname
print(workdir)
+ threshold = getThreshold(workdir)
+ indexstatus['PartialWordThreshold'] = threshold
+ utils.store_status(indexstatuspath, indexstatus)
+
recognizePartialWord(workdir, threshold)
#sign epoch
diff --git a/partialwordthreshold.py b/partialwordthreshold.py
deleted file mode 100644
index 4729168..0000000
--- a/partialwordthreshold.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/python3
-import os
-import sqlite3
-from argparse import ArgumentParser
-from operator import itemgetter
-import utils
-from myconfig import MyConfig
-from dirwalk import walkIndex
-
-SELECT_WORD_DML = '''
-SELECT freq from ngram where words = ?;
-'''
-
-config = MyConfig()
-
-#change cwd to the word recognizer directory
-words_dir = config.getWordRecognizerDir()
-os.chdir(words_dir)
-#chdir done
-
-
-def getWordFrequency(conn, word):
- sep = config.getWordSep()
- word_str = sep + word + sep
-
- cur = conn.cursor()
- row = cur.execute(SELECT_WORD_DML, (word_str, )).fetchone()
-
- if None == row:
- return 0
- else:
- freq = row[0]
- return freq
-
-
-def computeThreshold(conn):
- wordswithfreq = []
- wordlistfile = open(config.getWordsListFileName(), "r")
-
- for oneline in wordlistfile.readlines():
- oneline = oneline.rstrip(os.linesep)
-
- if len(oneline) == 0:
- continue
-
- word = oneline
-
- freq = getWordFrequency(conn, word)
-
- if freq < config.getWordMinimumOccurrence():
- continue
-
- wordswithfreq.append((word, freq))
-
- wordlistfile.close()
-
- #ascending sort
- wordswithfreq.sort(key=itemgetter(1))
- pos = int(len(wordswithfreq) * config.getPartialWordThreshold())
- (word, threshold) = wordswithfreq[-pos]
- print(word, threshold)
- return threshold
-
-
-def handleOneIndex(indexpath, subdir, indexname):
- print(indexpath, subdir, indexname)
-
- indexstatuspath = indexpath + config.getStatusPostfix()
- indexstatus = utils.load_status(indexstatuspath)
- if not utils.check_epoch(indexstatus, 'Populate'):
- raise utils.EpochError('Please populate first.\n')
- if utils.check_epoch(indexstatus, 'PartialWordThreshold'):
- return
-
- workdir = config.getWordRecognizerDir() + os.sep + \
- subdir + os.sep + indexname
- print(workdir)
-
- length = 1
-
- filename = config.getNgramFileName(length)
- filepath = workdir + os.sep + filename
-
- conn = sqlite3.connect(filepath)
-
- threshold = computeThreshold(conn)
- indexstatus['PartialWordThreshold'] = threshold
-
- conn.commit()
- if conn:
- conn.close()
-
- #sign epoch
- utils.sign_epoch(indexstatus, 'PartialWordThreshold')
- utils.store_status(indexstatuspath, indexstatus)
-
-
-if __name__ == '__main__':
- parser = ArgumentParser(description='Partial word threshold.')
- parser.add_argument('--indexdir', action = 'store', \
- help='index directory', \
- default=config.getTextIndexDir())
-
- args = parser.parse_args()
- print(args)
- walkIndex(handleOneIndex, args.indexdir)
- print('done')