diff options
author | Peng Wu <alexepico@gmail.com> | 2013-01-24 09:11:19 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-01-24 09:27:17 +0800 |
commit | b7ca5611982fba8113fee325183141f65be684b2 (patch) | |
tree | e7145d33a298fcec182f1565995e548879ad584a | |
parent | b0ed4df63c8e123772da05795dbacde0fcbba1ff (diff) | |
download | trainer-b7ca5611982fba8113fee325183141f65be684b2.tar.gz trainer-b7ca5611982fba8113fee325183141f65be684b2.tar.xz trainer-b7ca5611982fba8113fee325183141f65be684b2.zip |
merge partialwordthreshold.py into partialword.py
-rw-r--r-- | partialword.py | 99 | ||||
-rw-r--r-- | partialwordthreshold.py | 107 |
2 files changed, 87 insertions, 119 deletions
diff --git a/partialword.py b/partialword.py index 1e7e51c..f67ac97 100644 --- a/partialword.py +++ b/partialword.py @@ -2,10 +2,91 @@ import os import sqlite3 from argparse import ArgumentParser +from operator import itemgetter import utils from myconfig import MyConfig from dirwalk import walkIndex +config = MyConfig() + +#change cwd to the word recognizer directory +words_dir = config.getWordRecognizerDir() +os.chdir(words_dir) +#chdir done + +############################################################ +# Get Threshold # +############################################################ + +SELECT_WORD_DML = ''' +SELECT freq from ngram where words = ?; +''' + +def getWordFrequency(conn, word): + sep = config.getWordSep() + word_str = sep + word + sep + + cur = conn.cursor() + row = cur.execute(SELECT_WORD_DML, (word_str, )).fetchone() + + if None == row: + return 0 + else: + freq = row[0] + return freq + + +def computeThreshold(conn): + wordswithfreq = [] + wordlistfile = open(config.getWordsListFileName(), "r") + + for oneline in wordlistfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + word = oneline + + freq = getWordFrequency(conn, word) + + if freq < config.getWordMinimumOccurrence(): + continue + + wordswithfreq.append((word, freq)) + + wordlistfile.close() + + #ascending sort + wordswithfreq.sort(key=itemgetter(1)) + pos = int(len(wordswithfreq) * config.getPartialWordThreshold()) + (word, threshold) = wordswithfreq[-pos] + print(word, threshold) + return threshold + + +def getThreshold(workdir): + print(workdir, 'threshold') + + length = 1 + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + + conn = sqlite3.connect(filepath) + + threshold = computeThreshold(conn) + + conn.commit() + if conn: + conn.close() + + return threshold + + +############################################################ +# Get Partial Word # +############################################################ + SELECT_PARTIAL_WORD_DML = ''' SELECT words, freq FROM ngram WHERE freq > ?; ''' @@ -42,16 +123,9 @@ SELECT_MERGE_HIGH_NGRAM_DML = ''' SELECT words, freq FROM ngram_fts WHERE words MATCH ?; ''' -config = MyConfig() - #maximum combine number N = config.getMaximumCombineNumber() -#change cwd to the word recognizer directory -words_dir = config.getWordRecognizerDir() -os.chdir(words_dir) -#chdir done - #load existing words words_set = set([]) @@ -263,18 +337,19 @@ def handleOneIndex(indexpath, subdir, indexname): indexstatuspath = indexpath + config.getStatusPostfix() indexstatus = utils.load_status(indexstatuspath) - if not utils.check_epoch(indexstatus, 'PartialWordThreshold'): - raise utils.EpochError \ - ('Please partial word threshold estimate first.\n') + if not utils.check_epoch(indexstatus, 'Populate'): + raise utils.EpochError('Please populate first.\n') if utils.check_epoch(indexstatus, 'PartialWord'): return - threshold = indexstatus['PartialWordThreshold'] - workdir = config.getWordRecognizerDir() + os.sep + \ subdir + os.sep + indexname print(workdir) + threshold = getThreshold(workdir) + indexstatus['PartialWordThreshold'] = threshold + utils.store_status(indexstatuspath, indexstatus) + recognizePartialWord(workdir, threshold) #sign epoch diff --git a/partialwordthreshold.py b/partialwordthreshold.py deleted file mode 100644 index 4729168..0000000 --- a/partialwordthreshold.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/python3 -import os -import sqlite3 -from argparse import ArgumentParser -from operator import itemgetter -import utils -from myconfig import MyConfig -from dirwalk import walkIndex - -SELECT_WORD_DML = ''' -SELECT freq from ngram where words = ?; -''' - -config = MyConfig() - -#change cwd to the word recognizer directory -words_dir = config.getWordRecognizerDir() -os.chdir(words_dir) -#chdir done - - -def getWordFrequency(conn, word): - sep = config.getWordSep() - word_str = sep + word + sep - - cur = conn.cursor() - row = cur.execute(SELECT_WORD_DML, (word_str, )).fetchone() - - if None == row: - return 0 - else: - freq = row[0] - return freq - - -def computeThreshold(conn): - wordswithfreq = [] - wordlistfile = open(config.getWordsListFileName(), "r") - - for oneline in wordlistfile.readlines(): - oneline = oneline.rstrip(os.linesep) - - if len(oneline) == 0: - continue - - word = oneline - - freq = getWordFrequency(conn, word) - - if freq < config.getWordMinimumOccurrence(): - continue - - wordswithfreq.append((word, freq)) - - wordlistfile.close() - - #ascending sort - wordswithfreq.sort(key=itemgetter(1)) - pos = int(len(wordswithfreq) * config.getPartialWordThreshold()) - (word, threshold) = wordswithfreq[-pos] - print(word, threshold) - return threshold - - -def handleOneIndex(indexpath, subdir, indexname): - print(indexpath, subdir, indexname) - - indexstatuspath = indexpath + config.getStatusPostfix() - indexstatus = utils.load_status(indexstatuspath) - if not utils.check_epoch(indexstatus, 'Populate'): - raise utils.EpochError('Please populate first.\n') - if utils.check_epoch(indexstatus, 'PartialWordThreshold'): - return - - workdir = config.getWordRecognizerDir() + os.sep + \ - subdir + os.sep + indexname - print(workdir) - - length = 1 - - filename = config.getNgramFileName(length) - filepath = workdir + os.sep + filename - - conn = sqlite3.connect(filepath) - - threshold = computeThreshold(conn) - indexstatus['PartialWordThreshold'] = threshold - - conn.commit() - if conn: - conn.close() - - #sign epoch - utils.sign_epoch(indexstatus, 'PartialWordThreshold') - utils.store_status(indexstatuspath, indexstatus) - - -if __name__ == '__main__': - parser = ArgumentParser(description='Partial word threshold.') - parser.add_argument('--indexdir', action = 'store', \ - help='index directory', \ - default=config.getTextIndexDir()) - - args = parser.parse_args() - print(args) - walkIndex(handleOneIndex, args.indexdir) - print('done') |