diff options
| author | Peng Wu <alexepico@gmail.com> | 2013-01-15 11:07:36 +0800 |
|---|---|---|
| committer | Peng Wu <alexepico@gmail.com> | 2013-01-15 11:07:36 +0800 |
| commit | da2ed34e34891a2f74ee9c330dd56c246a76b67d (patch) | |
| tree | c8f6d5b5965ce3e039bc1df1e5ae15b63e1e704c | |
| parent | 8e20db6052bc76b3ae53476248fd946581f0273e (diff) | |
| download | trainer-da2ed34e34891a2f74ee9c330dd56c246a76b67d.tar.gz trainer-da2ed34e34891a2f74ee9c330dd56c246a76b67d.tar.xz trainer-da2ed34e34891a2f74ee9c330dd56c246a76b67d.zip | |
partialwordthreshold.py
| -rw-r--r-- | lib/myconfig.py | 11 | ||||
| -rw-r--r-- | partialwordthreshold.py | 128 |
2 files changed, 137 insertions, 2 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py index 4202b94..73a019b 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -11,6 +11,7 @@ class MyConfig: 'EvaluateEpoch': 1, \ 'PrepareEpoch': 2, \ 'PopulateEpoch': 3, \ + 'PartialWordThresholdEpoch': 4, \ } def getEpochs(self): @@ -129,11 +130,17 @@ class MyConfig: def getMinimumOccurrence(self): return 3 # minimum word occurrence - def getPartialWordThresHold(self): + def getPartialWordThreshold(self): return 0.10 # the last 10% in position - def getNewWordThresHold(self): + def getNewWordThreshold(self): return 0.10 / 2 # the last 5% in position def getMaximumIteration(self): return 20 # roughly around N + + def getWordsListFileName(self): + return "words.txt" + + def getWordsWithPinyinFileName(self): + return "oldwords.txt" diff --git a/partialwordthreshold.py b/partialwordthreshold.py new file mode 100644 index 0000000..955e5c6 --- /dev/null +++ b/partialwordthreshold.py @@ -0,0 +1,128 @@ +#!/usr/bin/python3 +import os +import sqlite3 +from argparse import ArgumentParser +from operator import itemgetter +import utils +from myconfig import MyConfig + + +SELECT_WORD_DML = ''' +SELECT freq from ngram where words = ?; +''' + +config = MyConfig() + +#change cwd to the word recognizer directory +words_dir = config.getWordRecognizerDir() +os.makedirs(words_dir, exist_ok=True) +os.chdir(words_dir) +#chdir done + + +def handleError(error): + sys.exit(error) + + +def getWordFrequency(conn, word): + sep = config.getWordSep() + word_str = sep + word + sep + + cur = conn.cursor() + row = cur.execute(SELECT_WORD_DML, word_str).fetchone() + + if None == row: + return 0 + else: + (freq) = row + return freq + + +def computeThreshold(conn): + wordswithfreq = [] + wordlistfile = open(config.getWordsListFileName(), "r") + + for oneline in wordlistfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + word = oneline + + freq = getWordFrequency(conn, word) + + if freq < config.getMinimumOccurrence(): + continue + + wordswithfreq.append((word, freq)) + + wordlistfile.close() + + #ascending sort + wordswithfreq.sort(key=itemgetter(1)) + pos = len(wordswithfreq) * config.getPartialWordThreshold() + threshold = wordswithfreq[pos] + + return threshold + + +def handleOneIndex(filepath, subdir, indexname): + print(indexpath, subdir, indexname) + + indexstatuspath = indexpath + config.getStatusPostfix() + indexstatus = utils.load_status(indexstatuspath) + if not utils.check_epoch(indexstatus, 'Populate'): + raise utils.EpochError('Please populate first.\n') + if utils.check_epoch(indexstatus, 'PartialWordThreshold'): + return + + workdir = config.getWordRecognizerDir() + os.sep + \ + subdir + os.sep + indexname + print(workdir) + + length = 1 + + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + + conn = sqlite3.connect(filepath) + + threshold = computeThreshold(conn) + print(threshold) + indexstatus['PartialWordThreshold'] = threshold + + conn.commit() + if conn: + conn.close() + + #sign epoch + #utils.sign_epoch(indexstatus, 'PartialWordThreshold') + #utils.store_status(indexstatuspath, indexstatus) + + +def walkThroughIndex(path): + for root, dirs, files in os.walk(path, topdown=True, onerror=handleError): + for onefile in files: + filepath = os.path.join(root, onefile) + indexpostfix = config.getIndexPostfix() + if onefile.endswith(indexpostfix): + subdir = os.path.relpath(root, path) + indexname = onefile[:-len(indexpostfix)] + handleOneIndex(filepath, subdir, indexname) + elif onefile.endswith(config.getStatusPostfix()): + pass + else: + print('Unexpected file:' + filepath) + + +if __name__ == '__main__': + parser = ArgumentParser(description='Partial word threshold.') + parser.add_argument('--indexdir', action = 'store', \ + help='index directory', \ + default=os.path.join(config.getTextDir(), 'index')) + + args = parser.parse_args() + print(args) + walkThroughIndex(args.indexdir) + print('done') |
