summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-15 11:07:36 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-15 11:07:36 +0800
commitda2ed34e34891a2f74ee9c330dd56c246a76b67d (patch)
treec8f6d5b5965ce3e039bc1df1e5ae15b63e1e704c
parent8e20db6052bc76b3ae53476248fd946581f0273e (diff)
downloadtrainer-da2ed34e34891a2f74ee9c330dd56c246a76b67d.tar.gz
trainer-da2ed34e34891a2f74ee9c330dd56c246a76b67d.tar.xz
trainer-da2ed34e34891a2f74ee9c330dd56c246a76b67d.zip
partialwordthreshold.py
-rw-r--r--lib/myconfig.py11
-rw-r--r--partialwordthreshold.py128
2 files changed, 137 insertions, 2 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py
index 4202b94..73a019b 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -11,6 +11,7 @@ class MyConfig:
'EvaluateEpoch': 1, \
'PrepareEpoch': 2, \
'PopulateEpoch': 3, \
+ 'PartialWordThresholdEpoch': 4, \
}
def getEpochs(self):
@@ -129,11 +130,17 @@ class MyConfig:
def getMinimumOccurrence(self):
return 3 # minimum word occurrence
- def getPartialWordThresHold(self):
+ def getPartialWordThreshold(self):
return 0.10 # the last 10% in position
- def getNewWordThresHold(self):
+ def getNewWordThreshold(self):
return 0.10 / 2 # the last 5% in position
def getMaximumIteration(self):
return 20 # roughly around N
+
+ def getWordsListFileName(self):
+ return "words.txt"
+
+ def getWordsWithPinyinFileName(self):
+ return "oldwords.txt"
diff --git a/partialwordthreshold.py b/partialwordthreshold.py
new file mode 100644
index 0000000..955e5c6
--- /dev/null
+++ b/partialwordthreshold.py
@@ -0,0 +1,128 @@
+#!/usr/bin/python3
+import os
+import sqlite3
+from argparse import ArgumentParser
+from operator import itemgetter
+import utils
+from myconfig import MyConfig
+
+
+SELECT_WORD_DML = '''
+SELECT freq from ngram where words = ?;
+'''
+
+config = MyConfig()
+
+#change cwd to the word recognizer directory
+words_dir = config.getWordRecognizerDir()
+os.makedirs(words_dir, exist_ok=True)
+os.chdir(words_dir)
+#chdir done
+
+
+def handleError(error):
+ sys.exit(error)
+
+
+def getWordFrequency(conn, word):
+ sep = config.getWordSep()
+ word_str = sep + word + sep
+
+ cur = conn.cursor()
+ row = cur.execute(SELECT_WORD_DML, word_str).fetchone()
+
+ if None == row:
+ return 0
+ else:
+ (freq) = row
+ return freq
+
+
+def computeThreshold(conn):
+ wordswithfreq = []
+ wordlistfile = open(config.getWordsListFileName(), "r")
+
+ for oneline in wordlistfile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+
+ if len(oneline) == 0:
+ continue
+
+ word = oneline
+
+ freq = getWordFrequency(conn, word)
+
+ if freq < config.getMinimumOccurrence():
+ continue
+
+ wordswithfreq.append((word, freq))
+
+ wordlistfile.close()
+
+ #ascending sort
+ wordswithfreq.sort(key=itemgetter(1))
+ pos = len(wordswithfreq) * config.getPartialWordThreshold()
+ threshold = wordswithfreq[pos]
+
+ return threshold
+
+
+def handleOneIndex(filepath, subdir, indexname):
+ print(indexpath, subdir, indexname)
+
+ indexstatuspath = indexpath + config.getStatusPostfix()
+ indexstatus = utils.load_status(indexstatuspath)
+ if not utils.check_epoch(indexstatus, 'Populate'):
+ raise utils.EpochError('Please populate first.\n')
+ if utils.check_epoch(indexstatus, 'PartialWordThreshold'):
+ return
+
+ workdir = config.getWordRecognizerDir() + os.sep + \
+ subdir + os.sep + indexname
+ print(workdir)
+
+ length = 1
+
+ filename = config.getNgramFileName(length)
+ filepath = workdir + os.sep + filename
+
+ conn = sqlite3.connect(filepath)
+
+ threshold = computeThreshold(conn)
+ print(threshold)
+ indexstatus['PartialWordThreshold'] = threshold
+
+ conn.commit()
+ if conn:
+ conn.close()
+
+ #sign epoch
+ #utils.sign_epoch(indexstatus, 'PartialWordThreshold')
+ #utils.store_status(indexstatuspath, indexstatus)
+
+
+def walkThroughIndex(path):
+ for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
+ for onefile in files:
+ filepath = os.path.join(root, onefile)
+ indexpostfix = config.getIndexPostfix()
+ if onefile.endswith(indexpostfix):
+ subdir = os.path.relpath(root, path)
+ indexname = onefile[:-len(indexpostfix)]
+ handleOneIndex(filepath, subdir, indexname)
+ elif onefile.endswith(config.getStatusPostfix()):
+ pass
+ else:
+ print('Unexpected file:' + filepath)
+
+
+if __name__ == '__main__':
+ parser = ArgumentParser(description='Partial word threshold.')
+ parser.add_argument('--indexdir', action = 'store', \
+ help='index directory', \
+ default=os.path.join(config.getTextDir(), 'index'))
+
+ args = parser.parse_args()
+ print(args)
+ walkThroughIndex(args.indexdir)
+ print('done')