summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-23 14:18:00 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-23 14:18:00 +0800
commit78c8376ec429ba31f3e70b3ae0524751d15b8deb (patch)
tree416b9bd3485cc547c5f0c063e91f428625506290
parentd64d827209ec59a8886b5fb8bde949813d602cae (diff)
downloadtrainer-78c8376ec429ba31f3e70b3ae0524751d15b8deb.tar.gz
trainer-78c8376ec429ba31f3e70b3ae0524751d15b8deb.tar.xz
trainer-78c8376ec429ba31f3e70b3ae0524751d15b8deb.zip
write filterPartialWord
-rw-r--r--lib/myconfig.py6
-rw-r--r--newword.py85
2 files changed, 86 insertions, 5 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py
index 161088e..2e54d14 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -142,13 +142,13 @@ class MyConfig:
return 9 # minimum word occurrence in n-gram table
def getPartialWordThreshold(self):
- return 0.10 # the first 10% in position
+ return 0.50 # the first 50% in position
def getNewWordThreshold(self):
- return 0.10 / 2 # the first 5% in position
+ return 0.50 # the first 50% in position, subject verb object.
def getMinimumEntropy(self):
- return 1.
+ return 0.3
def getMaximumIteration(self):
return 20 # roughly around N
diff --git a/newword.py b/newword.py
index 5646c5f..9633d76 100644
--- a/newword.py
+++ b/newword.py
@@ -167,7 +167,9 @@ def computePostfixEntropy(cur, word):
return computeEntropy(freqs)
-def computeThreshold(cur, tag):
+def computeThreshold(conn, tag):
+ cur = conn.cursor()
+
wordswithentropy = []
wordlistfile = open(config.getWordsListFileName(), "r")
@@ -196,14 +198,93 @@ def computeThreshold(cur, tag):
wordlistfile.close()
+ conn.commit()
+
#ascending sort
wordswithentropy.sort(key=itemgetter(1))
pos = int(len(wordswithentropy) * config.getNewWordThreshold())
(word, threshold) = wordswithentropy[-pos]
- print(word, threshold)
+ print(word, tag, threshold)
return threshold
############################################################
# Get Word Pass #
############################################################
+
+def filterPartialWord(workdir, conn, prethres, postthres):
+ words_set = set([])
+ cur = conn.cursor()
+
+ filename = workdir + os.sep + config.getPartialWordFileName()
+ partialwordfile = open(filename, "r")
+
+ filename = workdir + os.sep + config.getNewWordFileName()
+ newwordfile = open(filename, "w")
+
+ for oneline in partialwordfile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+
+ if len(oneline) == 0:
+ continue
+
+ (word, prefix, postfix, freq) = oneline.split(None, 3)
+
+ if word in words_set:
+ continue
+
+ entropy = computePrefixEntropy(cur, word)
+ if entropy < prethres:
+ continue
+ entropy = computePostfixEntropy(cur, word)
+ if entropy < postthres:
+ continue
+
+ print(word)
+ newwordfile.writelines([word, os.linesep])
+ words_set.add(word)
+
+ newwordfile.close()
+ partialwordfile.close()
+ conn.commit()
+
+
+############################################################
+# Handle Index #
+############################################################
+
+def handleOneIndex(indexpath, subdir, indexname):
+ print(indexpath, subdir, indexname)
+
+ indexstatuspath = indexpath + config.getStatusPostfix()
+ indexstatus = utils.load_status(indexstatuspath)
+ if not utils.check_epoch(indexstatus, 'PartialWord'):
+ raise utils.EpochError('Please partial word first.\n')
+ if utils.check_epoch(indexstatus, 'NewWord'):
+ return
+
+ workdir = config.getWordRecognizerDir() + os.sep + \
+ subdir + os.sep + indexname
+ print(workdir)
+
+ filename = config.getBigramFileName()
+ filepath = workdir + os.sep + filename
+
+ conn = sqlite3.connect(filename)
+
+ prethres = computeThreshold(conn, "prefix")
+ indexstatus['NewWordPrefixThreshold'] = prethres
+ postthres = computeThreshold(conn, "postfix")
+ indexstatus['NewWordPostfixThreshold'] = postthres
+
+ utils.store_status(indexstatuspath, indexstatus)
+
+ filterPartialWord(workdir, conn, prethres, postthres)
+
+ conn.commit()
+ if conn:
+ conn.close()
+
+ #sign epoch
+ utils.sign_epoch(indexstatus, 'NewWordThreshold')
+ utils.store_status(indexstatuspath, indexstatus)