diff options
author | Peng Wu <alexepico@gmail.com> | 2013-01-21 13:47:45 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-01-21 13:47:45 +0800 |
commit | 37d59988dba4d4c8b7aecd0a586eb34ef84e7943 (patch) | |
tree | 435fd6316a9e87083feaca036aead03092f626fc /partialword.py | |
parent | 29f0dbebd820a93aa0020fa7b989166ff0bd1971 (diff) | |
download | trainer-37d59988dba4d4c8b7aecd0a586eb34ef84e7943.tar.gz trainer-37d59988dba4d4c8b7aecd0a586eb34ef84e7943.tar.xz trainer-37d59988dba4d4c8b7aecd0a586eb34ef84e7943.zip |
write recognizePartialWord
Diffstat (limited to 'partialword.py')
-rw-r--r-- | partialword.py | 84 |
1 files changed, 78 insertions, 6 deletions
diff --git a/partialword.py b/partialword.py index 27ed127..847ba35 100644 --- a/partialword.py +++ b/partialword.py @@ -72,11 +72,9 @@ def load_words(filename): wordlistfile.close() -load_words(config.getWordsListFileName()) -#print(words_set) - - def createNgramTableClone(conn): + print("creating ngram fts table...") + cur = conn.cursor() cur.execute(CREATE_NGRAM_FTS_DDL) @@ -86,6 +84,7 @@ def createNgramTableClone(conn): def dropNgramTableClone(conn): + print("dropping ngram fts table...") cur = conn.cursor() cur.execute(DROP_NGRAM_FTS_DML) @@ -150,7 +149,7 @@ def doCombineWord(high_cur, low_cur, words): while middle != '': merged_words_str = left + merged_str + right - print(matched_words_str), print(merged_words_str) + #print(matched_words_str), print(merged_words_str) assert len(matched_words_str) == len(merged_words_str) + 1 #do combine @@ -172,4 +171,77 @@ def doCombineWord(high_cur, low_cur, words): def recognizePartialWord(workdir, threshold): - pass + print(workdir, threshold) + + iternum = 0 + maxIter = config.getMaximumIteration() + + filename = config.getPartialWordFileName() + filepath = workdir + os.sep + filename + partialwordfile = open(filepath, "w") + + while iternum < maxIter : + print("pass ", iternum) + + length = 2 + filename = config.getNgramFileName(length) + filepath = workdir + os.sep + filename + + conn = sqlite3.connect(filepath) + partial_words_list = getPartialWordList(conn, threshold) + conn.close() + + changed_num = 0 + for item in partial_words_list: + (merged_word, prefix, postfix, freq) = item + if merged_word in words_set : + continue + changed_num = changed_num + 1 + + if 0 == changed_num: + break; + + for item in partial_words_list: + item = [str(x) for x in item] + oneline = "\t".join(item) + partialwordfile.writelines([oneline, os.linesep]) + + for i in range(N, 1, -1): + #high n-gram + filename = config.getNgramFileName(i) + filepath = workdir + os.sep + filename + high_conn = sqlite3.connect(filepath) + high_cur = high_conn.cursor() + + #low n-gram + filename = config.getNgramFileName(i - 1) + filepath = workdir + os.sep + filename + low_conn = sqlite3.connect(filepath) + low_cur = low_conn.cursor() + + dropNgramTableClone(high_conn) + createNgramTableClone(high_conn) + + for item in partial_words_list: + (merged_word, prefix, postfix, freq) = item + if merged_word in words_set : + continue + doCombineWord(high_cur, low_cur, (prefix, postfix)) + + high_conn.commit() + low_conn.commit() + + dropNgramTableClone(high_conn) + + high_conn.close() + low_conn.close() + + iternum = iternum + 1 + + partialwordfile.close() + print(workdir, 'done') + + +load_words(config.getWordsListFileName()) +#print(words_set) + |