diff options
| author | Peng Wu <alexepico@gmail.com> | 2013-01-21 12:55:46 +0800 |
|---|---|---|
| committer | Peng Wu <alexepico@gmail.com> | 2013-01-21 12:57:28 +0800 |
| commit | 29f0dbebd820a93aa0020fa7b989166ff0bd1971 (patch) | |
| tree | df85514d06d29993ceca0b267faa001f82271d24 /partialword.py | |
| parent | 3efea35a1fb53c728a60ed13e4f3051c0bd23c01 (diff) | |
| download | trainer-29f0dbebd820a93aa0020fa7b989166ff0bd1971.tar.gz trainer-29f0dbebd820a93aa0020fa7b989166ff0bd1971.tar.xz trainer-29f0dbebd820a93aa0020fa7b989166ff0bd1971.zip | |
write doCombineWord
Diffstat (limited to 'partialword.py')
| -rw-r--r-- | partialword.py | 50 |
1 files changed, 45 insertions, 5 deletions
diff --git a/partialword.py b/partialword.py index a30609e..27ed127 100644 --- a/partialword.py +++ b/partialword.py @@ -14,7 +14,6 @@ SELECT words, freq FROM ngram WHERE freq > ?; UPDATE_LOW_NGRAM_DML = ''' UPDATE ngram SET freq = freq + ? WHERE words = ?; ''' -#assert rowcount <= 1 INSERT_LOW_NGRAM_DML = ''' INSERT INTO ngram (words, freq) VALUES (?, ?); @@ -24,7 +23,6 @@ INSERT INTO ngram (words, freq) VALUES (?, ?); DELETE_HIGH_NGRAM_DML = ''' DELETE FROM ngram WHERE words = ?; ''' -#assert rowcount <= 1 #sqlite full text search section @@ -115,7 +113,7 @@ def getPartialWordList(conn, threshold): return words_list -def getMatchedItems(conn, words): +def getMatchedItems(cur, words): print(words) (prefix, postfix) = words @@ -124,12 +122,54 @@ def getMatchedItems(conn, words): words_str = '"' + sep + prefix + sep + postfix + sep + '"' print(words_str) - cur = conn.cursor() rows = cur.execute(SELECT_MERGE_HIGH_NGRAM_DML, (words_str, )).fetchall() for row in rows: (words, freq) = row matched_list.append((words, freq)) - conn.commit() return matched_list + + +def doCombineWord(high_cur, low_cur, words): + print(words) + (prefix, postfix) = words + + sep = config.getWordSep() + + matched_items = getMatchedItems(high_cur, words) + words_str = sep + prefix + sep + postfix + sep + print(words_str) + + for item in matched_items: + (matched_words_str, matched_freq) = item + assert words_str in matched_words_str + merged_str = sep + prefix + postfix + sep + + (left, middle, right) = matched_words_str.partition(words_str) + while middle != '': + merged_words_str = left + merged_str + right + + print(matched_words_str), print(merged_words_str) + assert len(matched_words_str) == len(merged_words_str) + 1 + + #do combine + rowcount = low_cur.execute(UPDATE_LOW_NGRAM_DML, \ + (matched_freq, merged_words_str)).rowcount + #print(rowcount) + assert rowcount <= 1 + + if 0 == rowcount: + low_cur.execute(INSERT_LOW_NGRAM_DML, \ + (merged_words_str, matched_freq)) + + rowcount = high_cur.execute(DELETE_HIGH_NGRAM_DML, \ + (merged_words_str, )).rowcount + assert rowcount <= 1 + + (partial_left, middle, right) = right.partition(words_str) + left = left + middle + partial_left + + +def recognizePartialWord(workdir, threshold): + pass |
