diff options
| author | Peng Wu <alexepico@gmail.com> | 2013-01-16 11:26:45 +0800 |
|---|---|---|
| committer | Peng Wu <alexepico@gmail.com> | 2013-01-16 11:26:45 +0800 |
| commit | aaef9f0739a6b1db53f26baaaed28be053fa3bf6 (patch) | |
| tree | b60ead98dc89c17f1e6e4be92624d69ada701d31 | |
| parent | 19fd47bf825fb85ca4f83563156ffa8436a54792 (diff) | |
| download | trainer-aaef9f0739a6b1db53f26baaaed28be053fa3bf6.tar.gz trainer-aaef9f0739a6b1db53f26baaaed28be053fa3bf6.tar.xz trainer-aaef9f0739a6b1db53f26baaaed28be053fa3bf6.zip | |
clean bigram handle
| -rw-r--r-- | lib/myconfig.py | 2 | ||||
| -rw-r--r-- | populate.py | 45 | ||||
| -rw-r--r-- | prepare.py | 11 |
3 files changed, 2 insertions, 56 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py index a649c82..c0b3ad2 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -12,6 +12,8 @@ class MyConfig: 'PrepareEpoch': 2, \ 'PopulateEpoch': 3, \ 'PartialWordThresholdEpoch': 4, \ + 'PartialWordEpoch': 5, \ + 'PopulateBigramEpoch': 6, \ } def getEpochs(self): diff --git a/populate.py b/populate.py index 3097d0e..918fc6e 100644 --- a/populate.py +++ b/populate.py @@ -15,17 +15,6 @@ UPDATE_NGRAM_DML = ''' UPDATE ngram SET freq = freq + 1 WHERE words = ?; ''' -SELECT_ALL_NGRAM_DML = ''' -SELECT words, freq FROM ngram; -''' - -DELETE_BIGRAM_DML = ''' -DELETE FROM bigram; -''' - -INSERT_BIGRAM_DML = ''' -INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?); -''' config = MyConfig() @@ -130,37 +119,6 @@ def handleOnePass(indexpath, workdir, length): if conn: conn.close() -def handleBigramPass(indexpath, workdir): - print(indexpath, workdir, 'bigram') - length = 2 - - sep = config.getWordSep() - - filename = config.getNgramFileName(length) - filepath = workdir + os.sep + filename - - #begin processing - conn = sqlite3.connect(filepath) - cur = conn.cursor() - - cur.execute(DELETE_BIGRAM_DML) - rows = cur.execute(SELECT_ALL_NGRAM_DML).fetchall() - for row in rows: - (words_str, freq) = row - - words = words_str.strip(sep).split(sep, 1) - assert len(words) == length - - (prefix, postfix) = words - - cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq)) - #print(prefix, postfix, freq) - - conn.commit() - - if conn: - conn.close() - def handleOneIndex(indexpath, subdir, indexname, fast): print(indexpath, subdir, indexname) @@ -191,12 +149,9 @@ def handleOneIndex(indexpath, subdir, indexname, fast): else: handleOnePass(indexpath, workdir, i) - handleBigramPass(indexpath, workdir) - #sign epoch utils.sign_epoch(indexstatus, 'Populate') utils.store_status(indexstatuspath, indexstatus) - def walkThroughIndex(path, fast): @@ -18,13 +18,6 @@ CREATE_NGRAM_INDEX_DDL = ''' CREATE UNIQUE INDEX ngram_index on ngram(words); ''' -CREATE_BIGRAM_DDL = ''' -CREATE TABLE bigram ( - prefix TEXT NOT NULL, - postfix TEXT NOT NULL, - freq INTEGER NOT NULL - ); -''' config = MyConfig() @@ -62,10 +55,6 @@ def createSqliteDatabases(onedir): cur.execute(CREATE_NGRAM_DDL) cur.execute(CREATE_NGRAM_INDEX_DDL) - #special case for bi-gram - if 2 == i: - cur.execute(CREATE_BIGRAM_DDL) - conn.commit() if conn: |
