1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
#!/usr/bin/sqlite3
import os
import os.path
import sqlite3
from argparse import ArgumentParser
import utils
from myconfig import MyConfig
from dirwalk import walkIndex
CREATE_BIGRAM_DDL = '''
CREATE TABLE bigram (
prefix TEXT NOT NULL,
postfix TEXT NOT NULL,
freq INTEGER NOT NULL
);
'''
CREATE_BIGRAM_PREFIX_INDEX_DDL = '''
CREATE INDEX bigram_prefix_index on bigram(prefix);
'''
CREATE_BIGRAM_POSTFIX_INDEX_DDL = '''
CREATE INDEX bigram_postfix_index on bigram(postfix);
'''
SELECT_ALL_NGRAM_DML = '''
SELECT words, freq FROM ngram;
'''
INSERT_BIGRAM_DML = '''
INSERT INTO bigram(prefix, postfix, freq) VALUES (?, ?, ?);
'''
config = MyConfig()
#change cwd to the word recognizer directory
words_dir = config.getWordRecognizerDir()
os.chdir(words_dir)
#chdir done
def createBigramSqlite(indexpath, workdir):
print(indexpath, workdir, 'create bigram')
filename = config.getBigramFileName()
filepath = workdir + os.sep + filename
print(filepath)
if os.access(filepath, os.F_OK):
os.unlink(filepath)
conn = sqlite3.connect(filepath)
cur = conn.cursor()
cur.execute(CREATE_BIGRAM_DDL)
cur.execute(CREATE_BIGRAM_PREFIX_INDEX_DDL)
cur.execute(CREATE_BIGRAM_POSTFIX_INDEX_DDL)
conn.commit()
if conn:
conn.close()
def handleBigramPass(indexpath, workdir):
print(indexpath, workdir, 'bigram pass')
sep = config.getWordSep()
filename = config.getBigramFileName()
filepath = workdir + os.sep + filename
bigram_conn = sqlite3.connect(filepath)
bigram_cur = bigram_conn.cursor()
length = 2
filename = config.getNgramFileName(length)
filepath = workdir + os.sep + filename
ngram_conn = sqlite3.connect(filepath)
ngram_cur = ngram_conn.cursor()
#begin processing
rows = ngram_cur.execute(SELECT_ALL_NGRAM_DML).fetchall()
for row in rows:
(words_str, freq) = row
words = words_str.strip(sep).split(sep, 1)
assert len(words) == length
(prefix, postfix) = words
bigram_cur.execute(INSERT_BIGRAM_DML, (prefix, postfix, freq))
#print(prefix, postfix, freq)
bigram_conn.commit()
ngram_conn.commit()
if bigram_conn:
bigram_conn.close()
if ngram_conn:
ngram_conn.close()
def handleOneIndex(indexpath, subdir, indexname):
print(indexpath, subdir, indexname)
indexstatuspath = indexpath + config.getStatusPostfix()
indexstatus = utils.load_status(indexstatuspath)
if not utils.check_epoch(indexstatus, 'PartialWord'):
raise utils.EpochError('Please do partial word first.\n')
if utils.check_epoch(indexstatus, 'PopulateBigram'):
return
workdir = config.getWordRecognizerDir() + os.sep + \
subdir + os.sep + indexname
print(workdir)
createBigramSqlite(indexpath, workdir)
handleBigramPass(indexpath, workdir)
#sign epoch
utils.sign_epoch(indexstatus, 'PopulateBigram')
utils.store_status(indexstatuspath, indexstatus)
if __name__ == '__main__':
parser = ArgumentParser(description='Populate bi-gram.')
parser.add_argument('--indexdir', action='store', \
help='index directory', \
default=config.getTextIndexDir())
args = parser.parse_args()
print(args)
walkIndex(handleOneIndex, args.indexdir)
print('done')
|