diff options
Diffstat (limited to 'data/db/android/create_db.py')
-rwxr-xr-x | data/db/android/create_db.py | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/data/db/android/create_db.py b/data/db/android/create_db.py new file mode 100755 index 0000000..4fff1d0 --- /dev/null +++ b/data/db/android/create_db.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +from pydict import * +from id import * +from valid_hanzi import * +import sys + +def get_sheng_yun(pinyin): + if pinyin == None: + return None, None + if pinyin == "ng": + return "", "en" + for i in xrange(2, 0, -1): + t = pinyin[:i] + if t in SHENGMU_DICT: + return t, pinyin[len(t):] + return "", pinyin + +def read_phrases(filename): + buf = file(filename).read() + buf = unicode(buf, "utf16") + buf = buf.strip() + for l in buf.split(u'\n'): + hanzi, freq, flag, pinyin = l.split(u' ', 3) + freq = float(freq) + pinyin = pinyin.split() + if any(map(lambda c: c not in valid_hanzi, hanzi)): + continue + yield hanzi, freq, pinyin + +def create_db(filename): + # import sqlite3 + # con = sqlite3.connect("main.db") + # con.execute ("PRAGMA synchronous = NORMAL;") + # con.execute ("PRAGMA temp_store = MEMORY;") + # con.execute ("PRAGMA default_cache_size = 5000;") + print "PRAGMA synchronous = NORMAL;" + print "PRAGMA temp_store = MEMORY;" + print "PRAGMA default_cache_size = 5000;" + + + sql = "CREATE TABLE py_phrase_%d (phrase TEXT, freq INTEGER, %s);" + for i in range(0, 16): + column = [] + for j in range(0, i + 1): + column.append ("s%d INTEGER" % j) + column.append ("y%d INTEGER" % j) + print sql % (i, ",".join(column)) + # con.execute(sql % (i, column)) + # con.commit() + + records = list(read_phrases(filename)) + records.sort(lambda a, b: 1 if a[1] > b[1] else -1) + records_new = [] + i = 0 + max_freq = 0.0 + for hanzi, freq, pinyin in records: + if max_freq / freq < 1 - 0.001: + max_freq = freq + i = i + 1 + records_new.append((hanzi, i, pinyin)) + records_new.reverse() + + print "BEGIN;" + insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);" + for hanzi, freq, pinyin in records_new: + columns = [] + for py in pinyin: + s, y = get_sheng_yun(py) + s, y = pinyin_id[s], pinyin_id[y] + columns.append(s) + columns.append(y) + values = "'%s', %d, %s" % (hanzi.encode("utf8"), freq, ",".join(map(str,columns))) + + sql = insert_sql % (len(hanzi) - 1, values) + print sql + print "COMMIT;" + print "VACUUM;" + +def main(): + create_db(sys.argv[1]) + +if __name__ == "__main__": + main() |