diff options
| author | Peng Huang <shawn.p.huang@gmail.com> | 2009-10-05 11:54:20 +0800 |
|---|---|---|
| committer | Peng Huang <shawn.p.huang@gmail.com> | 2009-10-05 11:54:20 +0800 |
| commit | 81070b09ca9a8bec2ab76006aa049f460904e23a (patch) | |
| tree | a01abcd4766aa6fbb825851a67cfda71804063cb /scripts/create_db.py | |
| parent | d866b6b936220d6f3f95a24a0d3c762186134ba6 (diff) | |
| download | ibus-libpinyin-81070b09ca9a8bec2ab76006aa049f460904e23a.tar.gz ibus-libpinyin-81070b09ca9a8bec2ab76006aa049f460904e23a.tar.xz ibus-libpinyin-81070b09ca9a8bec2ab76006aa049f460904e23a.zip | |
Move all scripts to $top_srcdir/scripts
Diffstat (limited to 'scripts/create_db.py')
| -rw-r--r-- | scripts/create_db.py | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/scripts/create_db.py b/scripts/create_db.py new file mode 100644 index 0000000..e4826ae --- /dev/null +++ b/scripts/create_db.py @@ -0,0 +1,94 @@ +import sqlite3 +from pydict import * +from id import * +import sys + +con1 = sqlite3.connect("py.db") +con2 = sqlite3.connect("py-new.db") +con2.execute ("PRAGMA synchronous = NORMAL;") +con2.execute ("PRAGMA temp_store = MEMORY;") +con2.execute ("PRAGMA default_cache_size = 5000;") + +sql = "CREATE TABLE py_phrase_%d (phrase TEXT, freq INTEGER, %s)" + +for i in range(0, 16): + column= [] + for j in range(0, i + 1): + column.append ("s%d INTEGER" % j) + column.append ("y%d INTEGER" % j) + column = ",".join(column) + con2.execute(sql % (i, column)) +con2.commit() + +def get_sheng_yun(pinyin): + if pinyin == None: + return None, None + if pinyin == "ng": + return "", "en" + for i in xrange(2, 0, -1): + t = pinyin[:i] + if t in SHENGMU_DICT: + return t, pinyin[len(t):] + return "", pinyin + +def encode_pinyin(pinyin): + if pinyin == None or pinyin == "": + return 0 + return pinyin_id[pinyin] + e = 0 + for c in pinyin: + e = (e << 5) + (ord(c) - ord('a') + 1) + return e + +insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);" +con2.commit() +new_freq = 0 +freq = 0 + +print "INSERTING" +for r in con1.execute("SELECT * FROM py_phrase ORDER BY freq"): + ylen = r[0] + phrase = r[10] + if r[11] > freq: + freq = r[11] + new_freq += 1 + + if ylen <= 4: + pys = map(lambda id: ID_PINYIN_DICT[id], r[1: 1 + ylen]) + else: + pys = map(lambda id: ID_PINYIN_DICT[id], r[1: 5]) + r[5].encode("utf8").split("'") + + i = ylen - 1 + if i >= 15: + i = 15 + + pys = pys[0:16] + + sheng_yun = [] + for s, y in map(get_sheng_yun, pys): + sheng_yun.append(s) + sheng_yun.append(y) + + + column = [phrase, new_freq] + map(encode_pinyin, sheng_yun) + + sql = insert_sql % (i, ",".join(["?"] * len(column))) + con2.execute (sql, column) + +print "Remove duplicate" +for i in xrange(0, 16): + sql = "DELETE FROM py_phrase_%d WHERE rowid IN (SELECT rowid FROM (SELECT count() as count, rowid FROM py_phrase_%d GROUP by %s,phrase) WHERE count > 1)" % (i, i, ",".join(map(lambda i: "s%d,y%d"%(i,i), range(0, i + 1)))) + con2.execute(sql) +con2.commit() +print "CACUUM" +con2.execute("VACUUM;") +con2.commit() + +# con2.execute("create index index_0_0 on py_phrase_0(s0, y0)") +# +# for i in xrange(1, 16): +# con2.execute("create index index_%d_0 on py_phrase_%d(s0, y0, s1, y1)" % (i, i)) +# con2.execute("create index index_%d_1 on py_phrase_%d(s0, s1, y1)" % (i, i)) +# +# con2.execute("vacuum") +# con2.commit() |
