diff options
author | Peng Wu <alexepico@gmail.com> | 2012-06-06 10:29:27 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-06-06 10:29:27 +0800 |
commit | 928a7cce363dee8b42452bc0cf99c6070c6d4626 (patch) | |
tree | 7bb825b1b58a8271f30f4302fa08c2866db30528 | |
parent | eb35544148080f975b82ba279653cf9e4e7c589a (diff) | |
download | ibus-libpinyin-928a7cce363dee8b42452bc0cf99c6070c6d4626.tar.gz ibus-libpinyin-928a7cce363dee8b42452bc0cf99c6070c6d4626.tar.xz ibus-libpinyin-928a7cce363dee8b42452bc0cf99c6070c6d4626.zip |
clean up scripts
-rwxr-xr-x | data/db/android/create_db.py | 83 | ||||
-rw-r--r-- | scripts/Makefile | 9 | ||||
-rw-r--r-- | scripts/create_db.py | 94 | ||||
-rw-r--r-- | scripts/create_index.py | 21 | ||||
-rw-r--r-- | scripts/create_unique_index.py | 27 |
5 files changed, 0 insertions, 234 deletions
diff --git a/data/db/android/create_db.py b/data/db/android/create_db.py deleted file mode 100755 index 4fff1d0..0000000 --- a/data/db/android/create_db.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python -from pydict import * -from id import * -from valid_hanzi import * -import sys - -def get_sheng_yun(pinyin): - if pinyin == None: - return None, None - if pinyin == "ng": - return "", "en" - for i in xrange(2, 0, -1): - t = pinyin[:i] - if t in SHENGMU_DICT: - return t, pinyin[len(t):] - return "", pinyin - -def read_phrases(filename): - buf = file(filename).read() - buf = unicode(buf, "utf16") - buf = buf.strip() - for l in buf.split(u'\n'): - hanzi, freq, flag, pinyin = l.split(u' ', 3) - freq = float(freq) - pinyin = pinyin.split() - if any(map(lambda c: c not in valid_hanzi, hanzi)): - continue - yield hanzi, freq, pinyin - -def create_db(filename): - # import sqlite3 - # con = sqlite3.connect("main.db") - # con.execute ("PRAGMA synchronous = NORMAL;") - # con.execute ("PRAGMA temp_store = MEMORY;") - # con.execute ("PRAGMA default_cache_size = 5000;") - print "PRAGMA synchronous = NORMAL;" - print "PRAGMA temp_store = MEMORY;" - print "PRAGMA default_cache_size = 5000;" - - - sql = "CREATE TABLE py_phrase_%d (phrase TEXT, freq INTEGER, %s);" - for i in range(0, 16): - column = [] - for j in range(0, i + 1): - column.append ("s%d INTEGER" % j) - column.append ("y%d INTEGER" % j) - print sql % (i, ",".join(column)) - # con.execute(sql % (i, column)) - # con.commit() - - records = list(read_phrases(filename)) - records.sort(lambda a, b: 1 if a[1] > b[1] else -1) - records_new = [] - i = 0 - max_freq = 0.0 - for hanzi, freq, pinyin in records: - if max_freq / freq < 1 - 0.001: - max_freq = freq - i = i + 1 - records_new.append((hanzi, i, pinyin)) - records_new.reverse() - - print "BEGIN;" - insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);" - for hanzi, freq, pinyin in records_new: - columns = [] - for py in pinyin: - s, y = get_sheng_yun(py) - s, y = pinyin_id[s], pinyin_id[y] - columns.append(s) - columns.append(y) - values = "'%s', %d, %s" % (hanzi.encode("utf8"), freq, ",".join(map(str,columns))) - - sql = insert_sql % (len(hanzi) - 1, values) - print sql - print "COMMIT;" - print "VACUUM;" - -def main(): - create_db(sys.argv[1]) - -if __name__ == "__main__": - main() diff --git a/scripts/Makefile b/scripts/Makefile deleted file mode 100644 index 046dd70..0000000 --- a/scripts/Makefile +++ /dev/null @@ -1,9 +0,0 @@ - -all: - -py-new.db: py.db create_db.py - $(RM) py-new.db - python create_db.py - -index: - python create_index.py diff --git a/scripts/create_db.py b/scripts/create_db.py deleted file mode 100644 index e4826ae..0000000 --- a/scripts/create_db.py +++ /dev/null @@ -1,94 +0,0 @@ -import sqlite3 -from pydict import * -from id import * -import sys - -con1 = sqlite3.connect("py.db") -con2 = sqlite3.connect("py-new.db") -con2.execute ("PRAGMA synchronous = NORMAL;") -con2.execute ("PRAGMA temp_store = MEMORY;") -con2.execute ("PRAGMA default_cache_size = 5000;") - -sql = "CREATE TABLE py_phrase_%d (phrase TEXT, freq INTEGER, %s)" - -for i in range(0, 16): - column= [] - for j in range(0, i + 1): - column.append ("s%d INTEGER" % j) - column.append ("y%d INTEGER" % j) - column = ",".join(column) - con2.execute(sql % (i, column)) -con2.commit() - -def get_sheng_yun(pinyin): - if pinyin == None: - return None, None - if pinyin == "ng": - return "", "en" - for i in xrange(2, 0, -1): - t = pinyin[:i] - if t in SHENGMU_DICT: - return t, pinyin[len(t):] - return "", pinyin - -def encode_pinyin(pinyin): - if pinyin == None or pinyin == "": - return 0 - return pinyin_id[pinyin] - e = 0 - for c in pinyin: - e = (e << 5) + (ord(c) - ord('a') + 1) - return e - -insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);" -con2.commit() -new_freq = 0 -freq = 0 - -print "INSERTING" -for r in con1.execute("SELECT * FROM py_phrase ORDER BY freq"): - ylen = r[0] - phrase = r[10] - if r[11] > freq: - freq = r[11] - new_freq += 1 - - if ylen <= 4: - pys = map(lambda id: ID_PINYIN_DICT[id], r[1: 1 + ylen]) - else: - pys = map(lambda id: ID_PINYIN_DICT[id], r[1: 5]) + r[5].encode("utf8").split("'") - - i = ylen - 1 - if i >= 15: - i = 15 - - pys = pys[0:16] - - sheng_yun = [] - for s, y in map(get_sheng_yun, pys): - sheng_yun.append(s) - sheng_yun.append(y) - - - column = [phrase, new_freq] + map(encode_pinyin, sheng_yun) - - sql = insert_sql % (i, ",".join(["?"] * len(column))) - con2.execute (sql, column) - -print "Remove duplicate" -for i in xrange(0, 16): - sql = "DELETE FROM py_phrase_%d WHERE rowid IN (SELECT rowid FROM (SELECT count() as count, rowid FROM py_phrase_%d GROUP by %s,phrase) WHERE count > 1)" % (i, i, ",".join(map(lambda i: "s%d,y%d"%(i,i), range(0, i + 1)))) - con2.execute(sql) -con2.commit() -print "CACUUM" -con2.execute("VACUUM;") -con2.commit() - -# con2.execute("create index index_0_0 on py_phrase_0(s0, y0)") -# -# for i in xrange(1, 16): -# con2.execute("create index index_%d_0 on py_phrase_%d(s0, y0, s1, y1)" % (i, i)) -# con2.execute("create index index_%d_1 on py_phrase_%d(s0, s1, y1)" % (i, i)) -# -# con2.execute("vacuum") -# con2.commit() diff --git a/scripts/create_index.py b/scripts/create_index.py deleted file mode 100644 index 823e616..0000000 --- a/scripts/create_index.py +++ /dev/null @@ -1,21 +0,0 @@ -import sqlite3 - -con2 = sqlite3.connect("py-new.db") -con2.execute ("PRAGMA synchronous = NORMAL;") -con2.execute ("PRAGMA temp_store = MEMORY;") - - -con2.execute("CREATE INDEX index_0_0 ON py_phrase_0(s0, y0)") -print "py_phrase_%d done" % 0 - -con2.execute("CREATE INDEX index_1_0 ON py_phrase_1(s0, y0, s1, y1)") -con2.execute("CREATE INDEX index_1_1 ON py_phrase_1(s0, s1, y1)") -print "py_phrase_%d done" % 1 - -for i in xrange(2, 16): - con2.execute("CREATE INDEX index_%d_0 ON py_phrase_%d(s0, y0, s1, y1, s2, y2)" % (i, i)) - con2.execute("CREATE INDEX index_%d_1 ON py_phrase_%d(s0, s1, s2, y2)" % (i, i)) - print "py_phrase_%d done" % i - -# con2.execute("vacuum") -con2.commit() diff --git a/scripts/create_unique_index.py b/scripts/create_unique_index.py deleted file mode 100644 index e22d986..0000000 --- a/scripts/create_unique_index.py +++ /dev/null @@ -1,27 +0,0 @@ -import sqlite3 - -con2 = sqlite3.connect("py-new.db") -con2.execute ("PRAGMA synchronous = NORMAL;") -con2.execute ("PRAGMA temp_store = MEMORY;") - - -con2.execute("CREATE UNIQUE INDEX IF NOT EXISTS index_0_0 ON py_phrase_0(s0, y0, phrase)") -print "py_phrase_%d done" % 0 - -con2.execute("CREATE UNIQUE INDEX IF NOT EXISTS index_1_0 ON py_phrase_1(s0, y0, s1, y1, phrase)") -con2.execute("CREATE INDEX IF NOT EXISTS index_1_1 ON py_phrase_1(s0, s1, y1)") -print "py_phrase_%d done" % 1 - -for i in xrange(2, 16): - sql = "CREATE UNIQUE INDEX IF NOT EXISTS index_%d_0 ON py_phrase_%d (" % (i, i) - sql = sql + "s0,y0" - for j in xrange(1, i + 1): - sql = sql + ",s%d,y%d" % (j, j) - sql = sql + ", phrase)" - print sql - con2.execute(sql) - con2.execute("CREATE INDEX IF NOT EXISTS index_%d_1 ON py_phrase_%d(s0, s1, s2, y2)" % (i, i)) - print "py_phrase_%d done" % i - -# con2.execute("vacuum") -con2.commit() |