summaryrefslogtreecommitdiffstats
path: root/data
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-06-06 10:29:27 +0800
committerPeng Wu <alexepico@gmail.com>2012-06-06 10:29:27 +0800
commit928a7cce363dee8b42452bc0cf99c6070c6d4626 (patch)
tree7bb825b1b58a8271f30f4302fa08c2866db30528 /data
parenteb35544148080f975b82ba279653cf9e4e7c589a (diff)
downloadibus-libpinyin-928a7cce363dee8b42452bc0cf99c6070c6d4626.tar.gz
ibus-libpinyin-928a7cce363dee8b42452bc0cf99c6070c6d4626.tar.xz
ibus-libpinyin-928a7cce363dee8b42452bc0cf99c6070c6d4626.zip
clean up scripts
Diffstat (limited to 'data')
-rwxr-xr-xdata/db/android/create_db.py83
1 files changed, 0 insertions, 83 deletions
diff --git a/data/db/android/create_db.py b/data/db/android/create_db.py
deleted file mode 100755
index 4fff1d0..0000000
--- a/data/db/android/create_db.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-from pydict import *
-from id import *
-from valid_hanzi import *
-import sys
-
-def get_sheng_yun(pinyin):
- if pinyin == None:
- return None, None
- if pinyin == "ng":
- return "", "en"
- for i in xrange(2, 0, -1):
- t = pinyin[:i]
- if t in SHENGMU_DICT:
- return t, pinyin[len(t):]
- return "", pinyin
-
-def read_phrases(filename):
- buf = file(filename).read()
- buf = unicode(buf, "utf16")
- buf = buf.strip()
- for l in buf.split(u'\n'):
- hanzi, freq, flag, pinyin = l.split(u' ', 3)
- freq = float(freq)
- pinyin = pinyin.split()
- if any(map(lambda c: c not in valid_hanzi, hanzi)):
- continue
- yield hanzi, freq, pinyin
-
-def create_db(filename):
- # import sqlite3
- # con = sqlite3.connect("main.db")
- # con.execute ("PRAGMA synchronous = NORMAL;")
- # con.execute ("PRAGMA temp_store = MEMORY;")
- # con.execute ("PRAGMA default_cache_size = 5000;")
- print "PRAGMA synchronous = NORMAL;"
- print "PRAGMA temp_store = MEMORY;"
- print "PRAGMA default_cache_size = 5000;"
-
-
- sql = "CREATE TABLE py_phrase_%d (phrase TEXT, freq INTEGER, %s);"
- for i in range(0, 16):
- column = []
- for j in range(0, i + 1):
- column.append ("s%d INTEGER" % j)
- column.append ("y%d INTEGER" % j)
- print sql % (i, ",".join(column))
- # con.execute(sql % (i, column))
- # con.commit()
-
- records = list(read_phrases(filename))
- records.sort(lambda a, b: 1 if a[1] > b[1] else -1)
- records_new = []
- i = 0
- max_freq = 0.0
- for hanzi, freq, pinyin in records:
- if max_freq / freq < 1 - 0.001:
- max_freq = freq
- i = i + 1
- records_new.append((hanzi, i, pinyin))
- records_new.reverse()
-
- print "BEGIN;"
- insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);"
- for hanzi, freq, pinyin in records_new:
- columns = []
- for py in pinyin:
- s, y = get_sheng_yun(py)
- s, y = pinyin_id[s], pinyin_id[y]
- columns.append(s)
- columns.append(y)
- values = "'%s', %d, %s" % (hanzi.encode("utf8"), freq, ",".join(map(str,columns)))
-
- sql = insert_sql % (len(hanzi) - 1, values)
- print sql
- print "COMMIT;"
- print "VACUUM;"
-
-def main():
- create_db(sys.argv[1])
-
-if __name__ == "__main__":
- main()