summaryrefslogtreecommitdiffstats
path: root/data/db/android/create_db.py
diff options
context:
space:
mode:
Diffstat (limited to 'data/db/android/create_db.py')
-rwxr-xr-xdata/db/android/create_db.py83
1 files changed, 83 insertions, 0 deletions
diff --git a/data/db/android/create_db.py b/data/db/android/create_db.py
new file mode 100755
index 0000000..4fff1d0
--- /dev/null
+++ b/data/db/android/create_db.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+from pydict import *
+from id import *
+from valid_hanzi import *
+import sys
+
+def get_sheng_yun(pinyin):
+ if pinyin == None:
+ return None, None
+ if pinyin == "ng":
+ return "", "en"
+ for i in xrange(2, 0, -1):
+ t = pinyin[:i]
+ if t in SHENGMU_DICT:
+ return t, pinyin[len(t):]
+ return "", pinyin
+
+def read_phrases(filename):
+ buf = file(filename).read()
+ buf = unicode(buf, "utf16")
+ buf = buf.strip()
+ for l in buf.split(u'\n'):
+ hanzi, freq, flag, pinyin = l.split(u' ', 3)
+ freq = float(freq)
+ pinyin = pinyin.split()
+ if any(map(lambda c: c not in valid_hanzi, hanzi)):
+ continue
+ yield hanzi, freq, pinyin
+
+def create_db(filename):
+ # import sqlite3
+ # con = sqlite3.connect("main.db")
+ # con.execute ("PRAGMA synchronous = NORMAL;")
+ # con.execute ("PRAGMA temp_store = MEMORY;")
+ # con.execute ("PRAGMA default_cache_size = 5000;")
+ print "PRAGMA synchronous = NORMAL;"
+ print "PRAGMA temp_store = MEMORY;"
+ print "PRAGMA default_cache_size = 5000;"
+
+
+ sql = "CREATE TABLE py_phrase_%d (phrase TEXT, freq INTEGER, %s);"
+ for i in range(0, 16):
+ column = []
+ for j in range(0, i + 1):
+ column.append ("s%d INTEGER" % j)
+ column.append ("y%d INTEGER" % j)
+ print sql % (i, ",".join(column))
+ # con.execute(sql % (i, column))
+ # con.commit()
+
+ records = list(read_phrases(filename))
+ records.sort(lambda a, b: 1 if a[1] > b[1] else -1)
+ records_new = []
+ i = 0
+ max_freq = 0.0
+ for hanzi, freq, pinyin in records:
+ if max_freq / freq < 1 - 0.001:
+ max_freq = freq
+ i = i + 1
+ records_new.append((hanzi, i, pinyin))
+ records_new.reverse()
+
+ print "BEGIN;"
+ insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);"
+ for hanzi, freq, pinyin in records_new:
+ columns = []
+ for py in pinyin:
+ s, y = get_sheng_yun(py)
+ s, y = pinyin_id[s], pinyin_id[y]
+ columns.append(s)
+ columns.append(y)
+ values = "'%s', %d, %s" % (hanzi.encode("utf8"), freq, ",".join(map(str,columns)))
+
+ sql = insert_sql % (len(hanzi) - 1, values)
+ print sql
+ print "COMMIT;"
+ print "VACUUM;"
+
+def main():
+ create_db(sys.argv[1])
+
+if __name__ == "__main__":
+ main()