diff options
Diffstat (limited to 'data')
-rwxr-xr-x | data/db/google/create_db_from_google.py | 16 |
1 files changed, 12 insertions, 4 deletions
diff --git a/data/db/google/create_db_from_google.py b/data/db/google/create_db_from_google.py index 4cb3c61..fe1f5f7 100755 --- a/data/db/google/create_db_from_google.py +++ b/data/db/google/create_db_from_google.py @@ -37,19 +37,27 @@ def create_db(): validate_hanzi = get_validate_hanzi() records = list(read_phrases(validate_hanzi)) - records.sort(lambda a, b: -1 if a[1] - b[1] > 0 else 1) + records.sort(lambda a, b: 1 if a[1] > b[1] else -1) + records_new = [] + i = 0 + max_freq = 0.0 + for hanzi, freq, pinyin in records: + if max_freq / freq < 1 - 0.001: + max_freq = freq + i = i + 1 + records_new.append((hanzi, i, pinyin)) + records_new.reverse() print "BEGIN;" insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);" - l = len(records) - for i, (hanzi, freq, pinyin) in enumerate(records): + for hanzi, freq, pinyin in records_new: columns = [] for py in pinyin: s, y = get_sheng_yun(py) s, y = pinyin_id[s], pinyin_id[y] columns.append(s) columns.append(y) - values = "'%s', %d, %s" % (hanzi.encode("utf8"), l - i, ",".join(map(str,columns))) + values = "'%s', %d, %s" % (hanzi.encode("utf8"), freq, ",".join(map(str,columns))) sql = insert_sql % (len(hanzi) - 1, values) print sql |