summaryrefslogtreecommitdiffstats
path: root/scripts
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-11-09 15:02:13 +0800
committerPeng Wu <alexepico@gmail.com>2011-11-09 15:03:55 +0800
commitaf7acd67e0b8649c2dcd79b2de7d4a6b20446d29 (patch)
tree38befba4712268641efaaa7fa5477b1d70e75e67 /scripts
parent266386321bc1ac40551e63977d58a68ff9abb8ce (diff)
downloadlibpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.tar.gz
libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.tar.xz
libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.zip
add origin freq for divided table
Diffstat (limited to 'scripts')
-rw-r--r--scripts/Makefile.data10
-rw-r--r--scripts/genpinyins.awk (renamed from scripts/genpinyin2.awk)2
-rw-r--r--scripts/genspecialtable.py7
-rw-r--r--scripts/specialtable.py24
4 files changed, 25 insertions, 18 deletions
diff --git a/scripts/Makefile.data b/scripts/Makefile.data
index 2f665b2..830e294 100644
--- a/scripts/Makefile.data
+++ b/scripts/Makefile.data
@@ -1,9 +1,9 @@
-all: pinyin2.txt
+all: pinyins.txt
-pinyin2.txt:
- awk -f genpinyin2.awk ../data/gb_char.table > $@
+pinyins.txt:
+ awk -f genpinyins.awk -vlen=1 ../data/gb_char.table > $@
+ awk -f genpinyins.awk -vlen=2 ../data/gb_char.table >> $@
-.PHONY: pinyin2.txt
-
+.PHONY: pinyins.txt
diff --git a/scripts/genpinyin2.awk b/scripts/genpinyins.awk
index 00c6cad..8ab3d13 100644
--- a/scripts/genpinyin2.awk
+++ b/scripts/genpinyins.awk
@@ -1,5 +1,5 @@
#!/usr/bin/awk
- { if (length($2) == 2) pinyins[$1] += $4 }
+ { if (length($2) == len) pinyins[$1] += $4 }
END {
for (pinyin in pinyins) {
diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py
index 0b03404..4393274 100644
--- a/scripts/genspecialtable.py
+++ b/scripts/genspecialtable.py
@@ -52,11 +52,12 @@ def get_chewing_string(pinyin):
def gen_divided_table():
entries = []
- for (pinyin_key, first_key, second_key, freq) in divided_list:
+ for (pinyin_key, orig_freq, first_key, second_key, new_freq) \
+ in divided_list:
(pinyin_key, first_key, second_key) = map \
(get_chewing_string, (pinyin_key, first_key, second_key))
- entry = '{{{0}, {1}, {2}, {3}}}'.format \
- (pinyin_key, first_key, second_key, freq)
+ entry = '{{{0}, {1}, {2}, {3}, {4}}}'.format \
+ (pinyin_key, orig_freq, first_key, second_key, new_freq)
entries.append(entry)
return ',\n'.join(entries)
diff --git a/scripts/specialtable.py b/scripts/specialtable.py
index acb66cf..a71aed7 100644
--- a/scripts/specialtable.py
+++ b/scripts/specialtable.py
@@ -42,12 +42,15 @@ def load_phrase(filename):
#print(pinyin_str)
continue
- (first_key, second_key) = pinyin_str.split("'")
- if first_key[-1].isdigit():
- first_key = first_key[:-1]
- if second_key[-1].isdigit():
- second_key = second_key[:-1]
- phrase_dict[(first_key, second_key)] = freq
+ if "'" in pinyin_str:
+ (first_key, second_key) = pinyin_str.split("'")
+ if first_key[-1].isdigit():
+ first_key = first_key[:-1]
+ if second_key[-1].isdigit():
+ second_key = second_key[:-1]
+ phrase_dict[(first_key, second_key)] = freq
+ else:
+ phrase_dict[pinyin_str] = freq
phrasefile.close()
@@ -67,8 +70,11 @@ def filter_divided():
for (pinyin_key, first_key, second_key) in gen_all_divided():
if not (first_key, second_key) in phrase_dict:
continue
- freq = phrase_dict[(first_key, second_key)]
- yield pinyin_key, first_key, second_key, freq
+ orig_freq = 0
+ if pinyin_key in phrase_dict:
+ orig_freq = phrase_dict[pinyin_key]
+ new_freq = phrase_dict[(first_key, second_key)]
+ yield pinyin_key, orig_freq, first_key, second_key, new_freq
def gen_all_resplit():
@@ -104,7 +110,7 @@ def filter_resplit():
#init code
-load_phrase("pinyin2.txt")
+load_phrase("pinyins.txt")
if __name__ == "__main__":
for p in filter_divided():