diff options
author | Peng Wu <alexepico@gmail.com> | 2011-11-09 15:02:13 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-11-09 15:03:55 +0800 |
commit | af7acd67e0b8649c2dcd79b2de7d4a6b20446d29 (patch) | |
tree | 38befba4712268641efaaa7fa5477b1d70e75e67 /scripts | |
parent | 266386321bc1ac40551e63977d58a68ff9abb8ce (diff) | |
download | libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.tar.gz libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.tar.xz libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.zip |
add origin freq for divided table
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/Makefile.data | 10 | ||||
-rw-r--r-- | scripts/genpinyins.awk (renamed from scripts/genpinyin2.awk) | 2 | ||||
-rw-r--r-- | scripts/genspecialtable.py | 7 | ||||
-rw-r--r-- | scripts/specialtable.py | 24 |
4 files changed, 25 insertions, 18 deletions
diff --git a/scripts/Makefile.data b/scripts/Makefile.data index 2f665b2..830e294 100644 --- a/scripts/Makefile.data +++ b/scripts/Makefile.data @@ -1,9 +1,9 @@ -all: pinyin2.txt +all: pinyins.txt -pinyin2.txt: - awk -f genpinyin2.awk ../data/gb_char.table > $@ +pinyins.txt: + awk -f genpinyins.awk -vlen=1 ../data/gb_char.table > $@ + awk -f genpinyins.awk -vlen=2 ../data/gb_char.table >> $@ -.PHONY: pinyin2.txt - +.PHONY: pinyins.txt diff --git a/scripts/genpinyin2.awk b/scripts/genpinyins.awk index 00c6cad..8ab3d13 100644 --- a/scripts/genpinyin2.awk +++ b/scripts/genpinyins.awk @@ -1,5 +1,5 @@ #!/usr/bin/awk - { if (length($2) == 2) pinyins[$1] += $4 } + { if (length($2) == len) pinyins[$1] += $4 } END { for (pinyin in pinyins) { diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py index 0b03404..4393274 100644 --- a/scripts/genspecialtable.py +++ b/scripts/genspecialtable.py @@ -52,11 +52,12 @@ def get_chewing_string(pinyin): def gen_divided_table(): entries = [] - for (pinyin_key, first_key, second_key, freq) in divided_list: + for (pinyin_key, orig_freq, first_key, second_key, new_freq) \ + in divided_list: (pinyin_key, first_key, second_key) = map \ (get_chewing_string, (pinyin_key, first_key, second_key)) - entry = '{{{0}, {1}, {2}, {3}}}'.format \ - (pinyin_key, first_key, second_key, freq) + entry = '{{{0}, {1}, {2}, {3}, {4}}}'.format \ + (pinyin_key, orig_freq, first_key, second_key, new_freq) entries.append(entry) return ',\n'.join(entries) diff --git a/scripts/specialtable.py b/scripts/specialtable.py index acb66cf..a71aed7 100644 --- a/scripts/specialtable.py +++ b/scripts/specialtable.py @@ -42,12 +42,15 @@ def load_phrase(filename): #print(pinyin_str) continue - (first_key, second_key) = pinyin_str.split("'") - if first_key[-1].isdigit(): - first_key = first_key[:-1] - if second_key[-1].isdigit(): - second_key = second_key[:-1] - phrase_dict[(first_key, second_key)] = freq + if "'" in pinyin_str: + (first_key, second_key) = pinyin_str.split("'") + if first_key[-1].isdigit(): + first_key = first_key[:-1] + if second_key[-1].isdigit(): + second_key = second_key[:-1] + phrase_dict[(first_key, second_key)] = freq + else: + phrase_dict[pinyin_str] = freq phrasefile.close() @@ -67,8 +70,11 @@ def filter_divided(): for (pinyin_key, first_key, second_key) in gen_all_divided(): if not (first_key, second_key) in phrase_dict: continue - freq = phrase_dict[(first_key, second_key)] - yield pinyin_key, first_key, second_key, freq + orig_freq = 0 + if pinyin_key in phrase_dict: + orig_freq = phrase_dict[pinyin_key] + new_freq = phrase_dict[(first_key, second_key)] + yield pinyin_key, orig_freq, first_key, second_key, new_freq def gen_all_resplit(): @@ -104,7 +110,7 @@ def filter_resplit(): #init code -load_phrase("pinyin2.txt") +load_phrase("pinyins.txt") if __name__ == "__main__": for p in filter_divided(): |