add origin freq for divided table

author: Peng Wu <alexepico@gmail.com> 2011-11-09 15:02:13 +0800
committer: Peng Wu <alexepico@gmail.com> 2011-11-09 15:03:55 +0800
commit: af7acd67e0b8649c2dcd79b2de7d4a6b20446d29 (patch)
tree: 38befba4712268641efaaa7fa5477b1d70e75e67 /scripts
parent: 266386321bc1ac40551e63977d58a68ff9abb8ce (diff)
download: libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.tar.gz
libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.tar.xz
libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.zip
4 files changed, 25 insertions, 18 deletions
diff --git a/scripts/Makefile.data b/scripts/Makefile.data
index 2f665b2..830e294 100644
--- a/scripts/Makefile.data
+++ b/scripts/Makefile.data
@@ -1,9 +1,9 @@
-all: pinyin2.txt
+all: pinyins.txt
 
 
-pinyin2.txt:
-	awk -f genpinyin2.awk ../data/gb_char.table > $@
+pinyins.txt:
+	awk -f genpinyins.awk -vlen=1 ../data/gb_char.table > $@
+	awk -f genpinyins.awk -vlen=2 ../data/gb_char.table >> $@
 
 
-.PHONY: pinyin2.txt
-
+.PHONY: pinyins.txt
diff --git a/scripts/genpinyin2.awk b/scripts/genpinyins.awk
index 00c6cad..8ab3d13 100644
--- a/scripts/genpinyin2.awk
+++ b/scripts/genpinyins.awk
@@ -1,5 +1,5 @@
 #!/usr/bin/awk
-	{ if (length($2) == 2) pinyins[$1] += $4 }
+	{ if (length($2) == len) pinyins[$1] += $4 }
 
 END {
     for (pinyin in pinyins) {
diff --git a/scripts/genspecialtable.py b/scripts/genspecialtable.py
index 0b03404..4393274 100644
--- a/scripts/genspecialtable.py
+++ b/scripts/genspecialtable.py
@@ -52,11 +52,12 @@ def get_chewing_string(pinyin):
 
 def gen_divided_table():
     entries = []
-    for (pinyin_key, first_key, second_key, freq) in divided_list:
+    for (pinyin_key, orig_freq, first_key, second_key, new_freq) \
+            in divided_list:
         (pinyin_key, first_key, second_key) = map \
             (get_chewing_string, (pinyin_key, first_key, second_key))
-        entry = '{{{0}, {1}, {2}, {3}}}'.format \
-            (pinyin_key, first_key, second_key, freq)
+        entry = '{{{0}, {1}, {2}, {3}, {4}}}'.format \
+            (pinyin_key, orig_freq, first_key, second_key, new_freq)
         entries.append(entry)
     return ',\n'.join(entries)
 
diff --git a/scripts/specialtable.py b/scripts/specialtable.py
index acb66cf..a71aed7 100644
--- a/scripts/specialtable.py
+++ b/scripts/specialtable.py
@@ -42,12 +42,15 @@ def load_phrase(filename):
             #print(pinyin_str)
             continue
 
-        (first_key, second_key) = pinyin_str.split("'")
-        if first_key[-1].isdigit():
-            first_key = first_key[:-1]
-        if second_key[-1].isdigit():
-            second_key = second_key[:-1]
-        phrase_dict[(first_key, second_key)] = freq
+        if "'" in pinyin_str:
+            (first_key, second_key) = pinyin_str.split("'")
+            if first_key[-1].isdigit():
+                first_key = first_key[:-1]
+            if second_key[-1].isdigit():
+                second_key = second_key[:-1]
+            phrase_dict[(first_key, second_key)] = freq
+        else:
+            phrase_dict[pinyin_str] = freq
     phrasefile.close()
 
 
@@ -67,8 +70,11 @@ def filter_divided():
     for (pinyin_key, first_key, second_key) in gen_all_divided():
         if not (first_key, second_key) in phrase_dict:
             continue
-        freq = phrase_dict[(first_key, second_key)]
-        yield pinyin_key, first_key, second_key, freq
+        orig_freq = 0
+        if pinyin_key in phrase_dict:
+            orig_freq = phrase_dict[pinyin_key]
+        new_freq = phrase_dict[(first_key, second_key)]
+        yield pinyin_key, orig_freq, first_key, second_key, new_freq
 
 
 def gen_all_resplit():
@@ -104,7 +110,7 @@ def filter_resplit():
 
 
 #init code
-load_phrase("pinyin2.txt")
+load_phrase("pinyins.txt")
 
 if __name__ == "__main__":
     for p in filter_divided():
author	Peng Wu <alexepico@gmail.com>	2011-11-09 15:02:13 +0800
committer	Peng Wu <alexepico@gmail.com>	2011-11-09 15:03:55 +0800
commit	af7acd67e0b8649c2dcd79b2de7d4a6b20446d29 (patch)
tree	38befba4712268641efaaa7fa5477b1d70e75e67 /scripts
parent	266386321bc1ac40551e63977d58a68ff9abb8ce (diff)
download	libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.tar.gz libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.tar.xz libpinyin-af7acd67e0b8649c2dcd79b2de7d4a6b20446d29.zip