diff options
author | Peng Wu <alexepico@gmail.com> | 2011-10-24 16:42:22 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-10-24 16:42:22 +0800 |
commit | bf5eddadc363f60165cd89a7cbab7cfb404323bc (patch) | |
tree | d15347adc247e51894257350d183bf4be906f3e2 | |
parent | c8c7f23307fd7f8c36573c101161cb4c9ea6646e (diff) | |
download | libpinyin-bf5eddadc363f60165cd89a7cbab7cfb404323bc.tar.gz libpinyin-bf5eddadc363f60165cd89a7cbab7cfb404323bc.tar.xz libpinyin-bf5eddadc363f60165cd89a7cbab7cfb404323bc.zip |
write gen pinyin table in progress
-rw-r--r-- | scripts/correct.py | 48 | ||||
-rw-r--r-- | scripts/genpytable.py | 63 | ||||
-rw-r--r-- | scripts/pinyin.py | 2 | ||||
-rw-r--r-- | src/storage/pinyin_custom2.h | 8 |
4 files changed, 84 insertions, 37 deletions
diff --git a/scripts/correct.py b/scripts/correct.py index 96b965c..a5f49cf 100644 --- a/scripts/correct.py +++ b/scripts/correct.py @@ -34,35 +34,35 @@ auto_correct = [ auto_correct_ext = [ # "correct", "wrong", flag - ("ju", "jv", "PINYIN_CORRECT_V_TO_U"), - ("qu", "qv", "PINYIN_CORRECT_V_TO_U"), - ("xu", "xv", "PINYIN_CORRECT_V_TO_U"), - ("yu", "yv", "PINYIN_CORRECT_V_TO_U"), + ("ju", "jv", "PINYIN_CORRECT_V_U"), + ("qu", "qv", "PINYIN_CORRECT_V_U"), + ("xu", "xv", "PINYIN_CORRECT_V_U"), + ("yu", "yv", "PINYIN_CORRECT_V_U"), - ("jue", "jve", "PINYIN_CORRECT_V_TO_U"), - ("que", "qve", "PINYIN_CORRECT_V_TO_U"), - ("xue", "xve", "PINYIN_CORRECT_V_TO_U"), - ("yue", "yve", "PINYIN_CORRECT_V_TO_U"), + ("jue", "jve", "PINYIN_CORRECT_V_U"), + ("que", "qve", "PINYIN_CORRECT_V_U"), + ("xue", "xve", "PINYIN_CORRECT_V_U"), + ("yue", "yve", "PINYIN_CORRECT_V_U"), - ("juan", "jvan", "PINYIN_CORRECT_V_TO_U"), - ("quan", "qvan", "PINYIN_CORRECT_V_TO_U"), - ("xuan", "xvan", "PINYIN_CORRECT_V_TO_U"), - ("yuan", "yvan", "PINYIN_CORRECT_V_TO_U"), + ("juan", "jvan", "PINYIN_CORRECT_V_U"), + ("quan", "qvan", "PINYIN_CORRECT_V_U"), + ("xuan", "xvan", "PINYIN_CORRECT_V_U"), + ("yuan", "yvan", "PINYIN_CORRECT_V_U"), - ("jun", "jvn", "PINYIN_CORRECT_V_TO_U"), - ("qun", "qvn", "PINYIN_CORRECT_V_TO_U"), - ("xun", "xvn", "PINYIN_CORRECT_V_TO_U"), - ("yun", "yvn", "PINYIN_CORRECT_V_TO_U"), + ("jun", "jvn", "PINYIN_CORRECT_V_U"), + ("qun", "qvn", "PINYIN_CORRECT_V_U"), + ("xun", "xvn", "PINYIN_CORRECT_V_U"), + ("yun", "yvn", "PINYIN_CORRECT_V_U"), - ("juang", "jvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"), - ("quang", "qvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"), - ("xuang", "xvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"), - ("yuang", "yvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"), +# ("juang", "jvang", "PINYIN_CORRECT_V_U"), +# ("quang", "qvang", "PINYIN_CORRECT_V_U"), +# ("xuang", "xvang", "PINYIN_CORRECT_V_U"), +# ("yuang", "yvang", "PINYIN_CORRECT_V_U"), - ("jun", "jven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"), - ("qun", "qven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"), - ("xun", "xven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"), - ("yun", "yven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"), +# ("jun", "jven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +# ("qun", "qven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +# ("xun", "xven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), +# ("yun", "yven", "PINYIN_CORRECT_UEN_UN | PINYIN_CORRECT_V_U"), ] diff --git a/scripts/genpytable.py b/scripts/genpytable.py index 1b339c2..5216a9d 100644 --- a/scripts/genpytable.py +++ b/scripts/genpytable.py @@ -25,21 +25,16 @@ import chewing from correct import * +pinyin_list = sorted(bopomofo.PINYIN_BOPOMOFO_MAP.keys()) +shengmu_list = sorted(pinyin.SHENGMU_DICT.keys()) + def check_pinyin_chewing_map(): for pinyin_key in pinyin.PINYIN_DICT.keys(): - if pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP.keys(): + if pinyin_key in pinyin_list: pass else: print("pinyin %s has no chewing mapping", pinyin_key) -''' - for pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP.keys(): - if pinyin_key in pinyin.PINYIN_DICT.keys(): - pass - else: - print(pinyin_key, get_chewing(pinyin_key)) -''' - def get_chewing(pinyin_key): initial, middle, final = \ 'CHEWING_ZERO_INITIAL', 'CHEWING_ZERO_MIDDLE', 'CHEWING_ZERO_FINAL' @@ -85,6 +80,52 @@ def get_chewing(pinyin_key): return initial, middle, final +def get_pinyin_list(): + #generate all pinyins in bopomofo + for pinyin_key in pinyin_list: + flags = [] + (initial, middle, final) = get_chewing(pinyin_key) + if pinyin_key in bopomofo.PINYIN_BOPOMOFO_MAP.keys(): + flags.append("IS_CHEWING") + if pinyin_key in pinyin.PINYIN_DICT.keys(): + flags.append("IS_PINYIN") + if pinyin_key in shengmu_list: + flags.append("PINYIN_INCOMPLETE") + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + if chewing_key in chewing.ASCII_CHEWING_INITIAL_MAP: + flags.append("CHEWING_INCOMPLETE") + yield pinyin_key, chewing_key, flags, (initial, final, middle) + + #generate all shengmu + for shengmu in shengmu_list: + if shengmu in pinyin_list: + continue + flags = ["IS_PINYIN", "PINYIN_INCOMPLETE"] + chewing_key = 'CHEWING_{0}'.format(shengmu.upper()) + if chewing_key in chewing.ASCII_CHEWING_INITIAL_MAP: + initial = chewing_key + chewing_key = chewing.ASCII_CHEWING_INITIAL_MAP[chewing_key] + else: + chewing_key = 'PINYIN_{0}'.format(shengmu.upper()) + initial = chewing_key + yield shengmu, chewing_key, flags, (initial, "CHEWING_ZREO_MIDDLE", "CHEWING_ZERO_FINAL") + + #generate corrections + for correct, wrong in auto_correct: + flag = ['IS_PINYIN', 'PINYIN_CORRECT_{0}_{1}'.format(wrong, correct)] + for pinyin_key in pinyin_list: + if pinyin_key.endswith(correct) and pinyin_key != correct: + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + new_pinyin_key = pinyin_key.replace(correct, wrong) + yield new_pinyin_key, chewing_key, flags, get_chewing(pinyin_key) + + #generate U to V + for correct, wrong, flags in auto_correct_ext: + #over-ride flags + flags = ['IS_PINYIN', 'PINYIN_CORRECT_V_U'] + pinyin_key = correct + chewing_key = bopomofo.PINYIN_BOPOMOFO_MAP[pinyin_key] + yield wrong, chewing_key, flags, get_chewing(pinyin_key) ### main function ### if __name__ == "__main__": @@ -92,5 +133,5 @@ if __name__ == "__main__": check_pinyin_chewing_map() #dump - for pinyin_key in sorted(bopomofo.PINYIN_BOPOMOFO_MAP.keys()): - print (pinyin_key, get_chewing(pinyin_key)) + for pinyin_key in get_pinyin_list(): + print (pinyin_key) diff --git a/scripts/pinyin.py b/scripts/pinyin.py index b06d1cd..a3eccdb 100644 --- a/scripts/pinyin.py +++ b/scripts/pinyin.py @@ -126,7 +126,7 @@ for pinyin, id in PINYIN_DICT.items (): ID_PINYIN_DICT[id] = pinyin SHENGMU_DICT = { - "" : 0, "b" : 1, "p" : 2, "m" : 3, "f" : 4, "d" : 5, + "b" : 1, "p" : 2, "m" : 3, "f" : 4, "d" : 5, "t" : 6, "n" : 7, "l" : 8, "g" : 9, "k" : 10, "h" : 11, "j" : 12, "q" : 13, "x" : 14, "zh" : 15, "ch" : 16, "sh" : 17, "r" : 18, "z" : 19, "c" : 20, "s" : 21, "y" : 22, "w" : 23 diff --git a/src/storage/pinyin_custom2.h b/src/storage/pinyin_custom2.h index 5ca796d..cd71f4f 100644 --- a/src/storage/pinyin_custom2.h +++ b/src/storage/pinyin_custom2.h @@ -24,6 +24,12 @@ namespace pinyin{ +enum PinyinTableFlag{ + IS_CHEWING = 1, + IS_PINYIN = 1 << 2, + PINYIN_INCOMPLETE = 1 << 3, + CHEWING_INCOMPLETE = 1 << 4, +}; /** * @brief enums of pinyin ambiguities. @@ -33,7 +39,7 @@ namespace pinyin{ */ enum PinyinAmbiguityBeta{ PINYIN_AMB_ANY = 0, - PINYIN_AMB_C_Ch, + PINYIN_AMB_C_Ch , PINYIN_AMB_Z_Zh, PINYIN_AMB_S_Sh, PINYIN_AMB_L_N , |