summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-01-12 12:30:19 +0800
committerPeng Wu <alexepico@gmail.com>2012-01-12 12:30:19 +0800
commit845f8ed78a9a1ffe2ac8adf7cdb111473bb997e4 (patch)
treeb00ea62f7c1e5bebea42abde1da0da0daeb90699
parentee0b9cde31d362a95f514cbab043229940d04973 (diff)
downloadlibpinyin-845f8ed78a9a1ffe2ac8adf7cdb111473bb997e4.zip
libpinyin-845f8ed78a9a1ffe2ac8adf7cdb111473bb997e4.tar.gz
libpinyin-845f8ed78a9a1ffe2ac8adf7cdb111473bb997e4.tar.xz
update pinyin parser
-rw-r--r--scripts/specialtable.py6
-rw-r--r--src/storage/pinyin_parser2.cpp15
2 files changed, 14 insertions, 7 deletions
diff --git a/scripts/specialtable.py b/scripts/specialtable.py
index 41f9a26..89fa097 100644
--- a/scripts/specialtable.py
+++ b/scripts/specialtable.py
@@ -107,7 +107,11 @@ def gen_all_resplit():
def filter_resplit():
for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
in gen_all_resplit():
- if not (new_first_key, new_second_key) in phrase_dict:
+ #do the reverse here, as libpinyin pinyin parser is different with
+ #ibus-pinyin's parser.
+ (orig_first_key, orig_second_key, new_first_key, new_second_key) = \
+ (new_first_key, new_second_key, orig_first_key, orig_second_key)
+ if (new_first_key, new_second_key) not in phrase_dict:
continue
orig_freq = 0
new_freq = phrase_dict[(new_first_key, new_second_key)]
diff --git a/src/storage/pinyin_parser2.cpp b/src/storage/pinyin_parser2.cpp
index ee43eaf..ceea641 100644
--- a/src/storage/pinyin_parser2.cpp
+++ b/src/storage/pinyin_parser2.cpp
@@ -299,10 +299,9 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
next_sep = k;
}
- pinyin_option_t heuristic_options = options & ~PINYIN_CORRECT_ALL;
-
+#if 0
/* Heuristic Method:
- * do maximum forward match first, and without auto corrections. */
+ * do maximum forward match first. */
for (size_t pos = i; pos < next_sep; ++pos) {
curstep = &g_array_index(m_parse_steps, parse_value_t, pos);
size_t try_len = std_lite::min
@@ -317,7 +316,7 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
ChewingKey key; ChewingKeyRest rest;
bool parsed = parse_one_key
- (heuristic_options, key, onepinyin, onepinyinlen);
+ (options, key, onepinyin, onepinyinlen);
rest.m_raw_begin = pos; rest.m_raw_end = n;
if (!parsed)
@@ -343,6 +342,7 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
break;
}
}
+#endif
/* dynamic programming here. */
for (size_t m = i; m < next_sep; ++m) {
@@ -379,6 +379,9 @@ int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
if (value.m_parsed_len == nextstep->m_parsed_len &&
value.m_num_keys < nextstep->m_num_keys)
*nextstep = value;
+ if (nextstep->m_key.m_initial == CHEWING_ZERO_INITIAL &&
+ value.m_key.m_initial != CHEWING_ZERO_INITIAL)
+ *nextstep = value;
}
}
}
@@ -489,8 +492,8 @@ bool FullPinyinParser2::post_process(pinyin_option_t options,
*cur_key = item->m_new_keys[0];
*next_key = item->m_new_keys[1];
/* assumes only moved one char in gen_all_resplit script. */
- cur_rest->m_raw_end --;
- next_rest->m_raw_begin --;
+ cur_rest->m_raw_end ++;
+ next_rest->m_raw_begin ++;
}
/* save back tones */