summaryrefslogtreecommitdiffstats
path: root/src/storage/pinyin_parser2.cpp
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-11-16 17:43:32 +0800
committerPeng Wu <alexepico@gmail.com>2011-11-16 17:43:32 +0800
commit7c2c087c4ab0cd31c12bf686fe754c0c1064847a (patch)
tree6d80a4ce275588c8796a4844c1b9093333653014 /src/storage/pinyin_parser2.cpp
parent479745510f0f8bd99dec45a0822d77421129b976 (diff)
downloadlibpinyin-7c2c087c4ab0cd31c12bf686fe754c0c1064847a.tar.gz
libpinyin-7c2c087c4ab0cd31c12bf686fe754c0c1064847a.tar.xz
libpinyin-7c2c087c4ab0cd31c12bf686fe754c0c1064847a.zip
begin to write re-split post processing
Diffstat (limited to 'src/storage/pinyin_parser2.cpp')
-rw-r--r--src/storage/pinyin_parser2.cpp56
1 files changed, 55 insertions, 1 deletions
diff --git a/src/storage/pinyin_parser2.cpp b/src/storage/pinyin_parser2.cpp
index 9fd85cb..e1fb1b0 100644
--- a/src/storage/pinyin_parser2.cpp
+++ b/src/storage/pinyin_parser2.cpp
@@ -268,9 +268,63 @@ int FullPinyinParser2::parse (guint32 options, ChewingKeyVector & keys,
/* final step for back tracing. */
gint16 parsed_len = final_step(step_len, keys, key_rests);
+ assert(keys->len == key_rests->len);
+ gint16 num_keys = keys->len;
/* post processing for re-split table. */
if (options & USE_RESPLIT_TABLE) {
+
+ ChewingKey * cur_key = NULL, * next_key = NULL;
+ ChewingKeyRest * cur_rest = NULL, * next_rest = NULL;
+ guint16 cur_tone = CHEWING_ZERO_TONE, next_tone = CHEWING_ZERO_TONE;
+
+ for (i = 0; i < num_keys - 1; ++i) {
+ cur_rest = &g_array_index(key_rests, ChewingKeyRest, i);
+ next_rest = &g_array_index(key_rests, ChewingKeyRest, i + 1);
+
+ /* some "'" here */
+ if (cur_rest->m_raw_end != next_rest->m_raw_begin)
+ continue;
+
+ cur_key = &g_array_index(keys, ChewingKey, i);
+ next_key = &g_array_index(keys, ChewingKey, i + 1);
+
+ if (options & USE_TONE) {
+ cur_tone = cur_key->m_tone;
+ next_tone = next_key->m_tone;
+ cur_key->m_tone = next_key->m_tone = CHEWING_ZERO_TONE;
+ }
+
+ /* lookup re-split table */
+ size_t k;
+ resplit_table_item_t * item = NULL;
+ for (k = 0; k < G_N_ELEMENTS(resplit_table); ++k) {
+ item = resplit_table + k;
+ /* no ops */
+ if (item->m_orig_freq >= item->m_new_freq)
+ continue;
+ /* TODO: refine code style here. */
+ if (item->m_orig_first_key == *cur_key &&
+ item->m_orig_second_key == *next_key)
+ break;
+ /* TODO: should use pinyin_exact_compare2 here. */
+ assert(FALSE);
+ }
+ if (k < G_N_ELEMENTS(resplit_table)) {
+ /* do re-split */
+ item = resplit_table + k;
+ *cur_key = item->m_new_first_key;
+ *next_key = item->m_new_second_key;
+ /* assumes only moved one char in gen_all_resplit script. */
+ cur_rest->m_raw_end --;
+ next_rest->m_raw_begin --;
+ /* save back tones */
+ if (options & USE_TONE) {
+ cur_key->m_tone = cur_tone;
+ next_key->m_tone = next_tone;
+ }
+ }
+ }
}
@@ -285,7 +339,7 @@ int FullPinyinParser2::final_step(size_t step_len, ChewingKeyVector & keys,
parse_value_t * curstep = NULL;
/* find longest match, which starts from the beginning of input. */
- for ( i = step_len - 1; i >= 0; --i) {
+ for (i = step_len - 1; i >= 0; --i) {
curstep = &g_array_index(m_parse_steps, parse_value_t, i);
if (i == curstep->m_parsed_len)
break;