diff options
author | Peng Wu <alexepico@gmail.com> | 2022-09-20 17:24:23 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2022-09-21 15:10:34 +0800 |
commit | 698c01539cc8636dd79bd12a49e2d9a960209f98 (patch) | |
tree | 6b469c2eddee8f4802ce8ea0546e63c9fc4830fb | |
parent | baedddb15b41d7d6dbe1030195c0b343159a3b6c (diff) | |
download | libpinyin-698c01539cc8636dd79bd12a49e2d9a960209f98.tar.gz libpinyin-698c01539cc8636dd79bd12a49e2d9a960209f98.tar.xz libpinyin-698c01539cc8636dd79bd12a49e2d9a960209f98.zip |
Support longer candidate
-rw-r--r-- | src/pinyin.cpp | 86 | ||||
-rw-r--r-- | src/pinyin.h | 1 | ||||
-rw-r--r-- | src/storage/chewing_large_table2_kyotodb.cpp | 2 |
3 files changed, 86 insertions, 3 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp index 59e424b..68e237c 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -1662,6 +1662,70 @@ static void _compute_frequency_of_items(pinyin_context_t * context, } } +static bool _prepend_longer_candidates(pinyin_instance_t * instance, + CandidateVector candidates) { + + pinyin_context_t * & context = instance->m_context; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + PhoneticKeyMatrix & matrix = instance->m_matrix; + size_t prefix_len = instance->m_parsed_key_len; + + GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(tokens)); + phrase_index->prepare_tokens(tokens); + int result = search_suggestion_with_matrix + (context->m_pinyin_table, &matrix, prefix_len, tokens); + int num = reduce_tokens(tokens, tokenarray, false); + phrase_index->destroy_tokens(tokens); + + phrase_token_t longer_token = null_token; + PhraseItem longer_item, item; + for (int i = 0; i < tokenarray->len; ++i) { + phrase_token_t token = g_array_index(tokenarray, phrase_token_t, i); + + if (ERROR_OK != phrase_index->get_phrase_item(token, item)) + continue; + + /* skip the phrase longer than prefix_len * 2 + 1 */ + if (item.get_phrase_length() > (prefix_len * 2 + 1)) + continue; + + if (longer_token == null_token) { + longer_token = token; + phrase_index->get_phrase_item(longer_token, longer_item); + continue; + } + + if (item.get_unigram_frequency() > + longer_item.get_unigram_frequency()) { + longer_token = token; + phrase_index->get_phrase_item(longer_token, longer_item); + } + } + + if (longer_token == null_token) + return false; + + /* compute the unigram frequency. */ + gfloat lambda = context->m_system_table_info.get_lambda(); + guint32 total_freq = phrase_index->get_phrase_index_total_freq(); + guint32 freq = ((1 - lambda) * + longer_item.get_unigram_frequency() / + (gfloat) total_freq) * 256 * 256 * 256; + + /* prepend longer candidate to candidates. */ + lookup_candidate_t candidate; + candidate.m_candidate_type = LONGER_CANDIDATE; + candidate.m_token = longer_token; + candidate.m_freq = freq; + g_array_prepend_val(candidates, candidate); + + g_array_free(tokenarray, TRUE); + return true; +} + static bool _prepend_sentence_candidates(pinyin_instance_t * instance, CandidateVector candidates) { const size_t size = instance->m_nbest_results.size(); @@ -1737,6 +1801,7 @@ static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance, break; } case NORMAL_CANDIDATE: + case LONGER_CANDIDATE: case PREDICTED_BIGRAM_CANDIDATE: _token_get_phrase (instance->m_context->m_phrase_index, @@ -1781,8 +1846,7 @@ static gint compare_indexed_item_with_phrase_string(gconstpointer lhs, static bool _remove_duplicated_items_by_phrase_string -(pinyin_instance_t * instance, - CandidateVector candidates) { +(pinyin_instance_t * instance, CandidateVector candidates) { size_t i; /* create the GArray of indexed item */ GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t)); @@ -1809,6 +1873,22 @@ static bool _remove_duplicated_items_by_phrase_string cur_item->m_phrase_string)) { /* found duplicated candidates */ + /* as the longer candidates is longer than the pinyin input, + then only longer candidates can be equal. */ + + if (LONGER_CANDIDATE == saved_item->m_candidate_type && + LONGER_CANDIDATE == cur_item->m_candidate_type) { + /* keep the high possiblity one */ + if (saved_item->m_freq < cur_item->m_freq) { + cur_item->m_candidate_type = ZOMBIE_CANDIDATE; + } else { + saved_item->m_candidate_type = ZOMBIE_CANDIDATE; + saved_item = cur_item; + } + + continue; + } + /* both are nbest match candidate */ if (NBEST_MATCH_CANDIDATE == saved_item->m_candidate_type && NBEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { @@ -2002,6 +2082,8 @@ bool pinyin_guess_candidates(pinyin_instance_t * instance, /* post process to remove duplicated candidates */ + _prepend_longer_candidates(instance, instance->m_candidates); + _prepend_sentence_candidates(instance, instance->m_candidates); _compute_phrase_strings_of_items(instance, instance->m_candidates); diff --git a/src/pinyin.h b/src/pinyin.h index 4b631fe..cc1a5cb 100644 --- a/src/pinyin.h +++ b/src/pinyin.h @@ -46,6 +46,7 @@ typedef enum _lookup_candidate_type_t{ PREDICTED_BIGRAM_CANDIDATE, PREDICTED_PREFIX_CANDIDATE, ADDON_CANDIDATE, + LONGER_CANDIDATE, } lookup_candidate_type_t; typedef enum _sort_option_t{ diff --git a/src/storage/chewing_large_table2_kyotodb.cpp b/src/storage/chewing_large_table2_kyotodb.cpp index b05f100..d5aeeb9 100644 --- a/src/storage/chewing_large_table2_kyotodb.cpp +++ b/src/storage/chewing_large_table2_kyotodb.cpp @@ -228,7 +228,7 @@ int ChewingLargeTable2::search_suggestion_internal entry->m_chunk.set_chunk(chunk.begin(), chunk.size(), NULL); - result = entry->search(prefix_keys, tokens) | result; + result = entry->search_suggestion(prefix_len, prefix_keys, tokens) | result; entry->m_chunk.set_size(0); |