summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-11-19 12:11:51 +0800
committerPeng Wu <alexepico@gmail.com>2013-11-29 15:09:14 +0800
commit8e4f5fb7bcbdf3a67a4572231d757f0c6eb9ce6e (patch)
tree7e236b24159eb53ee0bc8de9f3933d1d3d4616b9
parent75b9fa26f59f44567fb159bc74dbc3cee5914182 (diff)
downloadlibpinyin-8e4f5fb7bcbdf3a67a4572231d757f0c6eb9ce6e.tar.gz
libpinyin-8e4f5fb7bcbdf3a67a4572231d757f0c6eb9ce6e.tar.xz
libpinyin-8e4f5fb7bcbdf3a67a4572231d757f0c6eb9ce6e.zip
write pinyin_guess_predicted_candidates
-rw-r--r--src/pinyin.cpp70
-rw-r--r--src/pinyin.h3
2 files changed, 72 insertions, 1 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 7adbca6..57057b6 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -1171,6 +1171,7 @@ static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
case NORMAL_CANDIDATE:
case DIVIDED_CANDIDATE:
case RESPLIT_CANDIDATE:
+ case PREDICTED_CANDIDATE:
pinyin_token_get_phrase
(instance, candidate->m_token, NULL,
&(candidate->m_phrase_string));
@@ -1758,6 +1759,75 @@ bool pinyin_guess_full_pinyin_candidates(pinyin_instance_t * instance,
return true;
}
+bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
+ const char * prefix) {
+ pinyin_context_t * & context = instance->m_context;
+ FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+ GArray * & prefixes = instance->m_prefixes;
+
+ _free_candidates(instance->m_candidates);
+
+ _compute_prefixes(instance, prefix);
+
+ phrase_token_t prev_token = _get_previous_token(instance, 0);
+
+ SingleGram merged_gram;
+ SingleGram * system_gram = NULL, * user_gram = NULL;
+ if (null_token != prev_token) {
+ context->m_system_bigram->load(prev_token, system_gram);
+ context->m_user_bigram->load(prev_token, user_gram);
+ merge_single_gram(&merged_gram, system_gram, user_gram);
+ }
+
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
+
+ /* retrieve all items. */
+ BigramPhraseWithCountArray tokens = g_array_new
+ (FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+ merged_gram.retrieve_all(tokens);
+
+ /* sort the longer word first. */
+ PhraseItem cached_item;
+ for (size_t len = MAX_PHRASE_LENGTH; len > 0; --len) {
+ /* append items. */
+ for (size_t k = 0; k < tokens->len; ++k){
+ phrase_token_t token = g_array_index(tokens, phrase_token_t, k);
+ phrase_index->get_phrase_item(token, cached_item);
+ if (len != cached_item.get_phrase_length())
+ continue;
+
+ lookup_candidate_t item;
+ item.m_candidate_type = PREDICTED_CANDIDATE;
+ item.m_token = token;
+ g_array_append_val(items, item);
+ }
+
+ _compute_frequency_of_items(context, prev_token, &merged_gram, items);
+
+ /* sort the candidates of the same length by frequency. */
+ g_array_sort(items, compare_item_with_frequency);
+
+ /* transfer back items to tokens, and save it into candidates */
+ for (size_t k = 0; k < items->len; ++k) {
+ lookup_candidate_t * item = &g_array_index
+ (items, lookup_candidate_t, k);
+ g_array_append_val(instance->m_candidates, *item);
+ }
+ }
+
+ g_array_free(items, TRUE);
+ if (system_gram)
+ delete system_gram;
+ if (user_gram)
+ delete user_gram;
+
+ /* post process to remove duplicated candidates */
+ _compute_phrase_strings_of_items(instance, 0, instance->m_candidates);
+
+ _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates);
+
+ return true;
+}
int pinyin_choose_candidate(pinyin_instance_t * instance,
size_t offset,
diff --git a/src/pinyin.h b/src/pinyin.h
index 8c39c3d..7163435 100644
--- a/src/pinyin.h
+++ b/src/pinyin.h
@@ -44,7 +44,8 @@ typedef enum _lookup_candidate_type_t{
NORMAL_CANDIDATE,
DIVIDED_CANDIDATE,
RESPLIT_CANDIDATE,
- ZOMBIE_CANDIDATE
+ ZOMBIE_CANDIDATE,
+ PREDICTED_CANDIDATE
} lookup_candidate_type_t;
/**