summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-07-02 11:35:57 +0800
committerPeng Wu <alexepico@gmail.com>2012-07-02 11:35:57 +0800
commit0efcc8db282f836442907091fd24d9d2f844216a (patch)
tree89310fa6ad5caad37cc54db75332fe34757a7deb
parentcb66df1cc752cddcf35ac3e8a00264b0df05d410 (diff)
downloadlibpinyin-0efcc8db282f836442907091fd24d9d2f844216a.tar.gz
libpinyin-0efcc8db282f836442907091fd24d9d2f844216a.tar.xz
libpinyin-0efcc8db282f836442907091fd24d9d2f844216a.zip
write _remove_duplicated_items_by_phrase_string
-rw-r--r--src/pinyin.cpp89
1 files changed, 87 insertions, 2 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 904fe61..d021e21 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -911,8 +911,8 @@ static bool _prepend_sentence_candidate(CandidateVector candidates) {
return true;
}
-static bool _compute_strings_of_items(pinyin_instance_t * instance,
- CandidateVector candidates) {
+static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
+ CandidateVector candidates) {
/* populate m_phrase_string in lookup_candidate_t. */
for(size_t i = 0; i < candidates->len; ++i) {
@@ -937,6 +937,91 @@ static bool _compute_strings_of_items(pinyin_instance_t * instance,
return true;
}
+static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
+ gconstpointer rhs,
+ gpointer userdata) {
+ size_t index_lhs = *((size_t *) lhs);
+ size_t index_rhs = *((size_t *) rhs);
+ CandidateVector candidates = (CandidateVector) userdata;
+
+ lookup_candidate_t * candidate_lhs =
+ &g_array_index(candidates, lookup_candidate_t, index_lhs);
+ lookup_candidate_t * candidate_rhs =
+ &g_array_index(candidates, lookup_candidate_t, index_rhs);
+
+ return -strcmp(candidate_lhs->m_phrase_string,
+ candidate_rhs->m_phrase_string); /* in descendant order */
+}
+
+
+static bool _remove_duplicated_items_by_phrase_string
+(pinyin_instance_t * instance,
+ CandidateVector candidates) {
+ size_t i;
+ /* create the GArray of indexed item */
+ GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
+ for (i = 0; i < candidates->len; ++i)
+ g_array_append_val(indices, i);
+
+ /* sort the indices array by phrase array */
+ g_array_sort_with_data
+ (indices, compare_indexed_item_with_phrase_string, candidates);
+
+ /* mark duplicated items as zombie candidate */
+ lookup_candidate_t * cur_item, * saved_item = NULL;
+ for (i = 0; i < candidates->len; ++i) {
+ cur_item = &g_array_index(candidates, lookup_candidate_t, i);
+ if (saved_item) {
+ if (0 == strcmp(saved_item->m_phrase_string,
+ cur_item->m_phrase_string)) {
+ /* found duplicated candidates */
+
+ /* keep best match candidate */
+ if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) {
+ cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ continue;
+ }
+
+ if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
+ saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ saved_item = cur_item;
+ continue;
+ }
+
+ /* keep the higher possiblity one
+ to quickly move the word forward in the candidate list */
+ if (cur_item->m_freq > saved_item->m_freq) {
+ /* find better candidate */
+ saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ saved_item = cur_item;
+ continue;
+ } else {
+ cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ continue;
+ }
+ } else {
+ /* keep the current candidate */
+ saved_item = cur_item;
+ }
+ }
+ }
+
+ g_array_free(indices, TRUE);
+
+ /* remove zombie candidate from the returned candidates */
+ for (i = 0; i < candidates->len; ++i) {
+ lookup_candidate_t * candidate = &g_array_index
+ (candidates, lookup_candidate_t, i);
+
+ if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) {
+ g_array_remove_index(candidates, i);
+ i--;
+ }
+ }
+
+ return true;
+}
+
bool pinyin_get_candidates(pinyin_instance_t * instance,
size_t offset,
CandidateVector candidates) {