From 915bbf4b2719d4d4829a318e92538fd10e59a395 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 9 Feb 2017 15:44:35 +0800 Subject: use nbest results in pinyin.cpp --- src/pinyin.cpp | 70 ++++++++++++++++++++++++++++++++++++++++------------------ src/pinyin.h | 7 ++++-- 2 files changed, 53 insertions(+), 24 deletions(-) diff --git a/src/pinyin.cpp b/src/pinyin.cpp index 7eb8ef0..9a522bb 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -1167,11 +1167,17 @@ bool pinyin_phrase_segment(pinyin_instance_t * instance, /* the returned sentence should be freed by g_free(). */ bool pinyin_get_sentence(pinyin_instance_t * instance, + gint8 index, char ** sentence){ pinyin_context_t * & context = instance->m_context; + NBestMatchResults & results = instance->m_match_results; + + MatchResult result = NULL; + assert(index < instance->m_match_results.size()); + assert(results.get_result(index, result)); bool retval = pinyin::convert_to_utf8 - (context->m_phrase_index, instance->m_match_results, + (context->m_phrase_index, result, NULL, false, *sentence); return retval; @@ -1485,19 +1491,21 @@ static void _compute_frequency_of_items(pinyin_context_t * context, } } -static bool _prepend_sentence_candidate(pinyin_instance_t * instance, - CandidateVector candidates) { - /* check whether the best match candidate exists. */ - gchar * sentence = NULL; - pinyin_get_sentence(instance, &sentence); - if (NULL == sentence) +static bool _prepend_sentence_candidates(pinyin_instance_t * instance, + CandidateVector candidates) { + const size_t size = instance->m_match_results.size(); + + /* check whether the nbest match candidate exists. */ + if (0 == size) return false; - g_free(sentence); - /* prepend best match candidate to candidates. */ - lookup_candidate_t candidate; - candidate.m_candidate_type = BEST_MATCH_CANDIDATE; - g_array_prepend_val(candidates, candidate); + /* prepend nbest match candidates to candidates. */ + for (size_t i = size - 1; i >= 0; --i) { + lookup_candidate_t candidate; + candidate.m_candidate_type = NBEST_MATCH_CANDIDATE; + candidate.m_nbest_index = i; + g_array_prepend_val(candidates, candidate); + } return true; } @@ -1515,7 +1523,7 @@ static bool _compute_phrase_length(pinyin_context_t * context, (candidates, lookup_candidate_t, i); switch(candidate->m_candidate_type) { - case BEST_MATCH_CANDIDATE: + case NBEST_MATCH_CANDIDATE: assert(FALSE); case NORMAL_CANDIDATE: case PREDICTED_CANDIDATE: { @@ -1545,9 +1553,9 @@ static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance, (candidates, lookup_candidate_t, i); switch(candidate->m_candidate_type) { - case BEST_MATCH_CANDIDATE: { + case NBEST_MATCH_CANDIDATE: { gchar * sentence = NULL; - pinyin_get_sentence(instance, &sentence); + pinyin_get_sentence(instance, candidate->m_nbest_index, &sentence); candidate->m_phrase_string = sentence; break; } @@ -1618,13 +1626,27 @@ static bool _remove_duplicated_items_by_phrase_string cur_item->m_phrase_string)) { /* found duplicated candidates */ - /* keep best match candidate */ - if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) { + /* both are nbest match candidate */ + if (NBEST_MATCH_CANDIDATE == saved_item->m_candidate_type && + NBEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { + /* keep the high possiblity one */ + if (saved_item->m_nbest_index < cur_item->m_nbest_index) { + cur_item->m_candidate_type = ZOMBIE_CANDIDATE; + } else { + saved_item->m_candidate_type = ZOMBIE_CANDIDATE; + saved_item = cur_item; + } + + continue; + } + + /* keep nbest match candidate */ + if (NBEST_MATCH_CANDIDATE == saved_item->m_candidate_type) { cur_item->m_candidate_type = ZOMBIE_CANDIDATE; continue; } - if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { + if (NBEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { saved_item->m_candidate_type = ZOMBIE_CANDIDATE; saved_item = cur_item; continue; @@ -2018,18 +2040,22 @@ bool pinyin_lookup_tokens(pinyin_instance_t * instance, return SEARCH_OK & retval; } -bool pinyin_train(pinyin_instance_t * instance){ +bool pinyin_train(pinyin_instance_t * instance, gint8 index){ if (!instance->m_context->m_user_dir) return false; pinyin_context_t * context = instance->m_context; PhoneticKeyMatrix & matrix = instance->m_matrix; + NBestMatchResults & results = instance->m_match_results; context->m_modified = true; - bool retval = context->m_pinyin_lookup->train_result2 - (&matrix, instance->m_constraints, - instance->m_match_results); + MatchResult result = NULL; + assert(index < results.size()); + assert(results.get_result(index, result)); + + bool retval = context->m_pinyin_lookup->train_result3 + (&matrix, &(instance->m_constraints), result); return retval; } diff --git a/src/pinyin.h b/src/pinyin.h index 9c86145..bde58c7 100644 --- a/src/pinyin.h +++ b/src/pinyin.h @@ -40,7 +40,7 @@ typedef struct _import_iterator_t import_iterator_t; typedef struct _export_iterator_t export_iterator_t; typedef enum _lookup_candidate_type_t{ - BEST_MATCH_CANDIDATE = 1, + NBEST_MATCH_CANDIDATE = 1, NORMAL_CANDIDATE, ZOMBIE_CANDIDATE, PREDICTED_CANDIDATE, @@ -351,6 +351,7 @@ bool pinyin_phrase_segment(pinyin_instance_t * instance, /** * pinyin_get_sentence: * @instance: the pinyin instance. + * @index: the index of the nbest result. * @sentence: the saved sentence in the instance. * @returns: whether the sentence is already saved in the instance. * @@ -360,6 +361,7 @@ bool pinyin_phrase_segment(pinyin_instance_t * instance, * */ bool pinyin_get_sentence(pinyin_instance_t * instance, + gint8 index, char ** sentence); /** @@ -544,12 +546,13 @@ bool pinyin_lookup_tokens(pinyin_instance_t * instance, /** * pinyin_train: * @instance: the pinyin instance. + * @index: the index of the nbest result. * @returns: whether the sentence is trained. * * Train the current user input sentence. * */ -bool pinyin_train(pinyin_instance_t * instance); +bool pinyin_train(pinyin_instance_t * instance, gint8 index); /** * pinyin_reset: -- cgit