diff options
-rw-r--r-- | src/lookup/phonetic_lookup.cpp | 73 | ||||
-rw-r--r-- | src/lookup/phonetic_lookup.h | 56 | ||||
-rw-r--r-- | src/lookup/phonetic_lookup_heap.h | 5 | ||||
-rw-r--r-- | src/lookup/phonetic_lookup_linear.h | 5 |
4 files changed, 136 insertions, 3 deletions
diff --git a/src/lookup/phonetic_lookup.cpp b/src/lookup/phonetic_lookup.cpp index ef64338..4ea8b72 100644 --- a/src/lookup/phonetic_lookup.cpp +++ b/src/lookup/phonetic_lookup.cpp @@ -22,9 +22,80 @@ namespace pinyin{ +/* internal definition */ +static const size_t nbeam = 32; + +bool dump_max_value(GPtrArray * values){ + if (0 == values->len) + return false; + + const trellis_value_t * max = + (const trellis_value_t *) g_ptr_array_index(values, 0); + + for (size_t i = 1; i < values->len; ++i) { + const trellis_value_t * cur = + (const trellis_value_t *) g_ptr_array_index(values, i); + + if (cur->m_poss > max->m_poss) + max = cur; + } + + printf("max value: %f\n", max->m_poss); + + return true; +} + +bool dump_all_values(GPtrArray * values) { + if (0 == values->len) + return false; + + printf("values:"); + for (size_t i = 0; i < values->len; ++i) { + const trellis_value_t * cur = + (const trellis_value_t *) g_ptr_array_index(values, i); + + printf("%f\t", cur->m_poss); + } + printf("\n"); + + return true; +} + +static bool trellis_value_less_than(trellis_value_t * lhs, + trellis_value_t * rhs){ + return lhs->m_poss < rhs->m_poss; +} + /* use maximum heap to get the topest results. */ static bool get_top_results(/* out */ GPtrArray * topresults, - /* in */ GPtrArray * candidates); + /* in */ GPtrArray * candidates) { + g_ptr_array_set_size(topresults, 0); + + if (0 == candidates->len) + return false; + + trellis_value_t ** begin = + (trellis_value_t **) &g_ptr_array_index(candidates, 0); + trellis_value_t ** end = + (trellis_value_t **) &g_ptr_array_index(candidates, candidates->len); + + std_lite::make_heap(begin, end, trellis_value_less_than); + + while (end != begin) { + trellis_value_t * one = *begin; + g_ptr_array_add(topresults, one); + + std_lite::pop_heap(begin, end, trellis_value_less_than); + --end; + + if (topresults->len >= nbeam) + break; + } + + /* dump_all_values(topresults); */ + + return true; +} int ForwardPhoneticConstraints::add_constraint(size_t start, size_t end, diff --git a/src/lookup/phonetic_lookup.h b/src/lookup/phonetic_lookup.h index 65602c7..e25f3da 100644 --- a/src/lookup/phonetic_lookup.h +++ b/src/lookup/phonetic_lookup.h @@ -151,10 +151,62 @@ public: } /* Array of phrase_token_t */ - bool fill_prefixes(/* in */ TokenVector prefixes); + bool fill_prefixes(/* in */ TokenVector prefixes) { + assert(prefixes->len > 0); + + for (size_t i = 0; i < prefixes->len; ++i) { + phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); + lookup_key_t initial_key = token; + trellis_value_t initial_value(log(1.f)); + initial_value.m_handles[1] = token; + + trellis_node<nbest> initial_node; + assert(initial_node.eval_item(&initial_value)); + + LookupStepContent initial_step_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, 0); + initial_step_content = g_array_append_val + (initial_step_content, initial_node); + + LookupStepIndex initial_step_index = (LookupStepIndex) + g_ptr_array_index(steps_index, 0); + g_hash_table_insert(initial_step_index, + GUINT_TO_POINTER(initial_key), + GUINT_TO_POINTER(initial_step_content->len - 1)); + } + + return true; + } + /* Array of trellis_value_t */ bool get_candidates(/* in */ gint32 index, - /* out */ GArray * candidates) const; + /* out */ GArray * candidates) const { + LookupStepContent step = (LookupStepContent) + g_ptr_array_index(m_steps_content, index); + + g_ptr_array_set_size(candidates, 0); + + if (0 == step->len) + return false; + + for (size_t i = 0; i < step->len; ++i) { + trellis_node<nbest> * node = &g_array_index + (step, trellis_node<nbest>, i); + + // only initialized in the get_candidates method. + node->number(); + + const trellis_value_t * value = node->begin(); + for (size_t j = 0; j < node->length(); ++j) { + g_ptr_array_add(candidates, value); + } + } + + /* dump_max_value(candidates); */ + + return true; + } + /* insert candidate */ bool insert_candidate(gint32 index, phrase_token_t token, const trellis_value_t * candidate); diff --git a/src/lookup/phonetic_lookup_heap.h b/src/lookup/phonetic_lookup_heap.h index 8af1767..94f97d1 100644 --- a/src/lookup/phonetic_lookup_heap.h +++ b/src/lookup/phonetic_lookup_heap.h @@ -45,6 +45,11 @@ public: const trellis_value_t * begin() { return m_elements; } const trellis_value_t * end() { return m_elements + m_nelem; } + bool number() { + for (ssize_t i = 0; i < m_nelem; ++i) + m_elements[i].m_current_index = i; + } + /* return true if the item is stored into m_elements. */ bool eval_item(const trellis_value_t * item) { /* min heap here, and always push heap. */ diff --git a/src/lookup/phonetic_lookup_linear.h b/src/lookup/phonetic_lookup_linear.h index fadc2bc..7abca66 100644 --- a/src/lookup/phonetic_lookup_linear.h +++ b/src/lookup/phonetic_lookup_linear.h @@ -38,6 +38,11 @@ public: const trellis_value_t * begin() { return m_elements; } const trellis_value_t * end() { return m_elements + m_nelem; } + bool number() { + for (ssize_t i = 0; i < m_nelem; ++i) + m_elements[i].m_current_index = i; + } + /* return true if the item is stored into m_elements. */ bool eval_item(const trellis_value_t * item) { /* still have space */ |