summaryrefslogtreecommitdiffstats
path: root/src/lookup
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2017-01-17 16:57:15 +0800
committerPeng Wu <alexepico@gmail.com>2017-01-17 16:57:15 +0800
commitacdefd9af9188ba0de901ca3d3670850d989ced4 (patch)
treefbf992f6802e830cd8798f8b30f5df8b0d4b9b6b /src/lookup
parent53948ad9daa5efd0da6e2e01f8e0e4545283fc1e (diff)
downloadlibpinyin-acdefd9af9188ba0de901ca3d3670850d989ced4.tar.gz
libpinyin-acdefd9af9188ba0de901ca3d3670850d989ced4.tar.xz
libpinyin-acdefd9af9188ba0de901ca3d3670850d989ced4.zip
write class ForwardPhoneticTrellis in progress
Diffstat (limited to 'src/lookup')
-rw-r--r--src/lookup/phonetic_lookup.cpp73
-rw-r--r--src/lookup/phonetic_lookup.h56
-rw-r--r--src/lookup/phonetic_lookup_heap.h5
-rw-r--r--src/lookup/phonetic_lookup_linear.h5
4 files changed, 136 insertions, 3 deletions
diff --git a/src/lookup/phonetic_lookup.cpp b/src/lookup/phonetic_lookup.cpp
index ef64338..4ea8b72 100644
--- a/src/lookup/phonetic_lookup.cpp
+++ b/src/lookup/phonetic_lookup.cpp
@@ -22,9 +22,80 @@
namespace pinyin{
+/* internal definition */
+static const size_t nbeam = 32;
+
+bool dump_max_value(GPtrArray * values){
+ if (0 == values->len)
+ return false;
+
+ const trellis_value_t * max =
+ (const trellis_value_t *) g_ptr_array_index(values, 0);
+
+ for (size_t i = 1; i < values->len; ++i) {
+ const trellis_value_t * cur =
+ (const trellis_value_t *) g_ptr_array_index(values, i);
+
+ if (cur->m_poss > max->m_poss)
+ max = cur;
+ }
+
+ printf("max value: %f\n", max->m_poss);
+
+ return true;
+}
+
+bool dump_all_values(GPtrArray * values) {
+ if (0 == values->len)
+ return false;
+
+ printf("values:");
+ for (size_t i = 0; i < values->len; ++i) {
+ const trellis_value_t * cur =
+ (const trellis_value_t *) g_ptr_array_index(values, i);
+
+ printf("%f\t", cur->m_poss);
+ }
+ printf("\n");
+
+ return true;
+}
+
+static bool trellis_value_less_than(trellis_value_t * lhs,
+ trellis_value_t * rhs){
+ return lhs->m_poss < rhs->m_poss;
+}
+
/* use maximum heap to get the topest results. */
static bool get_top_results(/* out */ GPtrArray * topresults,
- /* in */ GPtrArray * candidates);
+ /* in */ GPtrArray * candidates) {
+ g_ptr_array_set_size(topresults, 0);
+
+ if (0 == candidates->len)
+ return false;
+
+ trellis_value_t ** begin =
+ (trellis_value_t **) &g_ptr_array_index(candidates, 0);
+ trellis_value_t ** end =
+ (trellis_value_t **) &g_ptr_array_index(candidates, candidates->len);
+
+ std_lite::make_heap(begin, end, trellis_value_less_than);
+
+ while (end != begin) {
+ trellis_value_t * one = *begin;
+ g_ptr_array_add(topresults, one);
+
+ std_lite::pop_heap(begin, end, trellis_value_less_than);
+ --end;
+
+ if (topresults->len >= nbeam)
+ break;
+ }
+
+ /* dump_all_values(topresults); */
+
+ return true;
+}
int ForwardPhoneticConstraints::add_constraint(size_t start, size_t end,
diff --git a/src/lookup/phonetic_lookup.h b/src/lookup/phonetic_lookup.h
index 65602c7..e25f3da 100644
--- a/src/lookup/phonetic_lookup.h
+++ b/src/lookup/phonetic_lookup.h
@@ -151,10 +151,62 @@ public:
}
/* Array of phrase_token_t */
- bool fill_prefixes(/* in */ TokenVector prefixes);
+ bool fill_prefixes(/* in */ TokenVector prefixes) {
+ assert(prefixes->len > 0);
+
+ for (size_t i = 0; i < prefixes->len; ++i) {
+ phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
+ lookup_key_t initial_key = token;
+ trellis_value_t initial_value(log(1.f));
+ initial_value.m_handles[1] = token;
+
+ trellis_node<nbest> initial_node;
+ assert(initial_node.eval_item(&initial_value));
+
+ LookupStepContent initial_step_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, 0);
+ initial_step_content = g_array_append_val
+ (initial_step_content, initial_node);
+
+ LookupStepIndex initial_step_index = (LookupStepIndex)
+ g_ptr_array_index(steps_index, 0);
+ g_hash_table_insert(initial_step_index,
+ GUINT_TO_POINTER(initial_key),
+ GUINT_TO_POINTER(initial_step_content->len - 1));
+ }
+
+ return true;
+ }
+
/* Array of trellis_value_t */
bool get_candidates(/* in */ gint32 index,
- /* out */ GArray * candidates) const;
+ /* out */ GArray * candidates) const {
+ LookupStepContent step = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, index);
+
+ g_ptr_array_set_size(candidates, 0);
+
+ if (0 == step->len)
+ return false;
+
+ for (size_t i = 0; i < step->len; ++i) {
+ trellis_node<nbest> * node = &g_array_index
+ (step, trellis_node<nbest>, i);
+
+ // only initialized in the get_candidates method.
+ node->number();
+
+ const trellis_value_t * value = node->begin();
+ for (size_t j = 0; j < node->length(); ++j) {
+ g_ptr_array_add(candidates, value);
+ }
+ }
+
+ /* dump_max_value(candidates); */
+
+ return true;
+ }
+
/* insert candidate */
bool insert_candidate(gint32 index, phrase_token_t token,
const trellis_value_t * candidate);
diff --git a/src/lookup/phonetic_lookup_heap.h b/src/lookup/phonetic_lookup_heap.h
index 8af1767..94f97d1 100644
--- a/src/lookup/phonetic_lookup_heap.h
+++ b/src/lookup/phonetic_lookup_heap.h
@@ -45,6 +45,11 @@ public:
const trellis_value_t * begin() { return m_elements; }
const trellis_value_t * end() { return m_elements + m_nelem; }
+ bool number() {
+ for (ssize_t i = 0; i < m_nelem; ++i)
+ m_elements[i].m_current_index = i;
+ }
+
/* return true if the item is stored into m_elements. */
bool eval_item(const trellis_value_t * item) {
/* min heap here, and always push heap. */
diff --git a/src/lookup/phonetic_lookup_linear.h b/src/lookup/phonetic_lookup_linear.h
index fadc2bc..7abca66 100644
--- a/src/lookup/phonetic_lookup_linear.h
+++ b/src/lookup/phonetic_lookup_linear.h
@@ -38,6 +38,11 @@ public:
const trellis_value_t * begin() { return m_elements; }
const trellis_value_t * end() { return m_elements + m_nelem; }
+ bool number() {
+ for (ssize_t i = 0; i < m_nelem; ++i)
+ m_elements[i].m_current_index = i;
+ }
+
/* return true if the item is stored into m_elements. */
bool eval_item(const trellis_value_t * item) {
/* still have space */