summaryrefslogtreecommitdiffstats
path: root/src/lookup/phrase_lookup.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/lookup/phrase_lookup.cpp')
-rw-r--r--src/lookup/phrase_lookup.cpp434
1 files changed, 434 insertions, 0 deletions
diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp
new file mode 100644
index 0000000..f7da0b7
--- /dev/null
+++ b/src/lookup/phrase_lookup.cpp
@@ -0,0 +1,434 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <math.h>
+#include "stl_lite.h"
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "facade_phrase_table2.h"
+#include "ngram.h"
+#include "phrase_lookup.h"
+
+using namespace pinyin;
+
+
+/*
+const gfloat PhraseLookup::bigram_lambda = lambda;
+const gfloat PhraseLookup::unigram_lambda = 1 - lambda;
+*/
+
+static bool populate_prefixes(GPtrArray * steps_index,
+ GPtrArray * steps_content) {
+
+ lookup_key_t initial_key = sentence_start;
+ lookup_value_t initial_value(log(1));
+ initial_value.m_handles[1] = sentence_start;
+
+ LookupStepContent initial_step_content = (LookupStepContent)
+ g_ptr_array_index(steps_content, 0);
+ g_array_append_val(initial_step_content, initial_value);
+
+ LookupStepIndex initial_step_index = (LookupStepIndex)
+ g_ptr_array_index(steps_index, 0);
+ g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key),
+ GUINT_TO_POINTER(initial_step_content->len - 1));
+
+ return true;
+}
+
+static bool init_steps(GPtrArray * steps_index,
+ GPtrArray * steps_content,
+ int nstep) {
+
+ /* add null start step */
+ g_ptr_array_set_size(steps_index, nstep);
+ g_ptr_array_set_size(steps_content, nstep);
+
+ for ( int i = 0; i < nstep; ++i ){
+ /* initialize steps_index */
+ g_ptr_array_index(steps_index, i) = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
+ /* initialize steps_content */
+ g_ptr_array_index(steps_content, i) = g_array_new
+ (FALSE, FALSE, sizeof(lookup_value_t));
+ }
+
+ return true;
+}
+
+static void clear_steps(GPtrArray * steps_index,
+ GPtrArray * steps_content){
+ /* clear steps_index */
+ for ( size_t i = 0; i < steps_index->len; ++i){
+ GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
+ g_hash_table_destroy(table);
+ g_ptr_array_index(steps_index, i) = NULL;
+ }
+
+ /* free steps_content */
+ for ( size_t i = 0; i < steps_content->len; ++i){
+ GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
+ g_array_free(array, TRUE);
+ g_ptr_array_index(steps_content, i) = NULL;
+ }
+}
+
+PhraseLookup::PhraseLookup(const gfloat lambda,
+ FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram)
+ : bigram_lambda(lambda),
+ unigram_lambda(1. - lambda)
+{
+ m_phrase_table = phrase_table;
+ m_phrase_index = phrase_index;
+ m_system_bigram = system_bigram;
+ m_user_bigram = user_bigram;
+
+ m_steps_index = g_ptr_array_new();
+ m_steps_content = g_ptr_array_new();
+
+ /* the member variables below are saved in get_best_match call. */
+ m_sentence = NULL;
+ m_sentence_length = 0;
+}
+
+PhraseLookup::~PhraseLookup(){
+ clear_steps(m_steps_index, m_steps_content);
+ g_ptr_array_free(m_steps_index, TRUE);
+ g_ptr_array_free(m_steps_content, TRUE);
+}
+
+bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
+ MatchResults & results){
+ m_sentence_length = sentence_length;
+ m_sentence = sentence;
+ int nstep = m_sentence_length + 1;
+
+ clear_steps(m_steps_index, m_steps_content);
+
+ init_steps(m_steps_index, m_steps_content, nstep);
+
+ populate_prefixes(m_steps_index, m_steps_content);
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ m_phrase_index->prepare_tokens(tokens);
+
+ for ( int i = 0; i < nstep - 1; ++i ){
+ for ( int m = i + 1; m < nstep; ++m ){
+
+ /* do one phrase table search. */
+ int result = m_phrase_table->search(m - i, sentence + i, tokens);
+
+ /* found next phrase */
+ if ( result & SEARCH_OK ) {
+ search_bigram2(i, tokens),
+ search_unigram2(i, tokens);
+ }
+
+ /* no longer phrase */
+ if (!(result & SEARCH_CONTINUED))
+ break;
+ }
+ }
+
+ m_phrase_index->destroy_tokens(tokens);
+
+ return final_step(results);
+}
+
+#if 0
+
+bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return false;
+
+ lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0);
+ /* find the maximum node */
+ for ( size_t i = 1; i < lookup_content->len; ++i ){
+ lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ return unigram_gen_next_step(nstep, max_value, token);
+}
+
+bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return false;
+
+ for ( size_t i = 0; i < lookup_content->len; ++i ){
+ lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
+ phrase_token_t index_token = cur_value->m_handles[1];
+ SingleGram * system, * user;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if ( !merge_single_gram(&m_merged_single_gram, system, user) )
+ continue;
+
+ guint32 freq;
+ if ( m_merged_single_gram.get_freq(token, freq) ){
+ guint32 total_freq;
+ m_merged_single_gram.get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
+ }
+
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ return found;
+}
+
+#endif
+
+bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return found;
+
+ /* find the maximum node */
+ lookup_value_t * max_value = &g_array_index
+ (lookup_content, lookup_value_t, 0);
+
+ for (size_t i = 1; i < lookup_content->len; ++i) {
+ lookup_value_t * cur_value = &g_array_index
+ (lookup_content, lookup_value_t, i);
+ if (cur_value->m_poss > max_value->m_poss)
+ max_value = cur_value;
+ }
+
+ /* iterate over tokens */
+ for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+ GArray * array = tokens[n];
+ if (NULL == array)
+ continue;
+
+ /* just skip the loop when the length is zero. */
+ for (size_t k = 0; k < array->len; ++k) {
+ phrase_token_t token =
+ g_array_index(array, phrase_token_t, k);
+
+ found = unigram_gen_next_step
+ (nstep, max_value, token) || found;
+ }
+ }
+
+ return found;
+}
+
+bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if (0 == lookup_content->len)
+ return found;
+
+ for (size_t i = 0; i < lookup_content->len; ++i) {
+ lookup_value_t * cur_value = &g_array_index
+ (lookup_content, lookup_value_t, i);
+ phrase_token_t index_token = cur_value->m_handles[1];
+
+ SingleGram * system = NULL, * user = NULL;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if (!merge_single_gram
+ (&m_merged_single_gram, system, user))
+ continue;
+
+ /* iterate over tokens */
+ for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+ GArray * array = tokens[n];
+ if (NULL == array)
+ continue;
+
+ /* just skip the loop when the length is zero. */
+ for (size_t k = 0; k < array->len; ++k) {
+ phrase_token_t token =
+ g_array_index(array, phrase_token_t, k);
+
+ guint32 freq = 0;
+ if (m_merged_single_gram.get_freq(token, freq)) {
+ guint32 total_freq = 0;
+ m_merged_single_gram.get_total_freq(total_freq);
+
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
+ }
+ }
+ }
+
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ return found;
+}
+
+bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
+phrase_token_t token){
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( elem_poss < DBL_EPSILON )
+ return false;
+
+ lookup_value_t next_value;
+ next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
+ next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda);
+ next_value.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_value, &next_value);
+}
+
+bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){
+
+ if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
+ (gdouble) m_phrase_index->get_phrase_index_total_freq();
+
+ if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
+ return false;
+
+ lookup_value_t next_value;
+ next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
+ next_value.m_poss = cur_value->m_poss +
+ log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss );
+ next_value.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_value, &next_value);
+}
+
+bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){
+
+ LookupStepIndex next_lookup_index = (LookupStepIndex)
+ g_ptr_array_index(m_steps_index, next_step_pos);
+ LookupStepContent next_lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, next_step_pos);
+
+ lookup_key_t next_key = next_value->m_handles[1];
+
+ gpointer key = NULL, value = NULL;
+ gboolean lookup_result = g_hash_table_lookup_extended
+ (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
+
+ if (!lookup_result){
+ g_array_append_val(next_lookup_content, *next_value);
+ g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key),
+ GUINT_TO_POINTER(next_lookup_content->len - 1));
+ return true;
+ }else{
+ size_t step_index = GPOINTER_TO_UINT(value);
+ lookup_value_t * orig_next_value = &g_array_index
+ (next_lookup_content, lookup_value_t, step_index);
+
+ if ( orig_next_value->m_poss < next_value->m_poss ){
+ orig_next_value->m_handles[0] = next_value->m_handles[0];
+ assert(orig_next_value->m_handles[1] == next_value->m_handles[1]);
+ orig_next_value->m_poss = next_value->m_poss;
+ orig_next_value->m_last_step = next_value->m_last_step;
+ return true;
+ }
+ return false;
+ }
+}
+
+bool PhraseLookup::final_step(MatchResults & results ){
+
+ /* reset results */
+ g_array_set_size(results, m_steps_content->len - 1);
+ for ( size_t i = 0; i < results->len; ++i ){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ *token = null_token;
+ }
+
+ /* find max element */
+ size_t last_step_pos = m_steps_content->len - 1;
+ LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index
+ (m_steps_content, last_step_pos);
+ if ( last_step_content->len == 0 )
+ return false;
+
+ lookup_value_t * max_value = &g_array_index
+ (last_step_content, lookup_value_t, 0);
+ for ( size_t i = 1; i < last_step_content->len; ++i ){
+ lookup_value_t * cur_value = &g_array_index
+ (last_step_content, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ /* backtracing */
+ while( true ){
+ int cur_step_pos = max_value->m_last_step;
+ if ( -1 == cur_step_pos )
+ break;
+
+ phrase_token_t * token = &g_array_index
+ (results, phrase_token_t, cur_step_pos);
+ *token = max_value->m_handles[1];
+
+ phrase_token_t last_token = max_value->m_handles[0];
+ LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos);
+
+ gpointer key = NULL, value = NULL;
+ gboolean result = g_hash_table_lookup_extended
+ (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
+ if ( !result )
+ return false;
+
+ LookupStepContent lookup_step_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, cur_step_pos);
+ max_value = &g_array_index
+ (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
+ }
+
+ /* no need to reverse the result */
+ return true;
+}