diff options
Diffstat (limited to 'src/lookup/phrase_lookup.cpp')
-rw-r--r-- | src/lookup/phrase_lookup.cpp | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp new file mode 100644 index 0000000..f7da0b7 --- /dev/null +++ b/src/lookup/phrase_lookup.cpp @@ -0,0 +1,434 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <math.h> +#include "stl_lite.h" +#include "novel_types.h" +#include "phrase_index.h" +#include "facade_phrase_table2.h" +#include "ngram.h" +#include "phrase_lookup.h" + +using namespace pinyin; + + +/* +const gfloat PhraseLookup::bigram_lambda = lambda; +const gfloat PhraseLookup::unigram_lambda = 1 - lambda; +*/ + +static bool populate_prefixes(GPtrArray * steps_index, + GPtrArray * steps_content) { + + lookup_key_t initial_key = sentence_start; + lookup_value_t initial_value(log(1)); + initial_value.m_handles[1] = sentence_start; + + LookupStepContent initial_step_content = (LookupStepContent) + g_ptr_array_index(steps_content, 0); + g_array_append_val(initial_step_content, initial_value); + + LookupStepIndex initial_step_index = (LookupStepIndex) + g_ptr_array_index(steps_index, 0); + g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), + GUINT_TO_POINTER(initial_step_content->len - 1)); + + return true; +} + +static bool init_steps(GPtrArray * steps_index, + GPtrArray * steps_content, + int nstep) { + + /* add null start step */ + g_ptr_array_set_size(steps_index, nstep); + g_ptr_array_set_size(steps_content, nstep); + + for ( int i = 0; i < nstep; ++i ){ + /* initialize steps_index */ + g_ptr_array_index(steps_index, i) = g_hash_table_new + (g_direct_hash, g_direct_equal); + /* initialize steps_content */ + g_ptr_array_index(steps_content, i) = g_array_new + (FALSE, FALSE, sizeof(lookup_value_t)); + } + + return true; +} + +static void clear_steps(GPtrArray * steps_index, + GPtrArray * steps_content){ + /* clear steps_index */ + for ( size_t i = 0; i < steps_index->len; ++i){ + GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i); + g_hash_table_destroy(table); + g_ptr_array_index(steps_index, i) = NULL; + } + + /* free steps_content */ + for ( size_t i = 0; i < steps_content->len; ++i){ + GArray * array = (GArray *) g_ptr_array_index(steps_content, i); + g_array_free(array, TRUE); + g_ptr_array_index(steps_content, i) = NULL; + } +} + +PhraseLookup::PhraseLookup(const gfloat lambda, + FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram) + : bigram_lambda(lambda), + unigram_lambda(1. - lambda) +{ + m_phrase_table = phrase_table; + m_phrase_index = phrase_index; + m_system_bigram = system_bigram; + m_user_bigram = user_bigram; + + m_steps_index = g_ptr_array_new(); + m_steps_content = g_ptr_array_new(); + + /* the member variables below are saved in get_best_match call. */ + m_sentence = NULL; + m_sentence_length = 0; +} + +PhraseLookup::~PhraseLookup(){ + clear_steps(m_steps_index, m_steps_content); + g_ptr_array_free(m_steps_index, TRUE); + g_ptr_array_free(m_steps_content, TRUE); +} + +bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[], + MatchResults & results){ + m_sentence_length = sentence_length; + m_sentence = sentence; + int nstep = m_sentence_length + 1; + + clear_steps(m_steps_index, m_steps_content); + + init_steps(m_steps_index, m_steps_content, nstep); + + populate_prefixes(m_steps_index, m_steps_content); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + m_phrase_index->prepare_tokens(tokens); + + for ( int i = 0; i < nstep - 1; ++i ){ + for ( int m = i + 1; m < nstep; ++m ){ + + /* do one phrase table search. */ + int result = m_phrase_table->search(m - i, sentence + i, tokens); + + /* found next phrase */ + if ( result & SEARCH_OK ) { + search_bigram2(i, tokens), + search_unigram2(i, tokens); + } + + /* no longer phrase */ + if (!(result & SEARCH_CONTINUED)) + break; + } + } + + m_phrase_index->destroy_tokens(tokens); + + return final_step(results); +} + +#if 0 + +bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){ + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return false; + + lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0); + /* find the maximum node */ + for ( size_t i = 1; i < lookup_content->len; ++i ){ + lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + return unigram_gen_next_step(nstep, max_value, token); +} + +bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return false; + + for ( size_t i = 0; i < lookup_content->len; ++i ){ + lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i); + phrase_token_t index_token = cur_value->m_handles[1]; + SingleGram * system, * user; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if ( !merge_single_gram(&m_merged_single_gram, system, user) ) + continue; + + guint32 freq; + if ( m_merged_single_gram.get_freq(token, freq) ){ + guint32 total_freq; + m_merged_single_gram.get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found; + } + + if (system) + delete system; + if (user) + delete user; + } + + return found; +} + +#endif + +bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return found; + + /* find the maximum node */ + lookup_value_t * max_value = &g_array_index + (lookup_content, lookup_value_t, 0); + + for (size_t i = 1; i < lookup_content->len; ++i) { + lookup_value_t * cur_value = &g_array_index + (lookup_content, lookup_value_t, i); + if (cur_value->m_poss > max_value->m_poss) + max_value = cur_value; + } + + /* iterate over tokens */ + for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { + GArray * array = tokens[n]; + if (NULL == array) + continue; + + /* just skip the loop when the length is zero. */ + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = + g_array_index(array, phrase_token_t, k); + + found = unigram_gen_next_step + (nstep, max_value, token) || found; + } + } + + return found; +} + +bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if (0 == lookup_content->len) + return found; + + for (size_t i = 0; i < lookup_content->len; ++i) { + lookup_value_t * cur_value = &g_array_index + (lookup_content, lookup_value_t, i); + phrase_token_t index_token = cur_value->m_handles[1]; + + SingleGram * system = NULL, * user = NULL; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if (!merge_single_gram + (&m_merged_single_gram, system, user)) + continue; + + /* iterate over tokens */ + for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { + GArray * array = tokens[n]; + if (NULL == array) + continue; + + /* just skip the loop when the length is zero. */ + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = + g_array_index(array, phrase_token_t, k); + + guint32 freq = 0; + if (m_merged_single_gram.get_freq(token, freq)) { + guint32 total_freq = 0; + m_merged_single_gram.get_total_freq(total_freq); + + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found; + } + } + } + + if (system) + delete system; + if (user) + delete user; + } + + return found; +} + +bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value, +phrase_token_t token){ + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble) + m_phrase_index->get_phrase_index_total_freq(); + if ( elem_poss < DBL_EPSILON ) + return false; + + lookup_value_t next_value; + next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token; + next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda); + next_value.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_value, &next_value); +} + +bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){ + + if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() / + (gdouble) m_phrase_index->get_phrase_index_total_freq(); + + if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) + return false; + + lookup_value_t next_value; + next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token; + next_value.m_poss = cur_value->m_poss + + log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss ); + next_value.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_value, &next_value); +} + +bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){ + + LookupStepIndex next_lookup_index = (LookupStepIndex) + g_ptr_array_index(m_steps_index, next_step_pos); + LookupStepContent next_lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, next_step_pos); + + lookup_key_t next_key = next_value->m_handles[1]; + + gpointer key = NULL, value = NULL; + gboolean lookup_result = g_hash_table_lookup_extended + (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); + + if (!lookup_result){ + g_array_append_val(next_lookup_content, *next_value); + g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), + GUINT_TO_POINTER(next_lookup_content->len - 1)); + return true; + }else{ + size_t step_index = GPOINTER_TO_UINT(value); + lookup_value_t * orig_next_value = &g_array_index + (next_lookup_content, lookup_value_t, step_index); + + if ( orig_next_value->m_poss < next_value->m_poss ){ + orig_next_value->m_handles[0] = next_value->m_handles[0]; + assert(orig_next_value->m_handles[1] == next_value->m_handles[1]); + orig_next_value->m_poss = next_value->m_poss; + orig_next_value->m_last_step = next_value->m_last_step; + return true; + } + return false; + } +} + +bool PhraseLookup::final_step(MatchResults & results ){ + + /* reset results */ + g_array_set_size(results, m_steps_content->len - 1); + for ( size_t i = 0; i < results->len; ++i ){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + *token = null_token; + } + + /* find max element */ + size_t last_step_pos = m_steps_content->len - 1; + LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index + (m_steps_content, last_step_pos); + if ( last_step_content->len == 0 ) + return false; + + lookup_value_t * max_value = &g_array_index + (last_step_content, lookup_value_t, 0); + for ( size_t i = 1; i < last_step_content->len; ++i ){ + lookup_value_t * cur_value = &g_array_index + (last_step_content, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + /* backtracing */ + while( true ){ + int cur_step_pos = max_value->m_last_step; + if ( -1 == cur_step_pos ) + break; + + phrase_token_t * token = &g_array_index + (results, phrase_token_t, cur_step_pos); + *token = max_value->m_handles[1]; + + phrase_token_t last_token = max_value->m_handles[0]; + LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos); + + gpointer key = NULL, value = NULL; + gboolean result = g_hash_table_lookup_extended + (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); + if ( !result ) + return false; + + LookupStepContent lookup_step_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, cur_step_pos); + max_value = &g_array_index + (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); + } + + /* no need to reverse the result */ + return true; +} |