/*
* libpinyin
* Library to deal with pinyin.
*
* Copyright (C) 2010 Peng Wu
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#include
#include "stl_lite.h"
#include "novel_types.h"
#include "phrase_index.h"
#include "facade_phrase_table3.h"
#include "phrase_lookup.h"
using namespace pinyin;
/*
const gfloat PhraseLookup::bigram_lambda = lambda;
const gfloat PhraseLookup::unigram_lambda = 1 - lambda;
*/
static bool populate_prefixes(GPtrArray * steps_index,
GPtrArray * steps_content) {
lookup_key_t initial_key = sentence_start;
lookup_value_t initial_value(log(1.f));
initial_value.m_handles[1] = sentence_start;
LookupStepContent initial_step_content = (LookupStepContent)
g_ptr_array_index(steps_content, 0);
g_array_append_val(initial_step_content, initial_value);
LookupStepIndex initial_step_index = (LookupStepIndex)
g_ptr_array_index(steps_index, 0);
g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key),
GUINT_TO_POINTER(initial_step_content->len - 1));
return true;
}
static bool init_steps(GPtrArray * steps_index,
GPtrArray * steps_content,
int nstep) {
/* add null start step */
g_ptr_array_set_size(steps_index, nstep);
g_ptr_array_set_size(steps_content, nstep);
for ( int i = 0; i < nstep; ++i ){
/* initialize steps_index */
g_ptr_array_index(steps_index, i) = g_hash_table_new
(g_direct_hash, g_direct_equal);
/* initialize steps_content */
g_ptr_array_index(steps_content, i) = g_array_new
(FALSE, FALSE, sizeof(lookup_value_t));
}
return true;
}
static void clear_steps(GPtrArray * steps_index,
GPtrArray * steps_content){
/* clear steps_index */
for ( size_t i = 0; i < steps_index->len; ++i){
GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
g_hash_table_destroy(table);
g_ptr_array_index(steps_index, i) = NULL;
}
/* free steps_content */
for ( size_t i = 0; i < steps_content->len; ++i){
GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
g_array_free(array, TRUE);
g_ptr_array_index(steps_content, i) = NULL;
}
}
PhraseLookup::PhraseLookup(const gfloat lambda,
FacadePhraseTable3 * phrase_table,
FacadePhraseIndex * phrase_index,
Bigram * system_bigram,
Bigram * user_bigram)
: bigram_lambda(lambda),
unigram_lambda(1. - lambda)
{
m_phrase_table = phrase_table;
m_phrase_index = phrase_index;
m_system_bigram = system_bigram;
m_user_bigram = user_bigram;
m_steps_index = g_ptr_array_new();
m_steps_content = g_ptr_array_new();
/* the member variables below are saved in get_best_match call. */
m_sentence = NULL;
m_sentence_length = 0;
}
PhraseLookup::~PhraseLookup(){
clear_steps(m_steps_index, m_steps_content);
g_ptr_array_free(m_steps_index, TRUE);
g_ptr_array_free(m_steps_content, TRUE);
}
bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
MatchResults & results){
m_sentence_length = sentence_length;
m_sentence = sentence;
int nstep = m_sentence_length + 1;
clear_steps(m_steps_index, m_steps_content);
init_steps(m_steps_index, m_steps_content, nstep);
populate_prefixes(m_steps_index, m_steps_content);
PhraseTokens tokens;
memset(tokens, 0, sizeof(PhraseTokens));
m_phrase_index->prepare_tokens(tokens);
for ( int i = 0; i < nstep - 1; ++i ){
for ( int m = i + 1; m < nstep; ++m ){
m_phrase_index->clear_tokens(tokens);
/* do one phrase table search. */
int result = m_phrase_table->search(m - i, sentence + i, tokens);
/* found next phrase */
if ( result & SEARCH_OK ) {
search_bigram2(i, tokens),
search_unigram2(i, tokens);
}
/* no longer phrase */
if (!(result & SEARCH_CONTINUED))
break;
}
}
m_phrase_index->destroy_tokens(tokens);
return final_step(results);
}
#if 0
bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
LookupStepContent lookup_content = (LookupStepContent)
g_ptr_array_index(m_steps_content, nstep);
if ( 0 == lookup_content->len )
return false;
lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0);
/* find the maximum node */
for ( size_t i = 1; i < lookup_content->len; ++i ){
lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
if ( cur_value->m_poss > max_value->m_poss )
max_value = cur_value;
}
return unigram_gen_next_step(nstep, max_value, token);
}
bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
bool found = false;
LookupStepContent lookup_content = (LookupStepContent)
g_ptr_array_index(m_steps_content, nstep);
if ( 0 == lookup_content->len )
return false;
for ( size_t i = 0; i < lookup_content->len; ++i ){
lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
phrase_token_t index_token = cur_value->m_handles[1];
SingleGram * system, * user;
m_system_bigram->load(index_token, system);
m_user_bigram->load(index_token, user);
if ( !merge_single_gram(&m_merged_single_gram, system, user) )
continue;
guint32 freq;
if ( m_merged_single_gram.get_freq(token, freq) ){
guint32 total_freq;
m_merged_single_gram.get_total_freq(total_freq);
gfloat bigram_poss = freq / (gfloat) total_freq;
found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
}
if (system)
delete system;
if (user)
delete user;
}
return found;
}
#endif
bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
bool found = false;
LookupStepContent lookup_content = (LookupStepContent)
g_ptr_array_index(m_steps_content, nstep);
if ( 0 == lookup_content->len )
return found;
/* find the maximum node */
lookup_value_t * max_value = &g_array_index
(lookup_content, lookup_value_t, 0);
for (size_t i = 1; i < lookup_content->len; ++i) {
lookup_value_t * cur_value = &g_array_index
(lookup_content, lookup_value_t, i);
if (cur_value->m_poss > max_value->m_poss)
max_value = cur_value;
}
/* iterate over tokens */
for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
GArray * array = tokens[n];
if (NULL == array)
continue;
/* just skip the loop when the length is zero. */
for (size_t k = 0; k < array->len; ++k) {
phrase_token_t token =
g_array_index(array, phrase_token_t, k);
found = unigram_gen_next_step
(nstep, max_value, token) || found;
}
}
return found;
}
bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
bool found = false;
LookupStepContent lookup_content = (LookupStepContent)
g_ptr_array_index(m_steps_content, nstep);
if (0 == lookup_content->len)
return found;
for (size_t i = 0; i < lookup_content->len; ++i) {
lookup_value_t * cur_value = &g_array_index
(lookup_content, lookup_value_t, i);
phrase_token_t index_token = cur_value->m_handles[1];
SingleGram * system = NULL, * user = NULL;
m_system_bigram->load(index_token, system);
m_user_bigram->load(index_token, user);
if (!merge_single_gram
(&m_merged_single_gram, system, user))
continue;
/* iterate over tokens */
for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
GArray * array = tokens[n];
if (NULL == array)
continue;
/* just skip the loop when the length is zero. */
for (size_t k = 0; k < array->len; ++k) {
phrase_token_t token =
g_array_index(array, phrase_token_t, k);
guint32 freq = 0;
if (m_merged_single_gram.get_freq(token, freq)) {
guint32 total_freq = 0;
m_merged_single_gram.get_total_freq(total_freq);
gfloat bigram_poss = freq / (gfloat) total_freq;
found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
}
}
}
if (system)
delete system;
if (user)
delete user;
}
return found;
}
bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
phrase_token_t token){
if (m_phrase_index->get_phrase_item(token, m_cached_phrase_item))
return false;
size_t phrase_length = m_cached_phrase_item.get_phrase_length();
gdouble elem_poss = m_cached_phrase_item.get_unigram_frequency() / (gdouble)
m_phrase_index->get_phrase_index_total_freq();
if ( elem_poss < DBL_EPSILON )
return false;
lookup_value_t next_value;
next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda);
next_value.m_last_step = nstep;
return save_next_step(nstep + phrase_length, cur_value, &next_value);
}
bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){
if ( m_phrase_index->get_phrase_item(token, m_cached_phrase_item))
return false;
size_t phrase_length = m_cached_phrase_item.get_phrase_length();
gdouble unigram_poss = m_cached_phrase_item.get_unigram_frequency() /
(gdouble) m_phrase_index->get_phrase_index_total_freq();
if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
return false;
lookup_value_t next_value;
next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
next_value.m_poss = cur_value->m_poss +
log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss );
next_value.m_last_step = nstep;
return save_next_step(nstep + phrase_length, cur_value, &next_value);
}
bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){
LookupStepIndex next_lookup_index = (LookupStepIndex)
g_ptr_array_index(m_steps_index, next_step_pos);
LookupStepContent next_lookup_content = (LookupStepContent)
g_ptr_array_index(m_steps_content, next_step_pos);
lookup_key_t next_key = next_value->m_handles[1];
gpointer key = NULL, value = NULL;
gboolean lookup_result = g_hash_table_lookup_extended
(next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
if (!lookup_result){
g_array_append_val(next_lookup_content, *next_value);
g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key),
GUINT_TO_POINTER(next_lookup_content->len - 1));
return true;
}else{
size_t step_index = GPOINTER_TO_UINT(value);
lookup_value_t * orig_next_value = &g_array_index
(next_lookup_content, lookup_value_t, step_index);
if ( orig_next_value->m_poss < next_value->m_poss ){
orig_next_value->m_handles[0] = next_value->m_handles[0];
assert(orig_next_value->m_handles[1] == next_value->m_handles[1]);
orig_next_value->m_poss = next_value->m_poss;
orig_next_value->m_last_step = next_value->m_last_step;
return true;
}
return false;
}
}
bool PhraseLookup::final_step(MatchResults & results ){
/* reset results */
g_array_set_size(results, m_steps_content->len - 1);
for ( size_t i = 0; i < results->len; ++i ){
phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
*token = null_token;
}
/* find max element */
size_t last_step_pos = m_steps_content->len - 1;
LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index
(m_steps_content, last_step_pos);
if ( last_step_content->len == 0 )
return false;
lookup_value_t * max_value = &g_array_index
(last_step_content, lookup_value_t, 0);
for ( size_t i = 1; i < last_step_content->len; ++i ){
lookup_value_t * cur_value = &g_array_index
(last_step_content, lookup_value_t, i);
if ( cur_value->m_poss > max_value->m_poss )
max_value = cur_value;
}
/* backtracing */
while( true ){
int cur_step_pos = max_value->m_last_step;
if ( -1 == cur_step_pos )
break;
phrase_token_t * token = &g_array_index
(results, phrase_token_t, cur_step_pos);
*token = max_value->m_handles[1];
phrase_token_t last_token = max_value->m_handles[0];
LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos);
gpointer key = NULL, value = NULL;
gboolean result = g_hash_table_lookup_extended
(lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
if ( !result )
return false;
LookupStepContent lookup_step_content = (LookupStepContent)
g_ptr_array_index(m_steps_content, cur_step_pos);
max_value = &g_array_index
(lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
}
/* no need to reverse the result */
return true;
}