From 4477ff8cb78805d3c86ee612338f18ba7728e07e Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 11 Sep 2012 15:55:26 +0800 Subject: remove pinyin_lookup.* --- src/lookup/pinyin_lookup.cpp | 546 ------------------------------------------- src/lookup/pinyin_lookup.h | 254 -------------------- 2 files changed, 800 deletions(-) delete mode 100644 src/lookup/pinyin_lookup.cpp delete mode 100644 src/lookup/pinyin_lookup.h diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp deleted file mode 100644 index 547031a..0000000 --- a/src/lookup/pinyin_lookup.cpp +++ /dev/null @@ -1,546 +0,0 @@ -/* - * libpinyin - * Library to deal with pinyin. - * - * Copyright (C) 2006-2007 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - - -#include "pinyin_lookup.h" -#include -#include -#include "stl_lite.h" -#include "novel_types.h" -#include "pinyin_phrase2.h" -#include "facade_chewing_table.h" -#include "ngram.h" -#include "winner_tree.h" - -using namespace pinyin; - -const gfloat PinyinLookup::bigram_lambda = LAMBDA_PARAMETER; -const gfloat PinyinLookup::unigram_lambda = 1 - LAMBDA_PARAMETER; - -static void clear_steps(GPtrArray * steps_index, GPtrArray * steps_content){ - /* clear steps_index */ - for ( size_t i = 0; i < steps_index->len; ++i){ - GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i); - g_hash_table_destroy(table); - g_ptr_array_index(steps_index, i) = NULL; - } - - /* clear steps_content */ - for ( size_t i = 0; i < steps_content->len; ++i){ - GArray * array = (GArray *) g_ptr_array_index(steps_content, i); - g_array_free(array, TRUE); - g_ptr_array_index(steps_content, i) = NULL; - } -} - -PinyinLookup::PinyinLookup(pinyin_option_t options, - FacadeChewingTable * pinyin_table, - FacadePhraseIndex * phrase_index, - Bigram * system_bigram, - Bigram * user_bigram){ - m_options = options; - m_pinyin_table = pinyin_table; - m_phrase_index = phrase_index; - m_system_bigram = system_bigram; - m_user_bigram = user_bigram; - - m_winner_tree = new WinnerTree; - m_steps_index = g_ptr_array_new(); - m_steps_content = g_ptr_array_new(); - m_table_cache = g_array_new(FALSE, TRUE, sizeof(PhraseIndexRanges)); - g_array_set_size(m_table_cache, 1); -} - -PinyinLookup::~PinyinLookup(){ - if ( m_winner_tree ) - delete m_winner_tree; - m_winner_tree = NULL; - - //free resources - for ( size_t i = 0; i < m_table_cache->len; ++i){ - PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i); - m_phrase_index->destroy_ranges(*ranges); - } - - g_array_free(m_table_cache, TRUE); - - clear_steps(m_steps_index, m_steps_content); - g_ptr_array_free(m_steps_index, TRUE); - g_ptr_array_free(m_steps_content, TRUE); -} - -size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){ - /* free resources */ - for ( size_t i = 0; i < m_table_cache->len; ++i){ - PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i); - m_phrase_index->destroy_ranges(*ranges); - } - - ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data; - pinyin_keys += nstep; - g_array_set_size(m_table_cache, MAX_PHRASE_LENGTH + 1); - - int len, total_len = std_lite::min(total_pinyin, MAX_PHRASE_LENGTH); - - for ( len = 1; len <= total_len; ++len){ - PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, len); - m_phrase_index->prepare_ranges(*ranges); - int result = m_pinyin_table->search(len, pinyin_keys, *ranges); - if (!( result & SEARCH_CONTINUED)){ - ++len; - break; - } - } - g_array_set_size(m_table_cache, std_lite::min(len, MAX_PHRASE_LENGTH + 1)); - return m_table_cache->len - 1; -} - -bool PinyinLookup::get_best_match(TokenVector prefixes, - ChewingKeyVector keys, - CandidateConstraints constraints, - MatchResults & results){ - //g_array_set_size(results, 0); - - m_constraints = constraints; - m_keys = keys; - int nstep = keys->len + 1; - - clear_steps(m_steps_index, m_steps_content); - - //add null start step - g_ptr_array_set_size(m_steps_index, nstep); - g_ptr_array_set_size(m_steps_content, nstep); - - for ( int i = 0; i < nstep; ++i ){ - //initialize m_steps_index - g_ptr_array_index(m_steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal); - //initialize m_steps_content - g_ptr_array_index(m_steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t)); - } - - for ( int i = 0; i < prefixes->len; ++i ){ - phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); - lookup_key_t initial_key = token; - lookup_value_t initial_value(log(1)); - initial_value.m_handles[1] = token; - - GArray * initial_step_content = (GArray *) g_ptr_array_index(m_steps_content, 0); - initial_step_content = g_array_append_val(initial_step_content, initial_value); - GHashTable * initial_step_index = (GHashTable *) g_ptr_array_index(m_steps_index, 0); - g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), GUINT_TO_POINTER(initial_step_content->len - 1)); - } - - /* begin the Viterbi beam search. */ - for ( int i = 0; i < nstep - 1; ++i ){ - LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, i); - IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step); - size_t npinyin = prepare_table_cache(i, keys->len - i); - search_bigram(iter, i, npinyin), - search_unigram(iter, i, npinyin); - delete iter; - } - return final_step(results); -} - -bool PinyinLookup::search_unigram(IBranchIterator * iter, int nstep, int npinyin){ - lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep); - if ( CONSTRAINT_NOSEARCH == constraint->m_type ) - return false; - GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep); - if ( 0 == lookup_content->len ) - return false; - lookup_value_t max_step = iter->max(); - if ( CONSTRAINT_ONESTEP == constraint->m_type){ - return unigram_gen_next_step(nstep, &max_step, constraint->m_token); - } - if ( NO_CONSTRAINT == constraint->m_type ){ - bool found = false; - for ( size_t i = 1; i < m_table_cache->len && i <= MAX_PHRASE_LENGTH; ++i){ - lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1); - if ( constraint->m_type == CONSTRAINT_NOSEARCH ) - break; - PhraseIndexRanges * ranges = &g_array_index(m_table_cache,PhraseIndexRanges, i); - for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ - GArray * array = (*ranges)[m]; - if ( !array ) continue; - for ( size_t n = 0; n < array->len; ++n){ - PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); - for ( phrase_token_t token = range->m_range_begin; - token != range->m_range_end; ++token){ - found = unigram_gen_next_step(nstep, &max_step, token)|| found; - } - } - } - } - return found; - } - return false; -} - - -bool PinyinLookup::search_bigram(IBranchIterator * iter, - int nstep, int npinyin){ - lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep); - if ( CONSTRAINT_NOSEARCH == constraint->m_type ) - return false; - - bool found = false; - BigramPhraseArray bigram_phrase_items = g_array_new(FALSE, FALSE, - sizeof(BigramPhraseItem)); - while ( iter->has_next() ){ - lookup_value_t cur_step = iter->next(); - //printf("token:%d\t%d\n", cur_step.m_handles[0], cur_step.m_handles[1]); - phrase_token_t index_token = cur_step.m_handles[1]; - SingleGram * system, * user; - m_system_bigram->load(index_token, system); - m_user_bigram->load(index_token, user); - - if ( !merge_single_gram(&m_merged_single_gram, system, user) ) - continue; - - if ( CONSTRAINT_ONESTEP == constraint->m_type ){ - phrase_token_t token = constraint->m_token; - - guint32 freq; - if( m_merged_single_gram.get_freq(token, freq) ){ - guint32 total_freq; - m_merged_single_gram.get_total_freq(total_freq); - gfloat bigram_poss = freq / (gfloat) total_freq; - found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found; - } - } - - if ( NO_CONSTRAINT == constraint->m_type ){ - for ( size_t i = 1; i < m_table_cache->len - && i <= MAX_PHRASE_LENGTH; ++i ){ - lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1); - if ( constraint->m_type == CONSTRAINT_NOSEARCH ) - break; - - PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i); - for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ - GArray * array = (*ranges)[m]; - if ( !array ) continue; - - for ( size_t n = 0; n < array->len; ++n){ - PhraseIndexRange * range = - &g_array_index(array, PhraseIndexRange, n); - - g_array_set_size(bigram_phrase_items, 0); - m_merged_single_gram.search(range, bigram_phrase_items); - for( size_t k = 0; k < bigram_phrase_items->len; - ++k){ - BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); - found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found; - } - } - } - } - } - if (system) - delete system; - if (user) - delete user; - } - g_array_free(bigram_phrase_items, TRUE); - return found; -} - - -bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token){ - ChewingKey * pinyinkeys = ((ChewingKey *)m_keys->data) + nstep; - if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) - return false; - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble) - m_phrase_index->get_phrase_index_total_freq(); - if ( elem_poss < DBL_EPSILON ) - return false; - gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyinkeys); - if (pinyin_poss < FLT_EPSILON ) - return false; - lookup_value_t next_step; - next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; - next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda); - next_step.m_last_step = nstep; - - return save_next_step(nstep + phrase_length, cur_step, &next_step); -} - -bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss){ - ChewingKey * pinyinkeys = ((ChewingKey *)m_keys->data) + nstep; - if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) - return false; - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() / - (gdouble) m_phrase_index->get_phrase_index_total_freq(); - - if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) - return false; - gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyinkeys); - if ( pinyin_poss < FLT_EPSILON ) - return false; - lookup_value_t next_step; - next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; - next_step.m_poss = cur_step->m_poss + - log(( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) *pinyin_poss); - next_step.m_last_step = nstep; - - return save_next_step(nstep + phrase_length, cur_step, &next_step); -} - -bool PinyinLookup::save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step){ - lookup_key_t next_key = next_step->m_handles[1]; - GHashTable * next_lookup_index = (GHashTable *) g_ptr_array_index(m_steps_index, next_step_pos); - GArray * next_lookup_content = (GArray *) g_ptr_array_index(m_steps_content, next_step_pos); - - gpointer key, value; - gboolean lookup_result = g_hash_table_lookup_extended(next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); - size_t step_index = GPOINTER_TO_UINT(value); - if ( !lookup_result ){ - g_array_append_val(next_lookup_content, *next_step); - g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1)); - return true; - }else{ - lookup_value_t * orig_next_value = &g_array_index(next_lookup_content, lookup_value_t,step_index); - if ( orig_next_value->m_poss < next_step->m_poss) { - orig_next_value->m_handles[0] = next_step->m_handles[0]; - assert(orig_next_value->m_handles[1] == next_step->m_handles[1]); - orig_next_value->m_poss = next_step->m_poss; - orig_next_value->m_last_step = next_step->m_last_step; - return true; - } - return false; - } -} - -bool PinyinLookup::final_step(MatchResults & results){ - //reset results - g_array_set_size(results, m_steps_content->len - 1); - for ( size_t i = 0 ; i < results->len ; ++i){ - phrase_token_t * token = &g_array_index(results, phrase_token_t, i); - *token = null_token; - } - //find max element - size_t last_step_pos = m_steps_content->len - 1; - - GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos); - if ( last_step_array->len == 0 ) - return false; - lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0); - for ( size_t i = 1; i < last_step_array->len; ++i){ - lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i); - if ( cur_value->m_poss > max_value->m_poss ) - max_value = cur_value; - } - - //backtracing - while( true ){ - int cur_step_pos = max_value->m_last_step; - if ( -1 == cur_step_pos ) - break; - - phrase_token_t * token = &g_array_index(results, phrase_token_t, cur_step_pos); - *token = max_value->m_handles[1]; - - phrase_token_t last_token = max_value->m_handles[0]; - - - GHashTable * lookup_step_index = (GHashTable *)g_ptr_array_index(m_steps_index, cur_step_pos); - gpointer key, value; - gboolean result = g_hash_table_lookup_extended(lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); - if (!result) - return false; - GArray * lookup_step_content = (GArray *)g_ptr_array_index(m_steps_content, cur_step_pos); - - max_value = &g_array_index(lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); - } - - //no need to reverse the result - - return true; -} - -bool PinyinLookup::train_result2(ChewingKeyVector keys, - CandidateConstraints constraints, - MatchResults results) { - const guint32 initial_seed = 23 * 15; - const guint32 expand_factor = 2; - const guint32 unigram_factor = 7; - const guint32 pinyin_factor = 1; - const guint32 ceiling_seed = 23 * 15 * 64; - - /* begin training based on constraints and results. */ - bool train_next = false; - ChewingKey * pinyin_keys = (ChewingKey *) keys->data; - - phrase_token_t last_token = sentence_start; - /* constraints->len + 1 == results->len */ - for (size_t i = 0; i < constraints->len; ++i) { - phrase_token_t * token = &g_array_index(results, phrase_token_t, i); - if (null_token == *token) - continue; - - lookup_constraint_t * constraint = &g_array_index - (constraints, lookup_constraint_t, i); - if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) { - if (CONSTRAINT_ONESTEP == constraint->m_type) { - assert(*token == constraint->m_token); - train_next = true; - } else { - train_next = false; - } - - guint32 seed = initial_seed; - /* train bi-gram first, and get train seed. */ - if (last_token) { - SingleGram * user = NULL; - m_user_bigram->load(last_token, user); - - guint32 total_freq = 0; - if (!user) { - user = new SingleGram; - } - assert(user->get_total_freq(total_freq)); - - guint32 freq = 0; - /* compute train factor */ - if (!user->get_freq(*token, freq)) { - assert(user->insert_freq(*token, 0)); - seed = initial_seed; - } else { - seed = std_lite::max(freq, initial_seed); - seed *= expand_factor; - seed = std_lite::min(seed, ceiling_seed); - } - - /* protect against total_freq overflow */ - if (seed > 0 && total_freq > total_freq + seed) - goto next; - - assert(user->set_total_freq(total_freq + seed)); - /* if total_freq is not overflow, then freq won't overflow. */ - assert(user->set_freq(*token, freq + seed)); - assert(m_user_bigram->store(last_token, user)); - next: - if (user) - delete user; - } - - /* train uni-gram */ - m_phrase_index->get_phrase_item(*token, m_cache_phrase_item); - m_cache_phrase_item.increase_pronunciation_possibility - (m_options, pinyin_keys + i, seed * pinyin_factor); - m_phrase_index->add_unigram_frequency - (*token, seed * unigram_factor); - } - last_token = *token; - } - return true; -} - -guint8 PinyinLookup::add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token){ - if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item) ) - return 0; - - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - if ( index + phrase_length > constraints->len ) - return 0; - - for ( size_t i = index; i < index + phrase_length ; ++i ){ - clear_constraint(constraints, i); - } - - lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index); - constraint->m_type = CONSTRAINT_ONESTEP; - constraint->m_token = token; - - for (size_t i = 1; i < phrase_length; ++i){ - constraint = &g_array_index(constraints, lookup_constraint_t, index + i); - constraint->m_type = CONSTRAINT_NOSEARCH; - constraint->m_constraint_step = index; - } - return phrase_length; -} - -bool PinyinLookup::clear_constraint(CandidateConstraints constraints, size_t index){ - if ( index < 0 || index >= constraints->len ) - return false; - lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index); - if (constraint->m_type == NO_CONSTRAINT) - return false; - if (constraint->m_type == CONSTRAINT_NOSEARCH){ - index = constraint->m_constraint_step; - constraint = &g_array_index(constraints, lookup_constraint_t, index); - } - - assert(constraint->m_type == CONSTRAINT_ONESTEP); - - phrase_token_t token = constraint->m_token; - if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) - return false; - - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - for ( size_t i = 0; i < phrase_length; ++i){ - if ( index + i >= constraints->len ) - continue; - constraint = &g_array_index(constraints, lookup_constraint_t, index + i); - constraint->m_type = NO_CONSTRAINT; - } - return true; -} - -bool PinyinLookup::validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys){ - //resize constraints array - size_t constraints_length = constraints->len; - if ( keys->len > constraints_length ){ - g_array_set_size(constraints, keys->len); - //initialize new element - for( size_t i = constraints_length; i < keys->len; ++i){ - lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); - constraint->m_type = NO_CONSTRAINT; - } - }else if (keys->len < constraints_length ){ - g_array_set_size(constraints, keys->len); - } - - ChewingKey * pinyin_keys = (ChewingKey *)keys->data; - - for ( size_t i = 0; i < constraints->len; ++i){ - lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); - if ( constraint->m_type == CONSTRAINT_ONESTEP ){ - phrase_token_t token = constraint->m_token; - m_phrase_index->get_phrase_item(token, m_cache_phrase_item); - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - //clear too long constraint - if ( i + phrase_length > constraints->len ){ - clear_constraint(constraints, i); - continue; - } - //clear invalidated pinyin - gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys + i); - if ( pinyin_poss < FLT_EPSILON ){ - clear_constraint(constraints, i); - } - } - } - return true; -} diff --git a/src/lookup/pinyin_lookup.h b/src/lookup/pinyin_lookup.h deleted file mode 100644 index b23c39a..0000000 --- a/src/lookup/pinyin_lookup.h +++ /dev/null @@ -1,254 +0,0 @@ -/* - * libpinyin - * Library to deal with pinyin. - * - * Copyright (C) 2006-2007 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#ifndef PINYIN_LOOKUP_H -#define PINYIN_LOOKUP_H - - -#include -#include -#include "novel_types.h" -#include "chewing_key.h" -#include "phrase_index.h" -#include "ngram.h" -#include "lookup.h" - - -namespace pinyin{ - -class WinnerTree; - -/** - * pinyin_lookup.h - * - * The definitions of pinyin lookup related classes and structs. - * - */ - -enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH }; - -struct lookup_constraint_t{ - /* current type of the step */ - constraint_type m_type; - - /* Note: - * value of m_type: - * NO_CONSTRAINT: - * no values in the below union. - * search all possible next words. - * CONSTRAINT_ONESTEP: - * m_token contains the next word. - * only one word can be used to search for the next step, - * use case for user selected candidates. - * CONSTRAINT_NOSEARCH: - * m_constraint_step contains the value - * which points back to the CONSTRAINT_ONESTEP step. - * no search is allowed for the current step. - */ - - union{ - phrase_token_t m_token; - guint32 m_constraint_step; /* index of m_token */ - }; -}; - - -/** - * IBranchIterator: - * - * The iterator to get the 32 highest values. - * - * Note: The winner tree for Viterbi beam search. - * - */ -class IBranchIterator{ -public: - virtual ~IBranchIterator(){} - virtual bool has_next() = 0; - virtual lookup_value_t next() = 0; - virtual lookup_value_t max() = 0; -}; - -/** - * PinyinLookup: - * - * The pinyin lookup class to convert pinyin keys to guessed sentence. - * - */ -class PinyinLookup{ -private: - static const gfloat bigram_lambda; - static const gfloat unigram_lambda; - - PhraseItem m_cache_phrase_item; - SingleGram m_merged_single_gram; -protected: - //saved varibles - CandidateConstraints m_constraints; - ChewingKeyVector m_keys; - - FacadeChewingTable * m_pinyin_table; - FacadePhraseIndex * m_phrase_index; - pinyin_option_t m_options; - Bigram * m_system_bigram; - Bigram * m_user_bigram; - - //internal step data structure - GPtrArray * m_steps_index; - /* Array of LookupStepIndex */ - GPtrArray * m_steps_content; - /* Array of LookupStepContent */ - - GArray * m_table_cache; - /* Array of PhraseIndexRanges, - * PhraseIndexRanges is an array of GArray of PhraseIndexRange, - * indexed by phrase library (only contains enabled phrase libraries). - */ - - WinnerTree * m_winner_tree; - - size_t prepare_table_cache(int nstep, int total_pinyin); - - bool search_unigram(IBranchIterator * iter, int nstep, int npinyin); - bool search_bigram(IBranchIterator * iter, int nstep, int npinyin); - - bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token); - bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss); - - bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step); - - bool final_step(MatchResults & results); -public: - /** - * PinyinLookup::PinyinLookup: - * @options: the pinyin options. - * @pinyin_table: the pinyin table. - * @phrase_index: the phrase index. - * @system_bigram: the system bi-gram. - * @user_bigram: the user bi-gram. - * - * The constructor of the PinyinLookup. - * - */ - PinyinLookup(pinyin_option_t options, FacadeChewingTable * pinyin_table, - FacadePhraseIndex * phrase_index, Bigram * system_bigram, - Bigram * user_bigram); - - /** - * PinyinLookup::~PinyinLookup: - * - * The destructor of the PinyinLookup. - * - */ - ~PinyinLookup(); - - /** - * PinyinLookup::set_options: - * @options: the pinyin options. - * @returns: whether the set operation is successful. - * - * Set the pinyin options of this PinyinLookup. - * - */ - bool set_options(pinyin_option_t options) { - m_options = options; - return true; - } - - /** - * PinyinLookup::get_best_match: - * @prefixes: the phrase tokens before the guessed sentence. - * @keys: the pinyin keys of the guessed sentence. - * @constraints: the constraints on the guessed sentence. - * @results: the guessed sentence in the form of the phrase tokens. - * @returns: whether the guess operation is successful. - * - * Guess the best sentence according to user inputs. - * - */ - bool get_best_match(TokenVector prefixes, ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results); - - /** - * PinyinLookup::train_result2: - * @keys: the pinyin keys of the guessed sentence. - * @constraints: the constraints on the guessed sentence. - * @results: the guessed sentence in the form of the phrase tokens. - * @returns: whether the train operation is successful. - * - * Self learning the guessed sentence based on the constraints. - * - */ - bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results); - - /** - * PinyinLookup::convert_to_utf8: - * @results: the guessed sentence in the form of the phrase tokens. - * @result_string: the guessed sentence in the utf8 encoding. - * @returns: whether the convert operation is successful. - * - * Convert the guessed sentence from the phrase tokens to the utf8 string. - * - */ - bool convert_to_utf8(MatchResults results, - /* out */ char * & result_string) - { - return pinyin::convert_to_utf8(m_phrase_index, results, - NULL, result_string); - } - - /** - * PinyinLookup::add_constraint: - * @constraints: the constraints on the guessed sentence. - * @index: the character offset in the guessed sentence. - * @token: the phrase token in the candidate list chosen by user. - * @returns: the number of the characters in the chosen token. - * - * Add one constraint to the constraints on the guessed sentence. - * - */ - guint8 add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token); - - /** - * PinyinLookup::clear_constraint: - * @constraints: the constraints on the guessed sentence. - * @index: the character offset in the guessed sentence. - * @returns: whether the clear operation is successful. - * - * Clear one constraint in the constraints on the guessed sentence. - * - */ - bool clear_constraint(CandidateConstraints constraints, size_t index); - - /** - * PinyinLookup::validate_constraint: - * @constraints: the constraints on the guessed sentence. - * @keys: the pinyin keys of the guessed sentence. - * @returns: whether the validate operation is successful. - * - * Validate the old constraints with the new pinyin keys. - * - */ - bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys); - -}; - -}; -#endif -- cgit