summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-09-11 15:55:26 +0800
committerPeng Wu <alexepico@gmail.com>2012-09-11 15:55:26 +0800
commit4477ff8cb78805d3c86ee612338f18ba7728e07e (patch)
tree13ee561f122d5b239a2a27b2913f58f8412ffed5
parente37e1cd96935184bace71c8b2b4d4f57db56ef85 (diff)
downloadlibpinyin-4477ff8cb78805d3c86ee612338f18ba7728e07e.tar.gz
libpinyin-4477ff8cb78805d3c86ee612338f18ba7728e07e.tar.xz
libpinyin-4477ff8cb78805d3c86ee612338f18ba7728e07e.zip
remove pinyin_lookup.*
-rw-r--r--src/lookup/pinyin_lookup.cpp546
-rw-r--r--src/lookup/pinyin_lookup.h254
2 files changed, 0 insertions, 800 deletions
diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp
deleted file mode 100644
index 547031a..0000000
--- a/src/lookup/pinyin_lookup.cpp
+++ /dev/null
@@ -1,546 +0,0 @@
-/*
- * libpinyin
- * Library to deal with pinyin.
- *
- * Copyright (C) 2006-2007 Peng Wu
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- */
-
-
-#include "pinyin_lookup.h"
-#include <math.h>
-#include <assert.h>
-#include "stl_lite.h"
-#include "novel_types.h"
-#include "pinyin_phrase2.h"
-#include "facade_chewing_table.h"
-#include "ngram.h"
-#include "winner_tree.h"
-
-using namespace pinyin;
-
-const gfloat PinyinLookup::bigram_lambda = LAMBDA_PARAMETER;
-const gfloat PinyinLookup::unigram_lambda = 1 - LAMBDA_PARAMETER;
-
-static void clear_steps(GPtrArray * steps_index, GPtrArray * steps_content){
- /* clear steps_index */
- for ( size_t i = 0; i < steps_index->len; ++i){
- GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
- g_hash_table_destroy(table);
- g_ptr_array_index(steps_index, i) = NULL;
- }
-
- /* clear steps_content */
- for ( size_t i = 0; i < steps_content->len; ++i){
- GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
- g_array_free(array, TRUE);
- g_ptr_array_index(steps_content, i) = NULL;
- }
-}
-
-PinyinLookup::PinyinLookup(pinyin_option_t options,
- FacadeChewingTable * pinyin_table,
- FacadePhraseIndex * phrase_index,
- Bigram * system_bigram,
- Bigram * user_bigram){
- m_options = options;
- m_pinyin_table = pinyin_table;
- m_phrase_index = phrase_index;
- m_system_bigram = system_bigram;
- m_user_bigram = user_bigram;
-
- m_winner_tree = new WinnerTree;
- m_steps_index = g_ptr_array_new();
- m_steps_content = g_ptr_array_new();
- m_table_cache = g_array_new(FALSE, TRUE, sizeof(PhraseIndexRanges));
- g_array_set_size(m_table_cache, 1);
-}
-
-PinyinLookup::~PinyinLookup(){
- if ( m_winner_tree )
- delete m_winner_tree;
- m_winner_tree = NULL;
-
- //free resources
- for ( size_t i = 0; i < m_table_cache->len; ++i){
- PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
- m_phrase_index->destroy_ranges(*ranges);
- }
-
- g_array_free(m_table_cache, TRUE);
-
- clear_steps(m_steps_index, m_steps_content);
- g_ptr_array_free(m_steps_index, TRUE);
- g_ptr_array_free(m_steps_content, TRUE);
-}
-
-size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){
- /* free resources */
- for ( size_t i = 0; i < m_table_cache->len; ++i){
- PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
- m_phrase_index->destroy_ranges(*ranges);
- }
-
- ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data;
- pinyin_keys += nstep;
- g_array_set_size(m_table_cache, MAX_PHRASE_LENGTH + 1);
-
- int len, total_len = std_lite::min(total_pinyin, MAX_PHRASE_LENGTH);
-
- for ( len = 1; len <= total_len; ++len){
- PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, len);
- m_phrase_index->prepare_ranges(*ranges);
- int result = m_pinyin_table->search(len, pinyin_keys, *ranges);
- if (!( result & SEARCH_CONTINUED)){
- ++len;
- break;
- }
- }
- g_array_set_size(m_table_cache, std_lite::min(len, MAX_PHRASE_LENGTH + 1));
- return m_table_cache->len - 1;
-}
-
-bool PinyinLookup::get_best_match(TokenVector prefixes,
- ChewingKeyVector keys,
- CandidateConstraints constraints,
- MatchResults & results){
- //g_array_set_size(results, 0);
-
- m_constraints = constraints;
- m_keys = keys;
- int nstep = keys->len + 1;
-
- clear_steps(m_steps_index, m_steps_content);
-
- //add null start step
- g_ptr_array_set_size(m_steps_index, nstep);
- g_ptr_array_set_size(m_steps_content, nstep);
-
- for ( int i = 0; i < nstep; ++i ){
- //initialize m_steps_index
- g_ptr_array_index(m_steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal);
- //initialize m_steps_content
- g_ptr_array_index(m_steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t));
- }
-
- for ( int i = 0; i < prefixes->len; ++i ){
- phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
- lookup_key_t initial_key = token;
- lookup_value_t initial_value(log(1));
- initial_value.m_handles[1] = token;
-
- GArray * initial_step_content = (GArray *) g_ptr_array_index(m_steps_content, 0);
- initial_step_content = g_array_append_val(initial_step_content, initial_value);
- GHashTable * initial_step_index = (GHashTable *) g_ptr_array_index(m_steps_index, 0);
- g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), GUINT_TO_POINTER(initial_step_content->len - 1));
- }
-
- /* begin the Viterbi beam search. */
- for ( int i = 0; i < nstep - 1; ++i ){
- LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, i);
- IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step);
- size_t npinyin = prepare_table_cache(i, keys->len - i);
- search_bigram(iter, i, npinyin),
- search_unigram(iter, i, npinyin);
- delete iter;
- }
- return final_step(results);
-}
-
-bool PinyinLookup::search_unigram(IBranchIterator * iter, int nstep, int npinyin){
- lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep);
- if ( CONSTRAINT_NOSEARCH == constraint->m_type )
- return false;
- GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep);
- if ( 0 == lookup_content->len )
- return false;
- lookup_value_t max_step = iter->max();
- if ( CONSTRAINT_ONESTEP == constraint->m_type){
- return unigram_gen_next_step(nstep, &max_step, constraint->m_token);
- }
- if ( NO_CONSTRAINT == constraint->m_type ){
- bool found = false;
- for ( size_t i = 1; i < m_table_cache->len && i <= MAX_PHRASE_LENGTH; ++i){
- lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1);
- if ( constraint->m_type == CONSTRAINT_NOSEARCH )
- break;
- PhraseIndexRanges * ranges = &g_array_index(m_table_cache,PhraseIndexRanges, i);
- for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
- GArray * array = (*ranges)[m];
- if ( !array ) continue;
- for ( size_t n = 0; n < array->len; ++n){
- PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
- for ( phrase_token_t token = range->m_range_begin;
- token != range->m_range_end; ++token){
- found = unigram_gen_next_step(nstep, &max_step, token)|| found;
- }
- }
- }
- }
- return found;
- }
- return false;
-}
-
-
-bool PinyinLookup::search_bigram(IBranchIterator * iter,
- int nstep, int npinyin){
- lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep);
- if ( CONSTRAINT_NOSEARCH == constraint->m_type )
- return false;
-
- bool found = false;
- BigramPhraseArray bigram_phrase_items = g_array_new(FALSE, FALSE,
- sizeof(BigramPhraseItem));
- while ( iter->has_next() ){
- lookup_value_t cur_step = iter->next();
- //printf("token:%d\t%d\n", cur_step.m_handles[0], cur_step.m_handles[1]);
- phrase_token_t index_token = cur_step.m_handles[1];
- SingleGram * system, * user;
- m_system_bigram->load(index_token, system);
- m_user_bigram->load(index_token, user);
-
- if ( !merge_single_gram(&m_merged_single_gram, system, user) )
- continue;
-
- if ( CONSTRAINT_ONESTEP == constraint->m_type ){
- phrase_token_t token = constraint->m_token;
-
- guint32 freq;
- if( m_merged_single_gram.get_freq(token, freq) ){
- guint32 total_freq;
- m_merged_single_gram.get_total_freq(total_freq);
- gfloat bigram_poss = freq / (gfloat) total_freq;
- found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found;
- }
- }
-
- if ( NO_CONSTRAINT == constraint->m_type ){
- for ( size_t i = 1; i < m_table_cache->len
- && i <= MAX_PHRASE_LENGTH; ++i ){
- lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1);
- if ( constraint->m_type == CONSTRAINT_NOSEARCH )
- break;
-
- PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
- for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
- GArray * array = (*ranges)[m];
- if ( !array ) continue;
-
- for ( size_t n = 0; n < array->len; ++n){
- PhraseIndexRange * range =
- &g_array_index(array, PhraseIndexRange, n);
-
- g_array_set_size(bigram_phrase_items, 0);
- m_merged_single_gram.search(range, bigram_phrase_items);
- for( size_t k = 0; k < bigram_phrase_items->len;
- ++k){
- BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
- found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found;
- }
- }
- }
- }
- }
- if (system)
- delete system;
- if (user)
- delete user;
- }
- g_array_free(bigram_phrase_items, TRUE);
- return found;
-}
-
-
-bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token){
- ChewingKey * pinyinkeys = ((ChewingKey *)m_keys->data) + nstep;
- if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
- return false;
- size_t phrase_length = m_cache_phrase_item.get_phrase_length();
- gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
- m_phrase_index->get_phrase_index_total_freq();
- if ( elem_poss < DBL_EPSILON )
- return false;
- gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyinkeys);
- if (pinyin_poss < FLT_EPSILON )
- return false;
- lookup_value_t next_step;
- next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
- next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda);
- next_step.m_last_step = nstep;
-
- return save_next_step(nstep + phrase_length, cur_step, &next_step);
-}
-
-bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss){
- ChewingKey * pinyinkeys = ((ChewingKey *)m_keys->data) + nstep;
- if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
- return false;
- size_t phrase_length = m_cache_phrase_item.get_phrase_length();
- gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
- (gdouble) m_phrase_index->get_phrase_index_total_freq();
-
- if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
- return false;
- gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyinkeys);
- if ( pinyin_poss < FLT_EPSILON )
- return false;
- lookup_value_t next_step;
- next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
- next_step.m_poss = cur_step->m_poss +
- log(( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) *pinyin_poss);
- next_step.m_last_step = nstep;
-
- return save_next_step(nstep + phrase_length, cur_step, &next_step);
-}
-
-bool PinyinLookup::save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step){
- lookup_key_t next_key = next_step->m_handles[1];
- GHashTable * next_lookup_index = (GHashTable *) g_ptr_array_index(m_steps_index, next_step_pos);
- GArray * next_lookup_content = (GArray *) g_ptr_array_index(m_steps_content, next_step_pos);
-
- gpointer key, value;
- gboolean lookup_result = g_hash_table_lookup_extended(next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
- size_t step_index = GPOINTER_TO_UINT(value);
- if ( !lookup_result ){
- g_array_append_val(next_lookup_content, *next_step);
- g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1));
- return true;
- }else{
- lookup_value_t * orig_next_value = &g_array_index(next_lookup_content, lookup_value_t,step_index);
- if ( orig_next_value->m_poss < next_step->m_poss) {
- orig_next_value->m_handles[0] = next_step->m_handles[0];
- assert(orig_next_value->m_handles[1] == next_step->m_handles[1]);
- orig_next_value->m_poss = next_step->m_poss;
- orig_next_value->m_last_step = next_step->m_last_step;
- return true;
- }
- return false;
- }
-}
-
-bool PinyinLookup::final_step(MatchResults & results){
- //reset results
- g_array_set_size(results, m_steps_content->len - 1);
- for ( size_t i = 0 ; i < results->len ; ++i){
- phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
- *token = null_token;
- }
- //find max element
- size_t last_step_pos = m_steps_content->len - 1;
-
- GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos);
- if ( last_step_array->len == 0 )
- return false;
- lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0);
- for ( size_t i = 1; i < last_step_array->len; ++i){
- lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i);
- if ( cur_value->m_poss > max_value->m_poss )
- max_value = cur_value;
- }
-
- //backtracing
- while( true ){
- int cur_step_pos = max_value->m_last_step;
- if ( -1 == cur_step_pos )
- break;
-
- phrase_token_t * token = &g_array_index(results, phrase_token_t, cur_step_pos);
- *token = max_value->m_handles[1];
-
- phrase_token_t last_token = max_value->m_handles[0];
-
-
- GHashTable * lookup_step_index = (GHashTable *)g_ptr_array_index(m_steps_index, cur_step_pos);
- gpointer key, value;
- gboolean result = g_hash_table_lookup_extended(lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
- if (!result)
- return false;
- GArray * lookup_step_content = (GArray *)g_ptr_array_index(m_steps_content, cur_step_pos);
-
- max_value = &g_array_index(lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
- }
-
- //no need to reverse the result
-
- return true;
-}
-
-bool PinyinLookup::train_result2(ChewingKeyVector keys,
- CandidateConstraints constraints,
- MatchResults results) {
- const guint32 initial_seed = 23 * 15;
- const guint32 expand_factor = 2;
- const guint32 unigram_factor = 7;
- const guint32 pinyin_factor = 1;
- const guint32 ceiling_seed = 23 * 15 * 64;
-
- /* begin training based on constraints and results. */
- bool train_next = false;
- ChewingKey * pinyin_keys = (ChewingKey *) keys->data;
-
- phrase_token_t last_token = sentence_start;
- /* constraints->len + 1 == results->len */
- for (size_t i = 0; i < constraints->len; ++i) {
- phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
- if (null_token == *token)
- continue;
-
- lookup_constraint_t * constraint = &g_array_index
- (constraints, lookup_constraint_t, i);
- if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) {
- if (CONSTRAINT_ONESTEP == constraint->m_type) {
- assert(*token == constraint->m_token);
- train_next = true;
- } else {
- train_next = false;
- }
-
- guint32 seed = initial_seed;
- /* train bi-gram first, and get train seed. */
- if (last_token) {
- SingleGram * user = NULL;
- m_user_bigram->load(last_token, user);
-
- guint32 total_freq = 0;
- if (!user) {
- user = new SingleGram;
- }
- assert(user->get_total_freq(total_freq));
-
- guint32 freq = 0;
- /* compute train factor */
- if (!user->get_freq(*token, freq)) {
- assert(user->insert_freq(*token, 0));
- seed = initial_seed;
- } else {
- seed = std_lite::max(freq, initial_seed);
- seed *= expand_factor;
- seed = std_lite::min(seed, ceiling_seed);
- }
-
- /* protect against total_freq overflow */
- if (seed > 0 && total_freq > total_freq + seed)
- goto next;
-
- assert(user->set_total_freq(total_freq + seed));
- /* if total_freq is not overflow, then freq won't overflow. */
- assert(user->set_freq(*token, freq + seed));
- assert(m_user_bigram->store(last_token, user));
- next:
- if (user)
- delete user;
- }
-
- /* train uni-gram */
- m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
- m_cache_phrase_item.increase_pronunciation_possibility
- (m_options, pinyin_keys + i, seed * pinyin_factor);
- m_phrase_index->add_unigram_frequency
- (*token, seed * unigram_factor);
- }
- last_token = *token;
- }
- return true;
-}
-
-guint8 PinyinLookup::add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token){
- if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item) )
- return 0;
-
- size_t phrase_length = m_cache_phrase_item.get_phrase_length();
- if ( index + phrase_length > constraints->len )
- return 0;
-
- for ( size_t i = index; i < index + phrase_length ; ++i ){
- clear_constraint(constraints, i);
- }
-
- lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index);
- constraint->m_type = CONSTRAINT_ONESTEP;
- constraint->m_token = token;
-
- for (size_t i = 1; i < phrase_length; ++i){
- constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
- constraint->m_type = CONSTRAINT_NOSEARCH;
- constraint->m_constraint_step = index;
- }
- return phrase_length;
-}
-
-bool PinyinLookup::clear_constraint(CandidateConstraints constraints, size_t index){
- if ( index < 0 || index >= constraints->len )
- return false;
- lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index);
- if (constraint->m_type == NO_CONSTRAINT)
- return false;
- if (constraint->m_type == CONSTRAINT_NOSEARCH){
- index = constraint->m_constraint_step;
- constraint = &g_array_index(constraints, lookup_constraint_t, index);
- }
-
- assert(constraint->m_type == CONSTRAINT_ONESTEP);
-
- phrase_token_t token = constraint->m_token;
- if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
- return false;
-
- size_t phrase_length = m_cache_phrase_item.get_phrase_length();
- for ( size_t i = 0; i < phrase_length; ++i){
- if ( index + i >= constraints->len )
- continue;
- constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
- constraint->m_type = NO_CONSTRAINT;
- }
- return true;
-}
-
-bool PinyinLookup::validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys){
- //resize constraints array
- size_t constraints_length = constraints->len;
- if ( keys->len > constraints_length ){
- g_array_set_size(constraints, keys->len);
- //initialize new element
- for( size_t i = constraints_length; i < keys->len; ++i){
- lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
- constraint->m_type = NO_CONSTRAINT;
- }
- }else if (keys->len < constraints_length ){
- g_array_set_size(constraints, keys->len);
- }
-
- ChewingKey * pinyin_keys = (ChewingKey *)keys->data;
-
- for ( size_t i = 0; i < constraints->len; ++i){
- lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
- if ( constraint->m_type == CONSTRAINT_ONESTEP ){
- phrase_token_t token = constraint->m_token;
- m_phrase_index->get_phrase_item(token, m_cache_phrase_item);
- size_t phrase_length = m_cache_phrase_item.get_phrase_length();
- //clear too long constraint
- if ( i + phrase_length > constraints->len ){
- clear_constraint(constraints, i);
- continue;
- }
- //clear invalidated pinyin
- gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys + i);
- if ( pinyin_poss < FLT_EPSILON ){
- clear_constraint(constraints, i);
- }
- }
- }
- return true;
-}
diff --git a/src/lookup/pinyin_lookup.h b/src/lookup/pinyin_lookup.h
deleted file mode 100644
index b23c39a..0000000
--- a/src/lookup/pinyin_lookup.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * libpinyin
- * Library to deal with pinyin.
- *
- * Copyright (C) 2006-2007 Peng Wu
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- */
-
-#ifndef PINYIN_LOOKUP_H
-#define PINYIN_LOOKUP_H
-
-
-#include <float.h>
-#include <glib.h>
-#include "novel_types.h"
-#include "chewing_key.h"
-#include "phrase_index.h"
-#include "ngram.h"
-#include "lookup.h"
-
-
-namespace pinyin{
-
-class WinnerTree;
-
-/**
- * pinyin_lookup.h
- *
- * The definitions of pinyin lookup related classes and structs.
- *
- */
-
-enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH };
-
-struct lookup_constraint_t{
- /* current type of the step */
- constraint_type m_type;
-
- /* Note:
- * value of m_type:
- * NO_CONSTRAINT:
- * no values in the below union.
- * search all possible next words.
- * CONSTRAINT_ONESTEP:
- * m_token contains the next word.
- * only one word can be used to search for the next step,
- * use case for user selected candidates.
- * CONSTRAINT_NOSEARCH:
- * m_constraint_step contains the value
- * which points back to the CONSTRAINT_ONESTEP step.
- * no search is allowed for the current step.
- */
-
- union{
- phrase_token_t m_token;
- guint32 m_constraint_step; /* index of m_token */
- };
-};
-
-
-/**
- * IBranchIterator:
- *
- * The iterator to get the 32 highest values.
- *
- * Note: The winner tree for Viterbi beam search.
- *
- */
-class IBranchIterator{
-public:
- virtual ~IBranchIterator(){}
- virtual bool has_next() = 0;
- virtual lookup_value_t next() = 0;
- virtual lookup_value_t max() = 0;
-};
-
-/**
- * PinyinLookup:
- *
- * The pinyin lookup class to convert pinyin keys to guessed sentence.
- *
- */
-class PinyinLookup{
-private:
- static const gfloat bigram_lambda;
- static const gfloat unigram_lambda;
-
- PhraseItem m_cache_phrase_item;
- SingleGram m_merged_single_gram;
-protected:
- //saved varibles
- CandidateConstraints m_constraints;
- ChewingKeyVector m_keys;
-
- FacadeChewingTable * m_pinyin_table;
- FacadePhraseIndex * m_phrase_index;
- pinyin_option_t m_options;
- Bigram * m_system_bigram;
- Bigram * m_user_bigram;
-
- //internal step data structure
- GPtrArray * m_steps_index;
- /* Array of LookupStepIndex */
- GPtrArray * m_steps_content;
- /* Array of LookupStepContent */
-
- GArray * m_table_cache;
- /* Array of PhraseIndexRanges,
- * PhraseIndexRanges is an array of GArray of PhraseIndexRange,
- * indexed by phrase library (only contains enabled phrase libraries).
- */
-
- WinnerTree * m_winner_tree;
-
- size_t prepare_table_cache(int nstep, int total_pinyin);
-
- bool search_unigram(IBranchIterator * iter, int nstep, int npinyin);
- bool search_bigram(IBranchIterator * iter, int nstep, int npinyin);
-
- bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token);
- bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss);
-
- bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step);
-
- bool final_step(MatchResults & results);
-public:
- /**
- * PinyinLookup::PinyinLookup:
- * @options: the pinyin options.
- * @pinyin_table: the pinyin table.
- * @phrase_index: the phrase index.
- * @system_bigram: the system bi-gram.
- * @user_bigram: the user bi-gram.
- *
- * The constructor of the PinyinLookup.
- *
- */
- PinyinLookup(pinyin_option_t options, FacadeChewingTable * pinyin_table,
- FacadePhraseIndex * phrase_index, Bigram * system_bigram,
- Bigram * user_bigram);
-
- /**
- * PinyinLookup::~PinyinLookup:
- *
- * The destructor of the PinyinLookup.
- *
- */
- ~PinyinLookup();
-
- /**
- * PinyinLookup::set_options:
- * @options: the pinyin options.
- * @returns: whether the set operation is successful.
- *
- * Set the pinyin options of this PinyinLookup.
- *
- */
- bool set_options(pinyin_option_t options) {
- m_options = options;
- return true;
- }
-
- /**
- * PinyinLookup::get_best_match:
- * @prefixes: the phrase tokens before the guessed sentence.
- * @keys: the pinyin keys of the guessed sentence.
- * @constraints: the constraints on the guessed sentence.
- * @results: the guessed sentence in the form of the phrase tokens.
- * @returns: whether the guess operation is successful.
- *
- * Guess the best sentence according to user inputs.
- *
- */
- bool get_best_match(TokenVector prefixes, ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results);
-
- /**
- * PinyinLookup::train_result2:
- * @keys: the pinyin keys of the guessed sentence.
- * @constraints: the constraints on the guessed sentence.
- * @results: the guessed sentence in the form of the phrase tokens.
- * @returns: whether the train operation is successful.
- *
- * Self learning the guessed sentence based on the constraints.
- *
- */
- bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results);
-
- /**
- * PinyinLookup::convert_to_utf8:
- * @results: the guessed sentence in the form of the phrase tokens.
- * @result_string: the guessed sentence in the utf8 encoding.
- * @returns: whether the convert operation is successful.
- *
- * Convert the guessed sentence from the phrase tokens to the utf8 string.
- *
- */
- bool convert_to_utf8(MatchResults results,
- /* out */ char * & result_string)
- {
- return pinyin::convert_to_utf8(m_phrase_index, results,
- NULL, result_string);
- }
-
- /**
- * PinyinLookup::add_constraint:
- * @constraints: the constraints on the guessed sentence.
- * @index: the character offset in the guessed sentence.
- * @token: the phrase token in the candidate list chosen by user.
- * @returns: the number of the characters in the chosen token.
- *
- * Add one constraint to the constraints on the guessed sentence.
- *
- */
- guint8 add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token);
-
- /**
- * PinyinLookup::clear_constraint:
- * @constraints: the constraints on the guessed sentence.
- * @index: the character offset in the guessed sentence.
- * @returns: whether the clear operation is successful.
- *
- * Clear one constraint in the constraints on the guessed sentence.
- *
- */
- bool clear_constraint(CandidateConstraints constraints, size_t index);
-
- /**
- * PinyinLookup::validate_constraint:
- * @constraints: the constraints on the guessed sentence.
- * @keys: the pinyin keys of the guessed sentence.
- * @returns: whether the validate operation is successful.
- *
- * Validate the old constraints with the new pinyin keys.
- *
- */
- bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys);
-
-};
-
-};
-#endif