From f41d1fdf83408e042ab07925710a8913bad0c27c Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 3 Aug 2010 10:42:47 +0800 Subject: import from pinyin. --- src/lookup/Makefile.am | 30 +++ src/lookup/lookup.h | 144 +++++++++++ src/lookup/pinyin_lookup.cpp | 587 +++++++++++++++++++++++++++++++++++++++++++ src/lookup/winner_tree.cpp | 141 +++++++++++ src/lookup/winner_tree.h | 148 +++++++++++ 5 files changed, 1050 insertions(+) create mode 100644 src/lookup/Makefile.am create mode 100644 src/lookup/lookup.h create mode 100644 src/lookup/pinyin_lookup.cpp create mode 100644 src/lookup/winner_tree.cpp create mode 100644 src/lookup/winner_tree.h (limited to 'src/lookup') diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am new file mode 100644 index 0000000..2b7d21f --- /dev/null +++ b/src/lookup/Makefile.am @@ -0,0 +1,30 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CPPFLAGS@ + +noinst_HEADERS = lookup.h winner_tree.h + +noinst_PROGRAMS = + +noinst_LTLIBRARIES = liblookup.la + +liblookup_la_SOURCES = pinyin_lookup.cpp winner_tree.cpp diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h new file mode 100644 index 0000000..676c6ea --- /dev/null +++ b/src/lookup/lookup.h @@ -0,0 +1,144 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LOOKUP_H +#define LOOKUP_H + +#include +#include +#include "novel_types.h" +#include "pinyin_base.h" + +class WinnerTree; + +/** @file lookup.h + * @brief the definitions of lookup related classes and structs. + * Currently only contains pinyin lookup. + */ + +typedef phrase_token_t lookup_key_t; + +struct lookup_value_t{ + phrase_token_t m_handles[2]; + gfloat m_poss; + gint32 m_last_step; + lookup_value_t(gfloat poss = FLT_MAX){ + m_handles[0] = NULL; m_handles[1] = NULL; + m_poss = poss; + m_last_step = -1; + } +}; + +enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH }; + +struct lookup_constraint_t{ + constraint_type m_type; + union{ + phrase_token_t m_token; + guint32 m_constraint_step; /* index of m_token */ + }; +}; + +typedef GArray * CandidateConstraints; /* Array of lookup_constraint_t */ +typedef GArray * MatchResults; /* Array of phrase_token_t */ + +namespace novel{ +class PinyinLargeTable; +class FacadePhraseIndex; +class Bigram; +}; + +typedef GHashTable * LookupStepIndex; +/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */ +typedef GArray * LookupStepContent; /* array of lookup_value_t */ + + +class IBranchIterator{ +public: + virtual ~IBranchIterator(){} + virtual bool has_next() = 0; + virtual lookup_value_t next() = 0; + virtual lookup_value_t max() = 0; +}; + +class PinyinLookup{ +private: + static const gfloat bigram_lambda = LAMBDA_PARAMETER; + static const gfloat unigram_lambda = 1 - LAMBDA_PARAMETER; + + PhraseItem m_cache_phrase_item; +protected: + //saved varibles + CandidateConstraints m_constraints; + PinyinKeyVector m_keys; + + novel::PinyinLargeTable * m_pinyin_table; + novel::FacadePhraseIndex * m_phrase_index; + novel::PinyinCustomSettings * m_custom; + novel::Bigram * m_bigram; + + //internal step data structure + GPtrArray * m_steps_index; + /* Array of LookupStepIndex */ + GPtrArray * m_steps_content; + /* Array of LookupStepContent */ + + GArray * m_table_cache; + /* Array of PhraseIndexRanges */ + + WinnerTree * m_winner_tree; + + size_t prepare_table_cache(int nstep, int total_pinyin); + + bool search_unigram(IBranchIterator * iter, int nstep, int npinyin); + bool search_bigram(IBranchIterator * iter, int nstep, int npinyin); + + bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token); + bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss); + + bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step); + + bool final_step(MatchResults & results); +public: + PinyinLookup( PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram); + + ~PinyinLookup(); + + bool get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results); + + bool train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results); + + bool convert_to_utf8(MatchResults results, /* out */ char * & result_string); + + bool add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token); + + bool clear_constraint(CandidateConstraints constraints, size_t index); + + bool validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys); + + /* init pinyin table lookup array */ + bool prepare_pinyin_lookup(PhraseIndexRanges ranges); + /* destroy pinyin table lookup array */ + bool destroy_pinyin_lookup(PhraseIndexRanges ranges); +}; + +#endif diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp new file mode 100644 index 0000000..c335453 --- /dev/null +++ b/src/lookup/pinyin_lookup.cpp @@ -0,0 +1,587 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include "stl_lite.h" +#include "novel_types.h" +#include "pinyin_base.h" +#include "pinyin_phrase.h" +#include "pinyin_large_table.h" +#include "phrase_index.h" +#include "ngram.h" +#include "lookup.h" +#include "winner_tree.h" + +const gfloat PinyinLookup::bigram_lambda; +const gfloat PinyinLookup::unigram_lambda; + +PinyinLookup::PinyinLookup(PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram){ + m_custom = custom; + m_pinyin_table = pinyin_table; + m_phrase_index = phrase_index; + m_bigram = bigram; + m_winner_tree = new WinnerTree; + m_steps_index = g_ptr_array_new(); + m_steps_content = g_ptr_array_new(); + m_table_cache = g_array_new(FALSE, TRUE, sizeof(PhraseIndexRanges)); + g_array_set_size(m_table_cache, 1); +} + +PinyinLookup::~PinyinLookup(){ + if ( m_winner_tree ) + delete m_winner_tree; + m_winner_tree = NULL; + //free resources + for ( size_t i = 0; i < m_table_cache->len; ++i){ + PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i); + destroy_pinyin_lookup(*ranges); + } + //g_array_set_size(m_table_cache, 1); + g_array_free(m_table_cache, TRUE); + + //free m_steps_index + for ( size_t i = 0; i < m_steps_index->len; ++i){ + GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i); + g_hash_table_destroy(table); + g_ptr_array_index(m_steps_index, i) = NULL; + } + g_ptr_array_free(m_steps_index, TRUE); + + //free m_steps_content + for ( size_t i = 0; i < m_steps_content->len; ++i){ + GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i); + g_array_free(array, TRUE); + g_ptr_array_index(m_steps_content, i) = NULL; + } + g_ptr_array_free(m_steps_content, TRUE); + +} + +bool PinyinLookup::prepare_pinyin_lookup(PhraseIndexRanges ranges){ + //memset(ranges, 0, sizeof(ranges)); + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){ + GArray * & array = ranges[i]; + assert(NULL == array); + if (m_phrase_index->m_sub_phrase_indices[i]){ + array = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange)); + } + } + return true; +} + +bool PinyinLookup::destroy_pinyin_lookup(PhraseIndexRanges ranges){ + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ + GArray * & array = ranges[i]; + if ( array ) + g_array_free(array, TRUE); + array = NULL; + } + return true; +} + +size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){ + //free resources + for ( size_t i = 0; i < m_table_cache->len; ++i){ + PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i); + destroy_pinyin_lookup(*ranges); + } + //g_array_set_size(m_table_cache, 1); + PinyinKey * pinyin_keys = (PinyinKey *)m_keys->data; + pinyin_keys += nstep; + //init resources + g_array_set_size(m_table_cache, MAX_PHRASE_LENGTH + 1); + size_t len; + for ( len = 1; len <= total_pinyin && len <= MAX_PHRASE_LENGTH; ++len){ + PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, len); + prepare_pinyin_lookup(*ranges); + int result = m_pinyin_table->search(len, pinyin_keys, *ranges); + if (!( result & SEARCH_CONTINUED)){ + ++len; + break; + } + } + g_array_set_size(m_table_cache, std_lite::min(len, (size_t) MAX_PHRASE_LENGTH + 1)); + return m_table_cache->len - 1; +} + +bool PinyinLookup::get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){ + //g_array_set_size(results, 0); + + m_constraints = constraints; + m_keys = keys; + int nstep = keys->len + 1; + + //free m_steps_index + for ( size_t i = 0; i < m_steps_index->len; ++i){ + GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i); + g_hash_table_destroy(table); + g_ptr_array_index(m_steps_index, i) = NULL; + } + + //free m_steps_content + for ( size_t i = 0; i < m_steps_content->len; ++i){ + GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i); + g_array_free(array, TRUE); + g_ptr_array_index(m_steps_content, i) = NULL; + } + + //add null start step + g_ptr_array_set_size(m_steps_index, nstep); + g_ptr_array_set_size(m_steps_content, nstep); + + for ( size_t i = 0 ; i < nstep; ++i ){ + //initialize m_steps_index + g_ptr_array_index(m_steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal); + //initialize m_steps_content + g_ptr_array_index(m_steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t)); + } + + lookup_key_t initial_key = sentence_start; + lookup_value_t initial_value(log(1)); + initial_value.m_handles[1] = sentence_start; + GArray * initial_step_content = (GArray *) g_ptr_array_index(m_steps_content, 0); + initial_step_content = g_array_append_val(initial_step_content, initial_value); + GHashTable * initial_step_index = (GHashTable *) g_ptr_array_index(m_steps_index, 0); + g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), GUINT_TO_POINTER(initial_step_content->len - 1)); + +#if 0 + LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, 0); + IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step); + size_t npinyin = prepare_table_cache(0, keys->len); + search_unigram(iter, 0, npinyin); + delete iter; +#endif + + for ( size_t i = 0 ; i < nstep - 1 ; ++i ){ + LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, i); + IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step); + size_t npinyin = prepare_table_cache(i, keys->len - i); + search_bigram(iter, i, npinyin), + search_unigram(iter, i, npinyin); + delete iter; + } + return final_step(results); +} + +bool PinyinLookup::search_unigram(IBranchIterator * iter, int nstep, int npinyin){ + lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep); + if ( CONSTRAINT_NOSEARCH == constraint->m_type ) + return false; + GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return false; + lookup_value_t max_step = iter->max(); + if ( CONSTRAINT_ONESTEP == constraint->m_type){ + return unigram_gen_next_step(nstep, &max_step, constraint->m_token); + } + if ( NO_CONSTRAINT == constraint->m_type ){ + bool found = false; + for ( size_t i = 1; i < m_table_cache->len && i <= MAX_PHRASE_LENGTH; ++i){ + lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1); + if ( constraint->m_type != NO_CONSTRAINT ) + continue; + PhraseIndexRanges * ranges = &g_array_index(m_table_cache,PhraseIndexRanges, i); + for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = (*ranges)[m]; + if ( !array ) continue; + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); + for ( phrase_token_t token = range->m_range_begin; + token != range->m_range_end; ++token){ + found = unigram_gen_next_step(nstep, &max_step, token)|| found; + } + } + } + } + return found; + } + return false; +} + + +bool PinyinLookup::search_bigram(IBranchIterator * iter, + int nstep, int npinyin){ + lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep); + if ( CONSTRAINT_NOSEARCH == constraint->m_type ) + return false; + GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep); + + bool found = false; + BigramPhraseArray bigram_phrase_items = g_array_new(FALSE, FALSE, + sizeof(BigramPhraseItem)); + while ( iter->has_next() ){ + lookup_value_t cur_step = iter->next(); + //printf("token:%d\t%d\n", cur_step.m_handles[0], cur_step.m_handles[1]); + phrase_token_t index_token = cur_step.m_handles[1]; + SingleGram * system, * user; + m_bigram->load(index_token, system, user); + if ( system && user ){ + guint32 total_freq; + assert(user->get_total_freq(total_freq)); + assert(system->set_total_freq(total_freq)); + } + if ( CONSTRAINT_ONESTEP == constraint->m_type ){ + phrase_token_t token = constraint->m_token; + if ( system ){ + guint32 freq; + if( system->get_freq(token, freq) ){ + guint32 total_freq; + system->get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found; + } + } + if ( user ){ + guint32 freq; + if( user->get_freq(token, freq)){ + guint32 total_freq; + user->get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found; + } + } + } + + if ( NO_CONSTRAINT == constraint->m_type ){ + for ( size_t i = 1; i < m_table_cache->len + && i <= MAX_PHRASE_LENGTH;++i ){ + lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1); + if ( constraint->m_type != NO_CONSTRAINT ) + continue; + + PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i); + for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = (*ranges)[m]; + if ( !array ) continue; + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); + if (system){ + g_array_set_size(bigram_phrase_items, 0); + system->search(range, bigram_phrase_items); + for( size_t k = 0; k < bigram_phrase_items->len; + ++k){ + BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); + found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found; + } + } + if (user){ + g_array_set_size(bigram_phrase_items, 0); + user->search(range, bigram_phrase_items); + for( size_t k = 0; k < bigram_phrase_items->len; + ++k){ + BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); + found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found; + } + } + } + } + } + } + if (system) + delete system; + if (user) + delete user; + } + g_array_free(bigram_phrase_items, TRUE); + return found; +} + + +bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token){ + PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep; + if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gfloat elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat) + m_phrase_index->get_phrase_index_total_freq(); + if ( elem_poss < FLT_EPSILON ) + return false; + gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys); + if (pinyin_poss < FLT_EPSILON ) + return false; + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss){ + PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep; + if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gfloat unigram_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat) + m_phrase_index->get_phrase_index_total_freq(); + if ( bigram_poss < FLT_EPSILON && unigram_poss < FLT_EPSILON ) + return false; + gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys); + if ( pinyin_poss < FLT_EPSILON ) + return false; + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + + log(( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) *pinyin_poss); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup::save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step){ + lookup_key_t next_key = next_step->m_handles[1]; + GHashTable * next_lookup_index = (GHashTable *) g_ptr_array_index(m_steps_index, next_step_pos); + GArray * next_lookup_content = (GArray *) g_ptr_array_index(m_steps_content, next_step_pos); + + gpointer key, value; + gboolean lookup_result = g_hash_table_lookup_extended(next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); + size_t step_index = GPOINTER_TO_UINT(value); + if ( !lookup_result ){ + g_array_append_val(next_lookup_content, *next_step); + g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1)); + return true; + }else{ + lookup_value_t * orig_next_value = &g_array_index(next_lookup_content, lookup_value_t,step_index); + if ( orig_next_value->m_poss < next_step->m_poss) { + orig_next_value->m_handles[0] = next_step->m_handles[0]; + assert(orig_next_value->m_handles[1] == next_step->m_handles[1]); + orig_next_value->m_poss = next_step->m_poss; + orig_next_value->m_last_step = next_step->m_last_step; + return true; + } + return false; + } +} + +bool PinyinLookup::final_step(MatchResults & results){ + //reset results + g_array_set_size(results, m_steps_content->len); + for ( size_t i = 0 ; i < m_steps_content->len ; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + *token = NULL; + } + //find max element + size_t last_step_pos = m_steps_content->len - 1; + + GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos); + if ( last_step_array->len == 0 ) + return false; + lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0); + for ( size_t i = 1; i < last_step_array->len; ++i){ + lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + //backtracing + while( true ){ + int cur_step_pos = max_value->m_last_step; + if ( -1 == cur_step_pos ) + break; + + phrase_token_t * token = &g_array_index(results, phrase_token_t, cur_step_pos); + *token = max_value->m_handles[1]; + + phrase_token_t last_token = max_value->m_handles[0]; + + + GHashTable * lookup_step_index = (GHashTable *)g_ptr_array_index(m_steps_index, cur_step_pos); + gpointer key, value; + gboolean result = g_hash_table_lookup_extended(lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); + if (!result) + return false; + GArray * lookup_step_content = (GArray *)g_ptr_array_index(m_steps_content, cur_step_pos); + + max_value = &g_array_index(lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); + } + + //no need to reverse the result + + return true; +} + +bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){ + bool train_next = false; + PinyinKey * pinyin_keys = (PinyinKey *)keys->data; + //TODO: verify the new training method. + phrase_token_t last_token = sentence_start; + // constraints->len + 1 == results->len + guint32 train_factor = 23; + for ( size_t i = 0; i < constraints->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if ( *token == NULL ) + continue; + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + if (train_next || CONSTRAINT_ONESTEP == constraint->m_type ){ + if (CONSTRAINT_ONESTEP == constraint->m_type){ + assert(*token == constraint->m_token); + train_next = true; + }else{ + train_next = false; + } + //add pi-gram frequency + //std::cout<<"i:"<get_phrase_item(*token, m_cache_phrase_item); + m_cache_phrase_item.increase_pinyin_possibility(*m_custom, pinyin_keys + i, train_factor); + m_phrase_index->add_unigram_frequency(*token, train_factor); + if ( last_token ){ + SingleGram * system, *user; + m_bigram->load(last_token, system, user); + guint32 total_freq; + if ( !user ){ + total_freq = 0; + if ( system ) + assert(system->get_total_freq(total_freq)); + user = new SingleGram; + user->set_total_freq(total_freq); + } + guint32 freq = 0; + if ( !user->get_freq(*token, freq)){ + if (system) system->get_freq(*token, freq); + user->set_freq(*token, freq); + } + assert(user->get_total_freq(total_freq)); + //protect against total_freq overflow. + if ( train_factor > 0 && total_freq > total_freq + train_factor) + goto next; + assert(user->set_total_freq(total_freq + train_factor)); + assert(user->get_freq(*token, freq)); + //if total_freq is not overflow, then freq won't overflow. + assert(user->set_freq(*token, freq + train_factor)); + assert(m_bigram->store(last_token, user)); + next: + if (system) delete system; + if (user) delete user; + } + } + last_token = *token; + } + return true; +} + +bool PinyinLookup::convert_to_utf8(MatchResults results, /* out */ char * & result_string){ + result_string = g_strdup(""); + for ( size_t i = 0; i < results->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if ( NULL == *token ) + continue; + m_phrase_index->get_phrase_item(*token, m_cache_phrase_item); + utf16_t buffer[MAX_PHRASE_LENGTH]; + m_cache_phrase_item.get_phrase_string(buffer); + guint8 length = m_cache_phrase_item.get_phrase_length(); + gchar * phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL); + char * tmp = result_string; + result_string = g_strconcat(result_string, phrase, NULL); + g_free(tmp); g_free(phrase); + } + return true; +} + +bool PinyinLookup::add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token){ + if ( !m_phrase_index->get_phrase_item(token, m_cache_phrase_item) ) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + if ( index + phrase_length > constraints->len ) + return false; + + for ( size_t i = index; i < index + phrase_length ; ++i ){ + clear_constraint(constraints, i); + } + + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index); + constraint->m_type = CONSTRAINT_ONESTEP; + constraint->m_token = token; + + for (size_t i = 1; i < phrase_length; ++i){ + constraint = &g_array_index(constraints, lookup_constraint_t, index + i); + constraint->m_type = CONSTRAINT_NOSEARCH; + constraint->m_constraint_step = index; + } + return true; +} + +bool PinyinLookup::clear_constraint(CandidateConstraints constraints, size_t index){ + if ( index < 0 || index >= constraints->len ) + return false; + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index); + if (constraint->m_type == NO_CONSTRAINT) + return false; + if (constraint->m_type == CONSTRAINT_NOSEARCH){ + index = constraint->m_constraint_step; + constraint = &g_array_index(constraints, lookup_constraint_t, index); + } + + assert(constraint->m_type == CONSTRAINT_ONESTEP); + + phrase_token_t token = constraint->m_token; + if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + for ( size_t i = 0; i < phrase_length; ++i){ + if ( index + i >= constraints->len ) + continue; + constraint = &g_array_index(constraints, lookup_constraint_t, index + i); + constraint->m_type = NO_CONSTRAINT; + } + return true; +} + +bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys){ + //resize constraints array + size_t constraints_length = constraints->len; + if ( m_parsed_keys->len > constraints_length ){ + g_array_set_size(constraints, m_parsed_keys->len); + //initialize new element + for( size_t i = constraints_length; i < m_parsed_keys->len; ++i){ + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + }else if (m_parsed_keys->len < constraints_length ){ + g_array_set_size(constraints, m_parsed_keys->len); + } + + PinyinKey * pinyin_keys = (PinyinKey *)m_parsed_keys->data; + + for ( size_t i = 0; i < constraints->len; ++i){ + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + if ( constraint->m_type == CONSTRAINT_ONESTEP ){ + phrase_token_t token = constraint->m_token; + m_phrase_index->get_phrase_item(token, m_cache_phrase_item); + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + //clear too long constraint + if ( i + phrase_length > constraints->len ){ + clear_constraint(constraints, i); + continue; + } + //clear invalidated pinyin + gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyin_keys + i); + if ( pinyin_poss < FLT_EPSILON ){ + clear_constraint(constraints, i); + } + } + } + return true; +} diff --git a/src/lookup/winner_tree.cpp b/src/lookup/winner_tree.cpp new file mode 100644 index 0000000..248a749 --- /dev/null +++ b/src/lookup/winner_tree.cpp @@ -0,0 +1,141 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include "memory_chunk.h" +#include "phrase_index.h" +#include "lookup.h" +#include "winner_tree.h" + +WinnerTreeBranchIterator::WinnerTreeBranchIterator(WinnerTree & tree) + :m_tree(tree), m_counter(0){ + m_max_value = m_tree.m_items[m_tree.get_winner()]; + m_counter = 0; +} + +bool WinnerTreeBranchIterator::has_next(){ + if ( m_counter >= m_tree.m_tree_size) + return false; + return m_counter < nbranch; +} + +lookup_value_t WinnerTreeBranchIterator::next(){ + int winner = m_tree.get_winner(); + lookup_value_t tmp = m_tree.m_items[winner]; + m_tree.m_items[winner].m_poss = + - FLT_MAX; + m_tree.replay(winner); + ++m_counter; + return tmp; +} + +void WinnerTree::play(int p, int lc, int rc){ + m_tree[p] = winner(lc, rc); + //continue competition + while( p > 1 && p % 2) { + m_tree[p/2] = winner( m_tree[p - 1], m_tree[p]); + p/=2; + } +} + + +bool WinnerTree::initialize(LookupStepContent cur_step){ + size_t size = cur_step->len; + if ( size > m_max_tree_size ){ + init(size); + } + assert(size > nbranch); + m_tree_size = size; + + //initialize array tree + int nindex = 1; + + for( size_t i = 0; i < cur_step->len ; ++i){ + lookup_value_t * cur_value = &g_array_index(cur_step, lookup_value_t, i); + m_items[nindex++] = *cur_value; + } + + //compute s = 2 ^ log(n -1) + int i, s; + for( s = 1; 2 * s <= m_tree_size - 1; s += s); + + m_low_ext = 2 * (m_tree_size - s); + m_offset = 2 * s - 1; + + //compute outside nodes + for( i = 2; i <= m_low_ext; i += 2) + play((m_offset + i)/2, i - 1, i); + //compute other nodes + if ( m_tree_size % 2){ + play( m_tree_size / 2, m_tree[m_tree_size - 1], m_low_ext +1); + i = m_low_ext + 3; + }else i = m_low_ext + 2; + + //compute others + for( ; i <= m_tree_size; i += 2) + play( (i - m_low_ext + m_tree_size - 1) / 2, i - 1, i); + return true; +} + +void WinnerTree::replay(int i){ + assert( 1 <= i && i <= m_tree_size); + + int p; //compete node + int lc; //p's left child + int rc; //p's right child + + //first compete + if ( i <= m_low_ext){ + p = (m_offset + i) / 2; + lc = 2 * p - m_offset; + rc = lc + 1; + }else{ + p = (i - m_low_ext + m_tree_size -1) / 2; + if ( 2 * p == m_tree_size - 1 ){ + lc = m_tree[2*p]; + rc = i; + }else{ + lc = 2 * p - m_tree_size + 1 + m_low_ext; + rc = lc + 1; + } + } + + m_tree[p] = winner(lc, rc); + + //added by wupeng + if ( ( p | 0x01 ) == m_tree_size ){ + p /= 2; + m_tree[p] = winner( m_tree[2 * p], m_low_ext + 1 ); + } + + //compute others + p /= 2; + for( ; p >= 1 ; p /= 2) + m_tree[p] = winner( m_tree[2 * p], m_tree[2 * p + 1]); +} + +int WinnerTree::winner(int lc, int rc){ + return m_items[lc].m_poss > m_items[rc].m_poss ? + lc : rc; +} diff --git a/src/lookup/winner_tree.h b/src/lookup/winner_tree.h new file mode 100644 index 0000000..262f196 --- /dev/null +++ b/src/lookup/winner_tree.h @@ -0,0 +1,148 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LOOKUP_WINNER_TREE_H +#define LOOKUP_WINNER_TREE_H + +#include +#include "lookup.h" + +const int nbranch = 32; + +class DirectBranchIterator: public IBranchIterator{//for nitem <= nbranch + LookupStepContent m_step_content; + size_t m_iter_pos; +public: + //Constructor + DirectBranchIterator(LookupStepContent step_content) + :m_step_content(step_content) + { m_iter_pos = 0; } + + //Destructor + virtual ~DirectBranchIterator(){} + + //Member Function + bool has_next(){ + return m_iter_pos != m_step_content->len; + } + + lookup_value_t next(){ + lookup_value_t * tmp = &g_array_index(m_step_content, + lookup_value_t, m_iter_pos); + ++m_iter_pos; + return *tmp; + } + + lookup_value_t max(){ + lookup_value_t * max_value = &g_array_index(m_step_content, lookup_value_t, 0); + for ( size_t i = 1 ; i < m_step_content->len; ++i){ + lookup_value_t * cur_value = &g_array_index(m_step_content, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + return *max_value; + } +}; + +class WinnerTree; + +class WinnerTreeBranchIterator: public IBranchIterator{//for nitem <= nbranch + WinnerTree& m_tree; + int m_counter; + lookup_value_t m_max_value; +public: + //Constructor + WinnerTreeBranchIterator(WinnerTree & tree); + + //Destructor + virtual ~WinnerTreeBranchIterator(){} + + //Member Function + bool has_next(); + + lookup_value_t next(); + + lookup_value_t max(){ + return m_max_value; + } + +}; + +class WinnerTree{ + friend class WinnerTreeBranchIterator; +private: + size_t m_max_tree_size; // maxsize + int m_tree_size; // n + int m_low_ext; + int m_offset; + int * m_tree; + MemoryChunk m_buffer; + MemoryChunk m_tree_buffer; + lookup_value_t * m_items; + + int winner(int lc, int rc); + + void play(int p, int lc, int rc); + + void init(int tree_size){ + m_max_tree_size = tree_size; + //data buffer + m_buffer.set_size( sizeof(lookup_value_t) * (tree_size + 1) ); + m_items = (lookup_value_t *) m_buffer.begin(); + + //tree item buffer + m_tree_buffer.set_size( sizeof(int) * m_max_tree_size); + m_tree = (int * ) m_tree_buffer.begin(); + m_tree_size = 0; + } + +public: + + //Constructor + WinnerTree(int tree_size = 10){ + init(tree_size); + } + + //Destructor + ~WinnerTree() { } + + //need delete this + IBranchIterator* get_iterator(LookupStepContent step){ + if ( step->len <= nbranch ) + return new DirectBranchIterator(step); + //TODO:another situation > nbranch + assert(initialize(step)); + return new WinnerTreeBranchIterator(*this); + } + +protected: + + int get_winner() const { + return (m_tree_size)? m_tree[1] : 0; + } + + //Member Function + bool initialize(LookupStepContent cur_step); + void replay(int i); +}; + +#endif -- cgit