summaryrefslogtreecommitdiffstats
path: root/src/lookup
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2010-08-03 10:42:47 +0800
committerPeng Wu <alexepico@gmail.com>2010-08-03 10:42:47 +0800
commitf41d1fdf83408e042ab07925710a8913bad0c27c (patch)
tree1757833ac4cdd0830834d2f9ef92be07c0bc1a5b /src/lookup
parent34acf9be9033e0dc0a5905999133482c20b6cbf3 (diff)
downloadlibpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.gz
libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.xz
libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.zip
import from pinyin.
Diffstat (limited to 'src/lookup')
-rw-r--r--src/lookup/Makefile.am30
-rw-r--r--src/lookup/lookup.h144
-rw-r--r--src/lookup/pinyin_lookup.cpp587
-rw-r--r--src/lookup/winner_tree.cpp141
-rw-r--r--src/lookup/winner_tree.h148
5 files changed, 1050 insertions, 0 deletions
diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am
new file mode 100644
index 0000000..2b7d21f
--- /dev/null
+++ b/src/lookup/Makefile.am
@@ -0,0 +1,30 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ @GLIB2_CPPFLAGS@
+
+noinst_HEADERS = lookup.h winner_tree.h
+
+noinst_PROGRAMS =
+
+noinst_LTLIBRARIES = liblookup.la
+
+liblookup_la_SOURCES = pinyin_lookup.cpp winner_tree.cpp
diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h
new file mode 100644
index 0000000..676c6ea
--- /dev/null
+++ b/src/lookup/lookup.h
@@ -0,0 +1,144 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef LOOKUP_H
+#define LOOKUP_H
+
+#include <float.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "pinyin_base.h"
+
+class WinnerTree;
+
+/** @file lookup.h
+ * @brief the definitions of lookup related classes and structs.
+ * Currently only contains pinyin lookup.
+ */
+
+typedef phrase_token_t lookup_key_t;
+
+struct lookup_value_t{
+ phrase_token_t m_handles[2];
+ gfloat m_poss;
+ gint32 m_last_step;
+ lookup_value_t(gfloat poss = FLT_MAX){
+ m_handles[0] = NULL; m_handles[1] = NULL;
+ m_poss = poss;
+ m_last_step = -1;
+ }
+};
+
+enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH };
+
+struct lookup_constraint_t{
+ constraint_type m_type;
+ union{
+ phrase_token_t m_token;
+ guint32 m_constraint_step; /* index of m_token */
+ };
+};
+
+typedef GArray * CandidateConstraints; /* Array of lookup_constraint_t */
+typedef GArray * MatchResults; /* Array of phrase_token_t */
+
+namespace novel{
+class PinyinLargeTable;
+class FacadePhraseIndex;
+class Bigram;
+};
+
+typedef GHashTable * LookupStepIndex;
+/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */
+typedef GArray * LookupStepContent; /* array of lookup_value_t */
+
+
+class IBranchIterator{
+public:
+ virtual ~IBranchIterator(){}
+ virtual bool has_next() = 0;
+ virtual lookup_value_t next() = 0;
+ virtual lookup_value_t max() = 0;
+};
+
+class PinyinLookup{
+private:
+ static const gfloat bigram_lambda = LAMBDA_PARAMETER;
+ static const gfloat unigram_lambda = 1 - LAMBDA_PARAMETER;
+
+ PhraseItem m_cache_phrase_item;
+protected:
+ //saved varibles
+ CandidateConstraints m_constraints;
+ PinyinKeyVector m_keys;
+
+ novel::PinyinLargeTable * m_pinyin_table;
+ novel::FacadePhraseIndex * m_phrase_index;
+ novel::PinyinCustomSettings * m_custom;
+ novel::Bigram * m_bigram;
+
+ //internal step data structure
+ GPtrArray * m_steps_index;
+ /* Array of LookupStepIndex */
+ GPtrArray * m_steps_content;
+ /* Array of LookupStepContent */
+
+ GArray * m_table_cache;
+ /* Array of PhraseIndexRanges */
+
+ WinnerTree * m_winner_tree;
+
+ size_t prepare_table_cache(int nstep, int total_pinyin);
+
+ bool search_unigram(IBranchIterator * iter, int nstep, int npinyin);
+ bool search_bigram(IBranchIterator * iter, int nstep, int npinyin);
+
+ bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token);
+ bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss);
+
+ bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step);
+
+ bool final_step(MatchResults & results);
+public:
+ PinyinLookup( PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram);
+
+ ~PinyinLookup();
+
+ bool get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+
+ bool train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+
+ bool convert_to_utf8(MatchResults results, /* out */ char * & result_string);
+
+ bool add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token);
+
+ bool clear_constraint(CandidateConstraints constraints, size_t index);
+
+ bool validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys);
+
+ /* init pinyin table lookup array */
+ bool prepare_pinyin_lookup(PhraseIndexRanges ranges);
+ /* destroy pinyin table lookup array */
+ bool destroy_pinyin_lookup(PhraseIndexRanges ranges);
+};
+
+#endif
diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp
new file mode 100644
index 0000000..c335453
--- /dev/null
+++ b/src/lookup/pinyin_lookup.cpp
@@ -0,0 +1,587 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <math.h>
+#include <assert.h>
+#include <iostream>
+#include "stl_lite.h"
+#include "novel_types.h"
+#include "pinyin_base.h"
+#include "pinyin_phrase.h"
+#include "pinyin_large_table.h"
+#include "phrase_index.h"
+#include "ngram.h"
+#include "lookup.h"
+#include "winner_tree.h"
+
+const gfloat PinyinLookup::bigram_lambda;
+const gfloat PinyinLookup::unigram_lambda;
+
+PinyinLookup::PinyinLookup(PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * bigram){
+ m_custom = custom;
+ m_pinyin_table = pinyin_table;
+ m_phrase_index = phrase_index;
+ m_bigram = bigram;
+ m_winner_tree = new WinnerTree;
+ m_steps_index = g_ptr_array_new();
+ m_steps_content = g_ptr_array_new();
+ m_table_cache = g_array_new(FALSE, TRUE, sizeof(PhraseIndexRanges));
+ g_array_set_size(m_table_cache, 1);
+}
+
+PinyinLookup::~PinyinLookup(){
+ if ( m_winner_tree )
+ delete m_winner_tree;
+ m_winner_tree = NULL;
+ //free resources
+ for ( size_t i = 0; i < m_table_cache->len; ++i){
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
+ destroy_pinyin_lookup(*ranges);
+ }
+ //g_array_set_size(m_table_cache, 1);
+ g_array_free(m_table_cache, TRUE);
+
+ //free m_steps_index
+ for ( size_t i = 0; i < m_steps_index->len; ++i){
+ GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i);
+ g_hash_table_destroy(table);
+ g_ptr_array_index(m_steps_index, i) = NULL;
+ }
+ g_ptr_array_free(m_steps_index, TRUE);
+
+ //free m_steps_content
+ for ( size_t i = 0; i < m_steps_content->len; ++i){
+ GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i);
+ g_array_free(array, TRUE);
+ g_ptr_array_index(m_steps_content, i) = NULL;
+ }
+ g_ptr_array_free(m_steps_content, TRUE);
+
+}
+
+bool PinyinLookup::prepare_pinyin_lookup(PhraseIndexRanges ranges){
+ //memset(ranges, 0, sizeof(ranges));
+ for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i ){
+ GArray * & array = ranges[i];
+ assert(NULL == array);
+ if (m_phrase_index->m_sub_phrase_indices[i]){
+ array = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange));
+ }
+ }
+ return true;
+}
+
+bool PinyinLookup::destroy_pinyin_lookup(PhraseIndexRanges ranges){
+ for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
+ GArray * & array = ranges[i];
+ if ( array )
+ g_array_free(array, TRUE);
+ array = NULL;
+ }
+ return true;
+}
+
+size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){
+ //free resources
+ for ( size_t i = 0; i < m_table_cache->len; ++i){
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
+ destroy_pinyin_lookup(*ranges);
+ }
+ //g_array_set_size(m_table_cache, 1);
+ PinyinKey * pinyin_keys = (PinyinKey *)m_keys->data;
+ pinyin_keys += nstep;
+ //init resources
+ g_array_set_size(m_table_cache, MAX_PHRASE_LENGTH + 1);
+ size_t len;
+ for ( len = 1; len <= total_pinyin && len <= MAX_PHRASE_LENGTH; ++len){
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, len);
+ prepare_pinyin_lookup(*ranges);
+ int result = m_pinyin_table->search(len, pinyin_keys, *ranges);
+ if (!( result & SEARCH_CONTINUED)){
+ ++len;
+ break;
+ }
+ }
+ g_array_set_size(m_table_cache, std_lite::min(len, (size_t) MAX_PHRASE_LENGTH + 1));
+ return m_table_cache->len - 1;
+}
+
+bool PinyinLookup::get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){
+ //g_array_set_size(results, 0);
+
+ m_constraints = constraints;
+ m_keys = keys;
+ int nstep = keys->len + 1;
+
+ //free m_steps_index
+ for ( size_t i = 0; i < m_steps_index->len; ++i){
+ GHashTable * table = (GHashTable *) g_ptr_array_index(m_steps_index, i);
+ g_hash_table_destroy(table);
+ g_ptr_array_index(m_steps_index, i) = NULL;
+ }
+
+ //free m_steps_content
+ for ( size_t i = 0; i < m_steps_content->len; ++i){
+ GArray * array = (GArray *) g_ptr_array_index(m_steps_content, i);
+ g_array_free(array, TRUE);
+ g_ptr_array_index(m_steps_content, i) = NULL;
+ }
+
+ //add null start step
+ g_ptr_array_set_size(m_steps_index, nstep);
+ g_ptr_array_set_size(m_steps_content, nstep);
+
+ for ( size_t i = 0 ; i < nstep; ++i ){
+ //initialize m_steps_index
+ g_ptr_array_index(m_steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal);
+ //initialize m_steps_content
+ g_ptr_array_index(m_steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t));
+ }
+
+ lookup_key_t initial_key = sentence_start;
+ lookup_value_t initial_value(log(1));
+ initial_value.m_handles[1] = sentence_start;
+ GArray * initial_step_content = (GArray *) g_ptr_array_index(m_steps_content, 0);
+ initial_step_content = g_array_append_val(initial_step_content, initial_value);
+ GHashTable * initial_step_index = (GHashTable *) g_ptr_array_index(m_steps_index, 0);
+ g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), GUINT_TO_POINTER(initial_step_content->len - 1));
+
+#if 0
+ LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, 0);
+ IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step);
+ size_t npinyin = prepare_table_cache(0, keys->len);
+ search_unigram(iter, 0, npinyin);
+ delete iter;
+#endif
+
+ for ( size_t i = 0 ; i < nstep - 1 ; ++i ){
+ LookupStepContent tmp_step = (LookupStepContent) g_ptr_array_index(m_steps_content, i);
+ IBranchIterator * iter = m_winner_tree->get_iterator(tmp_step);
+ size_t npinyin = prepare_table_cache(i, keys->len - i);
+ search_bigram(iter, i, npinyin),
+ search_unigram(iter, i, npinyin);
+ delete iter;
+ }
+ return final_step(results);
+}
+
+bool PinyinLookup::search_unigram(IBranchIterator * iter, int nstep, int npinyin){
+ lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep);
+ if ( CONSTRAINT_NOSEARCH == constraint->m_type )
+ return false;
+ GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return false;
+ lookup_value_t max_step = iter->max();
+ if ( CONSTRAINT_ONESTEP == constraint->m_type){
+ return unigram_gen_next_step(nstep, &max_step, constraint->m_token);
+ }
+ if ( NO_CONSTRAINT == constraint->m_type ){
+ bool found = false;
+ for ( size_t i = 1; i < m_table_cache->len && i <= MAX_PHRASE_LENGTH; ++i){
+ lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1);
+ if ( constraint->m_type != NO_CONSTRAINT )
+ continue;
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache,PhraseIndexRanges, i);
+ for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = (*ranges)[m];
+ if ( !array ) continue;
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
+ for ( phrase_token_t token = range->m_range_begin;
+ token != range->m_range_end; ++token){
+ found = unigram_gen_next_step(nstep, &max_step, token)|| found;
+ }
+ }
+ }
+ }
+ return found;
+ }
+ return false;
+}
+
+
+bool PinyinLookup::search_bigram(IBranchIterator * iter,
+ int nstep, int npinyin){
+ lookup_constraint_t* constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep);
+ if ( CONSTRAINT_NOSEARCH == constraint->m_type )
+ return false;
+ GArray * lookup_content = (GArray *) g_ptr_array_index(m_steps_content, nstep);
+
+ bool found = false;
+ BigramPhraseArray bigram_phrase_items = g_array_new(FALSE, FALSE,
+ sizeof(BigramPhraseItem));
+ while ( iter->has_next() ){
+ lookup_value_t cur_step = iter->next();
+ //printf("token:%d\t%d\n", cur_step.m_handles[0], cur_step.m_handles[1]);
+ phrase_token_t index_token = cur_step.m_handles[1];
+ SingleGram * system, * user;
+ m_bigram->load(index_token, system, user);
+ if ( system && user ){
+ guint32 total_freq;
+ assert(user->get_total_freq(total_freq));
+ assert(system->set_total_freq(total_freq));
+ }
+ if ( CONSTRAINT_ONESTEP == constraint->m_type ){
+ phrase_token_t token = constraint->m_token;
+ if ( system ){
+ guint32 freq;
+ if( system->get_freq(token, freq) ){
+ guint32 total_freq;
+ system->get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found;
+ }
+ }
+ if ( user ){
+ guint32 freq;
+ if( user->get_freq(token, freq)){
+ guint32 total_freq;
+ user->get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, &cur_step, token, bigram_poss) || found;
+ }
+ }
+ }
+
+ if ( NO_CONSTRAINT == constraint->m_type ){
+ for ( size_t i = 1; i < m_table_cache->len
+ && i <= MAX_PHRASE_LENGTH;++i ){
+ lookup_constraint_t * constraint = &g_array_index(m_constraints, lookup_constraint_t, nstep + i - 1);
+ if ( constraint->m_type != NO_CONSTRAINT )
+ continue;
+
+ PhraseIndexRanges * ranges = &g_array_index(m_table_cache, PhraseIndexRanges, i);
+ for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = (*ranges)[m];
+ if ( !array ) continue;
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
+ if (system){
+ g_array_set_size(bigram_phrase_items, 0);
+ system->search(range, bigram_phrase_items);
+ for( size_t k = 0; k < bigram_phrase_items->len;
+ ++k){
+ BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
+ found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found;
+ }
+ }
+ if (user){
+ g_array_set_size(bigram_phrase_items, 0);
+ user->search(range, bigram_phrase_items);
+ for( size_t k = 0; k < bigram_phrase_items->len;
+ ++k){
+ BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
+ found = bigram_gen_next_step(nstep, &cur_step, item->m_token, item->m_freq) || found;
+ }
+ }
+ }
+ }
+ }
+ }
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+ g_array_free(bigram_phrase_items, TRUE);
+ return found;
+}
+
+
+bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token){
+ PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep;
+ if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gfloat elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( elem_poss < FLT_EPSILON )
+ return false;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys);
+ if (pinyin_poss < FLT_EPSILON )
+ return false;
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss){
+ PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep;
+ if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gfloat unigram_poss = m_cache_phrase_item.get_unigram_frequency() / (gfloat)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( bigram_poss < FLT_EPSILON && unigram_poss < FLT_EPSILON )
+ return false;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys);
+ if ( pinyin_poss < FLT_EPSILON )
+ return false;
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss +
+ log(( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) *pinyin_poss);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup::save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step){
+ lookup_key_t next_key = next_step->m_handles[1];
+ GHashTable * next_lookup_index = (GHashTable *) g_ptr_array_index(m_steps_index, next_step_pos);
+ GArray * next_lookup_content = (GArray *) g_ptr_array_index(m_steps_content, next_step_pos);
+
+ gpointer key, value;
+ gboolean lookup_result = g_hash_table_lookup_extended(next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
+ size_t step_index = GPOINTER_TO_UINT(value);
+ if ( !lookup_result ){
+ g_array_append_val(next_lookup_content, *next_step);
+ g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1));
+ return true;
+ }else{
+ lookup_value_t * orig_next_value = &g_array_index(next_lookup_content, lookup_value_t,step_index);
+ if ( orig_next_value->m_poss < next_step->m_poss) {
+ orig_next_value->m_handles[0] = next_step->m_handles[0];
+ assert(orig_next_value->m_handles[1] == next_step->m_handles[1]);
+ orig_next_value->m_poss = next_step->m_poss;
+ orig_next_value->m_last_step = next_step->m_last_step;
+ return true;
+ }
+ return false;
+ }
+}
+
+bool PinyinLookup::final_step(MatchResults & results){
+ //reset results
+ g_array_set_size(results, m_steps_content->len);
+ for ( size_t i = 0 ; i < m_steps_content->len ; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ *token = NULL;
+ }
+ //find max element
+ size_t last_step_pos = m_steps_content->len - 1;
+
+ GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos);
+ if ( last_step_array->len == 0 )
+ return false;
+ lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0);
+ for ( size_t i = 1; i < last_step_array->len; ++i){
+ lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ //backtracing
+ while( true ){
+ int cur_step_pos = max_value->m_last_step;
+ if ( -1 == cur_step_pos )
+ break;
+
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, cur_step_pos);
+ *token = max_value->m_handles[1];
+
+ phrase_token_t last_token = max_value->m_handles[0];
+
+
+ GHashTable * lookup_step_index = (GHashTable *)g_ptr_array_index(m_steps_index, cur_step_pos);
+ gpointer key, value;
+ gboolean result = g_hash_table_lookup_extended(lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
+ if (!result)
+ return false;
+ GArray * lookup_step_content = (GArray *)g_ptr_array_index(m_steps_content, cur_step_pos);
+
+ max_value = &g_array_index(lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
+ }
+
+ //no need to reverse the result
+
+ return true;
+}
+
+bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){
+ bool train_next = false;
+ PinyinKey * pinyin_keys = (PinyinKey *)keys->data;
+ //TODO: verify the new training method.
+ phrase_token_t last_token = sentence_start;
+ // constraints->len + 1 == results->len
+ guint32 train_factor = 23;
+ for ( size_t i = 0; i < constraints->len; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ if ( *token == NULL )
+ continue;
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+ if (train_next || CONSTRAINT_ONESTEP == constraint->m_type ){
+ if (CONSTRAINT_ONESTEP == constraint->m_type){
+ assert(*token == constraint->m_token);
+ train_next = true;
+ }else{
+ train_next = false;
+ }
+ //add pi-gram frequency
+ //std::cout<<"i:"<<i<<"last_token:"<<last_token<<"\ttoken:"<<*token<<std::endl;
+ m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+ m_cache_phrase_item.increase_pinyin_possibility(*m_custom, pinyin_keys + i, train_factor);
+ m_phrase_index->add_unigram_frequency(*token, train_factor);
+ if ( last_token ){
+ SingleGram * system, *user;
+ m_bigram->load(last_token, system, user);
+ guint32 total_freq;
+ if ( !user ){
+ total_freq = 0;
+ if ( system )
+ assert(system->get_total_freq(total_freq));
+ user = new SingleGram;
+ user->set_total_freq(total_freq);
+ }
+ guint32 freq = 0;
+ if ( !user->get_freq(*token, freq)){
+ if (system) system->get_freq(*token, freq);
+ user->set_freq(*token, freq);
+ }
+ assert(user->get_total_freq(total_freq));
+ //protect against total_freq overflow.
+ if ( train_factor > 0 && total_freq > total_freq + train_factor)
+ goto next;
+ assert(user->set_total_freq(total_freq + train_factor));
+ assert(user->get_freq(*token, freq));
+ //if total_freq is not overflow, then freq won't overflow.
+ assert(user->set_freq(*token, freq + train_factor));
+ assert(m_bigram->store(last_token, user));
+ next:
+ if (system) delete system;
+ if (user) delete user;
+ }
+ }
+ last_token = *token;
+ }
+ return true;
+}
+
+bool PinyinLookup::convert_to_utf8(MatchResults results, /* out */ char * & result_string){
+ result_string = g_strdup("");
+ for ( size_t i = 0; i < results->len; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ if ( NULL == *token )
+ continue;
+ m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+ utf16_t buffer[MAX_PHRASE_LENGTH];
+ m_cache_phrase_item.get_phrase_string(buffer);
+ guint8 length = m_cache_phrase_item.get_phrase_length();
+ gchar * phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
+ char * tmp = result_string;
+ result_string = g_strconcat(result_string, phrase, NULL);
+ g_free(tmp); g_free(phrase);
+ }
+ return true;
+}
+
+bool PinyinLookup::add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token){
+ if ( !m_phrase_index->get_phrase_item(token, m_cache_phrase_item) )
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ if ( index + phrase_length > constraints->len )
+ return false;
+
+ for ( size_t i = index; i < index + phrase_length ; ++i ){
+ clear_constraint(constraints, i);
+ }
+
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index);
+ constraint->m_type = CONSTRAINT_ONESTEP;
+ constraint->m_token = token;
+
+ for (size_t i = 1; i < phrase_length; ++i){
+ constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
+ constraint->m_type = CONSTRAINT_NOSEARCH;
+ constraint->m_constraint_step = index;
+ }
+ return true;
+}
+
+bool PinyinLookup::clear_constraint(CandidateConstraints constraints, size_t index){
+ if ( index < 0 || index >= constraints->len )
+ return false;
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, index);
+ if (constraint->m_type == NO_CONSTRAINT)
+ return false;
+ if (constraint->m_type == CONSTRAINT_NOSEARCH){
+ index = constraint->m_constraint_step;
+ constraint = &g_array_index(constraints, lookup_constraint_t, index);
+ }
+
+ assert(constraint->m_type == CONSTRAINT_ONESTEP);
+
+ phrase_token_t token = constraint->m_token;
+ if (!m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ for ( size_t i = 0; i < phrase_length; ++i){
+ if ( index + i >= constraints->len )
+ continue;
+ constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+ return true;
+}
+
+bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys){
+ //resize constraints array
+ size_t constraints_length = constraints->len;
+ if ( m_parsed_keys->len > constraints_length ){
+ g_array_set_size(constraints, m_parsed_keys->len);
+ //initialize new element
+ for( size_t i = constraints_length; i < m_parsed_keys->len; ++i){
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+ }else if (m_parsed_keys->len < constraints_length ){
+ g_array_set_size(constraints, m_parsed_keys->len);
+ }
+
+ PinyinKey * pinyin_keys = (PinyinKey *)m_parsed_keys->data;
+
+ for ( size_t i = 0; i < constraints->len; ++i){
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+ if ( constraint->m_type == CONSTRAINT_ONESTEP ){
+ phrase_token_t token = constraint->m_token;
+ m_phrase_index->get_phrase_item(token, m_cache_phrase_item);
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ //clear too long constraint
+ if ( i + phrase_length > constraints->len ){
+ clear_constraint(constraints, i);
+ continue;
+ }
+ //clear invalidated pinyin
+ gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyin_keys + i);
+ if ( pinyin_poss < FLT_EPSILON ){
+ clear_constraint(constraints, i);
+ }
+ }
+ }
+ return true;
+}
diff --git a/src/lookup/winner_tree.cpp b/src/lookup/winner_tree.cpp
new file mode 100644
index 0000000..248a749
--- /dev/null
+++ b/src/lookup/winner_tree.cpp
@@ -0,0 +1,141 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <float.h>
+#include <limits.h>
+#include <stdio.h>
+#include "memory_chunk.h"
+#include "phrase_index.h"
+#include "lookup.h"
+#include "winner_tree.h"
+
+WinnerTreeBranchIterator::WinnerTreeBranchIterator(WinnerTree & tree)
+ :m_tree(tree), m_counter(0){
+ m_max_value = m_tree.m_items[m_tree.get_winner()];
+ m_counter = 0;
+}
+
+bool WinnerTreeBranchIterator::has_next(){
+ if ( m_counter >= m_tree.m_tree_size)
+ return false;
+ return m_counter < nbranch;
+}
+
+lookup_value_t WinnerTreeBranchIterator::next(){
+ int winner = m_tree.get_winner();
+ lookup_value_t tmp = m_tree.m_items[winner];
+ m_tree.m_items[winner].m_poss =
+ - FLT_MAX;
+ m_tree.replay(winner);
+ ++m_counter;
+ return tmp;
+}
+
+void WinnerTree::play(int p, int lc, int rc){
+ m_tree[p] = winner(lc, rc);
+ //continue competition
+ while( p > 1 && p % 2) {
+ m_tree[p/2] = winner( m_tree[p - 1], m_tree[p]);
+ p/=2;
+ }
+}
+
+
+bool WinnerTree::initialize(LookupStepContent cur_step){
+ size_t size = cur_step->len;
+ if ( size > m_max_tree_size ){
+ init(size);
+ }
+ assert(size > nbranch);
+ m_tree_size = size;
+
+ //initialize array tree
+ int nindex = 1;
+
+ for( size_t i = 0; i < cur_step->len ; ++i){
+ lookup_value_t * cur_value = &g_array_index(cur_step, lookup_value_t, i);
+ m_items[nindex++] = *cur_value;
+ }
+
+ //compute s = 2 ^ log(n -1)
+ int i, s;
+ for( s = 1; 2 * s <= m_tree_size - 1; s += s);
+
+ m_low_ext = 2 * (m_tree_size - s);
+ m_offset = 2 * s - 1;
+
+ //compute outside nodes
+ for( i = 2; i <= m_low_ext; i += 2)
+ play((m_offset + i)/2, i - 1, i);
+ //compute other nodes
+ if ( m_tree_size % 2){
+ play( m_tree_size / 2, m_tree[m_tree_size - 1], m_low_ext +1);
+ i = m_low_ext + 3;
+ }else i = m_low_ext + 2;
+
+ //compute others
+ for( ; i <= m_tree_size; i += 2)
+ play( (i - m_low_ext + m_tree_size - 1) / 2, i - 1, i);
+ return true;
+}
+
+void WinnerTree::replay(int i){
+ assert( 1 <= i && i <= m_tree_size);
+
+ int p; //compete node
+ int lc; //p's left child
+ int rc; //p's right child
+
+ //first compete
+ if ( i <= m_low_ext){
+ p = (m_offset + i) / 2;
+ lc = 2 * p - m_offset;
+ rc = lc + 1;
+ }else{
+ p = (i - m_low_ext + m_tree_size -1) / 2;
+ if ( 2 * p == m_tree_size - 1 ){
+ lc = m_tree[2*p];
+ rc = i;
+ }else{
+ lc = 2 * p - m_tree_size + 1 + m_low_ext;
+ rc = lc + 1;
+ }
+ }
+
+ m_tree[p] = winner(lc, rc);
+
+ //added by wupeng
+ if ( ( p | 0x01 ) == m_tree_size ){
+ p /= 2;
+ m_tree[p] = winner( m_tree[2 * p], m_low_ext + 1 );
+ }
+
+ //compute others
+ p /= 2;
+ for( ; p >= 1 ; p /= 2)
+ m_tree[p] = winner( m_tree[2 * p], m_tree[2 * p + 1]);
+}
+
+int WinnerTree::winner(int lc, int rc){
+ return m_items[lc].m_poss > m_items[rc].m_poss ?
+ lc : rc;
+}
diff --git a/src/lookup/winner_tree.h b/src/lookup/winner_tree.h
new file mode 100644
index 0000000..262f196
--- /dev/null
+++ b/src/lookup/winner_tree.h
@@ -0,0 +1,148 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef LOOKUP_WINNER_TREE_H
+#define LOOKUP_WINNER_TREE_H
+
+#include <assert.h>
+#include "lookup.h"
+
+const int nbranch = 32;
+
+class DirectBranchIterator: public IBranchIterator{//for nitem <= nbranch
+ LookupStepContent m_step_content;
+ size_t m_iter_pos;
+public:
+ //Constructor
+ DirectBranchIterator(LookupStepContent step_content)
+ :m_step_content(step_content)
+ { m_iter_pos = 0; }
+
+ //Destructor
+ virtual ~DirectBranchIterator(){}
+
+ //Member Function
+ bool has_next(){
+ return m_iter_pos != m_step_content->len;
+ }
+
+ lookup_value_t next(){
+ lookup_value_t * tmp = &g_array_index(m_step_content,
+ lookup_value_t, m_iter_pos);
+ ++m_iter_pos;
+ return *tmp;
+ }
+
+ lookup_value_t max(){
+ lookup_value_t * max_value = &g_array_index(m_step_content, lookup_value_t, 0);
+ for ( size_t i = 1 ; i < m_step_content->len; ++i){
+ lookup_value_t * cur_value = &g_array_index(m_step_content, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+ return *max_value;
+ }
+};
+
+class WinnerTree;
+
+class WinnerTreeBranchIterator: public IBranchIterator{//for nitem <= nbranch
+ WinnerTree& m_tree;
+ int m_counter;
+ lookup_value_t m_max_value;
+public:
+ //Constructor
+ WinnerTreeBranchIterator(WinnerTree & tree);
+
+ //Destructor
+ virtual ~WinnerTreeBranchIterator(){}
+
+ //Member Function
+ bool has_next();
+
+ lookup_value_t next();
+
+ lookup_value_t max(){
+ return m_max_value;
+ }
+
+};
+
+class WinnerTree{
+ friend class WinnerTreeBranchIterator;
+private:
+ size_t m_max_tree_size; // maxsize
+ int m_tree_size; // n
+ int m_low_ext;
+ int m_offset;
+ int * m_tree;
+ MemoryChunk m_buffer;
+ MemoryChunk m_tree_buffer;
+ lookup_value_t * m_items;
+
+ int winner(int lc, int rc);
+
+ void play(int p, int lc, int rc);
+
+ void init(int tree_size){
+ m_max_tree_size = tree_size;
+ //data buffer
+ m_buffer.set_size( sizeof(lookup_value_t) * (tree_size + 1) );
+ m_items = (lookup_value_t *) m_buffer.begin();
+
+ //tree item buffer
+ m_tree_buffer.set_size( sizeof(int) * m_max_tree_size);
+ m_tree = (int * ) m_tree_buffer.begin();
+ m_tree_size = 0;
+ }
+
+public:
+
+ //Constructor
+ WinnerTree(int tree_size = 10){
+ init(tree_size);
+ }
+
+ //Destructor
+ ~WinnerTree() { }
+
+ //need delete this
+ IBranchIterator* get_iterator(LookupStepContent step){
+ if ( step->len <= nbranch )
+ return new DirectBranchIterator(step);
+ //TODO:another situation > nbranch
+ assert(initialize(step));
+ return new WinnerTreeBranchIterator(*this);
+ }
+
+protected:
+
+ int get_winner() const {
+ return (m_tree_size)? m_tree[1] : 0;
+ }
+
+ //Member Function
+ bool initialize(LookupStepContent cur_step);
+ void replay(int i);
+};
+
+#endif