summaryrefslogtreecommitdiffstats
path: root/src/lookup
diff options
context:
space:
mode:
Diffstat (limited to 'src/lookup')
-rw-r--r--src/lookup/CMakeLists.txt23
-rw-r--r--src/lookup/Makefile.am36
-rw-r--r--src/lookup/lookup.cpp73
-rw-r--r--src/lookup/lookup.h79
-rw-r--r--src/lookup/phrase_lookup.cpp434
-rw-r--r--src/lookup/phrase_lookup.h142
-rw-r--r--src/lookup/pinyin_lookup2.cpp730
-rw-r--r--src/lookup/pinyin_lookup2.h240
8 files changed, 1757 insertions, 0 deletions
diff --git a/src/lookup/CMakeLists.txt b/src/lookup/CMakeLists.txt
new file mode 100644
index 0000000..937b2cb
--- /dev/null
+++ b/src/lookup/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(
+ CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC"
+)
+
+set(
+ LIBLOOKUP_SOURCES
+ pinyin_lookup2.cpp
+ phrase_lookup.cpp
+ lookup.cpp
+)
+
+add_library(
+ lookup
+ STATIC
+ ${LIBLOOKUP_SOURCES}
+)
+
+install(
+ FILES
+ ${LIBLOOKUP_HEADERS}
+ DESTINATION
+ ${DIR_INCLUDE_LIBPINYIN}
+)
diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am
new file mode 100644
index 0000000..00d7df4
--- /dev/null
+++ b/src/lookup/Makefile.am
@@ -0,0 +1,36 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+MAINTAINERCLEANFILES = Makefile.in
+
+INCLUDES = -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/src/storage \
+ @GLIB2_CFLAGS@
+
+noinst_HEADERS = lookup.h \
+ pinyin_lookup2.h \
+ phrase_lookup.h
+
+noinst_LTLIBRARIES = liblookup.la
+
+liblookup_la_CXXFLAGS = "-fPIC"
+
+liblookup_la_LDFLAGS = -static
+
+liblookup_la_SOURCES = pinyin_lookup2.cpp \
+ phrase_lookup.cpp \
+ lookup.cpp
diff --git a/src/lookup/lookup.cpp b/src/lookup/lookup.cpp
new file mode 100644
index 0000000..c32a0ec
--- /dev/null
+++ b/src/lookup/lookup.cpp
@@ -0,0 +1,73 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#include "lookup.h"
+#include "phrase_index.h"
+
+namespace pinyin{
+
+bool convert_to_utf8(FacadePhraseIndex * phrase_index,
+ MatchResults match_results,
+ /* in */ const char * delimiter,
+ /* in */ bool show_tokens,
+ /* out */ char * & result_string){
+ //init variables
+ if ( NULL == delimiter )
+ delimiter = "";
+ result_string = NULL;
+
+ PhraseItem item;
+
+ for ( size_t i = 0; i < match_results->len; ++i ){
+ phrase_token_t token = g_array_index
+ (match_results, phrase_token_t, i);
+ if ( null_token == token )
+ continue;
+
+ phrase_index->get_phrase_item(token, item);
+ ucs4_t buffer[MAX_PHRASE_LENGTH];
+ item.get_phrase_string(buffer);
+
+ guint8 length = item.get_phrase_length();
+ gchar * phrase = NULL;
+ char * tmp = NULL;
+
+ if (show_tokens) {
+ tmp = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
+ phrase = g_strdup_printf("%d %s", token, tmp);
+ g_free(tmp);
+ } else {
+ phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
+ }
+
+ tmp = result_string;
+ if ( NULL == result_string )
+ result_string = g_strdup(phrase);
+ else
+ result_string = g_strconcat(result_string, delimiter, phrase, NULL);
+ g_free(phrase);
+ g_free(tmp);
+ }
+ return true;
+}
+
+};
diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h
new file mode 100644
index 0000000..8dc1a89
--- /dev/null
+++ b/src/lookup/lookup.h
@@ -0,0 +1,79 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef LOOKUP_H
+#define LOOKUP_H
+
+
+/** @file lookup.h
+ * @brief the definitions of common lookup related classes and structs.
+ */
+
+#include "novel_types.h"
+#include <limits.h>
+
+namespace pinyin{
+
+typedef phrase_token_t lookup_key_t;
+
+struct lookup_value_t{
+ /* previous and current tokens of the node */
+ phrase_token_t m_handles[2];
+ /* maximum possibility of current node */
+ gfloat m_poss;
+ /* trace back information for final step */
+ gint32 m_last_step;
+
+ lookup_value_t(gfloat poss = FLT_MAX){
+ m_handles[0] = null_token; m_handles[1] = null_token;
+ m_poss = poss;
+ m_last_step = -1;
+ }
+};
+
+
+class FacadePhraseIndex;
+
+
+/* Note:
+ * LookupStepIndex:
+ * the main purpose of lookup step index is served for an index
+ * for lookup step content, which can quickly merge the same node
+ * with different possibilities,
+ * then only keep the highest value of the node.
+ * LookupStepContent:
+ * the place to store the lookup values of current step,
+ * and indexed by lookup step index.
+ * See also comments on lookup_value_t.
+ */
+
+typedef GHashTable * LookupStepIndex;
+/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */
+typedef GArray * LookupStepContent; /* array of lookup_value_t */
+
+bool convert_to_utf8(FacadePhraseIndex * phrase_index,
+ MatchResults match_results,
+ /* in */ const char * delimiter,
+ /* in */ bool show_tokens,
+ /* out */ char * & result_string);
+
+};
+#endif
diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp
new file mode 100644
index 0000000..f7da0b7
--- /dev/null
+++ b/src/lookup/phrase_lookup.cpp
@@ -0,0 +1,434 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <math.h>
+#include "stl_lite.h"
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "facade_phrase_table2.h"
+#include "ngram.h"
+#include "phrase_lookup.h"
+
+using namespace pinyin;
+
+
+/*
+const gfloat PhraseLookup::bigram_lambda = lambda;
+const gfloat PhraseLookup::unigram_lambda = 1 - lambda;
+*/
+
+static bool populate_prefixes(GPtrArray * steps_index,
+ GPtrArray * steps_content) {
+
+ lookup_key_t initial_key = sentence_start;
+ lookup_value_t initial_value(log(1));
+ initial_value.m_handles[1] = sentence_start;
+
+ LookupStepContent initial_step_content = (LookupStepContent)
+ g_ptr_array_index(steps_content, 0);
+ g_array_append_val(initial_step_content, initial_value);
+
+ LookupStepIndex initial_step_index = (LookupStepIndex)
+ g_ptr_array_index(steps_index, 0);
+ g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key),
+ GUINT_TO_POINTER(initial_step_content->len - 1));
+
+ return true;
+}
+
+static bool init_steps(GPtrArray * steps_index,
+ GPtrArray * steps_content,
+ int nstep) {
+
+ /* add null start step */
+ g_ptr_array_set_size(steps_index, nstep);
+ g_ptr_array_set_size(steps_content, nstep);
+
+ for ( int i = 0; i < nstep; ++i ){
+ /* initialize steps_index */
+ g_ptr_array_index(steps_index, i) = g_hash_table_new
+ (g_direct_hash, g_direct_equal);
+ /* initialize steps_content */
+ g_ptr_array_index(steps_content, i) = g_array_new
+ (FALSE, FALSE, sizeof(lookup_value_t));
+ }
+
+ return true;
+}
+
+static void clear_steps(GPtrArray * steps_index,
+ GPtrArray * steps_content){
+ /* clear steps_index */
+ for ( size_t i = 0; i < steps_index->len; ++i){
+ GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
+ g_hash_table_destroy(table);
+ g_ptr_array_index(steps_index, i) = NULL;
+ }
+
+ /* free steps_content */
+ for ( size_t i = 0; i < steps_content->len; ++i){
+ GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
+ g_array_free(array, TRUE);
+ g_ptr_array_index(steps_content, i) = NULL;
+ }
+}
+
+PhraseLookup::PhraseLookup(const gfloat lambda,
+ FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram)
+ : bigram_lambda(lambda),
+ unigram_lambda(1. - lambda)
+{
+ m_phrase_table = phrase_table;
+ m_phrase_index = phrase_index;
+ m_system_bigram = system_bigram;
+ m_user_bigram = user_bigram;
+
+ m_steps_index = g_ptr_array_new();
+ m_steps_content = g_ptr_array_new();
+
+ /* the member variables below are saved in get_best_match call. */
+ m_sentence = NULL;
+ m_sentence_length = 0;
+}
+
+PhraseLookup::~PhraseLookup(){
+ clear_steps(m_steps_index, m_steps_content);
+ g_ptr_array_free(m_steps_index, TRUE);
+ g_ptr_array_free(m_steps_content, TRUE);
+}
+
+bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
+ MatchResults & results){
+ m_sentence_length = sentence_length;
+ m_sentence = sentence;
+ int nstep = m_sentence_length + 1;
+
+ clear_steps(m_steps_index, m_steps_content);
+
+ init_steps(m_steps_index, m_steps_content, nstep);
+
+ populate_prefixes(m_steps_index, m_steps_content);
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ m_phrase_index->prepare_tokens(tokens);
+
+ for ( int i = 0; i < nstep - 1; ++i ){
+ for ( int m = i + 1; m < nstep; ++m ){
+
+ /* do one phrase table search. */
+ int result = m_phrase_table->search(m - i, sentence + i, tokens);
+
+ /* found next phrase */
+ if ( result & SEARCH_OK ) {
+ search_bigram2(i, tokens),
+ search_unigram2(i, tokens);
+ }
+
+ /* no longer phrase */
+ if (!(result & SEARCH_CONTINUED))
+ break;
+ }
+ }
+
+ m_phrase_index->destroy_tokens(tokens);
+
+ return final_step(results);
+}
+
+#if 0
+
+bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return false;
+
+ lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0);
+ /* find the maximum node */
+ for ( size_t i = 1; i < lookup_content->len; ++i ){
+ lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ return unigram_gen_next_step(nstep, max_value, token);
+}
+
+bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return false;
+
+ for ( size_t i = 0; i < lookup_content->len; ++i ){
+ lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
+ phrase_token_t index_token = cur_value->m_handles[1];
+ SingleGram * system, * user;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if ( !merge_single_gram(&m_merged_single_gram, system, user) )
+ continue;
+
+ guint32 freq;
+ if ( m_merged_single_gram.get_freq(token, freq) ){
+ guint32 total_freq;
+ m_merged_single_gram.get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
+ }
+
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ return found;
+}
+
+#endif
+
+bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if ( 0 == lookup_content->len )
+ return found;
+
+ /* find the maximum node */
+ lookup_value_t * max_value = &g_array_index
+ (lookup_content, lookup_value_t, 0);
+
+ for (size_t i = 1; i < lookup_content->len; ++i) {
+ lookup_value_t * cur_value = &g_array_index
+ (lookup_content, lookup_value_t, i);
+ if (cur_value->m_poss > max_value->m_poss)
+ max_value = cur_value;
+ }
+
+ /* iterate over tokens */
+ for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+ GArray * array = tokens[n];
+ if (NULL == array)
+ continue;
+
+ /* just skip the loop when the length is zero. */
+ for (size_t k = 0; k < array->len; ++k) {
+ phrase_token_t token =
+ g_array_index(array, phrase_token_t, k);
+
+ found = unigram_gen_next_step
+ (nstep, max_value, token) || found;
+ }
+ }
+
+ return found;
+}
+
+bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
+ bool found = false;
+
+ LookupStepContent lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, nstep);
+ if (0 == lookup_content->len)
+ return found;
+
+ for (size_t i = 0; i < lookup_content->len; ++i) {
+ lookup_value_t * cur_value = &g_array_index
+ (lookup_content, lookup_value_t, i);
+ phrase_token_t index_token = cur_value->m_handles[1];
+
+ SingleGram * system = NULL, * user = NULL;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if (!merge_single_gram
+ (&m_merged_single_gram, system, user))
+ continue;
+
+ /* iterate over tokens */
+ for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
+ GArray * array = tokens[n];
+ if (NULL == array)
+ continue;
+
+ /* just skip the loop when the length is zero. */
+ for (size_t k = 0; k < array->len; ++k) {
+ phrase_token_t token =
+ g_array_index(array, phrase_token_t, k);
+
+ guint32 freq = 0;
+ if (m_merged_single_gram.get_freq(token, freq)) {
+ guint32 total_freq = 0;
+ m_merged_single_gram.get_total_freq(total_freq);
+
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
+ }
+ }
+ }
+
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ return found;
+}
+
+bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
+phrase_token_t token){
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( elem_poss < DBL_EPSILON )
+ return false;
+
+ lookup_value_t next_value;
+ next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
+ next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda);
+ next_value.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_value, &next_value);
+}
+
+bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){
+
+ if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
+ (gdouble) m_phrase_index->get_phrase_index_total_freq();
+
+ if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
+ return false;
+
+ lookup_value_t next_value;
+ next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
+ next_value.m_poss = cur_value->m_poss +
+ log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss );
+ next_value.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_value, &next_value);
+}
+
+bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){
+
+ LookupStepIndex next_lookup_index = (LookupStepIndex)
+ g_ptr_array_index(m_steps_index, next_step_pos);
+ LookupStepContent next_lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, next_step_pos);
+
+ lookup_key_t next_key = next_value->m_handles[1];
+
+ gpointer key = NULL, value = NULL;
+ gboolean lookup_result = g_hash_table_lookup_extended
+ (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
+
+ if (!lookup_result){
+ g_array_append_val(next_lookup_content, *next_value);
+ g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key),
+ GUINT_TO_POINTER(next_lookup_content->len - 1));
+ return true;
+ }else{
+ size_t step_index = GPOINTER_TO_UINT(value);
+ lookup_value_t * orig_next_value = &g_array_index
+ (next_lookup_content, lookup_value_t, step_index);
+
+ if ( orig_next_value->m_poss < next_value->m_poss ){
+ orig_next_value->m_handles[0] = next_value->m_handles[0];
+ assert(orig_next_value->m_handles[1] == next_value->m_handles[1]);
+ orig_next_value->m_poss = next_value->m_poss;
+ orig_next_value->m_last_step = next_value->m_last_step;
+ return true;
+ }
+ return false;
+ }
+}
+
+bool PhraseLookup::final_step(MatchResults & results ){
+
+ /* reset results */
+ g_array_set_size(results, m_steps_content->len - 1);
+ for ( size_t i = 0; i < results->len; ++i ){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ *token = null_token;
+ }
+
+ /* find max element */
+ size_t last_step_pos = m_steps_content->len - 1;
+ LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index
+ (m_steps_content, last_step_pos);
+ if ( last_step_content->len == 0 )
+ return false;
+
+ lookup_value_t * max_value = &g_array_index
+ (last_step_content, lookup_value_t, 0);
+ for ( size_t i = 1; i < last_step_content->len; ++i ){
+ lookup_value_t * cur_value = &g_array_index
+ (last_step_content, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ /* backtracing */
+ while( true ){
+ int cur_step_pos = max_value->m_last_step;
+ if ( -1 == cur_step_pos )
+ break;
+
+ phrase_token_t * token = &g_array_index
+ (results, phrase_token_t, cur_step_pos);
+ *token = max_value->m_handles[1];
+
+ phrase_token_t last_token = max_value->m_handles[0];
+ LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos);
+
+ gpointer key = NULL, value = NULL;
+ gboolean result = g_hash_table_lookup_extended
+ (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
+ if ( !result )
+ return false;
+
+ LookupStepContent lookup_step_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, cur_step_pos);
+ max_value = &g_array_index
+ (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
+ }
+
+ /* no need to reverse the result */
+ return true;
+}
diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h
new file mode 100644
index 0000000..cf65692
--- /dev/null
+++ b/src/lookup/phrase_lookup.h
@@ -0,0 +1,142 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef PHRASE_LOOKUP_H
+#define PHRASE_LOOKUP_H
+
+#include "novel_types.h"
+#include "ngram.h"
+#include "lookup.h"
+
+/**
+ * phrase_lookup.h
+ *
+ * The definitions of phrase lookup related classes and structs.
+ *
+ */
+
+namespace pinyin{
+
+/**
+ * PhraseLookup:
+ *
+ * The phrase lookup class to convert the sentence to phrase tokens.
+ *
+ */
+class PhraseLookup{
+private:
+ const gfloat bigram_lambda;
+ const gfloat unigram_lambda;
+
+ PhraseItem m_cache_phrase_item;
+ SingleGram m_merged_single_gram;
+protected:
+ //saved varibles
+ FacadePhraseTable2 * m_phrase_table;
+ FacadePhraseIndex * m_phrase_index;
+ Bigram * m_system_bigram;
+ Bigram * m_user_bigram;
+
+ //internal step data structure
+ GPtrArray * m_steps_index;
+ /* Array of LookupStepIndex */
+ GPtrArray * m_steps_content;
+ /* Array of LookupStepContent */
+
+ /* Saved sentence */
+ int m_sentence_length;
+ ucs4_t * m_sentence;
+
+protected:
+ /* Explicitly search the next phrase,
+ * to avoid double phrase lookup as the next token has only one.
+ */
+ bool search_unigram2(int nstep, PhraseTokens tokens);
+ bool search_bigram2(int nstep, PhraseTokens tokens);
+
+ bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token);
+ bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss);
+
+ bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step);
+
+ bool final_step(MatchResults & results);
+public:
+ /**
+ * PhraseLookup::PhraseLookup:
+ * @lambda: the lambda parameter for interpolation model.
+ * @phrase_table: the phrase table.
+ * @phrase_index: the phrase index.
+ * @system_bigram: the system bi-gram.
+ * @user_bigram: the user bi-gram.
+ *
+ * The constructor of the PhraseLookup.
+ *
+ */
+ PhraseLookup(const gfloat lambda,
+ FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram);
+
+ /**
+ * PhraseLookup::~PhraseLookup:
+ *
+ * The destructor of the PhraseLookup.
+ *
+ */
+ ~PhraseLookup();
+
+ /**
+ * PhraseLookup::get_best_match:
+ * @sentence_length: the length of the sentence in ucs4 characters.
+ * @sentence: the ucs4 characters of the sentence.
+ * @results: the segmented sentence in the form of phrase tokens.
+ * @returns: whether the segment operation is successful.
+ *
+ * Segment the sentence into phrase tokens.
+ *
+ * Note: this method only accepts the characters in phrase large table.
+ *
+ */
+ bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results);
+
+ /**
+ * PhraseLookup::convert_to_utf8:
+ * @results: the guessed sentence in the form of phrase tokens.
+ * @result_string: the converted sentence in utf8 string.
+ * @returns: whether the convert operation is successful.
+ *
+ * Convert the sentence from phrase tokens to the utf8 string.
+ *
+ * Note: free the result_string by g_free.
+ *
+ */
+ bool convert_to_utf8(MatchResults results,
+ /* out */ char * & result_string)
+ {
+ return pinyin::convert_to_utf8(m_phrase_index, results,
+ "\n", true, result_string);
+ }
+};
+
+};
+
+#endif
diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp
new file mode 100644
index 0000000..2250a93
--- /dev/null
+++ b/src/lookup/pinyin_lookup2.cpp
@@ -0,0 +1,730 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <math.h>
+#include "facade_chewing_table.h"
+#include "pinyin_lookup2.h"
+#include "stl_lite.h"
+
+using namespace pinyin;
+
+/*
+const gfloat PinyinLookup2::bigram_lambda = lambda;
+const gfloat PinyinLookup2::unigram_lambda = 1 - lambda;
+*/
+
+/* internal definition */
+static const size_t nbeam = 32;
+
+static bool dump_max_value(GPtrArray * values){
+ if (0 == values->len)
+ return false;
+
+ const lookup_value_t * max =
+ (const lookup_value_t *) g_ptr_array_index(values, 0);
+
+ for (size_t i = 1; i < values->len; ++i) {
+ const lookup_value_t * cur =
+ (const lookup_value_t *) g_ptr_array_index(values, i);
+
+ if (cur->m_poss > max->m_poss)
+ max = cur;
+ }
+
+ printf("max value: %f\n", max->m_poss);
+
+ return true;
+}
+
+static bool dump_all_values(GPtrArray * values) {
+ if (0 == values->len)
+ return false;
+
+ printf("values:");
+ for (size_t i = 0; i < values->len; ++i) {
+ const lookup_value_t * cur =
+ (const lookup_value_t *) g_ptr_array_index(values, i);
+
+ printf("%f\t", cur->m_poss);
+ }
+ printf("\n");
+
+ return true;
+}
+
+/* populate the candidates. */
+static bool populate_candidates(/* out */ GPtrArray * candidates,
+ /* in */ LookupStepContent step) {
+ g_ptr_array_set_size(candidates, 0);
+
+ if (0 == step->len)
+ return false;
+
+ for (size_t i = 0; i < step->len; ++i) {
+ lookup_value_t * value = &g_array_index
+ (step, lookup_value_t, i);
+
+ g_ptr_array_add(candidates, value);
+ }
+
+ /* dump_max_value(candidates); */
+
+ return true;
+}
+
+static bool lookup_value_less_than(lookup_value_t * lhs, lookup_value_t * rhs){
+ return lhs->m_poss < rhs->m_poss;
+}
+
+/* use maximum heap to get the topest results. */
+static bool get_top_results(/* out */ GPtrArray * topresults,
+ /* in */ GPtrArray * candidates) {
+ g_ptr_array_set_size(topresults, 0);
+
+ if (0 == candidates->len)
+ return false;
+
+ lookup_value_t ** begin =
+ (lookup_value_t **) &g_ptr_array_index(candidates, 0);
+ lookup_value_t ** end =
+ (lookup_value_t **) &g_ptr_array_index(candidates, candidates->len);
+
+ std_lite::make_heap(begin, end, lookup_value_less_than);
+
+ while (end != begin) {
+ lookup_value_t * one = *begin;
+ g_ptr_array_add(topresults, one);
+
+ std_lite::pop_heap(begin, end, lookup_value_less_than);
+ --end;
+
+ if (topresults->len >= nbeam)
+ break;
+ }
+
+ /* dump_all_values(topresults); */
+
+ return true;
+}
+
+static bool populate_prefixes(GPtrArray * steps_index,
+ GPtrArray * steps_content,
+ TokenVector prefixes) {
+ assert(prefixes->len > 0);
+
+ for (size_t i = 0; i < prefixes->len; ++i) {
+ phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
+ lookup_key_t initial_key = token;
+ lookup_value_t initial_value(log(1));
+ initial_value.m_handles[1] = token;
+
+ LookupStepContent initial_step_content = (LookupStepContent)
+ g_ptr_array_index(steps_content, 0);
+ initial_step_content = g_array_append_val
+ (initial_step_content, initial_value);
+
+ LookupStepIndex initial_step_index = (LookupStepIndex)
+ g_ptr_array_index(steps_index, 0);
+ g_hash_table_insert(initial_step_index,
+ GUINT_TO_POINTER(initial_key),
+ GUINT_TO_POINTER(initial_step_content->len - 1));
+ }
+
+ return true;
+}
+
+static bool init_steps(GPtrArray * steps_index,
+ GPtrArray * steps_content,
+ int nstep){
+ /* add null start step */
+ g_ptr_array_set_size(steps_index, nstep);
+ g_ptr_array_set_size(steps_content, nstep);
+
+ for (int i = 0; i < nstep; ++i) {
+ /* initialize steps_index */
+ g_ptr_array_index(steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal);
+ /* initialize steps_content */
+ g_ptr_array_index(steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t));
+ }
+
+ return true;
+}
+
+static void clear_steps(GPtrArray * steps_index, GPtrArray * steps_content){
+ /* clear steps_index */
+ for ( size_t i = 0; i < steps_index->len; ++i){
+ GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
+ g_hash_table_destroy(table);
+ g_ptr_array_index(steps_index, i) = NULL;
+ }
+
+ /* clear steps_content */
+ for ( size_t i = 0; i < steps_content->len; ++i){
+ GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
+ g_array_free(array, TRUE);
+ g_ptr_array_index(steps_content, i) = NULL;
+ }
+}
+
+
+PinyinLookup2::PinyinLookup2(const gfloat lambda,
+ pinyin_option_t options,
+ FacadeChewingTable * pinyin_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram)
+ : bigram_lambda(lambda),
+ unigram_lambda(1. - lambda)
+{
+ m_options = options;
+ m_pinyin_table = pinyin_table;
+ m_phrase_index = phrase_index;
+ m_system_bigram = system_bigram;
+ m_user_bigram = user_bigram;
+
+ m_steps_index = g_ptr_array_new();
+ m_steps_content = g_ptr_array_new();
+
+ /* the member variables below are saved in get_best_match call. */
+ m_keys = NULL;
+ m_constraints = NULL;
+}
+
+PinyinLookup2::~PinyinLookup2(){
+ clear_steps(m_steps_index, m_steps_content);
+ g_ptr_array_free(m_steps_index, TRUE);
+ g_ptr_array_free(m_steps_content, TRUE);
+}
+
+
+bool PinyinLookup2::get_best_match(TokenVector prefixes,
+ ChewingKeyVector keys,
+ CandidateConstraints constraints,
+ MatchResults & results){
+ m_constraints = constraints;
+ m_keys = keys;
+ int nstep = keys->len + 1;
+
+ clear_steps(m_steps_index, m_steps_content);
+
+ init_steps(m_steps_index, m_steps_content, nstep);
+
+ populate_prefixes(m_steps_index, m_steps_content, prefixes);
+
+ PhraseIndexRanges ranges;
+ memset(ranges, 0, sizeof(PhraseIndexRanges));
+ m_phrase_index->prepare_ranges(ranges);
+
+ GPtrArray * candidates = g_ptr_array_new();
+ GPtrArray * topresults = g_ptr_array_new();
+
+ /* begin the viterbi beam search. */
+ for ( int i = 0; i < nstep - 1; ++i ){
+ lookup_constraint_t * cur_constraint = &g_array_index
+ (m_constraints, lookup_constraint_t, i);
+
+ if (CONSTRAINT_NOSEARCH == cur_constraint->m_type)
+ continue;
+
+ LookupStepContent step = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, i);
+
+ populate_candidates(candidates, step);
+ get_top_results(topresults, candidates);
+
+ if (0 == topresults->len)
+ continue;
+
+ for ( int m = i + 1; m < nstep; ++m ){
+ const int len = m - i;
+ if (len > MAX_PHRASE_LENGTH)
+ break;
+
+ lookup_constraint_t * next_constraint = &g_array_index
+ (m_constraints, lookup_constraint_t, m - 1);
+
+ if (CONSTRAINT_NOSEARCH == next_constraint->m_type)
+ break;
+
+ ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data;
+ /* do one pinyin table search. */
+ int result = m_pinyin_table->search(len, pinyin_keys + i, ranges);
+
+ if (result & SEARCH_OK) {
+ /* assume topresults always contains items. */
+ search_bigram2(topresults, i, ranges),
+ search_unigram2(topresults, i, ranges);
+ }
+
+ /* poke the next constraint. */
+ ++ next_constraint;
+ if (CONSTRAINT_ONESTEP == next_constraint->m_type)
+ break;
+
+ /* no longer pinyin */
+ if (!(result & SEARCH_CONTINUED))
+ break;
+ }
+ }
+
+ m_phrase_index->destroy_ranges(ranges);
+
+ g_ptr_array_free(candidates, TRUE);
+ g_ptr_array_free(topresults, TRUE);
+
+ return final_step(results);
+}
+
+bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges) {
+
+ if (0 == topresults->len)
+ return false;
+
+ lookup_value_t * max = (lookup_value_t *)
+ g_ptr_array_index(topresults, 0);
+
+ lookup_constraint_t * constraint =
+ &g_array_index(m_constraints, lookup_constraint_t, nstep);
+
+ if (CONSTRAINT_ONESTEP == constraint->m_type) {
+ return unigram_gen_next_step(nstep, max, constraint->m_token);
+ }
+
+ bool found = false;
+
+ if (NO_CONSTRAINT == constraint->m_type) {
+ for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = ranges[m];
+ if ( !array ) continue;
+
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n);
+ for ( phrase_token_t token = range->m_range_begin;
+ token != range->m_range_end; ++token){
+ found = unigram_gen_next_step(nstep, max, token)|| found;
+ }
+ }
+ }
+ }
+
+ return found;
+}
+
+bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges) {
+
+ lookup_constraint_t * constraint =
+ &g_array_index(m_constraints, lookup_constraint_t, nstep);
+
+ bool found = false;
+ BigramPhraseArray bigram_phrase_items = g_array_new
+ (FALSE, FALSE, sizeof(BigramPhraseItem));
+
+ for (size_t i = 0; i < topresults->len; ++i) {
+ lookup_value_t * value = (lookup_value_t *)
+ g_ptr_array_index(topresults, i);
+
+ phrase_token_t index_token = value->m_handles[1];
+
+ SingleGram * system = NULL, * user = NULL;
+ m_system_bigram->load(index_token, system);
+ m_user_bigram->load(index_token, user);
+
+ if ( !merge_single_gram(&m_merged_single_gram, system, user) )
+ continue;
+
+ if ( CONSTRAINT_ONESTEP == constraint->m_type ){
+ phrase_token_t token = constraint->m_token;
+
+ guint32 freq;
+ if( m_merged_single_gram.get_freq(token, freq) ){
+ guint32 total_freq;
+ m_merged_single_gram.get_total_freq(total_freq);
+ gfloat bigram_poss = freq / (gfloat) total_freq;
+ found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found;
+ }
+ }
+
+ if (NO_CONSTRAINT == constraint->m_type) {
+ for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){
+ GArray * array = ranges[m];
+ if ( !array ) continue;
+
+ for ( size_t n = 0; n < array->len; ++n){
+ PhraseIndexRange * range =
+ &g_array_index(array, PhraseIndexRange, n);
+
+ g_array_set_size(bigram_phrase_items, 0);
+ m_merged_single_gram.search(range, bigram_phrase_items);
+ for( size_t k = 0; k < bigram_phrase_items->len; ++k) {
+ BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k);
+ found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found;
+ }
+ }
+ }
+ }
+ if (system)
+ delete system;
+ if (user)
+ delete user;
+ }
+
+ g_array_free(bigram_phrase_items, TRUE);
+ return found;
+}
+
+
+bool PinyinLookup2::unigram_gen_next_step(int nstep,
+ lookup_value_t * cur_step,
+ phrase_token_t token) {
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble)
+ m_phrase_index->get_phrase_index_total_freq();
+ if ( elem_poss < DBL_EPSILON )
+ return false;
+
+ ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys);
+ if (pinyin_poss < FLT_EPSILON )
+ return false;
+
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup2::bigram_gen_next_step(int nstep,
+ lookup_value_t * cur_step,
+ phrase_token_t token,
+ gfloat bigram_poss) {
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() /
+ (gdouble) m_phrase_index->get_phrase_index_total_freq();
+ if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
+ return false;
+
+ ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep;
+ gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys);
+ if ( pinyin_poss < FLT_EPSILON )
+ return false;
+
+ lookup_value_t next_step;
+ next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token;
+ next_step.m_poss = cur_step->m_poss +
+ log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss);
+ next_step.m_last_step = nstep;
+
+ return save_next_step(nstep + phrase_length, cur_step, &next_step);
+}
+
+bool PinyinLookup2::save_next_step(int next_step_pos,
+ lookup_value_t * cur_step,
+ lookup_value_t * next_step){
+
+ lookup_key_t next_key = next_step->m_handles[1];
+ LookupStepIndex next_lookup_index = (LookupStepIndex)
+ g_ptr_array_index(m_steps_index, next_step_pos);
+ LookupStepContent next_lookup_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, next_step_pos);
+
+ gpointer key = NULL, value = NULL;
+ gboolean lookup_result = g_hash_table_lookup_extended
+ (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
+
+ if ( !lookup_result ){
+ g_array_append_val(next_lookup_content, *next_step);
+ g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1));
+ return true;
+ }else{
+ size_t step_index = GPOINTER_TO_UINT(value);
+ lookup_value_t * orig_next_value = &g_array_index
+ (next_lookup_content, lookup_value_t, step_index);
+
+ if ( orig_next_value->m_poss < next_step->m_poss) {
+ /* found better result. */
+ orig_next_value->m_handles[0] = next_step->m_handles[0];
+ assert(orig_next_value->m_handles[1] == next_step->m_handles[1]);
+ orig_next_value->m_poss = next_step->m_poss;
+ orig_next_value->m_last_step = next_step->m_last_step;
+ return true;
+ }
+
+ return false;
+ }
+}
+
+bool PinyinLookup2::final_step(MatchResults & results){
+
+ /* reset results */
+ g_array_set_size(results, m_steps_content->len - 1);
+ for (size_t i = 0; i < results->len; ++i){
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ *token = null_token;
+ }
+
+ /* find max element */
+ size_t last_step_pos = m_steps_content->len - 1;
+ GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos);
+ if ( last_step_array->len == 0 )
+ return false;
+
+ lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0);
+ for ( size_t i = 1; i < last_step_array->len; ++i){
+ lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i);
+ if ( cur_value->m_poss > max_value->m_poss )
+ max_value = cur_value;
+ }
+
+ /* backtracing */
+ while( true ){
+ int cur_step_pos = max_value->m_last_step;
+ if ( -1 == cur_step_pos )
+ break;
+
+ phrase_token_t * token = &g_array_index
+ (results, phrase_token_t, cur_step_pos);
+ *token = max_value->m_handles[1];
+
+ phrase_token_t last_token = max_value->m_handles[0];
+ LookupStepIndex lookup_step_index = (LookupStepIndex)
+ g_ptr_array_index(m_steps_index, cur_step_pos);
+
+ gpointer key = NULL, value = NULL;
+ gboolean result = g_hash_table_lookup_extended
+ (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
+ if (!result)
+ return false;
+
+ LookupStepContent lookup_step_content = (LookupStepContent)
+ g_ptr_array_index(m_steps_content, cur_step_pos);
+ max_value = &g_array_index
+ (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
+ }
+
+ /* no need to reverse the result */
+ return true;
+}
+
+
+bool PinyinLookup2::train_result2(ChewingKeyVector keys,
+ CandidateConstraints constraints,
+ MatchResults results) {
+ const guint32 initial_seed = 23 * 3;
+ const guint32 expand_factor = 2;
+ const guint32 unigram_factor = 7;
+ const guint32 pinyin_factor = 1;
+ const guint32 ceiling_seed = 23 * 15 * 64;
+
+ /* begin training based on constraints and results. */
+ bool train_next = false;
+ ChewingKey * pinyin_keys = (ChewingKey *) keys->data;
+
+ phrase_token_t last_token = sentence_start;
+ /* constraints->len + 1 == results->len */
+ for (size_t i = 0; i < constraints->len; ++i) {
+ phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ if (null_token == *token)
+ continue;
+
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, i);
+ if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) {
+ if (CONSTRAINT_ONESTEP == constraint->m_type) {
+ assert(*token == constraint->m_token);
+ train_next = true;
+ } else {
+ train_next = false;
+ }
+
+ guint32 seed = initial_seed;
+ /* train bi-gram first, and get train seed. */
+ if (last_token) {
+ SingleGram * user = NULL;
+ m_user_bigram->load(last_token, user);
+
+ guint32 total_freq = 0;
+ if (!user) {
+ user = new SingleGram;
+ }
+ assert(user->get_total_freq(total_freq));
+
+ guint32 freq = 0;
+ /* compute train factor */
+ if (!user->get_freq(*token, freq)) {
+ assert(user->insert_freq(*token, 0));
+ seed = initial_seed;
+ } else {
+ seed = std_lite::max(freq, initial_seed);
+ seed *= expand_factor;
+ seed = std_lite::min(seed, ceiling_seed);
+ }
+
+ /* protect against total_freq overflow */
+ if (seed > 0 && total_freq > total_freq + seed)
+ goto next;
+
+ assert(user->set_total_freq(total_freq + seed));
+ /* if total_freq is not overflow, then freq won't overflow. */
+ assert(user->set_freq(*token, freq + seed));
+ assert(m_user_bigram->store(last_token, user));
+ next:
+ assert(NULL != user);
+ if (user)
+ delete user;
+ }
+
+ /* train uni-gram */
+ m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+ m_cache_phrase_item.increase_pronunciation_possibility
+ (m_options, pinyin_keys + i, seed * pinyin_factor);
+ m_phrase_index->add_unigram_frequency
+ (*token, seed * unigram_factor);
+ }
+ last_token = *token;
+ }
+ return true;
+}
+
+
+int PinyinLookup2::add_constraint(CandidateConstraints constraints,
+ size_t index,
+ phrase_token_t token) {
+
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return 0;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ if ( index + phrase_length > constraints->len )
+ return 0;
+
+ for (size_t i = index; i < index + phrase_length; ++i){
+ clear_constraint(constraints, i);
+ }
+
+ /* store one step constraint */
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, index);
+ constraint->m_type = CONSTRAINT_ONESTEP;
+ constraint->m_token = token;
+
+ /* propagate no search constraint */
+ for (size_t i = 1; i < phrase_length; ++i){
+ constraint = &g_array_index(constraints, lookup_constraint_t, index + i);
+ constraint->m_type = CONSTRAINT_NOSEARCH;
+ constraint->m_constraint_step = index;
+ }
+
+ return phrase_length;
+}
+
+bool PinyinLookup2::clear_constraint(CandidateConstraints constraints,
+ int index) {
+ if (index < 0 || index >= constraints->len)
+ return false;
+
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, index);
+
+ if (NO_CONSTRAINT == constraint->m_type)
+ return false;
+
+ if (CONSTRAINT_NOSEARCH == constraint->m_type){
+ index = constraint->m_constraint_step;
+ constraint = &g_array_index(constraints, lookup_constraint_t, index);
+ }
+
+ /* now var constraint points to the one step constraint. */
+ assert(constraint->m_type == CONSTRAINT_ONESTEP);
+
+ phrase_token_t token = constraint->m_token;
+ if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
+ return false;
+
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+ for ( size_t i = 0; i < phrase_length; ++i){
+ if (index + i >= constraints->len)
+ continue;
+
+ constraint = &g_array_index
+ (constraints, lookup_constraint_t, index + i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+
+ return true;
+}
+
+bool PinyinLookup2::validate_constraint(CandidateConstraints constraints,
+ ChewingKeyVector keys) {
+ /* resize constraints array first */
+ size_t constraints_length = constraints->len;
+
+ if ( keys->len > constraints_length ){
+ g_array_set_size(constraints, keys->len);
+
+ /* initialize new element */
+ for( size_t i = constraints_length; i < keys->len; ++i){
+ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+
+ }else if (keys->len < constraints_length ){
+ /* just shrink it */
+ g_array_set_size(constraints, keys->len);
+ }
+
+ for ( size_t i = 0; i < constraints->len; ++i){
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, i);
+
+ /* handle one step constraint */
+ if ( constraint->m_type == CONSTRAINT_ONESTEP ){
+
+ phrase_token_t token = constraint->m_token;
+ m_phrase_index->get_phrase_item(token, m_cache_phrase_item);
+ size_t phrase_length = m_cache_phrase_item.get_phrase_length();
+
+ /* clear too long constraint */
+ if (i + phrase_length > constraints->len){
+ clear_constraint(constraints, i);
+ continue;
+ }
+
+ ChewingKey * pinyin_keys = (ChewingKey *)keys->data;
+ /* clear invalid pinyin */
+ gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys + i);
+ if (pinyin_poss < FLT_EPSILON)
+ clear_constraint(constraints, i);
+ }
+ }
+ return true;
+}
diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h
new file mode 100644
index 0000000..dbe15c9
--- /dev/null
+++ b/src/lookup/pinyin_lookup2.h
@@ -0,0 +1,240 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef PINYIN_LOOKUP2_H
+#define PINYIN_LOOKUP2_H
+
+
+#include <float.h>
+#include <glib.h>
+#include "novel_types.h"
+#include "chewing_key.h"
+#include "phrase_index.h"
+#include "ngram.h"
+#include "lookup.h"
+
+
+namespace pinyin{
+
+/**
+ * pinyin_lookup2.h
+ *
+ * The definitions of pinyin lookup related classes and structs.
+ *
+ */
+
+
+
+enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH };
+
+struct lookup_constraint_t{
+ /* current type of the step */
+ constraint_type m_type;
+
+ /* Note:
+ * value of m_type:
+ * NO_CONSTRAINT:
+ * no values in the below union.
+ * search all possible next words.
+ * CONSTRAINT_ONESTEP:
+ * m_token contains the next word.
+ * only one word can be used to search for the next step,
+ * use case for user selected candidates.
+ * CONSTRAINT_NOSEARCH:
+ * m_constraint_step contains the value
+ * which points back to the CONSTRAINT_ONESTEP step.
+ * no search is allowed for the current step.
+ */
+
+ union{
+ phrase_token_t m_token;
+ guint32 m_constraint_step; /* index of m_token */
+ };
+};
+
+
+/**
+ * PinyinLookup2:
+ *
+ * The pinyin lookup class to convert pinyin keys to guessed sentence.
+ *
+ */
+class PinyinLookup2{
+private:
+ const gfloat bigram_lambda;
+ const gfloat unigram_lambda;
+
+ PhraseItem m_cache_phrase_item;
+ SingleGram m_merged_single_gram;
+
+protected:
+ /* saved varibles */
+ CandidateConstraints m_constraints;
+ ChewingKeyVector m_keys;
+
+ pinyin_option_t m_options;
+ FacadeChewingTable * m_pinyin_table;
+ FacadePhraseIndex * m_phrase_index;
+ Bigram * m_system_bigram;
+ Bigram * m_user_bigram;
+
+ /* internal step data structure */
+ GPtrArray * m_steps_index;
+ /* Array of LookupStepIndex */
+ GPtrArray * m_steps_content;
+ /* Array of LookupStepContent */
+
+
+ bool search_unigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges);
+ bool search_bigram2(GPtrArray * topresults, int nstep,
+ PhraseIndexRanges ranges);
+
+ bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token);
+ bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss);
+
+ bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step);
+
+ bool final_step(MatchResults & results);
+
+public:
+ /**
+ * PinyinLookup2::PinyinLookup2:
+ * @lambda: the lambda parameter for interpolation model.
+ * @options: the pinyin options.
+ * @pinyin_table: the pinyin table.
+ * @phrase_index: the phrase index.
+ * @system_bigram: the system bi-gram.
+ * @user_bigram: the user bi-gram.
+ *
+ * The constructor of the PinyinLookup2.
+ *
+ */
+ PinyinLookup2(const gfloat lambda,
+ pinyin_option_t options,
+ FacadeChewingTable * pinyin_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram);
+
+ /**
+ * PinyinLookup2::~PinyinLookup2:
+ *
+ * The destructor of the PinyinLookup2.
+ *
+ */
+ ~PinyinLookup2();
+
+ /**
+ * PinyinLookup2::set_options:
+ * @options: the pinyin options.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the pinyin options.
+ *
+ */
+ bool set_options(pinyin_option_t options) {
+ m_options = options;
+ return true;
+ }
+
+ /**
+ * PinyinLookup2::get_best_match:
+ * @prefixes: the phrase tokens before the guessed sentence.
+ * @keys: the pinyin keys of the guessed sentence.
+ * @constraints: the constraints on the guessed sentence.
+ * @results: the guessed sentence in the form of the phrase tokens.
+ * @returns: whether the guess operation is successful.
+ *
+ * Guess the best sentence according to user inputs.
+ *
+ */
+ bool get_best_match(TokenVector prefixes, ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+
+ /**
+ * PinyinLookup2::train_result2:
+ * @keys: the pinyin keys of the guessed sentence.
+ * @constraints: the constraints on the guessed sentence.
+ * @results: the guessed sentence in the form of the phrase tokens.
+ * @returns: whether the train operation is successful.
+ *
+ * Self learning the guessed sentence based on the constraints.
+ *
+ */
+ bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results);
+
+ /**
+ * PinyinLookup2::convert_to_utf8:
+ * @results: the guessed sentence in the form of the phrase tokens.
+ * @result_string: the guessed sentence in the utf8 encoding.
+ * @returns: whether the convert operation is successful.
+ *
+ * Convert the guessed sentence from the phrase tokens to the utf8 string.
+ *
+ */
+ bool convert_to_utf8(MatchResults results,
+ /* out */ char * & result_string)
+ {
+ return pinyin::convert_to_utf8(m_phrase_index, results,
+ NULL, false, result_string);
+ }
+
+
+ /**
+ * PinyinLookup2::add_constraint:
+ * @constraints: the constraints on the guessed sentence.
+ * @index: the character offset in the guessed sentence.
+ * @token: the phrase token in the candidate list chosen by user.
+ * @returns: the number of the characters in the chosen token.
+ *
+ * Add one constraint to the constraints on the guessed sentence.
+ *
+ */
+ int add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token);
+
+ /**
+ * PinyinLookup2::clear_constraint:
+ * @constraints: the constraints on the guessed sentence.
+ * @index: the character offset in the guessed sentence.
+ * @returns: whether the clear operation is successful.
+ *
+ * Clear one constraint in the constraints on the guessed sentence.
+ *
+ */
+ bool clear_constraint(CandidateConstraints constraints, int index);
+
+ /**
+ * PinyinLookup2::validate_constraint:
+ * @constraints: the constraints on the guessed sentence.
+ * @keys: the pinyin keys of the guessed sentence.
+ * @returns: whether the validate operation is successful.
+ *
+ * Validate the old constraints with the new pinyin keys.
+ *
+ */
+ bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys);
+
+};
+
+};
+
+#endif