diff options
Diffstat (limited to 'src/lookup')
-rw-r--r-- | src/lookup/CMakeLists.txt | 23 | ||||
-rw-r--r-- | src/lookup/Makefile.am | 36 | ||||
-rw-r--r-- | src/lookup/lookup.cpp | 73 | ||||
-rw-r--r-- | src/lookup/lookup.h | 79 | ||||
-rw-r--r-- | src/lookup/phrase_lookup.cpp | 434 | ||||
-rw-r--r-- | src/lookup/phrase_lookup.h | 142 | ||||
-rw-r--r-- | src/lookup/pinyin_lookup2.cpp | 730 | ||||
-rw-r--r-- | src/lookup/pinyin_lookup2.h | 240 |
8 files changed, 1757 insertions, 0 deletions
diff --git a/src/lookup/CMakeLists.txt b/src/lookup/CMakeLists.txt new file mode 100644 index 0000000..937b2cb --- /dev/null +++ b/src/lookup/CMakeLists.txt @@ -0,0 +1,23 @@ +set( + CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" +) + +set( + LIBLOOKUP_SOURCES + pinyin_lookup2.cpp + phrase_lookup.cpp + lookup.cpp +) + +add_library( + lookup + STATIC + ${LIBLOOKUP_SOURCES} +) + +install( + FILES + ${LIBLOOKUP_HEADERS} + DESTINATION + ${DIR_INCLUDE_LIBPINYIN} +) diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am new file mode 100644 index 0000000..00d7df4 --- /dev/null +++ b/src/lookup/Makefile.am @@ -0,0 +1,36 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/src/storage \ + @GLIB2_CFLAGS@ + +noinst_HEADERS = lookup.h \ + pinyin_lookup2.h \ + phrase_lookup.h + +noinst_LTLIBRARIES = liblookup.la + +liblookup_la_CXXFLAGS = "-fPIC" + +liblookup_la_LDFLAGS = -static + +liblookup_la_SOURCES = pinyin_lookup2.cpp \ + phrase_lookup.cpp \ + lookup.cpp diff --git a/src/lookup/lookup.cpp b/src/lookup/lookup.cpp new file mode 100644 index 0000000..c32a0ec --- /dev/null +++ b/src/lookup/lookup.cpp @@ -0,0 +1,73 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "lookup.h" +#include "phrase_index.h" + +namespace pinyin{ + +bool convert_to_utf8(FacadePhraseIndex * phrase_index, + MatchResults match_results, + /* in */ const char * delimiter, + /* in */ bool show_tokens, + /* out */ char * & result_string){ + //init variables + if ( NULL == delimiter ) + delimiter = ""; + result_string = NULL; + + PhraseItem item; + + for ( size_t i = 0; i < match_results->len; ++i ){ + phrase_token_t token = g_array_index + (match_results, phrase_token_t, i); + if ( null_token == token ) + continue; + + phrase_index->get_phrase_item(token, item); + ucs4_t buffer[MAX_PHRASE_LENGTH]; + item.get_phrase_string(buffer); + + guint8 length = item.get_phrase_length(); + gchar * phrase = NULL; + char * tmp = NULL; + + if (show_tokens) { + tmp = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); + phrase = g_strdup_printf("%d %s", token, tmp); + g_free(tmp); + } else { + phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); + } + + tmp = result_string; + if ( NULL == result_string ) + result_string = g_strdup(phrase); + else + result_string = g_strconcat(result_string, delimiter, phrase, NULL); + g_free(phrase); + g_free(tmp); + } + return true; +} + +}; diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h new file mode 100644 index 0000000..8dc1a89 --- /dev/null +++ b/src/lookup/lookup.h @@ -0,0 +1,79 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef LOOKUP_H +#define LOOKUP_H + + +/** @file lookup.h + * @brief the definitions of common lookup related classes and structs. + */ + +#include "novel_types.h" +#include <limits.h> + +namespace pinyin{ + +typedef phrase_token_t lookup_key_t; + +struct lookup_value_t{ + /* previous and current tokens of the node */ + phrase_token_t m_handles[2]; + /* maximum possibility of current node */ + gfloat m_poss; + /* trace back information for final step */ + gint32 m_last_step; + + lookup_value_t(gfloat poss = FLT_MAX){ + m_handles[0] = null_token; m_handles[1] = null_token; + m_poss = poss; + m_last_step = -1; + } +}; + + +class FacadePhraseIndex; + + +/* Note: + * LookupStepIndex: + * the main purpose of lookup step index is served for an index + * for lookup step content, which can quickly merge the same node + * with different possibilities, + * then only keep the highest value of the node. + * LookupStepContent: + * the place to store the lookup values of current step, + * and indexed by lookup step index. + * See also comments on lookup_value_t. + */ + +typedef GHashTable * LookupStepIndex; +/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */ +typedef GArray * LookupStepContent; /* array of lookup_value_t */ + +bool convert_to_utf8(FacadePhraseIndex * phrase_index, + MatchResults match_results, + /* in */ const char * delimiter, + /* in */ bool show_tokens, + /* out */ char * & result_string); + +}; +#endif diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp new file mode 100644 index 0000000..f7da0b7 --- /dev/null +++ b/src/lookup/phrase_lookup.cpp @@ -0,0 +1,434 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <math.h> +#include "stl_lite.h" +#include "novel_types.h" +#include "phrase_index.h" +#include "facade_phrase_table2.h" +#include "ngram.h" +#include "phrase_lookup.h" + +using namespace pinyin; + + +/* +const gfloat PhraseLookup::bigram_lambda = lambda; +const gfloat PhraseLookup::unigram_lambda = 1 - lambda; +*/ + +static bool populate_prefixes(GPtrArray * steps_index, + GPtrArray * steps_content) { + + lookup_key_t initial_key = sentence_start; + lookup_value_t initial_value(log(1)); + initial_value.m_handles[1] = sentence_start; + + LookupStepContent initial_step_content = (LookupStepContent) + g_ptr_array_index(steps_content, 0); + g_array_append_val(initial_step_content, initial_value); + + LookupStepIndex initial_step_index = (LookupStepIndex) + g_ptr_array_index(steps_index, 0); + g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), + GUINT_TO_POINTER(initial_step_content->len - 1)); + + return true; +} + +static bool init_steps(GPtrArray * steps_index, + GPtrArray * steps_content, + int nstep) { + + /* add null start step */ + g_ptr_array_set_size(steps_index, nstep); + g_ptr_array_set_size(steps_content, nstep); + + for ( int i = 0; i < nstep; ++i ){ + /* initialize steps_index */ + g_ptr_array_index(steps_index, i) = g_hash_table_new + (g_direct_hash, g_direct_equal); + /* initialize steps_content */ + g_ptr_array_index(steps_content, i) = g_array_new + (FALSE, FALSE, sizeof(lookup_value_t)); + } + + return true; +} + +static void clear_steps(GPtrArray * steps_index, + GPtrArray * steps_content){ + /* clear steps_index */ + for ( size_t i = 0; i < steps_index->len; ++i){ + GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i); + g_hash_table_destroy(table); + g_ptr_array_index(steps_index, i) = NULL; + } + + /* free steps_content */ + for ( size_t i = 0; i < steps_content->len; ++i){ + GArray * array = (GArray *) g_ptr_array_index(steps_content, i); + g_array_free(array, TRUE); + g_ptr_array_index(steps_content, i) = NULL; + } +} + +PhraseLookup::PhraseLookup(const gfloat lambda, + FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram) + : bigram_lambda(lambda), + unigram_lambda(1. - lambda) +{ + m_phrase_table = phrase_table; + m_phrase_index = phrase_index; + m_system_bigram = system_bigram; + m_user_bigram = user_bigram; + + m_steps_index = g_ptr_array_new(); + m_steps_content = g_ptr_array_new(); + + /* the member variables below are saved in get_best_match call. */ + m_sentence = NULL; + m_sentence_length = 0; +} + +PhraseLookup::~PhraseLookup(){ + clear_steps(m_steps_index, m_steps_content); + g_ptr_array_free(m_steps_index, TRUE); + g_ptr_array_free(m_steps_content, TRUE); +} + +bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[], + MatchResults & results){ + m_sentence_length = sentence_length; + m_sentence = sentence; + int nstep = m_sentence_length + 1; + + clear_steps(m_steps_index, m_steps_content); + + init_steps(m_steps_index, m_steps_content, nstep); + + populate_prefixes(m_steps_index, m_steps_content); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + m_phrase_index->prepare_tokens(tokens); + + for ( int i = 0; i < nstep - 1; ++i ){ + for ( int m = i + 1; m < nstep; ++m ){ + + /* do one phrase table search. */ + int result = m_phrase_table->search(m - i, sentence + i, tokens); + + /* found next phrase */ + if ( result & SEARCH_OK ) { + search_bigram2(i, tokens), + search_unigram2(i, tokens); + } + + /* no longer phrase */ + if (!(result & SEARCH_CONTINUED)) + break; + } + } + + m_phrase_index->destroy_tokens(tokens); + + return final_step(results); +} + +#if 0 + +bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){ + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return false; + + lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0); + /* find the maximum node */ + for ( size_t i = 1; i < lookup_content->len; ++i ){ + lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + return unigram_gen_next_step(nstep, max_value, token); +} + +bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return false; + + for ( size_t i = 0; i < lookup_content->len; ++i ){ + lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i); + phrase_token_t index_token = cur_value->m_handles[1]; + SingleGram * system, * user; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if ( !merge_single_gram(&m_merged_single_gram, system, user) ) + continue; + + guint32 freq; + if ( m_merged_single_gram.get_freq(token, freq) ){ + guint32 total_freq; + m_merged_single_gram.get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found; + } + + if (system) + delete system; + if (user) + delete user; + } + + return found; +} + +#endif + +bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if ( 0 == lookup_content->len ) + return found; + + /* find the maximum node */ + lookup_value_t * max_value = &g_array_index + (lookup_content, lookup_value_t, 0); + + for (size_t i = 1; i < lookup_content->len; ++i) { + lookup_value_t * cur_value = &g_array_index + (lookup_content, lookup_value_t, i); + if (cur_value->m_poss > max_value->m_poss) + max_value = cur_value; + } + + /* iterate over tokens */ + for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { + GArray * array = tokens[n]; + if (NULL == array) + continue; + + /* just skip the loop when the length is zero. */ + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = + g_array_index(array, phrase_token_t, k); + + found = unigram_gen_next_step + (nstep, max_value, token) || found; + } + } + + return found; +} + +bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){ + bool found = false; + + LookupStepContent lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, nstep); + if (0 == lookup_content->len) + return found; + + for (size_t i = 0; i < lookup_content->len; ++i) { + lookup_value_t * cur_value = &g_array_index + (lookup_content, lookup_value_t, i); + phrase_token_t index_token = cur_value->m_handles[1]; + + SingleGram * system = NULL, * user = NULL; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if (!merge_single_gram + (&m_merged_single_gram, system, user)) + continue; + + /* iterate over tokens */ + for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { + GArray * array = tokens[n]; + if (NULL == array) + continue; + + /* just skip the loop when the length is zero. */ + for (size_t k = 0; k < array->len; ++k) { + phrase_token_t token = + g_array_index(array, phrase_token_t, k); + + guint32 freq = 0; + if (m_merged_single_gram.get_freq(token, freq)) { + guint32 total_freq = 0; + m_merged_single_gram.get_total_freq(total_freq); + + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found; + } + } + } + + if (system) + delete system; + if (user) + delete user; + } + + return found; +} + +bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value, +phrase_token_t token){ + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble) + m_phrase_index->get_phrase_index_total_freq(); + if ( elem_poss < DBL_EPSILON ) + return false; + + lookup_value_t next_value; + next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token; + next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda); + next_value.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_value, &next_value); +} + +bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){ + + if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() / + (gdouble) m_phrase_index->get_phrase_index_total_freq(); + + if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) + return false; + + lookup_value_t next_value; + next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token; + next_value.m_poss = cur_value->m_poss + + log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss ); + next_value.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_value, &next_value); +} + +bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){ + + LookupStepIndex next_lookup_index = (LookupStepIndex) + g_ptr_array_index(m_steps_index, next_step_pos); + LookupStepContent next_lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, next_step_pos); + + lookup_key_t next_key = next_value->m_handles[1]; + + gpointer key = NULL, value = NULL; + gboolean lookup_result = g_hash_table_lookup_extended + (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); + + if (!lookup_result){ + g_array_append_val(next_lookup_content, *next_value); + g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), + GUINT_TO_POINTER(next_lookup_content->len - 1)); + return true; + }else{ + size_t step_index = GPOINTER_TO_UINT(value); + lookup_value_t * orig_next_value = &g_array_index + (next_lookup_content, lookup_value_t, step_index); + + if ( orig_next_value->m_poss < next_value->m_poss ){ + orig_next_value->m_handles[0] = next_value->m_handles[0]; + assert(orig_next_value->m_handles[1] == next_value->m_handles[1]); + orig_next_value->m_poss = next_value->m_poss; + orig_next_value->m_last_step = next_value->m_last_step; + return true; + } + return false; + } +} + +bool PhraseLookup::final_step(MatchResults & results ){ + + /* reset results */ + g_array_set_size(results, m_steps_content->len - 1); + for ( size_t i = 0; i < results->len; ++i ){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + *token = null_token; + } + + /* find max element */ + size_t last_step_pos = m_steps_content->len - 1; + LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index + (m_steps_content, last_step_pos); + if ( last_step_content->len == 0 ) + return false; + + lookup_value_t * max_value = &g_array_index + (last_step_content, lookup_value_t, 0); + for ( size_t i = 1; i < last_step_content->len; ++i ){ + lookup_value_t * cur_value = &g_array_index + (last_step_content, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + /* backtracing */ + while( true ){ + int cur_step_pos = max_value->m_last_step; + if ( -1 == cur_step_pos ) + break; + + phrase_token_t * token = &g_array_index + (results, phrase_token_t, cur_step_pos); + *token = max_value->m_handles[1]; + + phrase_token_t last_token = max_value->m_handles[0]; + LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos); + + gpointer key = NULL, value = NULL; + gboolean result = g_hash_table_lookup_extended + (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); + if ( !result ) + return false; + + LookupStepContent lookup_step_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, cur_step_pos); + max_value = &g_array_index + (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); + } + + /* no need to reverse the result */ + return true; +} diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h new file mode 100644 index 0000000..cf65692 --- /dev/null +++ b/src/lookup/phrase_lookup.h @@ -0,0 +1,142 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef PHRASE_LOOKUP_H +#define PHRASE_LOOKUP_H + +#include "novel_types.h" +#include "ngram.h" +#include "lookup.h" + +/** + * phrase_lookup.h + * + * The definitions of phrase lookup related classes and structs. + * + */ + +namespace pinyin{ + +/** + * PhraseLookup: + * + * The phrase lookup class to convert the sentence to phrase tokens. + * + */ +class PhraseLookup{ +private: + const gfloat bigram_lambda; + const gfloat unigram_lambda; + + PhraseItem m_cache_phrase_item; + SingleGram m_merged_single_gram; +protected: + //saved varibles + FacadePhraseTable2 * m_phrase_table; + FacadePhraseIndex * m_phrase_index; + Bigram * m_system_bigram; + Bigram * m_user_bigram; + + //internal step data structure + GPtrArray * m_steps_index; + /* Array of LookupStepIndex */ + GPtrArray * m_steps_content; + /* Array of LookupStepContent */ + + /* Saved sentence */ + int m_sentence_length; + ucs4_t * m_sentence; + +protected: + /* Explicitly search the next phrase, + * to avoid double phrase lookup as the next token has only one. + */ + bool search_unigram2(int nstep, PhraseTokens tokens); + bool search_bigram2(int nstep, PhraseTokens tokens); + + bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token); + bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss); + + bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step); + + bool final_step(MatchResults & results); +public: + /** + * PhraseLookup::PhraseLookup: + * @lambda: the lambda parameter for interpolation model. + * @phrase_table: the phrase table. + * @phrase_index: the phrase index. + * @system_bigram: the system bi-gram. + * @user_bigram: the user bi-gram. + * + * The constructor of the PhraseLookup. + * + */ + PhraseLookup(const gfloat lambda, + FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram); + + /** + * PhraseLookup::~PhraseLookup: + * + * The destructor of the PhraseLookup. + * + */ + ~PhraseLookup(); + + /** + * PhraseLookup::get_best_match: + * @sentence_length: the length of the sentence in ucs4 characters. + * @sentence: the ucs4 characters of the sentence. + * @results: the segmented sentence in the form of phrase tokens. + * @returns: whether the segment operation is successful. + * + * Segment the sentence into phrase tokens. + * + * Note: this method only accepts the characters in phrase large table. + * + */ + bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results); + + /** + * PhraseLookup::convert_to_utf8: + * @results: the guessed sentence in the form of phrase tokens. + * @result_string: the converted sentence in utf8 string. + * @returns: whether the convert operation is successful. + * + * Convert the sentence from phrase tokens to the utf8 string. + * + * Note: free the result_string by g_free. + * + */ + bool convert_to_utf8(MatchResults results, + /* out */ char * & result_string) + { + return pinyin::convert_to_utf8(m_phrase_index, results, + "\n", true, result_string); + } +}; + +}; + +#endif diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp new file mode 100644 index 0000000..2250a93 --- /dev/null +++ b/src/lookup/pinyin_lookup2.cpp @@ -0,0 +1,730 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <math.h> +#include "facade_chewing_table.h" +#include "pinyin_lookup2.h" +#include "stl_lite.h" + +using namespace pinyin; + +/* +const gfloat PinyinLookup2::bigram_lambda = lambda; +const gfloat PinyinLookup2::unigram_lambda = 1 - lambda; +*/ + +/* internal definition */ +static const size_t nbeam = 32; + +static bool dump_max_value(GPtrArray * values){ + if (0 == values->len) + return false; + + const lookup_value_t * max = + (const lookup_value_t *) g_ptr_array_index(values, 0); + + for (size_t i = 1; i < values->len; ++i) { + const lookup_value_t * cur = + (const lookup_value_t *) g_ptr_array_index(values, i); + + if (cur->m_poss > max->m_poss) + max = cur; + } + + printf("max value: %f\n", max->m_poss); + + return true; +} + +static bool dump_all_values(GPtrArray * values) { + if (0 == values->len) + return false; + + printf("values:"); + for (size_t i = 0; i < values->len; ++i) { + const lookup_value_t * cur = + (const lookup_value_t *) g_ptr_array_index(values, i); + + printf("%f\t", cur->m_poss); + } + printf("\n"); + + return true; +} + +/* populate the candidates. */ +static bool populate_candidates(/* out */ GPtrArray * candidates, + /* in */ LookupStepContent step) { + g_ptr_array_set_size(candidates, 0); + + if (0 == step->len) + return false; + + for (size_t i = 0; i < step->len; ++i) { + lookup_value_t * value = &g_array_index + (step, lookup_value_t, i); + + g_ptr_array_add(candidates, value); + } + + /* dump_max_value(candidates); */ + + return true; +} + +static bool lookup_value_less_than(lookup_value_t * lhs, lookup_value_t * rhs){ + return lhs->m_poss < rhs->m_poss; +} + +/* use maximum heap to get the topest results. */ +static bool get_top_results(/* out */ GPtrArray * topresults, + /* in */ GPtrArray * candidates) { + g_ptr_array_set_size(topresults, 0); + + if (0 == candidates->len) + return false; + + lookup_value_t ** begin = + (lookup_value_t **) &g_ptr_array_index(candidates, 0); + lookup_value_t ** end = + (lookup_value_t **) &g_ptr_array_index(candidates, candidates->len); + + std_lite::make_heap(begin, end, lookup_value_less_than); + + while (end != begin) { + lookup_value_t * one = *begin; + g_ptr_array_add(topresults, one); + + std_lite::pop_heap(begin, end, lookup_value_less_than); + --end; + + if (topresults->len >= nbeam) + break; + } + + /* dump_all_values(topresults); */ + + return true; +} + +static bool populate_prefixes(GPtrArray * steps_index, + GPtrArray * steps_content, + TokenVector prefixes) { + assert(prefixes->len > 0); + + for (size_t i = 0; i < prefixes->len; ++i) { + phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); + lookup_key_t initial_key = token; + lookup_value_t initial_value(log(1)); + initial_value.m_handles[1] = token; + + LookupStepContent initial_step_content = (LookupStepContent) + g_ptr_array_index(steps_content, 0); + initial_step_content = g_array_append_val + (initial_step_content, initial_value); + + LookupStepIndex initial_step_index = (LookupStepIndex) + g_ptr_array_index(steps_index, 0); + g_hash_table_insert(initial_step_index, + GUINT_TO_POINTER(initial_key), + GUINT_TO_POINTER(initial_step_content->len - 1)); + } + + return true; +} + +static bool init_steps(GPtrArray * steps_index, + GPtrArray * steps_content, + int nstep){ + /* add null start step */ + g_ptr_array_set_size(steps_index, nstep); + g_ptr_array_set_size(steps_content, nstep); + + for (int i = 0; i < nstep; ++i) { + /* initialize steps_index */ + g_ptr_array_index(steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal); + /* initialize steps_content */ + g_ptr_array_index(steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t)); + } + + return true; +} + +static void clear_steps(GPtrArray * steps_index, GPtrArray * steps_content){ + /* clear steps_index */ + for ( size_t i = 0; i < steps_index->len; ++i){ + GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i); + g_hash_table_destroy(table); + g_ptr_array_index(steps_index, i) = NULL; + } + + /* clear steps_content */ + for ( size_t i = 0; i < steps_content->len; ++i){ + GArray * array = (GArray *) g_ptr_array_index(steps_content, i); + g_array_free(array, TRUE); + g_ptr_array_index(steps_content, i) = NULL; + } +} + + +PinyinLookup2::PinyinLookup2(const gfloat lambda, + pinyin_option_t options, + FacadeChewingTable * pinyin_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram) + : bigram_lambda(lambda), + unigram_lambda(1. - lambda) +{ + m_options = options; + m_pinyin_table = pinyin_table; + m_phrase_index = phrase_index; + m_system_bigram = system_bigram; + m_user_bigram = user_bigram; + + m_steps_index = g_ptr_array_new(); + m_steps_content = g_ptr_array_new(); + + /* the member variables below are saved in get_best_match call. */ + m_keys = NULL; + m_constraints = NULL; +} + +PinyinLookup2::~PinyinLookup2(){ + clear_steps(m_steps_index, m_steps_content); + g_ptr_array_free(m_steps_index, TRUE); + g_ptr_array_free(m_steps_content, TRUE); +} + + +bool PinyinLookup2::get_best_match(TokenVector prefixes, + ChewingKeyVector keys, + CandidateConstraints constraints, + MatchResults & results){ + m_constraints = constraints; + m_keys = keys; + int nstep = keys->len + 1; + + clear_steps(m_steps_index, m_steps_content); + + init_steps(m_steps_index, m_steps_content, nstep); + + populate_prefixes(m_steps_index, m_steps_content, prefixes); + + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(PhraseIndexRanges)); + m_phrase_index->prepare_ranges(ranges); + + GPtrArray * candidates = g_ptr_array_new(); + GPtrArray * topresults = g_ptr_array_new(); + + /* begin the viterbi beam search. */ + for ( int i = 0; i < nstep - 1; ++i ){ + lookup_constraint_t * cur_constraint = &g_array_index + (m_constraints, lookup_constraint_t, i); + + if (CONSTRAINT_NOSEARCH == cur_constraint->m_type) + continue; + + LookupStepContent step = (LookupStepContent) + g_ptr_array_index(m_steps_content, i); + + populate_candidates(candidates, step); + get_top_results(topresults, candidates); + + if (0 == topresults->len) + continue; + + for ( int m = i + 1; m < nstep; ++m ){ + const int len = m - i; + if (len > MAX_PHRASE_LENGTH) + break; + + lookup_constraint_t * next_constraint = &g_array_index + (m_constraints, lookup_constraint_t, m - 1); + + if (CONSTRAINT_NOSEARCH == next_constraint->m_type) + break; + + ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data; + /* do one pinyin table search. */ + int result = m_pinyin_table->search(len, pinyin_keys + i, ranges); + + if (result & SEARCH_OK) { + /* assume topresults always contains items. */ + search_bigram2(topresults, i, ranges), + search_unigram2(topresults, i, ranges); + } + + /* poke the next constraint. */ + ++ next_constraint; + if (CONSTRAINT_ONESTEP == next_constraint->m_type) + break; + + /* no longer pinyin */ + if (!(result & SEARCH_CONTINUED)) + break; + } + } + + m_phrase_index->destroy_ranges(ranges); + + g_ptr_array_free(candidates, TRUE); + g_ptr_array_free(topresults, TRUE); + + return final_step(results); +} + +bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges) { + + if (0 == topresults->len) + return false; + + lookup_value_t * max = (lookup_value_t *) + g_ptr_array_index(topresults, 0); + + lookup_constraint_t * constraint = + &g_array_index(m_constraints, lookup_constraint_t, nstep); + + if (CONSTRAINT_ONESTEP == constraint->m_type) { + return unigram_gen_next_step(nstep, max, constraint->m_token); + } + + bool found = false; + + if (NO_CONSTRAINT == constraint->m_type) { + for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = ranges[m]; + if ( !array ) continue; + + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); + for ( phrase_token_t token = range->m_range_begin; + token != range->m_range_end; ++token){ + found = unigram_gen_next_step(nstep, max, token)|| found; + } + } + } + } + + return found; +} + +bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges) { + + lookup_constraint_t * constraint = + &g_array_index(m_constraints, lookup_constraint_t, nstep); + + bool found = false; + BigramPhraseArray bigram_phrase_items = g_array_new + (FALSE, FALSE, sizeof(BigramPhraseItem)); + + for (size_t i = 0; i < topresults->len; ++i) { + lookup_value_t * value = (lookup_value_t *) + g_ptr_array_index(topresults, i); + + phrase_token_t index_token = value->m_handles[1]; + + SingleGram * system = NULL, * user = NULL; + m_system_bigram->load(index_token, system); + m_user_bigram->load(index_token, user); + + if ( !merge_single_gram(&m_merged_single_gram, system, user) ) + continue; + + if ( CONSTRAINT_ONESTEP == constraint->m_type ){ + phrase_token_t token = constraint->m_token; + + guint32 freq; + if( m_merged_single_gram.get_freq(token, freq) ){ + guint32 total_freq; + m_merged_single_gram.get_total_freq(total_freq); + gfloat bigram_poss = freq / (gfloat) total_freq; + found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found; + } + } + + if (NO_CONSTRAINT == constraint->m_type) { + for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ + GArray * array = ranges[m]; + if ( !array ) continue; + + for ( size_t n = 0; n < array->len; ++n){ + PhraseIndexRange * range = + &g_array_index(array, PhraseIndexRange, n); + + g_array_set_size(bigram_phrase_items, 0); + m_merged_single_gram.search(range, bigram_phrase_items); + for( size_t k = 0; k < bigram_phrase_items->len; ++k) { + BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); + found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found; + } + } + } + } + if (system) + delete system; + if (user) + delete user; + } + + g_array_free(bigram_phrase_items, TRUE); + return found; +} + + +bool PinyinLookup2::unigram_gen_next_step(int nstep, + lookup_value_t * cur_step, + phrase_token_t token) { + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble) + m_phrase_index->get_phrase_index_total_freq(); + if ( elem_poss < DBL_EPSILON ) + return false; + + ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep; + gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys); + if (pinyin_poss < FLT_EPSILON ) + return false; + + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup2::bigram_gen_next_step(int nstep, + lookup_value_t * cur_step, + phrase_token_t token, + gfloat bigram_poss) { + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() / + (gdouble) m_phrase_index->get_phrase_index_total_freq(); + if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) + return false; + + ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep; + gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys); + if ( pinyin_poss < FLT_EPSILON ) + return false; + + lookup_value_t next_step; + next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; + next_step.m_poss = cur_step->m_poss + + log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss); + next_step.m_last_step = nstep; + + return save_next_step(nstep + phrase_length, cur_step, &next_step); +} + +bool PinyinLookup2::save_next_step(int next_step_pos, + lookup_value_t * cur_step, + lookup_value_t * next_step){ + + lookup_key_t next_key = next_step->m_handles[1]; + LookupStepIndex next_lookup_index = (LookupStepIndex) + g_ptr_array_index(m_steps_index, next_step_pos); + LookupStepContent next_lookup_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, next_step_pos); + + gpointer key = NULL, value = NULL; + gboolean lookup_result = g_hash_table_lookup_extended + (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); + + if ( !lookup_result ){ + g_array_append_val(next_lookup_content, *next_step); + g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1)); + return true; + }else{ + size_t step_index = GPOINTER_TO_UINT(value); + lookup_value_t * orig_next_value = &g_array_index + (next_lookup_content, lookup_value_t, step_index); + + if ( orig_next_value->m_poss < next_step->m_poss) { + /* found better result. */ + orig_next_value->m_handles[0] = next_step->m_handles[0]; + assert(orig_next_value->m_handles[1] == next_step->m_handles[1]); + orig_next_value->m_poss = next_step->m_poss; + orig_next_value->m_last_step = next_step->m_last_step; + return true; + } + + return false; + } +} + +bool PinyinLookup2::final_step(MatchResults & results){ + + /* reset results */ + g_array_set_size(results, m_steps_content->len - 1); + for (size_t i = 0; i < results->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + *token = null_token; + } + + /* find max element */ + size_t last_step_pos = m_steps_content->len - 1; + GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos); + if ( last_step_array->len == 0 ) + return false; + + lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0); + for ( size_t i = 1; i < last_step_array->len; ++i){ + lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i); + if ( cur_value->m_poss > max_value->m_poss ) + max_value = cur_value; + } + + /* backtracing */ + while( true ){ + int cur_step_pos = max_value->m_last_step; + if ( -1 == cur_step_pos ) + break; + + phrase_token_t * token = &g_array_index + (results, phrase_token_t, cur_step_pos); + *token = max_value->m_handles[1]; + + phrase_token_t last_token = max_value->m_handles[0]; + LookupStepIndex lookup_step_index = (LookupStepIndex) + g_ptr_array_index(m_steps_index, cur_step_pos); + + gpointer key = NULL, value = NULL; + gboolean result = g_hash_table_lookup_extended + (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); + if (!result) + return false; + + LookupStepContent lookup_step_content = (LookupStepContent) + g_ptr_array_index(m_steps_content, cur_step_pos); + max_value = &g_array_index + (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); + } + + /* no need to reverse the result */ + return true; +} + + +bool PinyinLookup2::train_result2(ChewingKeyVector keys, + CandidateConstraints constraints, + MatchResults results) { + const guint32 initial_seed = 23 * 3; + const guint32 expand_factor = 2; + const guint32 unigram_factor = 7; + const guint32 pinyin_factor = 1; + const guint32 ceiling_seed = 23 * 15 * 64; + + /* begin training based on constraints and results. */ + bool train_next = false; + ChewingKey * pinyin_keys = (ChewingKey *) keys->data; + + phrase_token_t last_token = sentence_start; + /* constraints->len + 1 == results->len */ + for (size_t i = 0; i < constraints->len; ++i) { + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if (null_token == *token) + continue; + + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, i); + if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) { + if (CONSTRAINT_ONESTEP == constraint->m_type) { + assert(*token == constraint->m_token); + train_next = true; + } else { + train_next = false; + } + + guint32 seed = initial_seed; + /* train bi-gram first, and get train seed. */ + if (last_token) { + SingleGram * user = NULL; + m_user_bigram->load(last_token, user); + + guint32 total_freq = 0; + if (!user) { + user = new SingleGram; + } + assert(user->get_total_freq(total_freq)); + + guint32 freq = 0; + /* compute train factor */ + if (!user->get_freq(*token, freq)) { + assert(user->insert_freq(*token, 0)); + seed = initial_seed; + } else { + seed = std_lite::max(freq, initial_seed); + seed *= expand_factor; + seed = std_lite::min(seed, ceiling_seed); + } + + /* protect against total_freq overflow */ + if (seed > 0 && total_freq > total_freq + seed) + goto next; + + assert(user->set_total_freq(total_freq + seed)); + /* if total_freq is not overflow, then freq won't overflow. */ + assert(user->set_freq(*token, freq + seed)); + assert(m_user_bigram->store(last_token, user)); + next: + assert(NULL != user); + if (user) + delete user; + } + + /* train uni-gram */ + m_phrase_index->get_phrase_item(*token, m_cache_phrase_item); + m_cache_phrase_item.increase_pronunciation_possibility + (m_options, pinyin_keys + i, seed * pinyin_factor); + m_phrase_index->add_unigram_frequency + (*token, seed * unigram_factor); + } + last_token = *token; + } + return true; +} + + +int PinyinLookup2::add_constraint(CandidateConstraints constraints, + size_t index, + phrase_token_t token) { + + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return 0; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + if ( index + phrase_length > constraints->len ) + return 0; + + for (size_t i = index; i < index + phrase_length; ++i){ + clear_constraint(constraints, i); + } + + /* store one step constraint */ + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, index); + constraint->m_type = CONSTRAINT_ONESTEP; + constraint->m_token = token; + + /* propagate no search constraint */ + for (size_t i = 1; i < phrase_length; ++i){ + constraint = &g_array_index(constraints, lookup_constraint_t, index + i); + constraint->m_type = CONSTRAINT_NOSEARCH; + constraint->m_constraint_step = index; + } + + return phrase_length; +} + +bool PinyinLookup2::clear_constraint(CandidateConstraints constraints, + int index) { + if (index < 0 || index >= constraints->len) + return false; + + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, index); + + if (NO_CONSTRAINT == constraint->m_type) + return false; + + if (CONSTRAINT_NOSEARCH == constraint->m_type){ + index = constraint->m_constraint_step; + constraint = &g_array_index(constraints, lookup_constraint_t, index); + } + + /* now var constraint points to the one step constraint. */ + assert(constraint->m_type == CONSTRAINT_ONESTEP); + + phrase_token_t token = constraint->m_token; + if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) + return false; + + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + for ( size_t i = 0; i < phrase_length; ++i){ + if (index + i >= constraints->len) + continue; + + constraint = &g_array_index + (constraints, lookup_constraint_t, index + i); + constraint->m_type = NO_CONSTRAINT; + } + + return true; +} + +bool PinyinLookup2::validate_constraint(CandidateConstraints constraints, + ChewingKeyVector keys) { + /* resize constraints array first */ + size_t constraints_length = constraints->len; + + if ( keys->len > constraints_length ){ + g_array_set_size(constraints, keys->len); + + /* initialize new element */ + for( size_t i = constraints_length; i < keys->len; ++i){ + lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + }else if (keys->len < constraints_length ){ + /* just shrink it */ + g_array_set_size(constraints, keys->len); + } + + for ( size_t i = 0; i < constraints->len; ++i){ + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, i); + + /* handle one step constraint */ + if ( constraint->m_type == CONSTRAINT_ONESTEP ){ + + phrase_token_t token = constraint->m_token; + m_phrase_index->get_phrase_item(token, m_cache_phrase_item); + size_t phrase_length = m_cache_phrase_item.get_phrase_length(); + + /* clear too long constraint */ + if (i + phrase_length > constraints->len){ + clear_constraint(constraints, i); + continue; + } + + ChewingKey * pinyin_keys = (ChewingKey *)keys->data; + /* clear invalid pinyin */ + gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys + i); + if (pinyin_poss < FLT_EPSILON) + clear_constraint(constraints, i); + } + } + return true; +} diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h new file mode 100644 index 0000000..dbe15c9 --- /dev/null +++ b/src/lookup/pinyin_lookup2.h @@ -0,0 +1,240 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef PINYIN_LOOKUP2_H +#define PINYIN_LOOKUP2_H + + +#include <float.h> +#include <glib.h> +#include "novel_types.h" +#include "chewing_key.h" +#include "phrase_index.h" +#include "ngram.h" +#include "lookup.h" + + +namespace pinyin{ + +/** + * pinyin_lookup2.h + * + * The definitions of pinyin lookup related classes and structs. + * + */ + + + +enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH }; + +struct lookup_constraint_t{ + /* current type of the step */ + constraint_type m_type; + + /* Note: + * value of m_type: + * NO_CONSTRAINT: + * no values in the below union. + * search all possible next words. + * CONSTRAINT_ONESTEP: + * m_token contains the next word. + * only one word can be used to search for the next step, + * use case for user selected candidates. + * CONSTRAINT_NOSEARCH: + * m_constraint_step contains the value + * which points back to the CONSTRAINT_ONESTEP step. + * no search is allowed for the current step. + */ + + union{ + phrase_token_t m_token; + guint32 m_constraint_step; /* index of m_token */ + }; +}; + + +/** + * PinyinLookup2: + * + * The pinyin lookup class to convert pinyin keys to guessed sentence. + * + */ +class PinyinLookup2{ +private: + const gfloat bigram_lambda; + const gfloat unigram_lambda; + + PhraseItem m_cache_phrase_item; + SingleGram m_merged_single_gram; + +protected: + /* saved varibles */ + CandidateConstraints m_constraints; + ChewingKeyVector m_keys; + + pinyin_option_t m_options; + FacadeChewingTable * m_pinyin_table; + FacadePhraseIndex * m_phrase_index; + Bigram * m_system_bigram; + Bigram * m_user_bigram; + + /* internal step data structure */ + GPtrArray * m_steps_index; + /* Array of LookupStepIndex */ + GPtrArray * m_steps_content; + /* Array of LookupStepContent */ + + + bool search_unigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges); + bool search_bigram2(GPtrArray * topresults, int nstep, + PhraseIndexRanges ranges); + + bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token); + bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss); + + bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step); + + bool final_step(MatchResults & results); + +public: + /** + * PinyinLookup2::PinyinLookup2: + * @lambda: the lambda parameter for interpolation model. + * @options: the pinyin options. + * @pinyin_table: the pinyin table. + * @phrase_index: the phrase index. + * @system_bigram: the system bi-gram. + * @user_bigram: the user bi-gram. + * + * The constructor of the PinyinLookup2. + * + */ + PinyinLookup2(const gfloat lambda, + pinyin_option_t options, + FacadeChewingTable * pinyin_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram); + + /** + * PinyinLookup2::~PinyinLookup2: + * + * The destructor of the PinyinLookup2. + * + */ + ~PinyinLookup2(); + + /** + * PinyinLookup2::set_options: + * @options: the pinyin options. + * @returns: whether the set operation is successful. + * + * Set the pinyin options. + * + */ + bool set_options(pinyin_option_t options) { + m_options = options; + return true; + } + + /** + * PinyinLookup2::get_best_match: + * @prefixes: the phrase tokens before the guessed sentence. + * @keys: the pinyin keys of the guessed sentence. + * @constraints: the constraints on the guessed sentence. + * @results: the guessed sentence in the form of the phrase tokens. + * @returns: whether the guess operation is successful. + * + * Guess the best sentence according to user inputs. + * + */ + bool get_best_match(TokenVector prefixes, ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results); + + /** + * PinyinLookup2::train_result2: + * @keys: the pinyin keys of the guessed sentence. + * @constraints: the constraints on the guessed sentence. + * @results: the guessed sentence in the form of the phrase tokens. + * @returns: whether the train operation is successful. + * + * Self learning the guessed sentence based on the constraints. + * + */ + bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results); + + /** + * PinyinLookup2::convert_to_utf8: + * @results: the guessed sentence in the form of the phrase tokens. + * @result_string: the guessed sentence in the utf8 encoding. + * @returns: whether the convert operation is successful. + * + * Convert the guessed sentence from the phrase tokens to the utf8 string. + * + */ + bool convert_to_utf8(MatchResults results, + /* out */ char * & result_string) + { + return pinyin::convert_to_utf8(m_phrase_index, results, + NULL, false, result_string); + } + + + /** + * PinyinLookup2::add_constraint: + * @constraints: the constraints on the guessed sentence. + * @index: the character offset in the guessed sentence. + * @token: the phrase token in the candidate list chosen by user. + * @returns: the number of the characters in the chosen token. + * + * Add one constraint to the constraints on the guessed sentence. + * + */ + int add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token); + + /** + * PinyinLookup2::clear_constraint: + * @constraints: the constraints on the guessed sentence. + * @index: the character offset in the guessed sentence. + * @returns: whether the clear operation is successful. + * + * Clear one constraint in the constraints on the guessed sentence. + * + */ + bool clear_constraint(CandidateConstraints constraints, int index); + + /** + * PinyinLookup2::validate_constraint: + * @constraints: the constraints on the guessed sentence. + * @keys: the pinyin keys of the guessed sentence. + * @returns: whether the validate operation is successful. + * + * Validate the old constraints with the new pinyin keys. + * + */ + bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys); + +}; + +}; + +#endif |