diff options
author | Peng Wu <alexepico@gmail.com> | 2017-08-21 17:39:27 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2017-08-21 17:39:27 +0800 |
commit | 8972c0f556f427c46b4e6960d00db6d4ec0e302e (patch) | |
tree | 5d46cfda898c791a74d23232fd5060bc4acdd07d /src/lookup | |
parent | b0217b44f0025d563def780109daff3fcfa0840f (diff) | |
download | libzhuyin-8972c0f556f427c46b4e6960d00db6d4ec0e302e.tar.gz libzhuyin-8972c0f556f427c46b4e6960d00db6d4ec0e302e.tar.xz libzhuyin-8972c0f556f427c46b4e6960d00db6d4ec0e302e.zip |
merged into libpinyin
Diffstat (limited to 'src/lookup')
-rw-r--r-- | src/lookup/CMakeLists.txt | 23 | ||||
-rw-r--r-- | src/lookup/Makefile.am | 36 | ||||
-rw-r--r-- | src/lookup/lookup.cpp | 73 | ||||
-rw-r--r-- | src/lookup/lookup.h | 79 | ||||
-rw-r--r-- | src/lookup/phrase_lookup.cpp | 434 | ||||
-rw-r--r-- | src/lookup/phrase_lookup.h | 142 | ||||
-rw-r--r-- | src/lookup/pinyin_lookup2.cpp | 730 | ||||
-rw-r--r-- | src/lookup/pinyin_lookup2.h | 240 |
8 files changed, 0 insertions, 1757 deletions
diff --git a/src/lookup/CMakeLists.txt b/src/lookup/CMakeLists.txt deleted file mode 100644 index 937b2cb..0000000 --- a/src/lookup/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -set( - CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" -) - -set( - LIBLOOKUP_SOURCES - pinyin_lookup2.cpp - phrase_lookup.cpp - lookup.cpp -) - -add_library( - lookup - STATIC - ${LIBLOOKUP_SOURCES} -) - -install( - FILES - ${LIBLOOKUP_HEADERS} - DESTINATION - ${DIR_INCLUDE_LIBPINYIN} -) diff --git a/src/lookup/Makefile.am b/src/lookup/Makefile.am deleted file mode 100644 index 00d7df4..0000000 --- a/src/lookup/Makefile.am +++ /dev/null @@ -1,36 +0,0 @@ -## Makefile.am -- Process this file with automake to produce Makefile.in -## Copyright (C) 2007 Peng Wu -## -## This program is free software; you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation; either version 2, or (at your option) -## any later version. -## -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with this program; if not, write to the Free Software -## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -MAINTAINERCLEANFILES = Makefile.in - -INCLUDES = -I$(top_srcdir)/src/include \ - -I$(top_srcdir)/src/storage \ - @GLIB2_CFLAGS@ - -noinst_HEADERS = lookup.h \ - pinyin_lookup2.h \ - phrase_lookup.h - -noinst_LTLIBRARIES = liblookup.la - -liblookup_la_CXXFLAGS = "-fPIC" - -liblookup_la_LDFLAGS = -static - -liblookup_la_SOURCES = pinyin_lookup2.cpp \ - phrase_lookup.cpp \ - lookup.cpp diff --git a/src/lookup/lookup.cpp b/src/lookup/lookup.cpp deleted file mode 100644 index c605f61..0000000 --- a/src/lookup/lookup.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* - * libzhuyin - * Library to deal with zhuyin. - * - * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - - -#include "lookup.h" -#include "phrase_index.h" - -namespace zhuyin{ - -bool convert_to_utf8(FacadePhraseIndex * phrase_index, - MatchResults match_results, - /* in */ const char * delimiter, - /* in */ bool show_tokens, - /* out */ char * & result_string){ - //init variables - if ( NULL == delimiter ) - delimiter = ""; - result_string = NULL; - - PhraseItem item; - - for ( size_t i = 0; i < match_results->len; ++i ){ - phrase_token_t token = g_array_index - (match_results, phrase_token_t, i); - if ( null_token == token ) - continue; - - phrase_index->get_phrase_item(token, item); - ucs4_t buffer[MAX_PHRASE_LENGTH]; - item.get_phrase_string(buffer); - - guint8 length = item.get_phrase_length(); - gchar * phrase = NULL; - char * tmp = NULL; - - if (show_tokens) { - tmp = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); - phrase = g_strdup_printf("%d %s", token, tmp); - g_free(tmp); - } else { - phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); - } - - tmp = result_string; - if ( NULL == result_string ) - result_string = g_strdup(phrase); - else - result_string = g_strconcat(result_string, delimiter, phrase, NULL); - g_free(phrase); - g_free(tmp); - } - return true; -} - -}; diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h deleted file mode 100644 index 952be6d..0000000 --- a/src/lookup/lookup.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * libzhuyin - * Library to deal with zhuyin. - * - * Copyright (C) 2006-2007 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#ifndef LOOKUP_H -#define LOOKUP_H - - -/** @file lookup.h - * @brief the definitions of common lookup related classes and structs. - */ - -#include "novel_types.h" -#include <limits.h> - -namespace zhuyin{ - -typedef phrase_token_t lookup_key_t; - -struct lookup_value_t{ - /* previous and current tokens of the node */ - phrase_token_t m_handles[2]; - /* maximum possibility of current node */ - gfloat m_poss; - /* trace back information for final step */ - gint32 m_last_step; - - lookup_value_t(gfloat poss = FLT_MAX){ - m_handles[0] = null_token; m_handles[1] = null_token; - m_poss = poss; - m_last_step = -1; - } -}; - - -class FacadePhraseIndex; - - -/* Note: - * LookupStepIndex: - * the main purpose of lookup step index is served for an index - * for lookup step content, which can quickly merge the same node - * with different possibilities, - * then only keep the highest value of the node. - * LookupStepContent: - * the place to store the lookup values of current step, - * and indexed by lookup step index. - * See also comments on lookup_value_t. - */ - -typedef GHashTable * LookupStepIndex; -/* Key: lookup_key_t, Value: int m, index to m_steps_content[i][m] */ -typedef GArray * LookupStepContent; /* array of lookup_value_t */ - -bool convert_to_utf8(FacadePhraseIndex * phrase_index, - MatchResults match_results, - /* in */ const char * delimiter, - /* in */ bool show_tokens, - /* out */ char * & result_string); - -}; -#endif diff --git a/src/lookup/phrase_lookup.cpp b/src/lookup/phrase_lookup.cpp deleted file mode 100644 index fd457a4..0000000 --- a/src/lookup/phrase_lookup.cpp +++ /dev/null @@ -1,434 +0,0 @@ -/* - * libzhuyin - * Library to deal with zhuyin. - * - * Copyright (C) 2010 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include <math.h> -#include "stl_lite.h" -#include "novel_types.h" -#include "phrase_index.h" -#include "facade_phrase_table2.h" -#include "ngram.h" -#include "phrase_lookup.h" - -using namespace zhuyin; - - -/* -const gfloat PhraseLookup::bigram_lambda = lambda; -const gfloat PhraseLookup::unigram_lambda = 1 - lambda; -*/ - -static bool populate_prefixes(GPtrArray * steps_index, - GPtrArray * steps_content) { - - lookup_key_t initial_key = sentence_start; - lookup_value_t initial_value(log(1)); - initial_value.m_handles[1] = sentence_start; - - LookupStepContent initial_step_content = (LookupStepContent) - g_ptr_array_index(steps_content, 0); - g_array_append_val(initial_step_content, initial_value); - - LookupStepIndex initial_step_index = (LookupStepIndex) - g_ptr_array_index(steps_index, 0); - g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key), - GUINT_TO_POINTER(initial_step_content->len - 1)); - - return true; -} - -static bool init_steps(GPtrArray * steps_index, - GPtrArray * steps_content, - int nstep) { - - /* add null start step */ - g_ptr_array_set_size(steps_index, nstep); - g_ptr_array_set_size(steps_content, nstep); - - for ( int i = 0; i < nstep; ++i ){ - /* initialize steps_index */ - g_ptr_array_index(steps_index, i) = g_hash_table_new - (g_direct_hash, g_direct_equal); - /* initialize steps_content */ - g_ptr_array_index(steps_content, i) = g_array_new - (FALSE, FALSE, sizeof(lookup_value_t)); - } - - return true; -} - -static void clear_steps(GPtrArray * steps_index, - GPtrArray * steps_content){ - /* clear steps_index */ - for ( size_t i = 0; i < steps_index->len; ++i){ - GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i); - g_hash_table_destroy(table); - g_ptr_array_index(steps_index, i) = NULL; - } - - /* free steps_content */ - for ( size_t i = 0; i < steps_content->len; ++i){ - GArray * array = (GArray *) g_ptr_array_index(steps_content, i); - g_array_free(array, TRUE); - g_ptr_array_index(steps_content, i) = NULL; - } -} - -PhraseLookup::PhraseLookup(const gfloat lambda, - FacadePhraseTable2 * phrase_table, - FacadePhraseIndex * phrase_index, - Bigram * system_bigram, - Bigram * user_bigram) - : bigram_lambda(lambda), - unigram_lambda(1. - lambda) -{ - m_phrase_table = phrase_table; - m_phrase_index = phrase_index; - m_system_bigram = system_bigram; - m_user_bigram = user_bigram; - - m_steps_index = g_ptr_array_new(); - m_steps_content = g_ptr_array_new(); - - /* the member variables below are saved in get_best_match call. */ - m_sentence = NULL; - m_sentence_length = 0; -} - -PhraseLookup::~PhraseLookup(){ - clear_steps(m_steps_index, m_steps_content); - g_ptr_array_free(m_steps_index, TRUE); - g_ptr_array_free(m_steps_content, TRUE); -} - -bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[], - MatchResults & results){ - m_sentence_length = sentence_length; - m_sentence = sentence; - int nstep = m_sentence_length + 1; - - clear_steps(m_steps_index, m_steps_content); - - init_steps(m_steps_index, m_steps_content, nstep); - - populate_prefixes(m_steps_index, m_steps_content); - - PhraseTokens tokens; - memset(tokens, 0, sizeof(PhraseTokens)); - m_phrase_index->prepare_tokens(tokens); - - for ( int i = 0; i < nstep - 1; ++i ){ - for ( int m = i + 1; m < nstep; ++m ){ - - /* do one phrase table search. */ - int result = m_phrase_table->search(m - i, sentence + i, tokens); - - /* found next phrase */ - if ( result & SEARCH_OK ) { - search_bigram2(i, tokens), - search_unigram2(i, tokens); - } - - /* no longer phrase */ - if (!(result & SEARCH_CONTINUED)) - break; - } - } - - m_phrase_index->destroy_tokens(tokens); - - return final_step(results); -} - -#if 0 - -bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){ - - LookupStepContent lookup_content = (LookupStepContent) - g_ptr_array_index(m_steps_content, nstep); - if ( 0 == lookup_content->len ) - return false; - - lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0); - /* find the maximum node */ - for ( size_t i = 1; i < lookup_content->len; ++i ){ - lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i); - if ( cur_value->m_poss > max_value->m_poss ) - max_value = cur_value; - } - - return unigram_gen_next_step(nstep, max_value, token); -} - -bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){ - bool found = false; - - LookupStepContent lookup_content = (LookupStepContent) - g_ptr_array_index(m_steps_content, nstep); - if ( 0 == lookup_content->len ) - return false; - - for ( size_t i = 0; i < lookup_content->len; ++i ){ - lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i); - phrase_token_t index_token = cur_value->m_handles[1]; - SingleGram * system, * user; - m_system_bigram->load(index_token, system); - m_user_bigram->load(index_token, user); - - if ( !merge_single_gram(&m_merged_single_gram, system, user) ) - continue; - - guint32 freq; - if ( m_merged_single_gram.get_freq(token, freq) ){ - guint32 total_freq; - m_merged_single_gram.get_total_freq(total_freq); - gfloat bigram_poss = freq / (gfloat) total_freq; - found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found; - } - - if (system) - delete system; - if (user) - delete user; - } - - return found; -} - -#endif - -bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){ - bool found = false; - - LookupStepContent lookup_content = (LookupStepContent) - g_ptr_array_index(m_steps_content, nstep); - if ( 0 == lookup_content->len ) - return found; - - /* find the maximum node */ - lookup_value_t * max_value = &g_array_index - (lookup_content, lookup_value_t, 0); - - for (size_t i = 1; i < lookup_content->len; ++i) { - lookup_value_t * cur_value = &g_array_index - (lookup_content, lookup_value_t, i); - if (cur_value->m_poss > max_value->m_poss) - max_value = cur_value; - } - - /* iterate over tokens */ - for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { - GArray * array = tokens[n]; - if (NULL == array) - continue; - - /* just skip the loop when the length is zero. */ - for (size_t k = 0; k < array->len; ++k) { - phrase_token_t token = - g_array_index(array, phrase_token_t, k); - - found = unigram_gen_next_step - (nstep, max_value, token) || found; - } - } - - return found; -} - -bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){ - bool found = false; - - LookupStepContent lookup_content = (LookupStepContent) - g_ptr_array_index(m_steps_content, nstep); - if (0 == lookup_content->len) - return found; - - for (size_t i = 0; i < lookup_content->len; ++i) { - lookup_value_t * cur_value = &g_array_index - (lookup_content, lookup_value_t, i); - phrase_token_t index_token = cur_value->m_handles[1]; - - SingleGram * system = NULL, * user = NULL; - m_system_bigram->load(index_token, system); - m_user_bigram->load(index_token, user); - - if (!merge_single_gram - (&m_merged_single_gram, system, user)) - continue; - - /* iterate over tokens */ - for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) { - GArray * array = tokens[n]; - if (NULL == array) - continue; - - /* just skip the loop when the length is zero. */ - for (size_t k = 0; k < array->len; ++k) { - phrase_token_t token = - g_array_index(array, phrase_token_t, k); - - guint32 freq = 0; - if (m_merged_single_gram.get_freq(token, freq)) { - guint32 total_freq = 0; - m_merged_single_gram.get_total_freq(total_freq); - - gfloat bigram_poss = freq / (gfloat) total_freq; - found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found; - } - } - } - - if (system) - delete system; - if (user) - delete user; - } - - return found; -} - -bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value, -phrase_token_t token){ - - if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) - return false; - - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble) - m_phrase_index->get_phrase_index_total_freq(); - if ( elem_poss < DBL_EPSILON ) - return false; - - lookup_value_t next_value; - next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token; - next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda); - next_value.m_last_step = nstep; - - return save_next_step(nstep + phrase_length, cur_value, &next_value); -} - -bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){ - - if ( m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) - return false; - - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() / - (gdouble) m_phrase_index->get_phrase_index_total_freq(); - - if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) - return false; - - lookup_value_t next_value; - next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token; - next_value.m_poss = cur_value->m_poss + - log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss ); - next_value.m_last_step = nstep; - - return save_next_step(nstep + phrase_length, cur_value, &next_value); -} - -bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){ - - LookupStepIndex next_lookup_index = (LookupStepIndex) - g_ptr_array_index(m_steps_index, next_step_pos); - LookupStepContent next_lookup_content = (LookupStepContent) - g_ptr_array_index(m_steps_content, next_step_pos); - - lookup_key_t next_key = next_value->m_handles[1]; - - gpointer key = NULL, value = NULL; - gboolean lookup_result = g_hash_table_lookup_extended - (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); - - if (!lookup_result){ - g_array_append_val(next_lookup_content, *next_value); - g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), - GUINT_TO_POINTER(next_lookup_content->len - 1)); - return true; - }else{ - size_t step_index = GPOINTER_TO_UINT(value); - lookup_value_t * orig_next_value = &g_array_index - (next_lookup_content, lookup_value_t, step_index); - - if ( orig_next_value->m_poss < next_value->m_poss ){ - orig_next_value->m_handles[0] = next_value->m_handles[0]; - assert(orig_next_value->m_handles[1] == next_value->m_handles[1]); - orig_next_value->m_poss = next_value->m_poss; - orig_next_value->m_last_step = next_value->m_last_step; - return true; - } - return false; - } -} - -bool PhraseLookup::final_step(MatchResults & results ){ - - /* reset results */ - g_array_set_size(results, m_steps_content->len - 1); - for ( size_t i = 0; i < results->len; ++i ){ - phrase_token_t * token = &g_array_index(results, phrase_token_t, i); - *token = null_token; - } - - /* find max element */ - size_t last_step_pos = m_steps_content->len - 1; - LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index - (m_steps_content, last_step_pos); - if ( last_step_content->len == 0 ) - return false; - - lookup_value_t * max_value = &g_array_index - (last_step_content, lookup_value_t, 0); - for ( size_t i = 1; i < last_step_content->len; ++i ){ - lookup_value_t * cur_value = &g_array_index - (last_step_content, lookup_value_t, i); - if ( cur_value->m_poss > max_value->m_poss ) - max_value = cur_value; - } - - /* backtracing */ - while( true ){ - int cur_step_pos = max_value->m_last_step; - if ( -1 == cur_step_pos ) - break; - - phrase_token_t * token = &g_array_index - (results, phrase_token_t, cur_step_pos); - *token = max_value->m_handles[1]; - - phrase_token_t last_token = max_value->m_handles[0]; - LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos); - - gpointer key = NULL, value = NULL; - gboolean result = g_hash_table_lookup_extended - (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); - if ( !result ) - return false; - - LookupStepContent lookup_step_content = (LookupStepContent) - g_ptr_array_index(m_steps_content, cur_step_pos); - max_value = &g_array_index - (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); - } - - /* no need to reverse the result */ - return true; -} diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h deleted file mode 100644 index 6262380..0000000 --- a/src/lookup/phrase_lookup.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * libzhuyin - * Library to deal with zhuyin. - * - * Copyright (C) 2006-2007 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#ifndef PHRASE_LOOKUP_H -#define PHRASE_LOOKUP_H - -#include "novel_types.h" -#include "ngram.h" -#include "lookup.h" - -/** - * phrase_lookup.h - * - * The definitions of phrase lookup related classes and structs. - * - */ - -namespace zhuyin{ - -/** - * PhraseLookup: - * - * The phrase lookup class to convert the sentence to phrase tokens. - * - */ -class PhraseLookup{ -private: - const gfloat bigram_lambda; - const gfloat unigram_lambda; - - PhraseItem m_cache_phrase_item; - SingleGram m_merged_single_gram; -protected: - //saved varibles - FacadePhraseTable2 * m_phrase_table; - FacadePhraseIndex * m_phrase_index; - Bigram * m_system_bigram; - Bigram * m_user_bigram; - - //internal step data structure - GPtrArray * m_steps_index; - /* Array of LookupStepIndex */ - GPtrArray * m_steps_content; - /* Array of LookupStepContent */ - - /* Saved sentence */ - int m_sentence_length; - ucs4_t * m_sentence; - -protected: - /* Explicitly search the next phrase, - * to avoid double phrase lookup as the next token has only one. - */ - bool search_unigram2(int nstep, PhraseTokens tokens); - bool search_bigram2(int nstep, PhraseTokens tokens); - - bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token); - bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss); - - bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step); - - bool final_step(MatchResults & results); -public: - /** - * PhraseLookup::PhraseLookup: - * @lambda: the lambda parameter for interpolation model. - * @phrase_table: the phrase table. - * @phrase_index: the phrase index. - * @system_bigram: the system bi-gram. - * @user_bigram: the user bi-gram. - * - * The constructor of the PhraseLookup. - * - */ - PhraseLookup(const gfloat lambda, - FacadePhraseTable2 * phrase_table, - FacadePhraseIndex * phrase_index, - Bigram * system_bigram, - Bigram * user_bigram); - - /** - * PhraseLookup::~PhraseLookup: - * - * The destructor of the PhraseLookup. - * - */ - ~PhraseLookup(); - - /** - * PhraseLookup::get_best_match: - * @sentence_length: the length of the sentence in ucs4 characters. - * @sentence: the ucs4 characters of the sentence. - * @results: the segmented sentence in the form of phrase tokens. - * @returns: whether the segment operation is successful. - * - * Segment the sentence into phrase tokens. - * - * Note: this method only accepts the characters in phrase large table. - * - */ - bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results); - - /** - * PhraseLookup::convert_to_utf8: - * @results: the guessed sentence in the form of phrase tokens. - * @result_string: the converted sentence in utf8 string. - * @returns: whether the convert operation is successful. - * - * Convert the sentence from phrase tokens to the utf8 string. - * - * Note: free the result_string by g_free. - * - */ - bool convert_to_utf8(MatchResults results, - /* out */ char * & result_string) - { - return zhuyin::convert_to_utf8(m_phrase_index, results, - "\n", true, result_string); - } -}; - -}; - -#endif diff --git a/src/lookup/pinyin_lookup2.cpp b/src/lookup/pinyin_lookup2.cpp deleted file mode 100644 index 7f1f613..0000000 --- a/src/lookup/pinyin_lookup2.cpp +++ /dev/null @@ -1,730 +0,0 @@ -/* - * libzhuyin - * Library to deal with zhuyin. - * - * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include <math.h> -#include "facade_chewing_table.h" -#include "pinyin_lookup2.h" -#include "stl_lite.h" - -using namespace zhuyin; - -/* -const gfloat PinyinLookup2::bigram_lambda = lambda; -const gfloat PinyinLookup2::unigram_lambda = 1 - lambda; -*/ - -/* internal definition */ -static const size_t nbeam = 32; - -static bool dump_max_value(GPtrArray * values){ - if (0 == values->len) - return false; - - const lookup_value_t * max = - (const lookup_value_t *) g_ptr_array_index(values, 0); - - for (size_t i = 1; i < values->len; ++i) { - const lookup_value_t * cur = - (const lookup_value_t *) g_ptr_array_index(values, i); - - if (cur->m_poss > max->m_poss) - max = cur; - } - - printf("max value: %f\n", max->m_poss); - - return true; -} - -static bool dump_all_values(GPtrArray * values) { - if (0 == values->len) - return false; - - printf("values:"); - for (size_t i = 0; i < values->len; ++i) { - const lookup_value_t * cur = - (const lookup_value_t *) g_ptr_array_index(values, i); - - printf("%f\t", cur->m_poss); - } - printf("\n"); - - return true; -} - -/* populate the candidates. */ -static bool populate_candidates(/* out */ GPtrArray * candidates, - /* in */ LookupStepContent step) { - g_ptr_array_set_size(candidates, 0); - - if (0 == step->len) - return false; - - for (size_t i = 0; i < step->len; ++i) { - lookup_value_t * value = &g_array_index - (step, lookup_value_t, i); - - g_ptr_array_add(candidates, value); - } - - /* dump_max_value(candidates); */ - - return true; -} - -static bool lookup_value_less_than(lookup_value_t * lhs, lookup_value_t * rhs){ - return lhs->m_poss < rhs->m_poss; -} - -/* use maximum heap to get the topest results. */ -static bool get_top_results(/* out */ GPtrArray * topresults, - /* in */ GPtrArray * candidates) { - g_ptr_array_set_size(topresults, 0); - - if (0 == candidates->len) - return false; - - lookup_value_t ** begin = - (lookup_value_t **) &g_ptr_array_index(candidates, 0); - lookup_value_t ** end = - (lookup_value_t **) &g_ptr_array_index(candidates, candidates->len); - - std_lite::make_heap(begin, end, lookup_value_less_than); - - while (end != begin) { - lookup_value_t * one = *begin; - g_ptr_array_add(topresults, one); - - std_lite::pop_heap(begin, end, lookup_value_less_than); - --end; - - if (topresults->len >= nbeam) - break; - } - - /* dump_all_values(topresults); */ - - return true; -} - -static bool populate_prefixes(GPtrArray * steps_index, - GPtrArray * steps_content, - TokenVector prefixes) { - assert(prefixes->len > 0); - - for (size_t i = 0; i < prefixes->len; ++i) { - phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); - lookup_key_t initial_key = token; - lookup_value_t initial_value(log(1)); - initial_value.m_handles[1] = token; - - LookupStepContent initial_step_content = (LookupStepContent) - g_ptr_array_index(steps_content, 0); - initial_step_content = g_array_append_val - (initial_step_content, initial_value); - - LookupStepIndex initial_step_index = (LookupStepIndex) - g_ptr_array_index(steps_index, 0); - g_hash_table_insert(initial_step_index, - GUINT_TO_POINTER(initial_key), - GUINT_TO_POINTER(initial_step_content->len - 1)); - } - - return true; -} - -static bool init_steps(GPtrArray * steps_index, - GPtrArray * steps_content, - int nstep){ - /* add null start step */ - g_ptr_array_set_size(steps_index, nstep); - g_ptr_array_set_size(steps_content, nstep); - - for (int i = 0; i < nstep; ++i) { - /* initialize steps_index */ - g_ptr_array_index(steps_index, i) = g_hash_table_new(g_direct_hash, g_direct_equal); - /* initialize steps_content */ - g_ptr_array_index(steps_content, i) = g_array_new(FALSE, FALSE, sizeof(lookup_value_t)); - } - - return true; -} - -static void clear_steps(GPtrArray * steps_index, GPtrArray * steps_content){ - /* clear steps_index */ - for ( size_t i = 0; i < steps_index->len; ++i){ - GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i); - g_hash_table_destroy(table); - g_ptr_array_index(steps_index, i) = NULL; - } - - /* clear steps_content */ - for ( size_t i = 0; i < steps_content->len; ++i){ - GArray * array = (GArray *) g_ptr_array_index(steps_content, i); - g_array_free(array, TRUE); - g_ptr_array_index(steps_content, i) = NULL; - } -} - - -PinyinLookup2::PinyinLookup2(const gfloat lambda, - pinyin_option_t options, - FacadeChewingTable * pinyin_table, - FacadePhraseIndex * phrase_index, - Bigram * system_bigram, - Bigram * user_bigram) - : bigram_lambda(lambda), - unigram_lambda(1. - lambda) -{ - m_options = options; - m_pinyin_table = pinyin_table; - m_phrase_index = phrase_index; - m_system_bigram = system_bigram; - m_user_bigram = user_bigram; - - m_steps_index = g_ptr_array_new(); - m_steps_content = g_ptr_array_new(); - - /* the member variables below are saved in get_best_match call. */ - m_keys = NULL; - m_constraints = NULL; -} - -PinyinLookup2::~PinyinLookup2(){ - clear_steps(m_steps_index, m_steps_content); - g_ptr_array_free(m_steps_index, TRUE); - g_ptr_array_free(m_steps_content, TRUE); -} - - -bool PinyinLookup2::get_best_match(TokenVector prefixes, - ChewingKeyVector keys, - CandidateConstraints constraints, - MatchResults & results){ - m_constraints = constraints; - m_keys = keys; - int nstep = keys->len + 1; - - clear_steps(m_steps_index, m_steps_content); - - init_steps(m_steps_index, m_steps_content, nstep); - - populate_prefixes(m_steps_index, m_steps_content, prefixes); - - PhraseIndexRanges ranges; - memset(ranges, 0, sizeof(PhraseIndexRanges)); - m_phrase_index->prepare_ranges(ranges); - - GPtrArray * candidates = g_ptr_array_new(); - GPtrArray * topresults = g_ptr_array_new(); - - /* begin the viterbi beam search. */ - for ( int i = 0; i < nstep - 1; ++i ){ - lookup_constraint_t * cur_constraint = &g_array_index - (m_constraints, lookup_constraint_t, i); - - if (CONSTRAINT_NOSEARCH == cur_constraint->m_type) - continue; - - LookupStepContent step = (LookupStepContent) - g_ptr_array_index(m_steps_content, i); - - populate_candidates(candidates, step); - get_top_results(topresults, candidates); - - if (0 == topresults->len) - continue; - - for ( int m = i + 1; m < nstep; ++m ){ - const int len = m - i; - if (len > MAX_PHRASE_LENGTH) - break; - - lookup_constraint_t * next_constraint = &g_array_index - (m_constraints, lookup_constraint_t, m - 1); - - if (CONSTRAINT_NOSEARCH == next_constraint->m_type) - break; - - ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data; - /* do one pinyin table search. */ - int result = m_pinyin_table->search(len, pinyin_keys + i, ranges); - - if (result & SEARCH_OK) { - /* assume topresults always contains items. */ - search_bigram2(topresults, i, ranges), - search_unigram2(topresults, i, ranges); - } - - /* poke the next constraint. */ - ++ next_constraint; - if (CONSTRAINT_ONESTEP == next_constraint->m_type) - break; - - /* no longer pinyin */ - if (!(result & SEARCH_CONTINUED)) - break; - } - } - - m_phrase_index->destroy_ranges(ranges); - - g_ptr_array_free(candidates, TRUE); - g_ptr_array_free(topresults, TRUE); - - return final_step(results); -} - -bool PinyinLookup2::search_unigram2(GPtrArray * topresults, int nstep, - PhraseIndexRanges ranges) { - - if (0 == topresults->len) - return false; - - lookup_value_t * max = (lookup_value_t *) - g_ptr_array_index(topresults, 0); - - lookup_constraint_t * constraint = - &g_array_index(m_constraints, lookup_constraint_t, nstep); - - if (CONSTRAINT_ONESTEP == constraint->m_type) { - return unigram_gen_next_step(nstep, max, constraint->m_token); - } - - bool found = false; - - if (NO_CONSTRAINT == constraint->m_type) { - for ( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ - GArray * array = ranges[m]; - if ( !array ) continue; - - for ( size_t n = 0; n < array->len; ++n){ - PhraseIndexRange * range = &g_array_index(array, PhraseIndexRange, n); - for ( phrase_token_t token = range->m_range_begin; - token != range->m_range_end; ++token){ - found = unigram_gen_next_step(nstep, max, token)|| found; - } - } - } - } - - return found; -} - -bool PinyinLookup2::search_bigram2(GPtrArray * topresults, int nstep, - PhraseIndexRanges ranges) { - - lookup_constraint_t * constraint = - &g_array_index(m_constraints, lookup_constraint_t, nstep); - - bool found = false; - BigramPhraseArray bigram_phrase_items = g_array_new - (FALSE, FALSE, sizeof(BigramPhraseItem)); - - for (size_t i = 0; i < topresults->len; ++i) { - lookup_value_t * value = (lookup_value_t *) - g_ptr_array_index(topresults, i); - - phrase_token_t index_token = value->m_handles[1]; - - SingleGram * system = NULL, * user = NULL; - m_system_bigram->load(index_token, system); - m_user_bigram->load(index_token, user); - - if ( !merge_single_gram(&m_merged_single_gram, system, user) ) - continue; - - if ( CONSTRAINT_ONESTEP == constraint->m_type ){ - phrase_token_t token = constraint->m_token; - - guint32 freq; - if( m_merged_single_gram.get_freq(token, freq) ){ - guint32 total_freq; - m_merged_single_gram.get_total_freq(total_freq); - gfloat bigram_poss = freq / (gfloat) total_freq; - found = bigram_gen_next_step(nstep, value, token, bigram_poss) || found; - } - } - - if (NO_CONSTRAINT == constraint->m_type) { - for( size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m){ - GArray * array = ranges[m]; - if ( !array ) continue; - - for ( size_t n = 0; n < array->len; ++n){ - PhraseIndexRange * range = - &g_array_index(array, PhraseIndexRange, n); - - g_array_set_size(bigram_phrase_items, 0); - m_merged_single_gram.search(range, bigram_phrase_items); - for( size_t k = 0; k < bigram_phrase_items->len; ++k) { - BigramPhraseItem * item = &g_array_index(bigram_phrase_items, BigramPhraseItem, k); - found = bigram_gen_next_step(nstep, value, item->m_token, item->m_freq) || found; - } - } - } - } - if (system) - delete system; - if (user) - delete user; - } - - g_array_free(bigram_phrase_items, TRUE); - return found; -} - - -bool PinyinLookup2::unigram_gen_next_step(int nstep, - lookup_value_t * cur_step, - phrase_token_t token) { - - if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) - return false; - - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - gdouble elem_poss = m_cache_phrase_item.get_unigram_frequency() / (gdouble) - m_phrase_index->get_phrase_index_total_freq(); - if ( elem_poss < DBL_EPSILON ) - return false; - - ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep; - gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys); - if (pinyin_poss < FLT_EPSILON ) - return false; - - lookup_value_t next_step; - next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; - next_step.m_poss = cur_step->m_poss + log(elem_poss * pinyin_poss * unigram_lambda); - next_step.m_last_step = nstep; - - return save_next_step(nstep + phrase_length, cur_step, &next_step); -} - -bool PinyinLookup2::bigram_gen_next_step(int nstep, - lookup_value_t * cur_step, - phrase_token_t token, - gfloat bigram_poss) { - - if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) - return false; - - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - gdouble unigram_poss = m_cache_phrase_item.get_unigram_frequency() / - (gdouble) m_phrase_index->get_phrase_index_total_freq(); - if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) - return false; - - ChewingKey * pinyin_keys = ((ChewingKey *)m_keys->data) + nstep; - gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys); - if ( pinyin_poss < FLT_EPSILON ) - return false; - - lookup_value_t next_step; - next_step.m_handles[0] = cur_step->m_handles[1]; next_step.m_handles[1] = token; - next_step.m_poss = cur_step->m_poss + - log((bigram_lambda * bigram_poss + unigram_lambda * unigram_poss) * pinyin_poss); - next_step.m_last_step = nstep; - - return save_next_step(nstep + phrase_length, cur_step, &next_step); -} - -bool PinyinLookup2::save_next_step(int next_step_pos, - lookup_value_t * cur_step, - lookup_value_t * next_step){ - - lookup_key_t next_key = next_step->m_handles[1]; - LookupStepIndex next_lookup_index = (LookupStepIndex) - g_ptr_array_index(m_steps_index, next_step_pos); - LookupStepContent next_lookup_content = (LookupStepContent) - g_ptr_array_index(m_steps_content, next_step_pos); - - gpointer key = NULL, value = NULL; - gboolean lookup_result = g_hash_table_lookup_extended - (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value); - - if ( !lookup_result ){ - g_array_append_val(next_lookup_content, *next_step); - g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key), GUINT_TO_POINTER(next_lookup_content->len - 1)); - return true; - }else{ - size_t step_index = GPOINTER_TO_UINT(value); - lookup_value_t * orig_next_value = &g_array_index - (next_lookup_content, lookup_value_t, step_index); - - if ( orig_next_value->m_poss < next_step->m_poss) { - /* found better result. */ - orig_next_value->m_handles[0] = next_step->m_handles[0]; - assert(orig_next_value->m_handles[1] == next_step->m_handles[1]); - orig_next_value->m_poss = next_step->m_poss; - orig_next_value->m_last_step = next_step->m_last_step; - return true; - } - - return false; - } -} - -bool PinyinLookup2::final_step(MatchResults & results){ - - /* reset results */ - g_array_set_size(results, m_steps_content->len - 1); - for (size_t i = 0; i < results->len; ++i){ - phrase_token_t * token = &g_array_index(results, phrase_token_t, i); - *token = null_token; - } - - /* find max element */ - size_t last_step_pos = m_steps_content->len - 1; - GArray * last_step_array = (GArray *)g_ptr_array_index(m_steps_content, last_step_pos); - if ( last_step_array->len == 0 ) - return false; - - lookup_value_t * max_value = &g_array_index(last_step_array, lookup_value_t, 0); - for ( size_t i = 1; i < last_step_array->len; ++i){ - lookup_value_t * cur_value = &g_array_index(last_step_array, lookup_value_t, i); - if ( cur_value->m_poss > max_value->m_poss ) - max_value = cur_value; - } - - /* backtracing */ - while( true ){ - int cur_step_pos = max_value->m_last_step; - if ( -1 == cur_step_pos ) - break; - - phrase_token_t * token = &g_array_index - (results, phrase_token_t, cur_step_pos); - *token = max_value->m_handles[1]; - - phrase_token_t last_token = max_value->m_handles[0]; - LookupStepIndex lookup_step_index = (LookupStepIndex) - g_ptr_array_index(m_steps_index, cur_step_pos); - - gpointer key = NULL, value = NULL; - gboolean result = g_hash_table_lookup_extended - (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value); - if (!result) - return false; - - LookupStepContent lookup_step_content = (LookupStepContent) - g_ptr_array_index(m_steps_content, cur_step_pos); - max_value = &g_array_index - (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value)); - } - - /* no need to reverse the result */ - return true; -} - - -bool PinyinLookup2::train_result2(ChewingKeyVector keys, - CandidateConstraints constraints, - MatchResults results) { - const guint32 initial_seed = 23 * 3; - const guint32 expand_factor = 2; - const guint32 unigram_factor = 7; - const guint32 pinyin_factor = 1; - const guint32 ceiling_seed = 23 * 15 * 64; - - /* begin training based on constraints and results. */ - bool train_next = false; - ChewingKey * pinyin_keys = (ChewingKey *) keys->data; - - phrase_token_t last_token = sentence_start; - /* constraints->len + 1 == results->len */ - for (size_t i = 0; i < constraints->len; ++i) { - phrase_token_t * token = &g_array_index(results, phrase_token_t, i); - if (null_token == *token) - continue; - - lookup_constraint_t * constraint = &g_array_index - (constraints, lookup_constraint_t, i); - if (train_next || CONSTRAINT_ONESTEP == constraint->m_type) { - if (CONSTRAINT_ONESTEP == constraint->m_type) { - assert(*token == constraint->m_token); - train_next = true; - } else { - train_next = false; - } - - guint32 seed = initial_seed; - /* train bi-gram first, and get train seed. */ - if (last_token) { - SingleGram * user = NULL; - m_user_bigram->load(last_token, user); - - guint32 total_freq = 0; - if (!user) { - user = new SingleGram; - } - assert(user->get_total_freq(total_freq)); - - guint32 freq = 0; - /* compute train factor */ - if (!user->get_freq(*token, freq)) { - assert(user->insert_freq(*token, 0)); - seed = initial_seed; - } else { - seed = std_lite::max(freq, initial_seed); - seed *= expand_factor; - seed = std_lite::min(seed, ceiling_seed); - } - - /* protect against total_freq overflow */ - if (seed > 0 && total_freq > total_freq + seed) - goto next; - - assert(user->set_total_freq(total_freq + seed)); - /* if total_freq is not overflow, then freq won't overflow. */ - assert(user->set_freq(*token, freq + seed)); - assert(m_user_bigram->store(last_token, user)); - next: - assert(NULL != user); - if (user) - delete user; - } - - /* train uni-gram */ - m_phrase_index->get_phrase_item(*token, m_cache_phrase_item); - m_cache_phrase_item.increase_pronunciation_possibility - (m_options, pinyin_keys + i, seed * pinyin_factor); - m_phrase_index->add_unigram_frequency - (*token, seed * unigram_factor); - } - last_token = *token; - } - return true; -} - - -int PinyinLookup2::add_constraint(CandidateConstraints constraints, - size_t index, - phrase_token_t token) { - - if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) - return 0; - - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - if ( index + phrase_length > constraints->len ) - return 0; - - for (size_t i = index; i < index + phrase_length; ++i){ - clear_constraint(constraints, i); - } - - /* store one step constraint */ - lookup_constraint_t * constraint = &g_array_index - (constraints, lookup_constraint_t, index); - constraint->m_type = CONSTRAINT_ONESTEP; - constraint->m_token = token; - - /* propagate no search constraint */ - for (size_t i = 1; i < phrase_length; ++i){ - constraint = &g_array_index(constraints, lookup_constraint_t, index + i); - constraint->m_type = CONSTRAINT_NOSEARCH; - constraint->m_constraint_step = index; - } - - return phrase_length; -} - -bool PinyinLookup2::clear_constraint(CandidateConstraints constraints, - int index) { - if (index < 0 || index >= constraints->len) - return false; - - lookup_constraint_t * constraint = &g_array_index - (constraints, lookup_constraint_t, index); - - if (NO_CONSTRAINT == constraint->m_type) - return false; - - if (CONSTRAINT_NOSEARCH == constraint->m_type){ - index = constraint->m_constraint_step; - constraint = &g_array_index(constraints, lookup_constraint_t, index); - } - - /* now var constraint points to the one step constraint. */ - assert(constraint->m_type == CONSTRAINT_ONESTEP); - - phrase_token_t token = constraint->m_token; - if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) - return false; - - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - for ( size_t i = 0; i < phrase_length; ++i){ - if (index + i >= constraints->len) - continue; - - constraint = &g_array_index - (constraints, lookup_constraint_t, index + i); - constraint->m_type = NO_CONSTRAINT; - } - - return true; -} - -bool PinyinLookup2::validate_constraint(CandidateConstraints constraints, - ChewingKeyVector keys) { - /* resize constraints array first */ - size_t constraints_length = constraints->len; - - if ( keys->len > constraints_length ){ - g_array_set_size(constraints, keys->len); - - /* initialize new element */ - for( size_t i = constraints_length; i < keys->len; ++i){ - lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); - constraint->m_type = NO_CONSTRAINT; - } - - }else if (keys->len < constraints_length ){ - /* just shrink it */ - g_array_set_size(constraints, keys->len); - } - - for ( size_t i = 0; i < constraints->len; ++i){ - lookup_constraint_t * constraint = &g_array_index - (constraints, lookup_constraint_t, i); - - /* handle one step constraint */ - if ( constraint->m_type == CONSTRAINT_ONESTEP ){ - - phrase_token_t token = constraint->m_token; - m_phrase_index->get_phrase_item(token, m_cache_phrase_item); - size_t phrase_length = m_cache_phrase_item.get_phrase_length(); - - /* clear too long constraint */ - if (i + phrase_length > constraints->len){ - clear_constraint(constraints, i); - continue; - } - - ChewingKey * pinyin_keys = (ChewingKey *)keys->data; - /* clear invalid pinyin */ - gfloat pinyin_poss = m_cache_phrase_item.get_pronunciation_possibility(m_options, pinyin_keys + i); - if (pinyin_poss < FLT_EPSILON) - clear_constraint(constraints, i); - } - } - return true; -} diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h deleted file mode 100644 index a05ccf7..0000000 --- a/src/lookup/pinyin_lookup2.h +++ /dev/null @@ -1,240 +0,0 @@ -/* - * libzhuyin - * Library to deal with zhuyin. - * - * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - - -#ifndef PINYIN_LOOKUP2_H -#define PINYIN_LOOKUP2_H - - -#include <float.h> -#include <glib.h> -#include "novel_types.h" -#include "chewing_key.h" -#include "phrase_index.h" -#include "ngram.h" -#include "lookup.h" - - -namespace zhuyin{ - -/** - * pinyin_lookup2.h - * - * The definitions of pinyin lookup related classes and structs. - * - */ - - - -enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH }; - -struct lookup_constraint_t{ - /* current type of the step */ - constraint_type m_type; - - /* Note: - * value of m_type: - * NO_CONSTRAINT: - * no values in the below union. - * search all possible next words. - * CONSTRAINT_ONESTEP: - * m_token contains the next word. - * only one word can be used to search for the next step, - * use case for user selected candidates. - * CONSTRAINT_NOSEARCH: - * m_constraint_step contains the value - * which points back to the CONSTRAINT_ONESTEP step. - * no search is allowed for the current step. - */ - - union{ - phrase_token_t m_token; - guint32 m_constraint_step; /* index of m_token */ - }; -}; - - -/** - * PinyinLookup2: - * - * The pinyin lookup class to convert pinyin keys to guessed sentence. - * - */ -class PinyinLookup2{ -private: - const gfloat bigram_lambda; - const gfloat unigram_lambda; - - PhraseItem m_cache_phrase_item; - SingleGram m_merged_single_gram; - -protected: - /* saved varibles */ - CandidateConstraints m_constraints; - ChewingKeyVector m_keys; - - pinyin_option_t m_options; - FacadeChewingTable * m_pinyin_table; - FacadePhraseIndex * m_phrase_index; - Bigram * m_system_bigram; - Bigram * m_user_bigram; - - /* internal step data structure */ - GPtrArray * m_steps_index; - /* Array of LookupStepIndex */ - GPtrArray * m_steps_content; - /* Array of LookupStepContent */ - - - bool search_unigram2(GPtrArray * topresults, int nstep, - PhraseIndexRanges ranges); - bool search_bigram2(GPtrArray * topresults, int nstep, - PhraseIndexRanges ranges); - - bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token); - bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss); - - bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step); - - bool final_step(MatchResults & results); - -public: - /** - * PinyinLookup2::PinyinLookup2: - * @lambda: the lambda parameter for interpolation model. - * @options: the pinyin options. - * @pinyin_table: the pinyin table. - * @phrase_index: the phrase index. - * @system_bigram: the system bi-gram. - * @user_bigram: the user bi-gram. - * - * The constructor of the PinyinLookup2. - * - */ - PinyinLookup2(const gfloat lambda, - pinyin_option_t options, - FacadeChewingTable * pinyin_table, - FacadePhraseIndex * phrase_index, - Bigram * system_bigram, - Bigram * user_bigram); - - /** - * PinyinLookup2::~PinyinLookup2: - * - * The destructor of the PinyinLookup2. - * - */ - ~PinyinLookup2(); - - /** - * PinyinLookup2::set_options: - * @options: the pinyin options. - * @returns: whether the set operation is successful. - * - * Set the pinyin options. - * - */ - bool set_options(pinyin_option_t options) { - m_options = options; - return true; - } - - /** - * PinyinLookup2::get_best_match: - * @prefixes: the phrase tokens before the guessed sentence. - * @keys: the pinyin keys of the guessed sentence. - * @constraints: the constraints on the guessed sentence. - * @results: the guessed sentence in the form of the phrase tokens. - * @returns: whether the guess operation is successful. - * - * Guess the best sentence according to user inputs. - * - */ - bool get_best_match(TokenVector prefixes, ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results); - - /** - * PinyinLookup2::train_result2: - * @keys: the pinyin keys of the guessed sentence. - * @constraints: the constraints on the guessed sentence. - * @results: the guessed sentence in the form of the phrase tokens. - * @returns: whether the train operation is successful. - * - * Self learning the guessed sentence based on the constraints. - * - */ - bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results); - - /** - * PinyinLookup2::convert_to_utf8: - * @results: the guessed sentence in the form of the phrase tokens. - * @result_string: the guessed sentence in the utf8 encoding. - * @returns: whether the convert operation is successful. - * - * Convert the guessed sentence from the phrase tokens to the utf8 string. - * - */ - bool convert_to_utf8(MatchResults results, - /* out */ char * & result_string) - { - return zhuyin::convert_to_utf8(m_phrase_index, results, - NULL, false, result_string); - } - - - /** - * PinyinLookup2::add_constraint: - * @constraints: the constraints on the guessed sentence. - * @index: the character offset in the guessed sentence. - * @token: the phrase token in the candidate list chosen by user. - * @returns: the number of the characters in the chosen token. - * - * Add one constraint to the constraints on the guessed sentence. - * - */ - int add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token); - - /** - * PinyinLookup2::clear_constraint: - * @constraints: the constraints on the guessed sentence. - * @index: the character offset in the guessed sentence. - * @returns: whether the clear operation is successful. - * - * Clear one constraint in the constraints on the guessed sentence. - * - */ - bool clear_constraint(CandidateConstraints constraints, int index); - - /** - * PinyinLookup2::validate_constraint: - * @constraints: the constraints on the guessed sentence. - * @keys: the pinyin keys of the guessed sentence. - * @returns: whether the validate operation is successful. - * - * Validate the old constraints with the new pinyin keys. - * - */ - bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys); - -}; - -}; - -#endif |