From 669d5b63478a83df3938ae128f52dd3ef4fd7d2f Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 8 Dec 2011 16:55:49 +0800 Subject: port pinyin lookup --- src/lookup/lookup.cpp | 3 ++- src/lookup/lookup.h | 3 +++ src/lookup/pinyin_lookup.cpp | 38 ++++++++++++++++++-------------------- src/lookup/pinyin_lookup.h | 22 ++++++++++++++-------- 4 files changed, 37 insertions(+), 29 deletions(-) diff --git a/src/lookup/lookup.cpp b/src/lookup/lookup.cpp index 66278cd..a22c246 100644 --- a/src/lookup/lookup.cpp +++ b/src/lookup/lookup.cpp @@ -19,8 +19,9 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ -#include "phrase_index.h" + #include "lookup.h" +#include "phrase_index.h" namespace pinyin{ diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h index c5e814a..0fc590b 100644 --- a/src/lookup/lookup.h +++ b/src/lookup/lookup.h @@ -22,10 +22,13 @@ #ifndef LOOKUP_H #define LOOKUP_H + /** @file lookup.h * @brief the definitions of common lookup related classes and structs. */ +#include "novel_types.h" + namespace pinyin{ typedef phrase_token_t lookup_key_t; diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp index 8ce68af..dab4b96 100644 --- a/src/lookup/pinyin_lookup.cpp +++ b/src/lookup/pinyin_lookup.cpp @@ -19,16 +19,14 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#include "pinyin_lookup.h" #include #include #include "stl_lite.h" #include "novel_types.h" -#include "pinyin_base.h" -#include "pinyin_phrase.h" -#include "pinyin_large_table.h" -#include "phrase_index.h" +#include "pinyin_phrase2.h" #include "ngram.h" -#include "pinyin_lookup.h" #include "winner_tree.h" using namespace pinyin; @@ -36,12 +34,12 @@ using namespace pinyin; const gfloat PinyinLookup::bigram_lambda = LAMBDA_PARAMETER; const gfloat PinyinLookup::unigram_lambda = 1 - LAMBDA_PARAMETER; -PinyinLookup::PinyinLookup(PinyinCustomSettings * custom, - PinyinLargeTable * pinyin_table, +PinyinLookup::PinyinLookup(pinyin_option_t options, + ChewingLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * system_bigram, Bigram * user_bigram){ - m_custom = custom; + m_options = options; m_pinyin_table = pinyin_table; m_phrase_index = phrase_index; m_system_bigram = system_bigram; @@ -113,7 +111,7 @@ size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){ destroy_pinyin_lookup(*ranges); } - PinyinKey * pinyin_keys = (PinyinKey *)m_keys->data; + ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data; pinyin_keys += nstep; g_array_set_size(m_table_cache, MAX_PHRASE_LENGTH + 1); @@ -140,7 +138,7 @@ size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){ return m_table_cache->len - 1; } -bool PinyinLookup::get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){ +bool PinyinLookup::get_best_match(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results){ //g_array_set_size(results, 0); m_constraints = constraints; @@ -329,7 +327,7 @@ bool PinyinLookup::search_bigram(IBranchIterator * iter, bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token){ - PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep; + ChewingKey * pinyinkeys = ((ChewingKey *)m_keys->data) + nstep; if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) return false; size_t phrase_length = m_cache_phrase_item.get_phrase_length(); @@ -337,7 +335,7 @@ bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, p m_phrase_index->get_phrase_index_total_freq(); if ( elem_poss < DBL_EPSILON ) return false; - gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys); + gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(m_options, pinyinkeys); if (pinyin_poss < FLT_EPSILON ) return false; lookup_value_t next_step; @@ -349,7 +347,7 @@ bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, p } bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss){ - PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep; + ChewingKey * pinyinkeys = ((ChewingKey *)m_keys->data) + nstep; if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item)) return false; size_t phrase_length = m_cache_phrase_item.get_phrase_length(); @@ -358,7 +356,7 @@ bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, ph if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON ) return false; - gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys); + gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(m_options, pinyinkeys); if ( pinyin_poss < FLT_EPSILON ) return false; lookup_value_t next_step; @@ -442,9 +440,9 @@ bool PinyinLookup::final_step(MatchResults & results){ return true; } -bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){ +bool PinyinLookup::train_result(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results){ bool train_next = false; - PinyinKey * pinyin_keys = (PinyinKey *)keys->data; + ChewingKey * pinyin_keys = (ChewingKey *)keys->data; //TODO: verify the new training method. phrase_token_t last_token = sentence_start; // constraints->len + 1 == results->len @@ -464,7 +462,7 @@ bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints const //add pi-gram frequency //printf("i:%d\tlast_token:%d\ttoken:%d\n", i, last_token, *token); m_phrase_index->get_phrase_item(*token, m_cache_phrase_item); - m_cache_phrase_item.increase_pinyin_possibility(*m_custom, pinyin_keys + i, train_factor); + m_cache_phrase_item.increase_pinyin_possibility(m_options, pinyin_keys + i, train_factor); m_phrase_index->add_unigram_frequency(*token, train_factor * 10); if ( last_token ){ SingleGram * system, *user; @@ -553,7 +551,7 @@ bool PinyinLookup::clear_constraint(CandidateConstraints constraints, size_t ind return true; } -bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys){ +bool PinyinLookup::validate_constraint(CandidateConstraints constraints, ChewingKeyVector m_parsed_keys){ //resize constraints array size_t constraints_length = constraints->len; if ( m_parsed_keys->len > constraints_length ){ @@ -567,7 +565,7 @@ bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinK g_array_set_size(constraints, m_parsed_keys->len); } - PinyinKey * pinyin_keys = (PinyinKey *)m_parsed_keys->data; + ChewingKey * pinyin_keys = (ChewingKey *)m_parsed_keys->data; for ( size_t i = 0; i < constraints->len; ++i){ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); @@ -581,7 +579,7 @@ bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinK continue; } //clear invalidated pinyin - gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyin_keys + i); + gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(m_options, pinyin_keys + i); if ( pinyin_poss < FLT_EPSILON ){ clear_constraint(constraints, i); } diff --git a/src/lookup/pinyin_lookup.h b/src/lookup/pinyin_lookup.h index dd28b4d..3211acf 100644 --- a/src/lookup/pinyin_lookup.h +++ b/src/lookup/pinyin_lookup.h @@ -22,12 +22,16 @@ #ifndef PINYIN_LOOKUP_H #define PINYIN_LOOKUP_H + #include #include #include "novel_types.h" -#include "pinyin_base.h" +#include "chewing_key.h" +#include "phrase_index.h" +#include "chewing_large_table.h" #include "lookup.h" + namespace pinyin{ class WinnerTree; @@ -84,11 +88,11 @@ private: protected: //saved varibles CandidateConstraints m_constraints; - PinyinKeyVector m_keys; + ChewingKeyVector m_keys; - PinyinLargeTable * m_pinyin_table; + ChewingLargeTable * m_pinyin_table; FacadePhraseIndex * m_phrase_index; - PinyinCustomSettings * m_custom; + pinyin_option_t m_options; Bigram * m_system_bigram; Bigram * m_user_bigram; @@ -118,13 +122,15 @@ protected: bool final_step(MatchResults & results); public: - PinyinLookup( PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * system_bigram, Bigram * user_bigram); + PinyinLookup(pinyin_option_t options, ChewingLargeTable * pinyin_table, + FacadePhraseIndex * phrase_index, Bigram * system_bigram, + Bigram * user_bigram); ~PinyinLookup(); - bool get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results); + bool get_best_match(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results); - bool train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results); + bool train_result(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results); bool convert_to_utf8(MatchResults results, /* out */ char * & result_string) @@ -138,7 +144,7 @@ public: bool clear_constraint(CandidateConstraints constraints, size_t index); - bool validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys); + bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector m_parsed_keys); /* init pinyin table lookup array */ bool prepare_pinyin_lookup(PhraseIndexRanges ranges); -- cgit