From ee9436f88eab4fcee2ade36bf847812ea241e689 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Fri, 7 Sep 2012 14:15:19 +0800 Subject: write pinyin_lookup2.h --- src/lookup/pinyin_lookup2.h | 236 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 src/lookup/pinyin_lookup2.h diff --git a/src/lookup/pinyin_lookup2.h b/src/lookup/pinyin_lookup2.h new file mode 100644 index 0000000..bf59f1c --- /dev/null +++ b/src/lookup/pinyin_lookup2.h @@ -0,0 +1,236 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef PINYIN_LOOKUP2_H +#define PINYIN_LOOKUP2_H + + +#include +#include +#include "novel_types.h" +#include "chewing_key.h" +#include "phrase_index.h" +#include "ngram.h" +#include "lookup.h" + + +namespace pinyin{ + +/** + * pinyin_lookup2.h + * + * The definitions of pinyin lookup related classes and structs. + * + */ + + + +enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH }; + +struct lookup_constraint_t{ + /* current type of the step */ + constraint_type m_type; + + /* Note: + * value of m_type: + * NO_CONSTRAINT: + * no values in the below union. + * search all possible next words. + * CONSTRAINT_ONESTEP: + * m_token contains the next word. + * only one word can be used to search for the next step, + * use case for user selected candidates. + * CONSTRAINT_NOSEARCH: + * m_constraint_step contains the value + * which points back to the CONSTRAINT_ONESTEP step. + * no search is allowed for the current step. + */ + + union{ + phrase_token_t m_token; + guint32 m_constraint_step; /* index of m_token */ + }; +}; + + +/** + * PinyinLookup2: + * + * The pinyin lookup class to convert pinyin keys to guessed sentence. + * + */ +class PinyinLookup2{ +private: + static const gfloat bigram_lambda; + static const gfloat unigram_lambda; + + PhraseItem m_cache_phrase_item; + SingleGram m_merged_single_gram; + +protected: + /* saved varibles */ + CandidateConstraints m_constraints; + ChewingKeyVector m_keys; + + pinyin_option_t m_options; + FacadeChewingTable * m_pinyin_table; + FacadePhraseIndex * m_phrase_index; + Bigram * m_system_bigram; + Bigram * m_user_bigram; + + /* internal step data structure */ + GPtrArray * m_steps_index; + /* Array of LookupStepIndex */ + GPtrArray * m_steps_content; + /* Array of LookupStepContent */ + + + bool search_unigram2(GPtrArray * topresults, int nstep, int npinyin, + PhraseIndexRanges ranges); + bool search_bigram2(GPtrArray * topresults, int nstep, int npinyin, + PhraseIndexRanges ranges); + + bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token); + bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss); + + bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step); + + bool final_step(MatchResults & results); + +public: + /** + * PinyinLookup2::PinyinLookup2: + * @options: the pinyin options. + * @pinyin_table: the pinyin table. + * @phrase_index: the phrase index. + * @system_bigram: the system bi-gram. + * @user_bigram: the user bi-gram. + * + * The constructor of the PinyinLookup2. + * + */ + PinyinLookup2(pinyin_option_t options, FacadeChewingTable * pinyin_table, + FacadePhraseIndex * phrase_index, Bigram * system_bigram, + Bigram * user_bigram); + + /** + * PinyinLookup2::~PinyinLookup2: + * + * The destructor of the PinyinLookup2. + * + */ + ~PinyinLookup2(); + + /** + * PinyinLookup2::set_options: + * @options: the pinyin options. + * @returns: whether the set operation is successful. + * + * Set the pinyin options. + * + */ + bool set_options(pinyin_option_t options) { + m_options = options; + return true; + } + + /** + * PinyinLookup2::get_best_match: + * @prefixes: the phrase tokens before the guessed sentence. + * @keys: the pinyin keys of the guessed sentence. + * @constraints: the constraints on the guessed sentence. + * @results: the guessed sentence in the form of the phrase tokens. + * @returns: whether the guess operation is successful. + * + * Guess the best sentence according to user inputs. + * + */ + bool get_best_match(TokenVector prefixes, ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results); + + /** + * PinyinLookup2::train_result2: + * @keys: the pinyin keys of the guessed sentence. + * @constraints: the constraints on the guessed sentence. + * @results: the guessed sentence in the form of the phrase tokens. + * @returns: whether the train operation is successful. + * + * Self learning the guessed sentence based on the constraints. + * + */ + bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results); + + /** + * PinyinLookup2::convert_to_utf8: + * @results: the guessed sentence in the form of the phrase tokens. + * @result_string: the guessed sentence in the utf8 encoding. + * @returns: whether the convert operation is successful. + * + * Convert the guessed sentence from the phrase tokens to the utf8 string. + * + */ + bool convert_to_utf8(MatchResults results, + /* out */ char * & result_string) + { + return pinyin::convert_to_utf8(m_phrase_index, results, + NULL, result_string); + } + + + /** + * PinyinLookup2::add_constraint: + * @constraints: the constraints on the guessed sentence. + * @index: the character offset in the guessed sentence. + * @token: the phrase token in the candidate list chosen by user. + * @returns: the number of the characters in the chosen token. + * + * Add one constraint to the constraints on the guessed sentence. + * + */ + guint8 add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token); + + /** + * PinyinLookup2::clear_constraint: + * @constraints: the constraints on the guessed sentence. + * @index: the character offset in the guessed sentence. + * @returns: whether the clear operation is successful. + * + * Clear one constraint in the constraints on the guessed sentence. + * + */ + bool clear_constraint(CandidateConstraints constraints, size_t index); + + /** + * PinyinLookup2::validate_constraint: + * @constraints: the constraints on the guessed sentence. + * @keys: the pinyin keys of the guessed sentence. + * @returns: whether the validate operation is successful. + * + * Validate the old constraints with the new pinyin keys. + * + */ + bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys); + +}; + +}; + +#endif -- cgit