diff options
Diffstat (limited to 'src/lookup/phrase_lookup.h')
-rw-r--r-- | src/lookup/phrase_lookup.h | 142 |
1 files changed, 142 insertions, 0 deletions
diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h new file mode 100644 index 0000000..cf65692 --- /dev/null +++ b/src/lookup/phrase_lookup.h @@ -0,0 +1,142 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef PHRASE_LOOKUP_H +#define PHRASE_LOOKUP_H + +#include "novel_types.h" +#include "ngram.h" +#include "lookup.h" + +/** + * phrase_lookup.h + * + * The definitions of phrase lookup related classes and structs. + * + */ + +namespace pinyin{ + +/** + * PhraseLookup: + * + * The phrase lookup class to convert the sentence to phrase tokens. + * + */ +class PhraseLookup{ +private: + const gfloat bigram_lambda; + const gfloat unigram_lambda; + + PhraseItem m_cache_phrase_item; + SingleGram m_merged_single_gram; +protected: + //saved varibles + FacadePhraseTable2 * m_phrase_table; + FacadePhraseIndex * m_phrase_index; + Bigram * m_system_bigram; + Bigram * m_user_bigram; + + //internal step data structure + GPtrArray * m_steps_index; + /* Array of LookupStepIndex */ + GPtrArray * m_steps_content; + /* Array of LookupStepContent */ + + /* Saved sentence */ + int m_sentence_length; + ucs4_t * m_sentence; + +protected: + /* Explicitly search the next phrase, + * to avoid double phrase lookup as the next token has only one. + */ + bool search_unigram2(int nstep, PhraseTokens tokens); + bool search_bigram2(int nstep, PhraseTokens tokens); + + bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token); + bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss); + + bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step); + + bool final_step(MatchResults & results); +public: + /** + * PhraseLookup::PhraseLookup: + * @lambda: the lambda parameter for interpolation model. + * @phrase_table: the phrase table. + * @phrase_index: the phrase index. + * @system_bigram: the system bi-gram. + * @user_bigram: the user bi-gram. + * + * The constructor of the PhraseLookup. + * + */ + PhraseLookup(const gfloat lambda, + FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + Bigram * system_bigram, + Bigram * user_bigram); + + /** + * PhraseLookup::~PhraseLookup: + * + * The destructor of the PhraseLookup. + * + */ + ~PhraseLookup(); + + /** + * PhraseLookup::get_best_match: + * @sentence_length: the length of the sentence in ucs4 characters. + * @sentence: the ucs4 characters of the sentence. + * @results: the segmented sentence in the form of phrase tokens. + * @returns: whether the segment operation is successful. + * + * Segment the sentence into phrase tokens. + * + * Note: this method only accepts the characters in phrase large table. + * + */ + bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results); + + /** + * PhraseLookup::convert_to_utf8: + * @results: the guessed sentence in the form of phrase tokens. + * @result_string: the converted sentence in utf8 string. + * @returns: whether the convert operation is successful. + * + * Convert the sentence from phrase tokens to the utf8 string. + * + * Note: free the result_string by g_free. + * + */ + bool convert_to_utf8(MatchResults results, + /* out */ char * & result_string) + { + return pinyin::convert_to_utf8(m_phrase_index, results, + "\n", true, result_string); + } +}; + +}; + +#endif |