summaryrefslogtreecommitdiffstats
path: root/src/lookup/phrase_lookup.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/lookup/phrase_lookup.h')
-rw-r--r--src/lookup/phrase_lookup.h142
1 files changed, 142 insertions, 0 deletions
diff --git a/src/lookup/phrase_lookup.h b/src/lookup/phrase_lookup.h
new file mode 100644
index 0000000..cf65692
--- /dev/null
+++ b/src/lookup/phrase_lookup.h
@@ -0,0 +1,142 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef PHRASE_LOOKUP_H
+#define PHRASE_LOOKUP_H
+
+#include "novel_types.h"
+#include "ngram.h"
+#include "lookup.h"
+
+/**
+ * phrase_lookup.h
+ *
+ * The definitions of phrase lookup related classes and structs.
+ *
+ */
+
+namespace pinyin{
+
+/**
+ * PhraseLookup:
+ *
+ * The phrase lookup class to convert the sentence to phrase tokens.
+ *
+ */
+class PhraseLookup{
+private:
+ const gfloat bigram_lambda;
+ const gfloat unigram_lambda;
+
+ PhraseItem m_cache_phrase_item;
+ SingleGram m_merged_single_gram;
+protected:
+ //saved varibles
+ FacadePhraseTable2 * m_phrase_table;
+ FacadePhraseIndex * m_phrase_index;
+ Bigram * m_system_bigram;
+ Bigram * m_user_bigram;
+
+ //internal step data structure
+ GPtrArray * m_steps_index;
+ /* Array of LookupStepIndex */
+ GPtrArray * m_steps_content;
+ /* Array of LookupStepContent */
+
+ /* Saved sentence */
+ int m_sentence_length;
+ ucs4_t * m_sentence;
+
+protected:
+ /* Explicitly search the next phrase,
+ * to avoid double phrase lookup as the next token has only one.
+ */
+ bool search_unigram2(int nstep, PhraseTokens tokens);
+ bool search_bigram2(int nstep, PhraseTokens tokens);
+
+ bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token);
+ bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss);
+
+ bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step);
+
+ bool final_step(MatchResults & results);
+public:
+ /**
+ * PhraseLookup::PhraseLookup:
+ * @lambda: the lambda parameter for interpolation model.
+ * @phrase_table: the phrase table.
+ * @phrase_index: the phrase index.
+ * @system_bigram: the system bi-gram.
+ * @user_bigram: the user bi-gram.
+ *
+ * The constructor of the PhraseLookup.
+ *
+ */
+ PhraseLookup(const gfloat lambda,
+ FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ Bigram * system_bigram,
+ Bigram * user_bigram);
+
+ /**
+ * PhraseLookup::~PhraseLookup:
+ *
+ * The destructor of the PhraseLookup.
+ *
+ */
+ ~PhraseLookup();
+
+ /**
+ * PhraseLookup::get_best_match:
+ * @sentence_length: the length of the sentence in ucs4 characters.
+ * @sentence: the ucs4 characters of the sentence.
+ * @results: the segmented sentence in the form of phrase tokens.
+ * @returns: whether the segment operation is successful.
+ *
+ * Segment the sentence into phrase tokens.
+ *
+ * Note: this method only accepts the characters in phrase large table.
+ *
+ */
+ bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results);
+
+ /**
+ * PhraseLookup::convert_to_utf8:
+ * @results: the guessed sentence in the form of phrase tokens.
+ * @result_string: the converted sentence in utf8 string.
+ * @returns: whether the convert operation is successful.
+ *
+ * Convert the sentence from phrase tokens to the utf8 string.
+ *
+ * Note: free the result_string by g_free.
+ *
+ */
+ bool convert_to_utf8(MatchResults results,
+ /* out */ char * & result_string)
+ {
+ return pinyin::convert_to_utf8(m_phrase_index, results,
+ "\n", true, result_string);
+ }
+};
+
+};
+
+#endif