summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-12-08 16:55:49 +0800
committerPeng Wu <alexepico@gmail.com>2011-12-09 13:34:47 +0800
commit669d5b63478a83df3938ae128f52dd3ef4fd7d2f (patch)
tree1d3671aeafed52f11054b75f93fcd3b886978829
parentfcd6aa844697457b3b5e8d38c485d57f92789ea9 (diff)
downloadlibpinyin-669d5b63478a83df3938ae128f52dd3ef4fd7d2f.tar.gz
libpinyin-669d5b63478a83df3938ae128f52dd3ef4fd7d2f.tar.xz
libpinyin-669d5b63478a83df3938ae128f52dd3ef4fd7d2f.zip
port pinyin lookup
-rw-r--r--src/lookup/lookup.cpp3
-rw-r--r--src/lookup/lookup.h3
-rw-r--r--src/lookup/pinyin_lookup.cpp38
-rw-r--r--src/lookup/pinyin_lookup.h22
4 files changed, 37 insertions, 29 deletions
diff --git a/src/lookup/lookup.cpp b/src/lookup/lookup.cpp
index 66278cd..a22c246 100644
--- a/src/lookup/lookup.cpp
+++ b/src/lookup/lookup.cpp
@@ -19,8 +19,9 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
-#include "phrase_index.h"
+
#include "lookup.h"
+#include "phrase_index.h"
namespace pinyin{
diff --git a/src/lookup/lookup.h b/src/lookup/lookup.h
index c5e814a..0fc590b 100644
--- a/src/lookup/lookup.h
+++ b/src/lookup/lookup.h
@@ -22,10 +22,13 @@
#ifndef LOOKUP_H
#define LOOKUP_H
+
/** @file lookup.h
* @brief the definitions of common lookup related classes and structs.
*/
+#include "novel_types.h"
+
namespace pinyin{
typedef phrase_token_t lookup_key_t;
diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp
index 8ce68af..dab4b96 100644
--- a/src/lookup/pinyin_lookup.cpp
+++ b/src/lookup/pinyin_lookup.cpp
@@ -19,16 +19,14 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
+
+#include "pinyin_lookup.h"
#include <math.h>
#include <assert.h>
#include "stl_lite.h"
#include "novel_types.h"
-#include "pinyin_base.h"
-#include "pinyin_phrase.h"
-#include "pinyin_large_table.h"
-#include "phrase_index.h"
+#include "pinyin_phrase2.h"
#include "ngram.h"
-#include "pinyin_lookup.h"
#include "winner_tree.h"
using namespace pinyin;
@@ -36,12 +34,12 @@ using namespace pinyin;
const gfloat PinyinLookup::bigram_lambda = LAMBDA_PARAMETER;
const gfloat PinyinLookup::unigram_lambda = 1 - LAMBDA_PARAMETER;
-PinyinLookup::PinyinLookup(PinyinCustomSettings * custom,
- PinyinLargeTable * pinyin_table,
+PinyinLookup::PinyinLookup(pinyin_option_t options,
+ ChewingLargeTable * pinyin_table,
FacadePhraseIndex * phrase_index,
Bigram * system_bigram,
Bigram * user_bigram){
- m_custom = custom;
+ m_options = options;
m_pinyin_table = pinyin_table;
m_phrase_index = phrase_index;
m_system_bigram = system_bigram;
@@ -113,7 +111,7 @@ size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){
destroy_pinyin_lookup(*ranges);
}
- PinyinKey * pinyin_keys = (PinyinKey *)m_keys->data;
+ ChewingKey * pinyin_keys = (ChewingKey *)m_keys->data;
pinyin_keys += nstep;
g_array_set_size(m_table_cache, MAX_PHRASE_LENGTH + 1);
@@ -140,7 +138,7 @@ size_t PinyinLookup::prepare_table_cache(int nstep, int total_pinyin){
return m_table_cache->len - 1;
}
-bool PinyinLookup::get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){
+bool PinyinLookup::get_best_match(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results){
//g_array_set_size(results, 0);
m_constraints = constraints;
@@ -329,7 +327,7 @@ bool PinyinLookup::search_bigram(IBranchIterator * iter,
bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token){
- PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep;
+ ChewingKey * pinyinkeys = ((ChewingKey *)m_keys->data) + nstep;
if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
return false;
size_t phrase_length = m_cache_phrase_item.get_phrase_length();
@@ -337,7 +335,7 @@ bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, p
m_phrase_index->get_phrase_index_total_freq();
if ( elem_poss < DBL_EPSILON )
return false;
- gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys);
+ gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(m_options, pinyinkeys);
if (pinyin_poss < FLT_EPSILON )
return false;
lookup_value_t next_step;
@@ -349,7 +347,7 @@ bool PinyinLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_step, p
}
bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss){
- PinyinKey * pinyinkeys = ((PinyinKey *)m_keys->data) + nstep;
+ ChewingKey * pinyinkeys = ((ChewingKey *)m_keys->data) + nstep;
if (m_phrase_index->get_phrase_item(token, m_cache_phrase_item))
return false;
size_t phrase_length = m_cache_phrase_item.get_phrase_length();
@@ -358,7 +356,7 @@ bool PinyinLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_step, ph
if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
return false;
- gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyinkeys);
+ gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(m_options, pinyinkeys);
if ( pinyin_poss < FLT_EPSILON )
return false;
lookup_value_t next_step;
@@ -442,9 +440,9 @@ bool PinyinLookup::final_step(MatchResults & results){
return true;
}
-bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results){
+bool PinyinLookup::train_result(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results){
bool train_next = false;
- PinyinKey * pinyin_keys = (PinyinKey *)keys->data;
+ ChewingKey * pinyin_keys = (ChewingKey *)keys->data;
//TODO: verify the new training method.
phrase_token_t last_token = sentence_start;
// constraints->len + 1 == results->len
@@ -464,7 +462,7 @@ bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints const
//add pi-gram frequency
//printf("i:%d\tlast_token:%d\ttoken:%d\n", i, last_token, *token);
m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
- m_cache_phrase_item.increase_pinyin_possibility(*m_custom, pinyin_keys + i, train_factor);
+ m_cache_phrase_item.increase_pinyin_possibility(m_options, pinyin_keys + i, train_factor);
m_phrase_index->add_unigram_frequency(*token, train_factor * 10);
if ( last_token ){
SingleGram * system, *user;
@@ -553,7 +551,7 @@ bool PinyinLookup::clear_constraint(CandidateConstraints constraints, size_t ind
return true;
}
-bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys){
+bool PinyinLookup::validate_constraint(CandidateConstraints constraints, ChewingKeyVector m_parsed_keys){
//resize constraints array
size_t constraints_length = constraints->len;
if ( m_parsed_keys->len > constraints_length ){
@@ -567,7 +565,7 @@ bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinK
g_array_set_size(constraints, m_parsed_keys->len);
}
- PinyinKey * pinyin_keys = (PinyinKey *)m_parsed_keys->data;
+ ChewingKey * pinyin_keys = (ChewingKey *)m_parsed_keys->data;
for ( size_t i = 0; i < constraints->len; ++i){
lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
@@ -581,7 +579,7 @@ bool PinyinLookup::validate_constraint(CandidateConstraints constraints, PinyinK
continue;
}
//clear invalidated pinyin
- gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(*m_custom, pinyin_keys + i);
+ gfloat pinyin_poss = m_cache_phrase_item.get_pinyin_possibility(m_options, pinyin_keys + i);
if ( pinyin_poss < FLT_EPSILON ){
clear_constraint(constraints, i);
}
diff --git a/src/lookup/pinyin_lookup.h b/src/lookup/pinyin_lookup.h
index dd28b4d..3211acf 100644
--- a/src/lookup/pinyin_lookup.h
+++ b/src/lookup/pinyin_lookup.h
@@ -22,12 +22,16 @@
#ifndef PINYIN_LOOKUP_H
#define PINYIN_LOOKUP_H
+
#include <float.h>
#include <glib.h>
#include "novel_types.h"
-#include "pinyin_base.h"
+#include "chewing_key.h"
+#include "phrase_index.h"
+#include "chewing_large_table.h"
#include "lookup.h"
+
namespace pinyin{
class WinnerTree;
@@ -84,11 +88,11 @@ private:
protected:
//saved varibles
CandidateConstraints m_constraints;
- PinyinKeyVector m_keys;
+ ChewingKeyVector m_keys;
- PinyinLargeTable * m_pinyin_table;
+ ChewingLargeTable * m_pinyin_table;
FacadePhraseIndex * m_phrase_index;
- PinyinCustomSettings * m_custom;
+ pinyin_option_t m_options;
Bigram * m_system_bigram;
Bigram * m_user_bigram;
@@ -118,13 +122,15 @@ protected:
bool final_step(MatchResults & results);
public:
- PinyinLookup( PinyinCustomSettings * custom, PinyinLargeTable * pinyin_table, FacadePhraseIndex * phrase_index, Bigram * system_bigram, Bigram * user_bigram);
+ PinyinLookup(pinyin_option_t options, ChewingLargeTable * pinyin_table,
+ FacadePhraseIndex * phrase_index, Bigram * system_bigram,
+ Bigram * user_bigram);
~PinyinLookup();
- bool get_best_match(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+ bool get_best_match(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results);
- bool train_result(PinyinKeyVector keys, CandidateConstraints constraints, MatchResults & results);
+ bool train_result(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results);
bool convert_to_utf8(MatchResults results,
/* out */ char * & result_string)
@@ -138,7 +144,7 @@ public:
bool clear_constraint(CandidateConstraints constraints, size_t index);
- bool validate_constraint(CandidateConstraints constraints, PinyinKeyVector m_parsed_keys);
+ bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector m_parsed_keys);
/* init pinyin table lookup array */
bool prepare_pinyin_lookup(PhraseIndexRanges ranges);