1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
|
/*
* libpinyin
* Library to deal with pinyin.
*
* Copyright (C) 2006-2007 Peng Wu
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#ifndef PINYIN_LOOKUP_H
#define PINYIN_LOOKUP_H
#include <float.h>
#include <glib.h>
#include "novel_types.h"
#include "chewing_key.h"
#include "phrase_index.h"
#include "ngram.h"
#include "lookup.h"
namespace pinyin{
class WinnerTree;
/**
* pinyin_lookup.h
*
* the definitions of pinyin lookup related classes and structs.
*
*/
enum constraint_type{NO_CONSTRAINT, CONSTRAINT_ONESTEP, CONSTRAINT_NOSEARCH };
struct lookup_constraint_t{
/* current type of the step */
constraint_type m_type;
/* Note:
* value of m_type:
* NO_CONSTRAINT:
* no values in the below union.
* search all possible next words.
* CONSTRAINT_ONESTEP:
* m_token contains the next word.
* only one word can be used to search for the next step,
* use case for user selected candidates.
* CONSTRAINT_NOSEARCH:
* m_constraint_step contains the value
* which points back to the CONSTRAINT_ONESTEP step.
* no search is allowed for the current step.
*/
union{
phrase_token_t m_token;
guint32 m_constraint_step; /* index of m_token */
};
};
/**
* IBranchIterator:
*
* The iterator to get the 32 highest values.
*
* Note: The winner tree for Viterbi beam search.
*
*/
class IBranchIterator{
public:
virtual ~IBranchIterator(){}
virtual bool has_next() = 0;
virtual lookup_value_t next() = 0;
virtual lookup_value_t max() = 0;
};
/**
* PinyinLookup:
*
* The pinyin lookup class to convert pinyin keys to guessed sentence.
*
*/
class PinyinLookup{
private:
static const gfloat bigram_lambda;
static const gfloat unigram_lambda;
PhraseItem m_cache_phrase_item;
SingleGram m_merged_single_gram;
protected:
//saved varibles
CandidateConstraints m_constraints;
ChewingKeyVector m_keys;
FacadeChewingTable * m_pinyin_table;
FacadePhraseIndex * m_phrase_index;
pinyin_option_t m_options;
Bigram * m_system_bigram;
Bigram * m_user_bigram;
//internal step data structure
GPtrArray * m_steps_index;
/* Array of LookupStepIndex */
GPtrArray * m_steps_content;
/* Array of LookupStepContent */
GArray * m_table_cache;
/* Array of PhraseIndexRanges,
* PhraseIndexRanges is an array of GArray of PhraseIndexRange,
* indexed by phrase library (only contains enabled phrase libraries).
*/
WinnerTree * m_winner_tree;
size_t prepare_table_cache(int nstep, int total_pinyin);
/* init pinyin table lookup array */
bool prepare_pinyin_lookup(PhraseIndexRanges ranges);
/* destroy pinyin table lookup array */
bool destroy_pinyin_lookup(PhraseIndexRanges ranges);
bool search_unigram(IBranchIterator * iter, int nstep, int npinyin);
bool search_bigram(IBranchIterator * iter, int nstep, int npinyin);
bool unigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token);
bool bigram_gen_next_step(int nstep, lookup_value_t * cur_step, phrase_token_t token, gfloat bigram_poss);
bool save_next_step(int next_step_pos, lookup_value_t * cur_step, lookup_value_t * next_step);
bool final_step(MatchResults & results);
public:
/**
* PinyinLookup::PinyinLookup:
* @options: the pinyin options.
* @pinyin_table: the pinyin table.
* @phrase_index: the phrase index.
* @system_bigram: the system bi-gram.
* @user_bigram: the user bi-gram.
*
* The constructor of the PinyinLookup.
*
*/
PinyinLookup(pinyin_option_t options, FacadeChewingTable * pinyin_table,
FacadePhraseIndex * phrase_index, Bigram * system_bigram,
Bigram * user_bigram);
/**
* PinyinLookup::~PinyinLookup:
*
* The destructor of the PinyinLookup.
*
*/
~PinyinLookup();
/**
* PinyinLookup::set_options:
* @options: the pinyin options.
* @returns: whether the set operation is successful.
*
* Set the pinyin options of this PinyinLookup.
*
*/
bool set_options(pinyin_option_t options) {
m_options = options;
return true;
}
/**
* PinyinLookup::get_best_match:
* @prefixes: the phrase tokens before the guessed sentence.
* @keys: the pinyin keys of the guessed sentence.
* @constraints: the constraints on the guessed sentence.
* @results: the guessed sentence in the form of the phrase tokens.
* @returns: whether the guess operation is successful.
*
* Guess the best sentence according to user inputs.
*
*/
bool get_best_match(TokenVector prefixes, ChewingKeyVector keys, CandidateConstraints constraints, MatchResults & results);
/**
* PinyinLookup::train_result2:
* @keys: the pinyin keys of the guessed sentence.
* @constraints: the constraints on the guessed sentence.
* @results: the guessed sentence in the form of the phrase tokens.
* @returns: whether the train operation is successful.
*
* Self learning the guessed sentence based on the constraints.
*
*/
bool train_result2(ChewingKeyVector keys, CandidateConstraints constraints, MatchResults results);
/**
* PinyinLookup::convert_to_utf8:
* @results: the guessed sentence in the form of the phrase tokens.
* @result_string: the guessed sentence in the utf8 encoding.
* @returns: whether the convert operation is successful.
*
* Convert the guessed sentence from the phrase tokens to the utf8 string.
*
*/
bool convert_to_utf8(MatchResults results,
/* out */ char * & result_string)
{
return pinyin::convert_to_utf8(m_phrase_index, results,
NULL, result_string);
}
/**
* PinyinLookup::add_constraint:
* @constraints: the constraints on the guessed sentence.
* @index: the character offset in the guessed sentence.
* @token: the phrase token in the candidate list chosen by user.
* @returns: the number of the characters in the chosen token.
*
* Add one constraint to the constraints on the guessed sentence.
*
*/
guint8 add_constraint(CandidateConstraints constraints, size_t index, phrase_token_t token);
/**
* PinyinLookup::clear_constraint:
* @constraints: the constraints on the guessed sentence.
* @index: the character offset in the guessed sentence.
* @returns: whether the clear operation is successful.
*
* Clear one constraint in the constraints on the guessed sentence.
*
*/
bool clear_constraint(CandidateConstraints constraints, size_t index);
/**
* PinyinLookup::validate_constraint:
* @constraints: the constraints on the guessed sentence.
* @keys: the pinyin keys of the guessed sentence.
* @returns: whether the validate operation is successful.
*
* Validate the old constraints with the new pinyin keys.
*
*/
bool validate_constraint(CandidateConstraints constraints, ChewingKeyVector keys);
};
};
#endif
|