summaryrefslogtreecommitdiffstats
path: root/src/storage/phrase_index.h
blob: c1ad870071e588c3a52520cf2f6a9ff30d098144 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/* 
 *  libpinyin
 *  Library to deal with pinyin.
 *  
 *  Copyright (C) 2006-2007 Peng Wu
 *  
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 * 
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

#ifndef PHRASE_INDEX_H
#define PHRASE_INDEX_H

#include <stdio.h>
#include <glib.h>
#include "novel_types.h"
#include "chewing_key.h"
#include "pinyin_parser2.h"
#include "pinyin_phrase2.h"
#include "memory_chunk.h"
#include "phrase_index_logger.h"

/**
 * Phrase Index File Format
 *
 * Indirect Index: Index by Token
 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 * + Phrase Offset + Phrase Offset + Phrase Offset + ......  +
 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 * Phrase Content:
 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 * + Phrase Length + number of  Pronunciations  + Uni-gram Frequency+
 * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 * + n Pronunciations + Phrase String(UCS2) +
 * ++++++++++++++++++++++++++++++++++++++++++
 */

namespace pinyin{

class PinyinLookup;

/* Store delta info by phrase index logger in user home directory.
 */

const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);

class PhraseItem{
    friend class SubPhraseIndex;
private:
    MemoryChunk m_chunk;
    bool set_n_pronunciation(guint8 n_prouns);
public:
    /* Null Constructor */
    PhraseItem(){
	m_chunk.set_size(phrase_item_header);
	memset(m_chunk.begin(), 0, m_chunk.size());
    }

#if 0
    PhraseItem(MemoryChunk & chunk){
        m_chunk.set_content(0, chunk->begin(), chunk->size());
        assert ( m_chunk.size() >= phrase_item_header);
    }
#endif

    /* functions */
    guint8 get_phrase_length(){
	char * buf_begin = (char *)m_chunk.begin();
	return (*(guint8 *)buf_begin);
    }

    guint8 get_n_pronunciation(){
	char * buf_begin = ( char *) m_chunk.begin();
	return (*(guint8 *)(buf_begin + sizeof(guint8)));
    }

    guint32 get_unigram_frequency(){
	char * buf_begin = (char *)m_chunk.begin();
	return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
    }

    gfloat get_pronunciation_possibility(pinyin_option_t options,
				  ChewingKey * keys){
	guint8 phrase_length = get_phrase_length();
	guint8 npron = get_n_pronunciation();
	size_t offset = phrase_item_header + phrase_length * sizeof (utf16_t);
	char * buf_begin = (char *)m_chunk.begin();
	guint32 matched = 0, total_freq =0;
	for ( int i = 0 ; i < npron ; ++i){
	    char * chewing_begin = buf_begin + offset +
		i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
	    guint32 * freq = (guint32 *)(chewing_begin +
                                         phrase_length * sizeof(ChewingKey));
	    total_freq += *freq;
	    if ( 0 == pinyin_compare_with_ambiguities2
                 (options,  keys,
                  (ChewingKey *)chewing_begin,phrase_length) ){
		matched += *freq;
	    }
	}
	// use preprocessor to avoid zero freq, in gen_pinyin_table.
	/*
	if ( 0 == total_freq )
	    return 0.1;
	*/
	gfloat retval = matched / (gfloat) total_freq;
	/*
	if ( 0 == retval )
	    return 0.03;
	*/
	return retval;
    }
    
    void increase_pronunciation_possibility(pinyin_option_t options,
				     ChewingKey * keys,
				     gint32 delta);

    bool get_phrase_string(utf16_t * phrase);
    bool set_phrase_string(guint8 phrase_length, utf16_t * phrase);
    bool get_nth_pronunciation(size_t index, 
			       /* out */ ChewingKey * keys,
			       /* out */ guint32 & freq);
    /* Normally don't change the first pronunciation,
     * which decides the token number.
     */
    void append_pronunciation(ChewingKey * keys, guint32 freq);
    void remove_nth_pronunciation(size_t index);

    bool operator == (const PhraseItem & rhs) const{
        if (m_chunk.size() != rhs.m_chunk.size())
            return false;
        return memcmp(m_chunk.begin(), rhs.m_chunk.begin(),
                      m_chunk.size()) == 0;
    }

    bool operator != (const PhraseItem & rhs) const{
        return ! (*this == rhs);
    }
};

/*
 *  In Sub Phrase Index, token == (token & PHRASE_MASK).
 */

class SubPhraseIndex{
private:
    guint32 m_total_freq;
    MemoryChunk m_phrase_index;
    MemoryChunk m_phrase_content;
    MemoryChunk * m_chunk;
public:
    SubPhraseIndex():m_total_freq(0){
	m_chunk = NULL;
    }

    ~SubPhraseIndex(){
	reset();
    }

    void reset(){
	if ( m_chunk ){
	    delete m_chunk;
	    m_chunk = NULL;
	}
    }    
    
    /* binary memory chunk load/store method */
    bool load(MemoryChunk * chunk, 
	      table_offset_t offset, table_offset_t end);
    bool store(MemoryChunk * new_chunk, 
	       table_offset_t offset, table_offset_t & end);

    /* switch to logger format to reduce user storage */
    bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger);
    bool merge(PhraseIndexLogger * logger);

    /* get token range in this sub phrase */
    int get_range(/* out */ PhraseIndexRange & range);
    
    /* Zero-gram */
    guint32 get_phrase_index_total_freq();
    int add_unigram_frequency(phrase_token_t token, guint32 delta);

    /* get_phrase_item function can't modify the phrase item size,
     * but can increment the freq of the special pronunciation,
     * or change the content without size increasing.
     */
    int get_phrase_item(phrase_token_t token, PhraseItem & item);
    int add_phrase_item(phrase_token_t token, PhraseItem * item);
    /* remove_phrase_item will substract item->get_unigram_frequency()
     * from m_total_freq
     */
    int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);

};

class FacadePhraseIndex{
    friend class PinyinLookup;
private:
    guint32 m_total_freq;
    SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
public:
    FacadePhraseIndex(){
	m_total_freq = 0;
	memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
    }

    ~FacadePhraseIndex(){
	for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
	    if ( m_sub_phrase_indices[i] ){
		delete m_sub_phrase_indices[i];
		m_sub_phrase_indices[i] = NULL;
	    }
	}
    }

    /* load/store single sub phrase index, according to the config files. */
    bool load_text(guint8 phrase_index, FILE * infile);
    bool load(guint8 phrase_index, MemoryChunk * chunk);
    bool store(guint8 phrase_index, MemoryChunk * new_chunk);
    bool unload(guint8 phrase_index);

    /* load/store logger format.
       the ownership of oldchunk and log is transfered to here. */
    bool diff(guint8 phrase_index, MemoryChunk * oldchunk,
              MemoryChunk * newlog);
    bool merge(guint8 phrase_index, MemoryChunk * log);

    /* compat all SubPhraseIndex m_phrase_content memory usage. */
    bool compat();

    /* get all available sub phrase indices. */
    int get_sub_phrase_range(guint8 & min_index, guint8 & max_index);

    /* get each sub phrase token range with phrase_index added */
    int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range);

    /* Zero-gram */
    guint32 get_phrase_index_total_freq(){
	return m_total_freq;
    }

    int add_unigram_frequency(phrase_token_t token, guint32 delta){
	guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
	SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
	if ( !sub_phrase )
	    return ERROR_NO_SUB_PHRASE_INDEX;
	m_total_freq += delta;
	return sub_phrase->add_unigram_frequency(token, delta);
    }

    /* get_phrase_item function can't modify the phrase item */
    int get_phrase_item(phrase_token_t token, PhraseItem & item){
	guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
	SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
	if ( !sub_phrase )
	    return ERROR_NO_SUB_PHRASE_INDEX;
	return sub_phrase->get_phrase_item(token, item);
    }

    int add_phrase_item(phrase_token_t token, PhraseItem * item){
	guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
	SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
	if ( !sub_phrase ){
	    sub_phrase = new SubPhraseIndex;
	}   
	m_total_freq += item->get_unigram_frequency();
	return sub_phrase->add_phrase_item(token, item);
    }

    int remove_phrase_item(phrase_token_t token, PhraseItem * & item){
	guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
	SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
	if ( !sub_phrase ){
	    return ERROR_NO_SUB_PHRASE_INDEX;
	}
	int result = sub_phrase->remove_phrase_item(token, item);
	if ( result )
	    return result;
	m_total_freq -= item->get_unigram_frequency();
	return result;
    }

};
 
};

#endif