summaryrefslogtreecommitdiffstats
path: root/src/storage/ngram.h
blob: c5e7bc8c702c4c95a4eec8de1359507251e9a429 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/* 
 *  libpinyin
 *  Library to deal with pinyin.
 *  
 *  Copyright (C) 2006-2007 Peng Wu
 *  
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 * 
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#ifndef NGRAM_H
#define NGRAM_H

#include <db.h>

namespace pinyin{

class Bigram;

/* Note:
 * When transfer from system ngram to user ngram, 
 *   if user ngram doesn't exist,
 *     copy total freq from system ngram to user ngram,
 *     so the total freq exists.
 *   if item freq don't exist, copy item freq from system to user ngram,
 *     so the item freq exists.
 *     if user ngram already exists(always true), increases the total freq,
 *     if item ngram already exists(always true), increases the freq.
 */

class SingleGram{
    friend class Bigram;
private:
    MemoryChunk m_chunk;
    SingleGram(void * buffer, size_t length);
public:
    /* Null Constructor */
    SingleGram();
    /* retrieve all items */
    bool retrieve_all(/* out */ BigramPhraseWithCountArray array);

    /* search method */
    /* the array result contains many items */
    bool search(/* in */ PhraseIndexRange * range, 
	       /* out */ BigramPhraseArray array);

    bool get_freq(/* in */ phrase_token_t token,
	       /* out */ guint32 & freq); 
    
    /* set_freq method
     */
    bool set_freq(/* in */ phrase_token_t token,
		  guint32 freq);
    
    /* get_total_freq method
     * used in user bigram table
     */
    bool get_total_freq(guint32 & total);

    /* set_total_freq method
     * used in user bigram table
     */
    bool set_total_freq(guint32 total);
    
    /* prune one method
     * only used in training
     */
    bool prune();
};

class Bigram{
private:
    DB * m_system;
    DB * m_user;

    void reset(){
	if ( m_system ){
	    m_system->close(m_system, 0);
	    m_system = NULL;
	}
	if ( m_user ){
	    m_user->close(m_user, 0);
	    m_user = NULL;
	}
    }

public:
    Bigram(){
	m_system = NULL; m_user = NULL;
    }

    ~Bigram(){
	reset();
    }

    /* attach system and user bi-gram */
    /* when with training systemdb is NULL, only user_gram */
    bool attach(const char * systemfile, const char * userfile);

    /* load/store one single gram */
    bool load(/* in */ phrase_token_t index,
              /* out */ SingleGram * & system_gram,
              /* out */ SingleGram * & user_gram);

    bool store(/* in */ phrase_token_t index,
               /* in */ SingleGram * user_gram);

    /* array of phrase_token_t items, for parameter estimation. */
    bool get_all_items(/* out */ GArray * system,
                       /* out */ GArray * user);
};

};

using namespace pinyin;


#endif