src/lookup/phrase_lookup.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142

/* 
 *  libpinyin
 *  Library to deal with pinyin.
 *  
 *  Copyright (C) 2006-2007 Peng Wu
 *  
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 * 
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

#ifndef PHRASE_LOOKUP_H
#define PHRASE_LOOKUP_H

#include "novel_types.h"
#include "ngram.h"
#include "lookup.h"

/**
 * phrase_lookup.h
 *
 * The definitions of phrase lookup related classes and structs.
 *
 */

namespace pinyin{

/**
 * PhraseLookup:
 *
 * The phrase lookup class to convert the sentence to phrase tokens.
 *
 */
class PhraseLookup{
private:
    const gfloat bigram_lambda;
    const gfloat unigram_lambda;

    PhraseItem m_cache_phrase_item;
    SingleGram m_merged_single_gram;
protected:
    //saved varibles
    FacadePhraseTable2 * m_phrase_table;
    FacadePhraseIndex * m_phrase_index;
    Bigram * m_system_bigram;
    Bigram * m_user_bigram;

    //internal step data structure
    GPtrArray * m_steps_index;
    /* Array of LookupStepIndex */
    GPtrArray * m_steps_content;
    /* Array of LookupStepContent */

    /* Saved sentence */
    int m_sentence_length;
    ucs4_t * m_sentence;

protected:
    /* Explicitly search the next phrase,
     *  to avoid double phrase lookup as the next token has only one.
     */
    bool search_unigram2(int nstep, PhraseTokens tokens);
    bool search_bigram2(int nstep, PhraseTokens tokens);

    bool unigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token);
    bool bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss);

    bool save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_step);

    bool final_step(MatchResults & results);
public:
    /**
     * PhraseLookup::PhraseLookup:
     * @lambda: the lambda parameter for interpolation model.
     * @phrase_table: the phrase table.
     * @phrase_index: the phrase index.
     * @system_bigram: the system bi-gram.
     * @user_bigram: the user bi-gram.
     *
     * The constructor of the PhraseLookup.
     *
     */
    PhraseLookup(const gfloat lambda,
                 FacadePhraseTable2 * phrase_table,
                 FacadePhraseIndex * phrase_index,
                 Bigram * system_bigram,
                 Bigram * user_bigram);

    /**
     * PhraseLookup::~PhraseLookup:
     *
     * The destructor of the PhraseLookup.
     *
     */
    ~PhraseLookup();

    /**
     * PhraseLookup::get_best_match:
     * @sentence_length: the length of the sentence in ucs4 characters.
     * @sentence: the ucs4 characters of the sentence.
     * @results: the segmented sentence in the form of phrase tokens.
     * @returns: whether the segment operation is successful.
     *
     * Segment the sentence into phrase tokens.
     *
     * Note: this method only accepts the characters in phrase large table.
     *
     */
    bool get_best_match(int sentence_length, ucs4_t sentence[], MatchResults & results);

    /**
     * PhraseLookup::convert_to_utf8:
     * @results: the guessed sentence in the form of phrase tokens.
     * @result_string: the converted sentence in utf8 string.
     * @returns: whether the convert operation is successful.
     *
     * Convert the sentence from phrase tokens to the utf8 string.
     *
     * Note: free the result_string by g_free.
     *
     */
    bool convert_to_utf8(MatchResults results,
                         /* out */ char * & result_string)
    {
        return pinyin::convert_to_utf8(m_phrase_index, results,
                                       "\n", true, result_string);
    }
};

};

#endif