/*
* libpinyin
* Library to deal with pinyin.
*
* Copyright (C) 2006-2007 Peng Wu
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef PHRASE_INDEX_H
#define PHRASE_INDEX_H
#include
#include
#include "novel_types.h"
#include "chewing_key.h"
#include "pinyin_parser2.h"
#include "pinyin_phrase3.h"
#include "memory_chunk.h"
#include "phrase_index_logger.h"
/**
* Phrase Index File Format
*
* Indirect Index: Index by Token
* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* + Phrase Offset + Phrase Offset + Phrase Offset + ...... +
* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Phrase Content:
* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* + Phrase Length + number of Pronunciations + Uni-gram Frequency+
* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* + Phrase String(UCS4) + n Pronunciations with Frequency +
* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
namespace pinyin{
/* Store delta info by phrase index logger in user home directory.
*/
const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32);
/**
* PhraseItem:
*
* The PhraseItem to access the items in phrase index.
*
*/
class PhraseItem{
friend class SubPhraseIndex;
friend bool _compute_new_header(PhraseIndexLogger * logger,
phrase_token_t mask,
phrase_token_t value,
guint32 & new_total_freq);
private:
MemoryChunk m_chunk;
bool set_n_pronunciation(guint8 n_prouns);
public:
/**
* PhraseItem::PhraseItem:
*
* The constructor of the PhraseItem.
*
*/
PhraseItem(){
m_chunk.set_size(phrase_item_header);
memset(m_chunk.begin(), 0, m_chunk.size());
}
#if 0
PhraseItem(MemoryChunk & chunk){
m_chunk.set_content(0, chunk->begin(), chunk->size());
assert ( m_chunk.size() >= phrase_item_header);
}
#endif
/**
* PhraseItem::get_phrase_length:
* @returns: the length of this phrase item.
*
* Get the length of this phrase item.
*
*/
guint8 get_phrase_length(){
char * buf_begin = (char *)m_chunk.begin();
return (*(guint8 *)buf_begin);
}
/**
* PhraseItem::get_n_pronunciation:
* @returns: the number of the pronunciations.
*
* Get the number of the pronunciations.
*
*/
guint8 get_n_pronunciation(){
char * buf_begin = ( char *) m_chunk.begin();
return (*(guint8 *)(buf_begin + sizeof(guint8)));
}
/**
* PhraseItem::get_unigram_frequency:
* @returns: the uni-gram frequency of this phrase item.
*
* Get the uni-gram frequency of this phrase item.
*
*/
guint32 get_unigram_frequency(){
char * buf_begin = (char *)m_chunk.begin();
return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
}
/**
* PhraseItem::get_pronunciation_possibility:
* @keys: the pronunciation keys.
* @returns: the possibility of this phrase item pronounces the pinyin.
*
* Get the possibility of this phrase item pronounces the pinyin.
*
*/
gfloat get_pronunciation_possibility(ChewingKey * keys){
guint8 phrase_length = get_phrase_length();
guint8 npron = get_n_pronunciation();
size_t offset = phrase_item_header + phrase_length * sizeof (ucs4_t);
char * buf_begin = (char *)m_chunk.begin();
guint32 matched = 0, total_freq =0;
for ( int i = 0 ; i < npron ; ++i){
char * chewing_begin = buf_begin + offset +
i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
guint32 * freq = (guint32 *)(chewing_begin +
phrase_length * sizeof(ChewingKey));
total_freq += *freq;
if ( 0 == pinyin_compare_with_tones(keys, (ChewingKey *)chewing_begin,
phrase_length) ){
matched += *freq;
}
}
#if 1
/* an additional safe guard for chewing. */
if ( 0 == total_freq )
return 0;
#endif
/* used preprocessor to avoid zero freq, in gen_pinyin_table. */
gfloat retval = matched / (gfloat) total_freq;
return retval;
}
/**
* PhraseItem::increase_pronunciation_possibility:
* @keys: the pronunciation keys.
* @delta: the delta to be added to the pronunciation keys.
*
* Add the delta to the pronunciation of the pronunciation keys.
*
*/
void increase_pronunciation_possibility(ChewingKey * keys,
gint32 delta);
/**
* PhraseItem::get_phrase_string:
* @phrase: the ucs4 character buffer.
* @returns: whether the get operation is successful.
*
* Get the ucs4 characters of this phrase item.
*
*/
bool get_phrase_string(ucs4_t * phrase);
/**
* PhraseItem::set_phrase_string:
* @phrase_length: the ucs4 character length of this phrase item.
* @phrase: the ucs4 character buffer.
* @returns: whether the set operation is successful.
*
* Set the length and ucs4 characters of this phrase item.
*
*/
bool set_phrase_string(guint8 phrase_length, ucs4_t * phrase);
/**
* PhraseItem::get_nth_pronunciation:
* @index: the pronunciation index.
* @keys: the pronunciation keys.
* @freq: the frequency of the pronunciation.
* @returns: whether the get operation is successful.
*
* Get the nth pronunciation of this phrase item.
*
*/
bool get_nth_pronunciation(size_t index,
/* out */ ChewingKey * keys,
/* out */ guint32 & freq);
/**
* PhraseItem::add_pronunciation:
* @keys: the pronunciation keys.
* @delta: the delta of the frequency of the pronunciation.
* @returns: whether the add operation is successful.
*
* Add one pronunciation.
*
*/
bool add_pronunciation(ChewingKey * keys, guint32 delta);
/**
* PhraseItem::remove_nth_pronunciation:
* @index: the pronunciation index.
*
* Remove the nth pronunciation.
*
* Note: Normally don't change the first pronunciation,
* which decides the token number.
*
*/
void remove_nth_pronunciation(size_t index);
bool operator == (const PhraseItem & rhs) const{
if (m_chunk.size() != rhs.m_chunk.size())
return false;
return memcmp(m_chunk.begin(), rhs.m_chunk.begin(),
m_chunk.size()) == 0;
}
bool operator != (const PhraseItem & rhs) const{
return ! (*this == rhs);
}
};
/*
* In Sub Phrase Index, token == (token & PHRASE_MASK).
*/
/**
* SubPhraseIndex:
*
* The SubPhraseIndex class for internal usage.
*
*/
class SubPhraseIndex{
private:
guint32 m_total_freq;
MemoryChunk m_phrase_index;
MemoryChunk m_phrase_content;
MemoryChunk * m_chunk;
void reset(){
m_total_freq = 0;
m_phrase_index.set_size(0);
m_phrase_content.set_size(0);
if ( m_chunk ){
delete m_chunk;
m_chunk = NULL;
}
}
public:
/**
* SubPhraseIndex::SubPhraseIndex:
*
* The constructor of the SubPhraseIndex.
*
*/
SubPhraseIndex():m_total_freq(0){
m_chunk = NULL;
}
/**
* SubPhraseIndex::~SubPhraseIndex:
*
* The destructor of the SubPhraseIndex.
*
*/
~SubPhraseIndex(){
reset();
}
/**
* SubPhraseIndex::load:
* @chunk: the memory chunk of the binary sub phrase index.
* @offset: the begin of binary data in the memory chunk.
* @end: the end of binary data in the memory chunk.
* @returns: whether the load operation is successful.
*
* Load the sub phrase index from the memory chunk.
*
*/
bool load(MemoryChunk * chunk,
table_offset_t offset, table_offset_t end);
/**
* SubPhraseIndex::store:
* @new_chunk: the new memory chunk to store this sub phrase index.
* @offset: the begin of binary data in the memory chunk.
* @end: the end of stored binary data in the memory chunk.
* @returns: whether the store operation is successful.
*
* Store the sub phrase index to the new memory chunk.
*
*/
bool store(MemoryChunk * new_chunk,
table_offset_t offset, table_offset_t & end);
/**
* SubPhraseIndex::diff:
* @oldone: the original content of sub phrase index.
* @logger: the delta information of user self-learning data.
* @returns: whether the diff operation is successful.
*
* Compare this sub phrase index with the original content of the system
* sub phrase index to generate the logger of difference.
*
* Note: Switch to logger format to reduce user space storage.
*
*/
bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger);
/**
* SubPhraseIndex::merge:
* @logger: the logger of difference in user home directory.
* @returns: whether the merge operation is successful.
*
* Merge the user logger of difference with this sub phrase index.
*
*/
bool merge(PhraseIndexLogger * logger);
/**
* SubPhraseIndex::get_range:
* @range: the token range.
* @returns: whether the get operation is successful.
*
* Get the token range in this sub phrase index.
*
*/
int get_range(/* out */ PhraseIndexRange & range);
/**
* SubPhraseIndex::get_phrase_index_total_freq:
* @returns: the total frequency of this sub phrase index.
*
* Get the total frequency of this sub phrase index.
*
* Note: maybe call it "Zero-gram".
*
*/
guint32 get_phrase_index_total_freq();
/**
* SubPhraseIndex::add_unigram_frequency:
* @token: the phrase token.
* @delta: the delta value of the phrase token.
* @returns: the status of the add operation.
*
* Add delta value to the phrase of the token.
*
* Note: this method is a fast path to add delta value.
* Maybe use the get_phrase_item method instead in future.
*
*/
int add_unigram_frequency(phrase_token_t token, guint32 delta);
/**
* SubPhraseIndex::get_phrase_item:
* @token: the phrase token.
* @item: the phrase item of the token.
* @returns: the status of the get operation.
*
* Get the phrase item from this sub phrase index.
*
* Note:get_phrase_item function can't modify the phrase item size,
* but can increment the freq of the special pronunciation,
* or change the content without size increasing.
*
*/
int get_phrase_item(phrase_token_t token, PhraseItem & item);
/**
* SubPhraseIndex::add_phrase_item:
* @token: the phrase token.
* @item: the phrase item of the token.
* @returns: the status of the add operation.
*
* Add the phrase item to this sub phrase index.
*
*/
int add_phrase_item(phrase_token_t token, PhraseItem * item);
/**
* SubPhraseIndex::remove_phrase_item:
* @token: the phrase token.
* @item: the removed phrase item of the token.
* @returns: the status of the remove operation.
*
* Remove the phrase item of the token.
*
* Note: this remove_phrase_item method will substract the unigram
* frequency of the removed item from m_total_freq.
*
*/
int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item);
/**
* SubPhraseIndex::mask_out:
* @mask: the mask.
* @value: the value.
* @returns: whether the mask out operation is successful.
*
* Mask out the matched phrase items.
*
*/
bool mask_out(phrase_token_t mask, phrase_token_t value);
};
/**
* FacadePhraseIndex:
*
* The facade class of phrase index.
*
*/
class FacadePhraseIndex{
private:
guint32 m_total_freq;
SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT];
public:
/**
* FacadePhraseIndex::FacadePhraseIndex:
*
* The constructor of the FacadePhraseIndex.
*
*/
FacadePhraseIndex(){
m_total_freq = 0;
memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices));
}
/**
* FacadePhraseIndex::~FacadePhraseIndex:
*
* The destructor of the FacadePhraseIndex.
*
*/
~FacadePhraseIndex(){
for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
if ( m_sub_phrase_indices[i] ){
delete m_sub_phrase_indices[i];
m_sub_phrase_indices[i] = NULL;
}
}
}
/**
* FacadePhraseIndex::load_text:
* @phrase_index: the index of sub phrase index to be loaded.
* @infile: the textual format file of the phrase table.
* @returns: whether the load operation is successful.
*
* Load one sub phrase index from the textual format file.
* Note: load sub phrase index according to the config in future.
*
*/
bool load_text(guint8 phrase_index, FILE * infile);
/**
* FacadePhraseIndex::load:
* @phrase_index: the index of sub phrase index to be loaded.
* @chunk: the memory chunk of sub phrase index to be loaded.
* @returns: whether the load operation is successful.
*
* Load one sub phrase index from the memory chunk.
*
*/
bool load(guint8 phrase_index, MemoryChunk * chunk);
/**
* FacadePhraseIndex::store:
* @phrase_index: the index of sub phrase index to be stored.
* @new_chunk: the memory chunk of sub phrase index to be stored.
* @returns: whether the store operation is successful.
*
* Store one sub phrase index to the memory chunk.
*
*/
bool store(guint8 phrase_index, MemoryChunk * new_chunk);
/**
* FacadePhraseIndex::unload:
* @phrase_index: the index of sub phrase index to be unloaded.
* @returns: whether the unload operation is successful.
*
* Unload one sub phrase index.
*
*/
bool unload(guint8 phrase_index);
/**
* FacadePhraseIndex::diff:
* @phrase_index: the index of sub phrase index to be differed.
* @oldchunk: the original content of sub phrase index.
* @newlog: the delta information of user self-learning data.
* @returns: whether the diff operation is successful.
*
* Store user delta information in the logger format.
*
* Note: the ownership of oldchunk is transfered here.
*
*/
bool diff(guint8 phrase_index, MemoryChunk * oldchunk,
MemoryChunk * newlog);
/**
* FacadePhraseIndex::merge:
* @phrase_index: the index of sub phrase index to be merged.
* @log: the logger of difference in user home directory.
* @returns: whether the merge operation is successful.
*
* Merge the user logger of difference with the sub phrase index.
*
* Note: the ownership of log is transfered here.
*
*/
bool merge(guint8 phrase_index, MemoryChunk * log);
/**
* FacadePhraseIndex::merge_with_mask:
* @phrase_index: the index of sub phrase index to be merged.
* @log: the logger of difference in user home directory.
* @mask: the mask.
* @value: the value.
* @returns: whether the merge operation is successful.
*
* Merge the user logger of difference with mask operation.
*
* Note: the ownership of log is transfered here.
*
*/
bool merge_with_mask(guint8 phrase_index, MemoryChunk * log,
phrase_token_t mask, phrase_token_t value);
/**
* FacadePhraseIndex::compact:
* @returns: whether the compact operation is successful.
*
* Compat all sub phrase index memory usage.
*
*/
bool compact();
/**
* FacadePhraseIndex::mask_out:
* @phrase_index: the index of sub phrase index.
* @mask: the mask.
* @value: the value.
* @returns: whether the mask out operation is successful.
*
* Mask out the matched phrase items.
*
* Note: should call compact() after the mask out operation.
*
*/
bool mask_out(guint8 phrase_index,
phrase_token_t mask, phrase_token_t value);
/**
* FacadePhraseIndex::get_sub_phrase_range:
* @min_index: the minimal sub phrase index.
* @max_index: the maximal sub phrase index.
* @returns: the status of the get operation.
*
* Get the minimum and maximum of the sub phrase index.
*
*/
int get_sub_phrase_range(guint8 & min_index, guint8 & max_index);
/**
* FacadePhraseIndex::get_range:
* @phrase_index: the index of sub phrase index.
* @range: the token range of the sub phrase index.
* @returns: the status of the get operation.
*
* Get the token range of the sub phrase index.
*
*/
int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range);
/**
* FacadePhraseIndex::get_phrase_index_total_freq:
* @returns: the total freq of the facade phrase index.
*
* Get the total freq of the facade phrase index.
*
* Note: maybe call it "Zero-gram".
*
*/
guint32 get_phrase_index_total_freq(){
return m_total_freq;
}
/**
* FacadePhraseIndex::add_unigram_frequency:
* @token: the phrase token.
* @delta: the delta value of the phrase token.
* @returns: the status of the add operation.
*
* Add delta value to the phrase of the token.
*
*/
int add_unigram_frequency(phrase_token_t token, guint32 delta){
guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
if ( !sub_phrase )
return ERROR_NO_SUB_PHRASE_INDEX;
m_total_freq += delta;
return sub_phrase->add_unigram_frequency(token, delta);
}
/**
* FacadePhraseIndex::get_phrase_item:
* @token: the phrase token.
* @item: the phrase item of the token.
* @returns: the status of the get operation.
*
* Get the phrase item from the facade phrase index.
*
*/
int get_phrase_item(phrase_token_t token, PhraseItem & item){
guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index];
if ( !sub_phrase )
return ERROR_NO_SUB_PHRASE_INDEX;
return sub_phrase->get_phrase_item(token, item);
}
/**
* FacadePhraseIndex::add_phrase_item:
* @token: the phrase token.
* @item: the phrase item of the token.
* @returns: the status of the add operation.
*
* Add the phrase item to the facade phrase index.
*
*/
int add_phrase_item(phrase_token_t token, PhraseItem * item){
guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
if ( !sub_phrase ){
sub_phrase = new SubPhraseIndex;
}
m_total_freq += item->get_unigram_frequency();
return sub_phrase->add_phrase_item(token, item);
}
/**
* FacadePhraseIndex::remove_phrase_item:
* @token: the phrase token.
* @item: the removed phrase item of the token.
* @returns: the status of the remove operation.
*
* Remove the phrase item of the token.
*
*/
int remove_phrase_item(phrase_token_t token, PhraseItem * & item){
guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token);
SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
if ( !sub_phrase ){
return ERROR_NO_SUB_PHRASE_INDEX;
}
int result = sub_phrase->remove_phrase_item(token, item);
if ( result )
return result;
m_total_freq -= item->get_unigram_frequency();
return result;
}
/**
* FacadePhraseIndex::prepare_ranges:
* @ranges: the ranges to be prepared.
* @returns: whether the prepare operation is successful.
*
* Prepare the ranges.
*
*/
bool prepare_ranges(PhraseIndexRanges ranges) {
/* assume memset(ranges, 0, sizeof(ranges)); */
for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
GArray * & range = ranges[i];
assert(NULL == range);
SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i];
if (sub_phrase) {
range = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange));
}
}
return true;
}
/**
* FacadePhraseIndex::clear_ranges:
* @ranges: the ranges to be cleared.
* @returns: whether the clear operation is successful.
*
* Clear the ranges.
*
*/
bool clear_ranges(PhraseIndexRanges ranges) {
for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
GArray * range = ranges[i];
if (range) {
g_array_set_size(range, 0);
}
}
return true;
}
/**
* FacadePhraseIndex::destroy_ranges:
* @ranges: the ranges to be destroyed.
* @returns: whether the destroy operation is successful.
*
* Destroy the ranges.
*
*/
bool destroy_ranges(PhraseIndexRanges ranges) {
for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
GArray * & range = ranges[i];
if (range) {
g_array_free(range, TRUE);
range = NULL;
}
}
return true;
}
/**
* FacadePhraseIndex::prepare_tokens:
* @tokens: the tokens to be prepared.
* @returns: whether the prepare operation is successful.
*
* Prepare the tokens.
*
*/
bool prepare_tokens(PhraseTokens tokens) {
/* assume memset(tokens, 0, sizeof(tokens)); */
for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
GArray * & token = tokens[i];
assert(NULL == token);
SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i];
if (sub_phrase) {
token = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
}
}
return true;
}
/**
* FacadePhraseIndex::clear_tokens:
* @tokens: the tokens to be cleared.
* @return: whether the clear operation is successful.
*
* Clear the tokens.
*
*/
bool clear_tokens(PhraseTokens tokens) {
for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
GArray * token = tokens[i];
if (token) {
g_array_set_size(token, 0);
}
}
return true;
}
/**
* FacadePhraseIndex::destroy_tokens:
* @tokens: the tokens to be destroyed.
* @returns: whether the destroy operation is successful.
*
* Destroy the tokens.
*
*/
bool destroy_tokens(PhraseTokens tokens) {
for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
GArray * & token = tokens[i];
if (token) {
g_array_free(token, TRUE);
token = NULL;
}
}
return true;
}
/**
* FacadePhraseIndex::create_sub_phrase:
* @index: the phrase index to be created.
* @returns: the result of the create operation.
*
* Create the sub phrase index.
*
*/
int create_sub_phrase(guint8 index) {
SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index];
if (sub_phrase) {
return ERROR_ALREADY_EXISTS;
}
sub_phrase = new SubPhraseIndex;
return ERROR_OK;
}
};
PhraseIndexLogger * mask_out_phrase_index_logger
(PhraseIndexLogger * oldlogger, phrase_token_t mask, phrase_token_t value);
};
#endif