diff options
Diffstat (limited to 'src/storage/phrase_index.h')
-rw-r--r-- | src/storage/phrase_index.h | 839 |
1 files changed, 839 insertions, 0 deletions
diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h new file mode 100644 index 0000000..e1dad0b --- /dev/null +++ b/src/storage/phrase_index.h @@ -0,0 +1,839 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef PHRASE_INDEX_H +#define PHRASE_INDEX_H + +#include <stdio.h> +#include <glib.h> +#include "novel_types.h" +#include "chewing_key.h" +#include "pinyin_parser2.h" +#include "pinyin_phrase2.h" +#include "memory_chunk.h" +#include "phrase_index_logger.h" + +/** + * Phrase Index File Format + * + * Indirect Index: Index by Token + * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * + Phrase Offset + Phrase Offset + Phrase Offset + ...... + + * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * Phrase Content: + * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * + Phrase Length + number of Pronunciations + Uni-gram Frequency+ + * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * + Phrase String(UCS4) + n Pronunciations with Frequency + + * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + +namespace pinyin{ + +/* Store delta info by phrase index logger in user home directory. + */ + +const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint32); + +/** + * PhraseItem: + * + * The PhraseItem to access the items in phrase index. + * + */ +class PhraseItem{ + friend class SubPhraseIndex; + friend bool _compute_new_header(PhraseIndexLogger * logger, + phrase_token_t mask, + phrase_token_t value, + guint32 & new_total_freq); + +private: + MemoryChunk m_chunk; + bool set_n_pronunciation(guint8 n_prouns); +public: + /** + * PhraseItem::PhraseItem: + * + * The constructor of the PhraseItem. + * + */ + PhraseItem(){ + m_chunk.set_size(phrase_item_header); + memset(m_chunk.begin(), 0, m_chunk.size()); + } + +#if 0 + PhraseItem(MemoryChunk & chunk){ + m_chunk.set_content(0, chunk->begin(), chunk->size()); + assert ( m_chunk.size() >= phrase_item_header); + } +#endif + + /** + * PhraseItem::get_phrase_length: + * @returns: the length of this phrase item. + * + * Get the length of this phrase item. + * + */ + guint8 get_phrase_length(){ + char * buf_begin = (char *)m_chunk.begin(); + return (*(guint8 *)buf_begin); + } + + /** + * PhraseItem::get_n_pronunciation: + * @returns: the number of the pronunciations. + * + * Get the number of the pronunciations. + * + */ + guint8 get_n_pronunciation(){ + char * buf_begin = ( char *) m_chunk.begin(); + return (*(guint8 *)(buf_begin + sizeof(guint8))); + } + + /** + * PhraseItem::get_unigram_frequency: + * @returns: the uni-gram frequency of this phrase item. + * + * Get the uni-gram frequency of this phrase item. + * + */ + guint32 get_unigram_frequency(){ + char * buf_begin = (char *)m_chunk.begin(); + return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8))); + } + + /** + * PhraseItem::get_pronunciation_possibility: + * @options: the pinyin options. + * @keys: the pronunciation keys. + * @returns: the possibility of this phrase item pronounces the pinyin. + * + * Get the possibility of this phrase item pronounces the pinyin. + * + */ + gfloat get_pronunciation_possibility(pinyin_option_t options, + ChewingKey * keys){ + guint8 phrase_length = get_phrase_length(); + guint8 npron = get_n_pronunciation(); + size_t offset = phrase_item_header + phrase_length * sizeof (ucs4_t); + char * buf_begin = (char *)m_chunk.begin(); + guint32 matched = 0, total_freq =0; + for ( int i = 0 ; i < npron ; ++i){ + char * chewing_begin = buf_begin + offset + + i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); + guint32 * freq = (guint32 *)(chewing_begin + + phrase_length * sizeof(ChewingKey)); + total_freq += *freq; + if ( 0 == pinyin_compare_with_ambiguities2 + (options, keys, + (ChewingKey *)chewing_begin,phrase_length) ){ + matched += *freq; + } + } + +#if 1 + /* an additional safe guard for chewing. */ + if ( 0 == total_freq ) + return 0; +#endif + + /* used preprocessor to avoid zero freq, in gen_chewing_table. */ + gfloat retval = matched / (gfloat) total_freq; + return retval; + } + + /** + * PhraseItem::increase_pronunciation_possibility: + * @options: the pinyin options. + * @keys: the pronunciation keys. + * @delta: the delta to be added to the pronunciation keys. + * + * Add the delta to the pronunciation of the pronunciation keys. + * + */ + void increase_pronunciation_possibility(pinyin_option_t options, + ChewingKey * keys, + gint32 delta); + + /** + * PhraseItem::get_phrase_string: + * @phrase: the ucs4 character buffer. + * @returns: whether the get operation is successful. + * + * Get the ucs4 characters of this phrase item. + * + */ + bool get_phrase_string(ucs4_t * phrase); + + /** + * PhraseItem::set_phrase_string: + * @phrase_length: the ucs4 character length of this phrase item. + * @phrase: the ucs4 character buffer. + * @returns: whether the set operation is successful. + * + * Set the length and ucs4 characters of this phrase item. + * + */ + bool set_phrase_string(guint8 phrase_length, ucs4_t * phrase); + + /** + * PhraseItem::get_nth_pronunciation: + * @index: the pronunciation index. + * @keys: the pronunciation keys. + * @freq: the frequency of the pronunciation. + * @returns: whether the get operation is successful. + * + * Get the nth pronunciation of this phrase item. + * + */ + bool get_nth_pronunciation(size_t index, + /* out */ ChewingKey * keys, + /* out */ guint32 & freq); + + /** + * PhraseItem::add_pronunciation: + * @keys: the pronunciation keys. + * @delta: the delta of the frequency of the pronunciation. + * @returns: whether the add operation is successful. + * + * Add one pronunciation. + * + */ + bool add_pronunciation(ChewingKey * keys, guint32 delta); + + /** + * PhraseItem::remove_nth_pronunciation: + * @index: the pronunciation index. + * + * Remove the nth pronunciation. + * + * Note: Normally don't change the first pronunciation, + * which decides the token number. + * + */ + void remove_nth_pronunciation(size_t index); + + bool operator == (const PhraseItem & rhs) const{ + if (m_chunk.size() != rhs.m_chunk.size()) + return false; + return memcmp(m_chunk.begin(), rhs.m_chunk.begin(), + m_chunk.size()) == 0; + } + + bool operator != (const PhraseItem & rhs) const{ + return ! (*this == rhs); + } +}; + +/* + * In Sub Phrase Index, token == (token & PHRASE_MASK). + */ + +/** + * SubPhraseIndex: + * + * The SubPhraseIndex class for internal usage. + * + */ +class SubPhraseIndex{ +private: + guint32 m_total_freq; + MemoryChunk m_phrase_index; + MemoryChunk m_phrase_content; + MemoryChunk * m_chunk; + + void reset(){ + m_total_freq = 0; + m_phrase_index.set_size(0); + m_phrase_content.set_size(0); + if ( m_chunk ){ + delete m_chunk; + m_chunk = NULL; + } + } + +public: + /** + * SubPhraseIndex::SubPhraseIndex: + * + * The constructor of the SubPhraseIndex. + * + */ + SubPhraseIndex():m_total_freq(0){ + m_chunk = NULL; + } + + /** + * SubPhraseIndex::~SubPhraseIndex: + * + * The destructor of the SubPhraseIndex. + * + */ + ~SubPhraseIndex(){ + reset(); + } + + /** + * SubPhraseIndex::load: + * @chunk: the memory chunk of the binary sub phrase index. + * @offset: the begin of binary data in the memory chunk. + * @end: the end of binary data in the memory chunk. + * @returns: whether the load operation is successful. + * + * Load the sub phrase index from the memory chunk. + * + */ + bool load(MemoryChunk * chunk, + table_offset_t offset, table_offset_t end); + + /** + * SubPhraseIndex::store: + * @new_chunk: the new memory chunk to store this sub phrase index. + * @offset: the begin of binary data in the memory chunk. + * @end: the end of stored binary data in the memory chunk. + * @returns: whether the store operation is successful. + * + * Store the sub phrase index to the new memory chunk. + * + */ + bool store(MemoryChunk * new_chunk, + table_offset_t offset, table_offset_t & end); + + /** + * SubPhraseIndex::diff: + * @oldone: the original content of sub phrase index. + * @logger: the delta information of user self-learning data. + * @returns: whether the diff operation is successful. + * + * Compare this sub phrase index with the original content of the system + * sub phrase index to generate the logger of difference. + * + * Note: Switch to logger format to reduce user space storage. + * + */ + bool diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger); + + /** + * SubPhraseIndex::merge: + * @logger: the logger of difference in user home directory. + * @returns: whether the merge operation is successful. + * + * Merge the user logger of difference with this sub phrase index. + * + */ + bool merge(PhraseIndexLogger * logger); + + /** + * SubPhraseIndex::get_range: + * @range: the token range. + * @returns: whether the get operation is successful. + * + * Get the token range in this sub phrase index. + * + */ + int get_range(/* out */ PhraseIndexRange & range); + + /** + * SubPhraseIndex::get_phrase_index_total_freq: + * @returns: the total frequency of this sub phrase index. + * + * Get the total frequency of this sub phrase index. + * + * Note: maybe call it "Zero-gram". + * + */ + guint32 get_phrase_index_total_freq(); + + /** + * SubPhraseIndex::add_unigram_frequency: + * @token: the phrase token. + * @delta: the delta value of the phrase token. + * @returns: the status of the add operation. + * + * Add delta value to the phrase of the token. + * + * Note: this method is a fast path to add delta value. + * Maybe use the get_phrase_item method instead in future. + * + */ + int add_unigram_frequency(phrase_token_t token, guint32 delta); + + /** + * SubPhraseIndex::get_phrase_item: + * @token: the phrase token. + * @item: the phrase item of the token. + * @returns: the status of the get operation. + * + * Get the phrase item from this sub phrase index. + * + * Note:get_phrase_item function can't modify the phrase item size, + * but can increment the freq of the special pronunciation, + * or change the content without size increasing. + * + */ + int get_phrase_item(phrase_token_t token, PhraseItem & item); + + /** + * SubPhraseIndex::add_phrase_item: + * @token: the phrase token. + * @item: the phrase item of the token. + * @returns: the status of the add operation. + * + * Add the phrase item to this sub phrase index. + * + */ + int add_phrase_item(phrase_token_t token, PhraseItem * item); + + /** + * SubPhraseIndex::remove_phrase_item: + * @token: the phrase token. + * @item: the removed phrase item of the token. + * @returns: the status of the remove operation. + * + * Remove the phrase item of the token. + * + * Note: this remove_phrase_item method will substract the unigram + * frequency of the removed item from m_total_freq. + * + */ + int remove_phrase_item(phrase_token_t token, /* out */ PhraseItem * & item); + + /** + * SubPhraseIndex::mask_out: + * @mask: the mask. + * @value: the value. + * @returns: whether the mask out operation is successful. + * + * Mask out the matched phrase items. + * + */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + +/** + * FacadePhraseIndex: + * + * The facade class of phrase index. + * + */ +class FacadePhraseIndex{ +private: + guint32 m_total_freq; + SubPhraseIndex * m_sub_phrase_indices[PHRASE_INDEX_LIBRARY_COUNT]; +public: + /** + * FacadePhraseIndex::FacadePhraseIndex: + * + * The constructor of the FacadePhraseIndex. + * + */ + FacadePhraseIndex(){ + m_total_freq = 0; + memset(m_sub_phrase_indices, 0, sizeof(m_sub_phrase_indices)); + } + + /** + * FacadePhraseIndex::~FacadePhraseIndex: + * + * The destructor of the FacadePhraseIndex. + * + */ + ~FacadePhraseIndex(){ + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){ + if ( m_sub_phrase_indices[i] ){ + delete m_sub_phrase_indices[i]; + m_sub_phrase_indices[i] = NULL; + } + } + } + + /** + * FacadePhraseIndex::load_text: + * @phrase_index: the index of sub phrase index to be loaded. + * @infile: the textual format file of the phrase table. + * @returns: whether the load operation is successful. + * + * Load one sub phrase index from the textual format file. + * Note: load sub phrase index according to the config in future. + * + */ + bool load_text(guint8 phrase_index, FILE * infile); + + /** + * FacadePhraseIndex::load: + * @phrase_index: the index of sub phrase index to be loaded. + * @chunk: the memory chunk of sub phrase index to be loaded. + * @returns: whether the load operation is successful. + * + * Load one sub phrase index from the memory chunk. + * + */ + bool load(guint8 phrase_index, MemoryChunk * chunk); + + /** + * FacadePhraseIndex::store: + * @phrase_index: the index of sub phrase index to be stored. + * @new_chunk: the memory chunk of sub phrase index to be stored. + * @returns: whether the store operation is successful. + * + * Store one sub phrase index to the memory chunk. + * + */ + bool store(guint8 phrase_index, MemoryChunk * new_chunk); + + /** + * FacadePhraseIndex::unload: + * @phrase_index: the index of sub phrase index to be unloaded. + * @returns: whether the unload operation is successful. + * + * Unload one sub phrase index. + * + */ + bool unload(guint8 phrase_index); + + + /** + * FacadePhraseIndex::diff: + * @phrase_index: the index of sub phrase index to be differed. + * @oldchunk: the original content of sub phrase index. + * @newlog: the delta information of user self-learning data. + * @returns: whether the diff operation is successful. + * + * Store user delta information in the logger format. + * + * Note: the ownership of oldchunk is transfered here. + * + */ + bool diff(guint8 phrase_index, MemoryChunk * oldchunk, + MemoryChunk * newlog); + + /** + * FacadePhraseIndex::merge: + * @phrase_index: the index of sub phrase index to be merged. + * @log: the logger of difference in user home directory. + * @returns: whether the merge operation is successful. + * + * Merge the user logger of difference with the sub phrase index. + * + * Note: the ownership of log is transfered here. + * + */ + bool merge(guint8 phrase_index, MemoryChunk * log); + + /** + * FacadePhraseIndex::merge_with_mask: + * @phrase_index: the index of sub phrase index to be merged. + * @log: the logger of difference in user home directory. + * @mask: the mask. + * @value: the value. + * @returns: whether the merge operation is successful. + * + * Merge the user logger of difference with mask operation. + * + * Note: the ownership of log is transfered here. + * + */ + bool merge_with_mask(guint8 phrase_index, MemoryChunk * log, + phrase_token_t mask, phrase_token_t value); + + /** + * FacadePhraseIndex::compact: + * @returns: whether the compact operation is successful. + * + * Compat all sub phrase index memory usage. + * + */ + bool compact(); + + /** + * FacadePhraseIndex::mask_out: + * @phrase_index: the index of sub phrase index. + * @mask: the mask. + * @value: the value. + * @returns: whether the mask out operation is successful. + * + * Mask out the matched phrase items. + * + * Note: should call compact() after the mask out operation. + * + */ + bool mask_out(guint8 phrase_index, + phrase_token_t mask, phrase_token_t value); + + /** + * FacadePhraseIndex::get_sub_phrase_range: + * @min_index: the minimal sub phrase index. + * @max_index: the maximal sub phrase index. + * @returns: the status of the get operation. + * + * Get the minimum and maximum of the sub phrase index. + * + */ + int get_sub_phrase_range(guint8 & min_index, guint8 & max_index); + + /** + * FacadePhraseIndex::get_range: + * @phrase_index: the index of sub phrase index. + * @range: the token range of the sub phrase index. + * @returns: the status of the get operation. + * + * Get the token range of the sub phrase index. + * + */ + int get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range); + + /** + * FacadePhraseIndex::get_phrase_index_total_freq: + * @returns: the total freq of the facade phrase index. + * + * Get the total freq of the facade phrase index. + * + * Note: maybe call it "Zero-gram". + * + */ + guint32 get_phrase_index_total_freq(){ + return m_total_freq; + } + + /** + * FacadePhraseIndex::add_unigram_frequency: + * @token: the phrase token. + * @delta: the delta value of the phrase token. + * @returns: the status of the add operation. + * + * Add delta value to the phrase of the token. + * + */ + int add_unigram_frequency(phrase_token_t token, guint32 delta){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ) + return ERROR_NO_SUB_PHRASE_INDEX; + m_total_freq += delta; + return sub_phrase->add_unigram_frequency(token, delta); + } + + /** + * FacadePhraseIndex::get_phrase_item: + * @token: the phrase token. + * @item: the phrase item of the token. + * @returns: the status of the get operation. + * + * Get the phrase item from the facade phrase index. + * + */ + int get_phrase_item(phrase_token_t token, PhraseItem & item){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ) + return ERROR_NO_SUB_PHRASE_INDEX; + return sub_phrase->get_phrase_item(token, item); + } + + /** + * FacadePhraseIndex::add_phrase_item: + * @token: the phrase token. + * @item: the phrase item of the token. + * @returns: the status of the add operation. + * + * Add the phrase item to the facade phrase index. + * + */ + int add_phrase_item(phrase_token_t token, PhraseItem * item){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ){ + sub_phrase = new SubPhraseIndex; + } + m_total_freq += item->get_unigram_frequency(); + return sub_phrase->add_phrase_item(token, item); + } + + /** + * FacadePhraseIndex::remove_phrase_item: + * @token: the phrase token. + * @item: the removed phrase item of the token. + * @returns: the status of the remove operation. + * + * Remove the phrase item of the token. + * + */ + int remove_phrase_item(phrase_token_t token, PhraseItem * & item){ + guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); + SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index]; + if ( !sub_phrase ){ + return ERROR_NO_SUB_PHRASE_INDEX; + } + int result = sub_phrase->remove_phrase_item(token, item); + if ( result ) + return result; + m_total_freq -= item->get_unigram_frequency(); + return result; + } + + /** + * FacadePhraseIndex::prepare_ranges: + * @ranges: the ranges to be prepared. + * @returns: whether the prepare operation is successful. + * + * Prepare the ranges. + * + */ + bool prepare_ranges(PhraseIndexRanges ranges) { + /* assume memset(ranges, 0, sizeof(ranges)); */ + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & range = ranges[i]; + assert(NULL == range); + + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i]; + if (sub_phrase) { + range = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange)); + } + } + return true; + } + + /** + * FacadePhraseIndex::clear_ranges: + * @ranges: the ranges to be cleared. + * @returns: whether the clear operation is successful. + * + * Clear the ranges. + * + */ + bool clear_ranges(PhraseIndexRanges ranges) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * range = ranges[i]; + if (range) { + g_array_set_size(range, 0); + } + } + return true; + } + + /** + * FacadePhraseIndex::destroy_ranges: + * @ranges: the ranges to be destroyed. + * @returns: whether the destroy operation is successful. + * + * Destroy the ranges. + * + */ + bool destroy_ranges(PhraseIndexRanges ranges) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & range = ranges[i]; + if (range) { + g_array_free(range, TRUE); + range = NULL; + } + } + return true; + } + + /** + * FacadePhraseIndex::prepare_tokens: + * @tokens: the tokens to be prepared. + * @returns: whether the prepare operation is successful. + * + * Prepare the tokens. + * + */ + bool prepare_tokens(PhraseTokens tokens) { + /* assume memset(tokens, 0, sizeof(tokens)); */ + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & token = tokens[i]; + assert(NULL == token); + + SubPhraseIndex * sub_phrase = m_sub_phrase_indices[i]; + if (sub_phrase) { + token = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + } + } + return true; + } + + /** + * FacadePhraseIndex::clear_tokens: + * @tokens: the tokens to be cleared. + * @return: whether the clear operation is successful. + * + * Clear the tokens. + * + */ + bool clear_tokens(PhraseTokens tokens) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * token = tokens[i]; + if (token) { + g_array_set_size(token, 0); + } + } + return true; + } + + /** + * FacadePhraseIndex::destroy_tokens: + * @tokens: the tokens to be destroyed. + * @returns: whether the destroy operation is successful. + * + * Destroy the tokens. + * + */ + bool destroy_tokens(PhraseTokens tokens) { + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + GArray * & token = tokens[i]; + if (token) { + g_array_free(token, TRUE); + token = NULL; + } + } + return true; + } + + /** + * FacadePhraseIndex::create_sub_phrase: + * @index: the phrase index to be created. + * @returns: the result of the create operation. + * + * Create the sub phrase index. + * + */ + int create_sub_phrase(guint8 index) { + SubPhraseIndex * & sub_phrase = m_sub_phrase_indices[index]; + if (sub_phrase) { + return ERROR_ALREADY_EXISTS; + } + + sub_phrase = new SubPhraseIndex; + + return ERROR_OK; + } +}; + +PhraseIndexLogger * mask_out_phrase_index_logger +(PhraseIndexLogger * oldlogger, phrase_token_t mask, phrase_token_t value); + +}; + +#endif |