diff options
author | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
commit | f41d1fdf83408e042ab07925710a8913bad0c27c (patch) | |
tree | 1757833ac4cdd0830834d2f9ef92be07c0bc1a5b /src/storage/phrase_index.cpp | |
parent | 34acf9be9033e0dc0a5905999133482c20b6cbf3 (diff) | |
download | libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.gz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.xz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.zip |
import from pinyin.
Diffstat (limited to 'src/storage/phrase_index.cpp')
-rw-r--r-- | src/storage/phrase_index.cpp | 340 |
1 files changed, 340 insertions, 0 deletions
diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp new file mode 100644 index 0000000..7dbecb3 --- /dev/null +++ b/src/storage/phrase_index.cpp @@ -0,0 +1,340 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "phrase_index.h" + +bool PhraseItem::set_n_pronunciation(guint8 n_prouns){ + m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8)); + return true; +} + +bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){ + guint8 phrase_length = get_phrase_length(); + table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32)); + bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey)); + if ( !retval ) + return retval; + return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32)); +} + +void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){ + guint8 phrase_length = get_phrase_length(); + set_n_pronunciation(get_n_pronunciation() + 1); + m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey)); + m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32)); +} + +void PhraseItem::remove_nth_pronunciation(size_t index){ + guint8 phrase_length = get_phrase_length(); + set_n_pronunciation(get_n_pronunciation() - 1); + size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32)); + m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32)); +} + +bool PhraseItem::get_phrase_string(utf16_t * phrase){ + guint8 phrase_length = get_phrase_length(); + return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t)); +} + +bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){ + m_chunk.set_content(0, &phrase_length, sizeof(guint8)); + m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t)); + return true; +} + +void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom, + PinyinKey * pinyin_keys, + gint32 delta){ + guint8 phrase_length = get_phrase_length(); + guint8 npron = get_n_pronunciation(); + size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ); + char * buf_begin = (char *) m_chunk.begin(); + guint32 total_freq = 0; + for ( int i = 0 ; i < npron ; ++i){ + char * pinyin_begin = buf_begin + offset + + i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) ); + guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey)); + total_freq += *freq; + if ( 0 == pinyin_compare_with_ambiguities(custom, + (PinyinKey *)pinyin_begin, + pinyin_keys, + phrase_length)){ + //protect against total_freq overflow. + if ( delta > 0 && total_freq > total_freq + delta ) + return; + *freq += delta; + total_freq += delta; + } + } +} + + +guint32 SubPhraseIndex::get_phrase_index_total_freq(){ + return m_total_freq; +} + +bool SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){ + table_offset_t offset; + guint32 freq; + bool result = m_phrase_index.get_content + ((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + + if ( !result) + return result; + + if ( 0 == offset ) + return false; + + result = m_phrase_content.get_content + (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32)); + //protect total_freq overflow + if ( delta > 0 && m_total_freq > m_total_freq + delta ) + return false; + freq += delta; + m_total_freq += delta; + return m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32)); +} + +bool SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){ + table_offset_t offset; + guint8 phrase_length; + guint8 n_prons; + + bool result = m_phrase_index.get_content + ((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + + if ( !result ) + return result; + + if ( 0 == offset ) + return false; + + result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8)); + if ( !result ) + return result; + + result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8)); + if ( !result ) + return result; + + size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) ); + item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL); + return true; +} + +bool SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){ + table_offset_t offset = m_phrase_content.size(); + if ( 0 == offset ) + offset = 8; + m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size()); + m_phrase_index.set_content((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + m_total_freq += item->get_unigram_frequency(); + return true; +} + +bool SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){ + table_offset_t offset; + guint8 phrase_length; + guint8 n_prons; + + bool result = m_phrase_index.get_content + ((token & PHRASE_MASK) + * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); + + if ( !result ) + return result; + + if ( 0 == offset ) + return false; + + result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8)); + if ( !result ) + return result; + + result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8)); + if ( !result ) + return result; + + size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) ); + item = new PhraseItem; + //implictly copy data from m_chunk_content. + item->m_chunk.set_content(0, (char *) m_phrase_content.begin() + offset, length); + + const table_offset_t zero_const = 0; + m_phrase_index.set_content((token & PHRASE_MASK) + * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t)); + m_total_freq -= item->get_unigram_frequency(); + return true; +} + +bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ){ + sub_phrases = new SubPhraseIndex; + } + + bool retval = sub_phrases->load(chunk, 0, chunk->size()); + if ( !retval ) + return retval; + m_total_freq += sub_phrases->get_phrase_index_total_freq(); + return retval; +} + +bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){ + table_offset_t end; + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ) + return false; + + sub_phrases->store(new_chunk, 0, end); + return true; +} + +bool FacadePhraseIndex::unload(guint8 phrase_index){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ) + return false; + m_total_freq -= sub_phrases->get_phrase_index_total_freq(); + delete sub_phrases; + sub_phrases = NULL; + return true; +} + +bool SubPhraseIndex::load(MemoryChunk * chunk, + table_offset_t offset, table_offset_t end){ + //save the memory chunk + if ( m_chunk ){ + delete m_chunk; + m_chunk = NULL; + } + m_chunk = chunk; + + char * buf_begin = (char *)chunk->begin(); + chunk->get_content(offset, &m_total_freq, sizeof(guint32)); + offset += sizeof(guint32); + table_offset_t index_one, index_two, index_three; + chunk->get_content(offset, &index_one, sizeof(table_offset_t)); + offset += sizeof(table_offset_t); + chunk->get_content(offset, &index_two, sizeof(table_offset_t)); + offset += sizeof(table_offset_t); + chunk->get_content(offset, &index_three, sizeof(table_offset_t)); + offset += sizeof(table_offset_t); + g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE); + g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE); + g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE); + m_phrase_index.set_chunk(buf_begin + index_one, + index_two - 1 - index_one, NULL); + m_phrase_content.set_chunk(buf_begin + index_two, + index_three - 1 - index_two, NULL); + g_return_val_if_fail( index_three <= end, FALSE); + return true; +} + +bool SubPhraseIndex::store(MemoryChunk * new_chunk, + table_offset_t offset, table_offset_t& end){ + new_chunk->set_content(offset, &m_total_freq, sizeof(guint32)); + table_offset_t index = offset + sizeof(guint32); + + offset = index + sizeof(table_offset_t) * 3 ; + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size()); + offset += m_phrase_index.size(); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + + new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size()); + offset += m_phrase_content.size(); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + return true; +} + +bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){ + SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; + if ( !sub_phrases ){ + sub_phrases = new SubPhraseIndex; + } + + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + PhraseItem * item_ptr = new PhraseItem; + phrase_token_t cur_token = 0; + while ( !feof(infile)){ + fscanf(infile, "%s", pinyin); + fscanf(infile, "%s", phrase); + fscanf(infile, "%ld", &token); + fscanf(infile, "%ld", &freq); + if ( feof(infile) ) + break; + + glong written; + utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL, + &written, NULL); + + if ( 0 == cur_token ){ + cur_token = token; + item_ptr->set_phrase_string(written, phrase_utf16); + } + + if ( cur_token != token ){ + add_phrase_item( cur_token, item_ptr); + delete item_ptr; + item_ptr = new PhraseItem; + cur_token = token; + item_ptr->set_phrase_string(written, phrase_utf16); + } + + PinyinDefaultParser parser; + NullPinyinValidator validator; + PinyinKeyVector keys; + PinyinKeyPosVector poses; + + keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); + poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); + parser.parse(validator, keys, poses, pinyin); + + assert ( item_ptr->get_phrase_length() == keys->len ); + item_ptr->append_pronunciation((PinyinKey *)keys->data, freq); + + g_array_free(keys, TRUE); + g_array_free(poses, TRUE); + g_free(phrase_utf16); + } + + add_phrase_item( cur_token, item_ptr); + delete item_ptr; + m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq(); + return true; +} |