/* * libpinyin * Library to deal with pinyin. * * Copyright (C) 2006-2007 Peng Wu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "phrase_index.h" using namespace pinyin; bool PhraseItem::set_n_pronunciation(guint8 n_prouns){ m_chunk.set_content(sizeof(guint8), &n_prouns, sizeof(guint8)); return true; } bool PhraseItem::get_nth_pronunciation(size_t index, PinyinKey * pinyin, guint32 & freq){ guint8 phrase_length = get_phrase_length(); table_offset_t offset = phrase_item_header + phrase_length * sizeof( utf16_t) + index * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32)); bool retval = m_chunk.get_content(offset, pinyin, phrase_length * sizeof(PinyinKey)); if ( !retval ) return retval; return m_chunk.get_content(offset + phrase_length * sizeof(PinyinKey), &freq , sizeof(guint32)); } void PhraseItem::append_pronunciation(PinyinKey * pinyin, guint32 freq){ guint8 phrase_length = get_phrase_length(); set_n_pronunciation(get_n_pronunciation() + 1); m_chunk.set_content(m_chunk.size(), pinyin, phrase_length * sizeof(PinyinKey)); m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32)); } void PhraseItem::remove_nth_pronunciation(size_t index){ guint8 phrase_length = get_phrase_length(); set_n_pronunciation(get_n_pronunciation() - 1); size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ) + index * (phrase_length * sizeof (PinyinKey) + sizeof(guint32)); m_chunk.remove_content(offset, phrase_length * sizeof(PinyinKey) + sizeof(guint32)); } bool PhraseItem::get_phrase_string(utf16_t * phrase){ guint8 phrase_length = get_phrase_length(); return m_chunk.get_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t)); } bool PhraseItem::set_phrase_string(guint8 phrase_length, utf16_t * phrase){ m_chunk.set_content(0, &phrase_length, sizeof(guint8)); m_chunk.set_content(phrase_item_header, phrase, phrase_length * sizeof(utf16_t)); return true; } void PhraseItem::increase_pinyin_possibility(PinyinCustomSettings & custom, PinyinKey * pinyin_keys, gint32 delta){ guint8 phrase_length = get_phrase_length(); guint8 npron = get_n_pronunciation(); size_t offset = phrase_item_header + phrase_length * sizeof ( utf16_t ); char * buf_begin = (char *) m_chunk.begin(); guint32 total_freq = 0; for ( int i = 0 ; i < npron ; ++i){ char * pinyin_begin = buf_begin + offset + i * ( phrase_length * sizeof(PinyinKey) + sizeof(guint32) ); guint32 * freq = (guint32 *)(pinyin_begin + phrase_length * sizeof(PinyinKey)); total_freq += *freq; if ( 0 == pinyin_compare_with_ambiguities(custom, (PinyinKey *)pinyin_begin, pinyin_keys, phrase_length)){ //protect against total_freq overflow. if ( delta > 0 && total_freq > total_freq + delta ) return; *freq += delta; total_freq += delta; } } } guint32 SubPhraseIndex::get_phrase_index_total_freq(){ return m_total_freq; } int SubPhraseIndex::add_unigram_frequency(phrase_token_t token, guint32 delta){ table_offset_t offset; guint32 freq; bool result = m_phrase_index.get_content ((token & PHRASE_MASK) * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); if ( !result ) return ERROR_OUT_OF_RANGE; if ( 0 == offset ) return ERROR_NO_ITEM; result = m_phrase_content.get_content (offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32)); if ( !result ) return ERROR_FILE_CORRUPTION; //protect total_freq overflow if ( delta > 0 && m_total_freq > m_total_freq + delta ) return ERROR_INTEGER_OVERFLOW; freq += delta; m_total_freq += delta; m_phrase_content.set_content(offset + sizeof(guint8) + sizeof(guint8), &freq, sizeof(guint32)); return ERROR_OK; } int SubPhraseIndex::get_phrase_item(phrase_token_t token, PhraseItem & item){ table_offset_t offset; guint8 phrase_length; guint8 n_prons; bool result = m_phrase_index.get_content ((token & PHRASE_MASK) * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); if ( !result ) return ERROR_OUT_OF_RANGE; if ( 0 == offset ) return ERROR_NO_ITEM; result = m_phrase_content.get_content(offset, &phrase_length, sizeof(guint8)); if ( !result ) return ERROR_FILE_CORRUPTION; result = m_phrase_content.get_content(offset+sizeof(guint8), &n_prons, sizeof(guint8)); if ( !result ) return ERROR_FILE_CORRUPTION; size_t length = phrase_item_header + phrase_length * sizeof ( utf16_t ) + n_prons * ( phrase_length * sizeof (PinyinKey) + sizeof(guint32) ); item.m_chunk.set_chunk((char *)m_phrase_content.begin() + offset, length, NULL); return ERROR_OK; } int SubPhraseIndex::add_phrase_item(phrase_token_t token, PhraseItem * item){ table_offset_t offset = m_phrase_content.size(); if ( 0 == offset ) offset = 8; m_phrase_content.set_content(offset, item->m_chunk.begin(), item->m_chunk.size()); m_phrase_index.set_content((token & PHRASE_MASK) * sizeof(table_offset_t), &offset, sizeof(table_offset_t)); m_total_freq += item->get_unigram_frequency(); return ERROR_OK; } int SubPhraseIndex::remove_phrase_item(phrase_token_t token, PhraseItem * & item){ PhraseItem old_item; int result = get_phrase_item(token, old_item); if (result != ERROR_OK) return result; item = new PhraseItem; //implictly copy data from m_chunk_content. item->m_chunk.set_content(0, (char *) old_item.m_chunk.begin() , old_item.m_chunk.size()); const table_offset_t zero_const = 0; m_phrase_index.set_content((token & PHRASE_MASK) * sizeof(table_offset_t), &zero_const, sizeof(table_offset_t)); m_total_freq -= item->get_unigram_frequency(); return ERROR_OK; } bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; if ( !sub_phrases ){ sub_phrases = new SubPhraseIndex; } bool retval = sub_phrases->load(chunk, 0, chunk->size()); if ( !retval ) return retval; m_total_freq += sub_phrases->get_phrase_index_total_freq(); return retval; } bool FacadePhraseIndex::store(guint8 phrase_index, MemoryChunk * new_chunk){ table_offset_t end; SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; if ( !sub_phrases ) return false; sub_phrases->store(new_chunk, 0, end); return true; } bool FacadePhraseIndex::unload(guint8 phrase_index){ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; if ( !sub_phrases ) return false; m_total_freq -= sub_phrases->get_phrase_index_total_freq(); delete sub_phrases; sub_phrases = NULL; return true; } bool SubPhraseIndex::load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){ //save the memory chunk if ( m_chunk ){ delete m_chunk; m_chunk = NULL; } m_chunk = chunk; char * buf_begin = (char *)chunk->begin(); chunk->get_content(offset, &m_total_freq, sizeof(guint32)); offset += sizeof(guint32); table_offset_t index_one, index_two, index_three; chunk->get_content(offset, &index_one, sizeof(table_offset_t)); offset += sizeof(table_offset_t); chunk->get_content(offset, &index_two, sizeof(table_offset_t)); offset += sizeof(table_offset_t); chunk->get_content(offset, &index_three, sizeof(table_offset_t)); offset += sizeof(table_offset_t); g_return_val_if_fail(*(buf_begin + offset) == c_separate, FALSE); g_return_val_if_fail(*(buf_begin + index_two - 1) == c_separate, FALSE); g_return_val_if_fail(*(buf_begin + index_three - 1) == c_separate, FALSE); m_phrase_index.set_chunk(buf_begin + index_one, index_two - 1 - index_one, NULL); m_phrase_content.set_chunk(buf_begin + index_two, index_three - 1 - index_two, NULL); g_return_val_if_fail( index_three <= end, FALSE); return true; } bool SubPhraseIndex::store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t& end){ new_chunk->set_content(offset, &m_total_freq, sizeof(guint32)); table_offset_t index = offset + sizeof(guint32); offset = index + sizeof(table_offset_t) * 3 ; new_chunk->set_content(offset, &c_separate, sizeof(char)); offset += sizeof(char); new_chunk->set_content(index, &offset, sizeof(table_offset_t)); index += sizeof(table_offset_t); new_chunk->set_content(offset, m_phrase_index.begin(), m_phrase_index.size()); offset += m_phrase_index.size(); new_chunk->set_content(offset, &c_separate, sizeof(char)); offset += sizeof(char); new_chunk->set_content(index, &offset, sizeof(table_offset_t)); index += sizeof(table_offset_t); new_chunk->set_content(offset, m_phrase_content.begin(), m_phrase_content.size()); offset += m_phrase_content.size(); new_chunk->set_content(offset, &c_separate, sizeof(char)); offset += sizeof(char); new_chunk->set_content(index, &offset, sizeof(table_offset_t)); return true; } bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; if ( !sub_phrases ){ sub_phrases = new SubPhraseIndex; } char pinyin[256]; char phrase[256]; phrase_token_t token; size_t freq; PhraseItem * item_ptr = new PhraseItem; phrase_token_t cur_token = 0; while ( !feof(infile)){ fscanf(infile, "%s", pinyin); fscanf(infile, "%s", phrase); fscanf(infile, "%u", &token); fscanf(infile, "%ld", &freq); if ( feof(infile) ) break; assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index ); glong written; utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL, &written, NULL); if ( 0 == cur_token ){ cur_token = token; item_ptr->set_phrase_string(written, phrase_utf16); } if ( cur_token != token ){ add_phrase_item( cur_token, item_ptr); delete item_ptr; item_ptr = new PhraseItem; cur_token = token; item_ptr->set_phrase_string(written, phrase_utf16); } PinyinDefaultParser parser; NullPinyinValidator validator; PinyinKeyVector keys; PinyinKeyPosVector poses; keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); parser.parse(validator, keys, poses, pinyin); assert ( item_ptr->get_phrase_length() == keys->len ); item_ptr->append_pronunciation((PinyinKey *)keys->data, freq); g_array_free(keys, TRUE); g_array_free(poses, TRUE); g_free(phrase_utf16); } add_phrase_item( cur_token, item_ptr); delete item_ptr; m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq(); return true; } int FacadePhraseIndex::get_range(guint8 phrase_index, /* out */ PhraseIndexRange & range){ SubPhraseIndex * sub_phrase = m_sub_phrase_indices[phrase_index]; if ( !sub_phrase ) return ERROR_NO_SUB_PHRASE_INDEX; int result = sub_phrase->get_range(range); if ( result ) return result; range.m_range_begin = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_begin); range.m_range_end = PHRASE_INDEX_MAKE_TOKEN(phrase_index, range.m_range_end); return ERROR_OK; } int SubPhraseIndex::get_range(/* out */ PhraseIndexRange & range){ const table_offset_t * begin = (const table_offset_t *)m_phrase_index.begin(); const table_offset_t * end = (const table_offset_t *)m_phrase_index.end(); range.m_range_begin = 1; /* token starts with 1 in gen_pinyin_table. */ range.m_range_end = end - begin; return ERROR_OK; } bool FacadePhraseIndex::compat(){ for ( size_t index = 0; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) { SubPhraseIndex * sub_phrase = m_sub_phrase_indices[index]; if ( !sub_phrase ) continue; SubPhraseIndex * new_sub_phrase = new SubPhraseIndex; PhraseIndexRange range; int result = sub_phrase->get_range(range); if ( result != ERROR_OK ) { delete new_sub_phrase; continue; } PhraseItem item; for ( phrase_token_t token = range.m_range_begin; token < range.m_range_end; ++token ) { result = sub_phrase->get_phrase_item(token, item); if ( result != ERROR_OK ) continue; new_sub_phrase->add_phrase_item(token, &item); } delete sub_phrase; m_sub_phrase_indices[index] = new_sub_phrase; } return true; }