/* * libpinyin * Library to deal with pinyin. * * Copyright (C) 2006-2007 Peng Wu * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include "memory_chunk.h" #include "novel_types.h" #include "ngram.h" using namespace pinyin; struct SingleGramItem{ phrase_token_t m_token; guint32 m_freq; }; SingleGram::SingleGram(){ m_chunk.set_size(sizeof(guint32)); memset(m_chunk.begin(), 0, sizeof(guint32)); } SingleGram::SingleGram(void * buffer, size_t length, bool copy){ if (copy) m_chunk.set_content(0, buffer, length); else m_chunk.set_chunk(buffer, length, NULL); } bool SingleGram::get_total_freq(guint32 & total) const{ char * buf_begin = (char *)m_chunk.begin(); total = *((guint32 *)buf_begin); return true; } bool SingleGram::set_total_freq(guint32 total){ char * buf_begin = (char *)m_chunk.begin(); *((guint32 *)buf_begin) = total; return true; } guint32 SingleGram::get_length(){ /* get the number of items. */ const SingleGramItem * begin = (const SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); const SingleGramItem * end = (const SingleGramItem *) m_chunk.end(); const guint32 length = end - begin; if (0 == length) { /* no items here, total freq should be zero. */ guint32 total_freq = 0; assert(get_total_freq(total_freq)); assert(0 == total_freq); } return length; } guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){ guint32 removed_items = 0; guint32 total_freq = 0; assert(get_total_freq(total_freq)); const SingleGramItem * begin = (const SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); const SingleGramItem * end = (const SingleGramItem *) m_chunk.end(); for (const SingleGramItem * cur = begin; cur != end; ++cur) { if ((cur->m_token & mask) != value) continue; total_freq -= cur->m_freq; size_t offset = sizeof(guint32) + sizeof(SingleGramItem) * (cur - begin); m_chunk.remove_content(offset, sizeof(SingleGramItem)); /* update chunk end. */ end = (const SingleGramItem *) m_chunk.end(); ++removed_items; --cur; } assert(set_total_freq(total_freq)); return removed_items; } bool SingleGram::prune(){ assert(false); #if 0 SingleGramItem * begin = (SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); SingleGramItem * end = (SingleGramItem *)m_chunk.end(); size_t nitem = 0; for ( SingleGramItem * cur = begin; cur != end; ++cur){ cur->m_freq--; nitem++; if ( cur->m_freq == 0 ){ size_t offset = sizeof(guint32) + (cur - begin) * sizeof(SingleGramItem) ; m_chunk.remove_content(offset, sizeof(SingleGramItem)); } } guint32 total_freq; assert(get_total_freq(total_freq)); assert(set_total_freq(total_freq - nitem)); #endif return true; } static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){ return lhs.m_token < rhs.m_token; } bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array) const { const SingleGramItem * begin = (const SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); const SingleGramItem * end = (const SingleGramItem *) m_chunk.end(); guint32 total_freq; BigramPhraseItemWithCount bigram_item_with_count; assert(get_total_freq(total_freq)); for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){ bigram_item_with_count.m_token = cur_item->m_token; bigram_item_with_count.m_count = cur_item->m_freq; bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq; g_array_append_val(array, bigram_item_with_count); } return true; } bool SingleGram::search(/* in */ PhraseIndexRange * range, /* out */ BigramPhraseArray array) const { const SingleGramItem * begin = (const SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); SingleGramItem compare_item; compare_item.m_token = range->m_range_begin; const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); guint32 total_freq; BigramPhraseItem bigram_item; assert(get_total_freq(total_freq)); for ( ; cur_item != end; ++cur_item){ if ( cur_item->m_token >= range->m_range_end ) break; bigram_item.m_token = cur_item->m_token; bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq; g_array_append_val(array, bigram_item); } return true; } bool SingleGram::insert_freq( /* in */ phrase_token_t token, /* in */ guint32 freq){ SingleGramItem * begin = (SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); SingleGramItem * end = (SingleGramItem *) m_chunk.end(); SingleGramItem compare_item; compare_item.m_token = token; SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); SingleGramItem insert_item; insert_item.m_token = token; insert_item.m_freq = freq; for ( ; cur_item != end; ++cur_item ){ if ( cur_item->m_token > token ){ size_t offset = sizeof(guint32) + sizeof(SingleGramItem) * (cur_item - begin); m_chunk.insert_content(offset, &insert_item, sizeof(SingleGramItem)); return true; } if ( cur_item->m_token == token ){ return false; } } m_chunk.insert_content(m_chunk.size(), &insert_item, sizeof(SingleGramItem)); return true; } bool SingleGram::remove_freq( /* in */ phrase_token_t token, /* out */ guint32 & freq){ freq = 0; const SingleGramItem * begin = (const SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); SingleGramItem compare_item; compare_item.m_token = token; const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); for ( ; cur_item != end; ++cur_item ){ if ( cur_item->m_token > token ) return false; if ( cur_item->m_token == token ){ freq = cur_item -> m_freq; size_t offset = sizeof(guint32) + sizeof(SingleGramItem) * (cur_item - begin); m_chunk.remove_content(offset, sizeof(SingleGramItem)); return true; } } return false; } bool SingleGram::get_freq(/* in */ phrase_token_t token, /* out */ guint32 & freq) const { freq = 0; const SingleGramItem * begin = (const SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); SingleGramItem compare_item; compare_item.m_token = token; const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); for ( ; cur_item != end; ++cur_item){ if ( cur_item->m_token > token ) return false; if ( cur_item->m_token == token ){ freq = cur_item -> m_freq; return true; } } return false; } bool SingleGram::set_freq( /* in */ phrase_token_t token, /* in */ guint32 freq){ SingleGramItem * begin = (SingleGramItem *) ((const char *)(m_chunk.begin()) + sizeof(guint32)); SingleGramItem * end = (SingleGramItem *)m_chunk.end(); SingleGramItem compare_item; compare_item.m_token = token; SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); for ( ;cur_item != end; ++cur_item){ if ( cur_item->m_token > token ){ return false; } if ( cur_item->m_token == token ){ cur_item -> m_freq = freq; return true; } } return false; } namespace pinyin{ /* merge origin system info and delta user info */ bool merge_single_gram(SingleGram * merged, const SingleGram * system, const SingleGram * user){ if (NULL == system && NULL == user) return false; MemoryChunk & merged_chunk = merged->m_chunk; if (NULL == system) { merged_chunk.set_chunk(user->m_chunk.begin(), user->m_chunk.size(), NULL); return true; } if (NULL == user) { merged_chunk.set_chunk(system->m_chunk.begin(), system->m_chunk.size(), NULL); return true; } /* clear merged. */ merged_chunk.set_size(sizeof(guint32)); /* merge the origin info and delta info */ guint32 system_total, user_total; assert(system->get_total_freq(system_total)); assert(user->get_total_freq(user_total)); const guint32 merged_total = system_total + user_total; merged_chunk.set_content(0, &merged_total, sizeof(guint32)); const SingleGramItem * cur_system = (const SingleGramItem *) (((const char *)(system->m_chunk.begin())) + sizeof(guint32)); const SingleGramItem * system_end = (const SingleGramItem *) system->m_chunk.end(); const SingleGramItem * cur_user = (const SingleGramItem *) (((const char *)(user->m_chunk.begin())) + sizeof(guint32)); const SingleGramItem * user_end = (const SingleGramItem *) user->m_chunk.end(); while (cur_system < system_end && cur_user < user_end) { if (cur_system->m_token < cur_user->m_token) { /* do append operation here */ merged_chunk.append_content(cur_system, sizeof(SingleGramItem)); cur_system++; } else if (cur_system->m_token > cur_user->m_token) { /* do append operation here */ merged_chunk.append_content(cur_user, sizeof(SingleGramItem)); cur_user++; } else { assert(cur_system->m_token == cur_user->m_token); SingleGramItem merged_item; merged_item.m_token = cur_system->m_token; merged_item.m_freq = cur_system->m_freq + cur_user->m_freq; merged_chunk.append_content(&merged_item, sizeof(SingleGramItem)); cur_system++; cur_user++; } } /* add remained items. */ while (cur_system < system_end) { merged_chunk.append_content(cur_system, sizeof(SingleGramItem)); cur_system++; } while (cur_user < user_end) { merged_chunk.append_content(cur_user, sizeof(SingleGramItem)); cur_user++; } return true; } };