/*
* libpinyin
* Library to deal with pinyin.
*
* Copyright (C) 2006-2007 Peng Wu
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#include
#include
#include
#include
#include "memory_chunk.h"
#include "novel_types.h"
#include "ngram.h"
using namespace pinyin;
struct SingleGramItem{
phrase_token_t m_token;
guint32 m_freq;
};
SingleGram::SingleGram(){
m_chunk.set_size(sizeof(guint32));
memset(m_chunk.begin(), 0, sizeof(guint32));
}
SingleGram::SingleGram(void * buffer, size_t length, bool copy){
if (copy)
m_chunk.set_content(0, buffer, length);
else
m_chunk.set_chunk(buffer, length, NULL);
}
bool SingleGram::get_total_freq(guint32 & total) const{
char * buf_begin = (char *)m_chunk.begin();
total = *((guint32 *)buf_begin);
return true;
}
bool SingleGram::set_total_freq(guint32 total){
char * buf_begin = (char *)m_chunk.begin();
*((guint32 *)buf_begin) = total;
return true;
}
guint32 SingleGram::get_length(){
/* get the number of items. */
const SingleGramItem * begin = (const SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
const guint32 length = end - begin;
if (0 == length) {
/* no items here, total freq should be zero. */
guint32 total_freq = 0;
assert(get_total_freq(total_freq));
assert(0 == total_freq);
}
return length;
}
guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
guint32 removed_items = 0;
guint32 total_freq = 0;
assert(get_total_freq(total_freq));
const SingleGramItem * begin = (const SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
for (const SingleGramItem * cur = begin; cur != end; ++cur) {
if ((cur->m_token & mask) != value)
continue;
total_freq -= cur->m_freq;
size_t offset = sizeof(guint32) +
sizeof(SingleGramItem) * (cur - begin);
m_chunk.remove_content(offset, sizeof(SingleGramItem));
/* update chunk end. */
end = (const SingleGramItem *) m_chunk.end();
++removed_items;
--cur;
}
assert(set_total_freq(total_freq));
return removed_items;
}
bool SingleGram::prune(){
assert(false);
#if 0
SingleGramItem * begin = (SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
SingleGramItem * end = (SingleGramItem *)m_chunk.end();
size_t nitem = 0;
for ( SingleGramItem * cur = begin; cur != end; ++cur){
cur->m_freq--;
nitem++;
if ( cur->m_freq == 0 ){
size_t offset = sizeof(guint32) + (cur - begin)
* sizeof(SingleGramItem) ;
m_chunk.remove_content(offset, sizeof(SingleGramItem));
}
}
guint32 total_freq;
assert(get_total_freq(total_freq));
assert(set_total_freq(total_freq - nitem));
#endif
return true;
}
static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
return lhs.m_token < rhs.m_token;
}
bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
const {
const SingleGramItem * begin = (const SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
guint32 total_freq;
BigramPhraseItemWithCount bigram_item_with_count;
assert(get_total_freq(total_freq));
for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
bigram_item_with_count.m_token = cur_item->m_token;
bigram_item_with_count.m_count = cur_item->m_freq;
bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
g_array_append_val(array, bigram_item_with_count);
}
return true;
}
bool SingleGram::search(/* in */ PhraseIndexRange * range,
/* out */ BigramPhraseArray array) const {
const SingleGramItem * begin = (const SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
SingleGramItem compare_item;
compare_item.m_token = range->m_range_begin;
const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
guint32 total_freq;
BigramPhraseItem bigram_item;
assert(get_total_freq(total_freq));
for ( ; cur_item != end; ++cur_item){
if ( cur_item->m_token >= range->m_range_end )
break;
bigram_item.m_token = cur_item->m_token;
bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
g_array_append_val(array, bigram_item);
}
return true;
}
bool SingleGram::insert_freq( /* in */ phrase_token_t token,
/* in */ guint32 freq){
SingleGramItem * begin = (SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
SingleGramItem * end = (SingleGramItem *) m_chunk.end();
SingleGramItem compare_item;
compare_item.m_token = token;
SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
SingleGramItem insert_item;
insert_item.m_token = token;
insert_item.m_freq = freq;
for ( ; cur_item != end; ++cur_item ){
if ( cur_item->m_token > token ){
size_t offset = sizeof(guint32) +
sizeof(SingleGramItem) * (cur_item - begin);
m_chunk.insert_content(offset, &insert_item,
sizeof(SingleGramItem));
return true;
}
if ( cur_item->m_token == token ){
return false;
}
}
m_chunk.insert_content(m_chunk.size(), &insert_item,
sizeof(SingleGramItem));
return true;
}
bool SingleGram::remove_freq( /* in */ phrase_token_t token,
/* out */ guint32 & freq){
freq = 0;
const SingleGramItem * begin = (const SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
SingleGramItem compare_item;
compare_item.m_token = token;
const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
for ( ; cur_item != end; ++cur_item ){
if ( cur_item->m_token > token )
return false;
if ( cur_item->m_token == token ){
freq = cur_item -> m_freq;
size_t offset = sizeof(guint32) +
sizeof(SingleGramItem) * (cur_item - begin);
m_chunk.remove_content(offset, sizeof(SingleGramItem));
return true;
}
}
return false;
}
bool SingleGram::get_freq(/* in */ phrase_token_t token,
/* out */ guint32 & freq) const {
freq = 0;
const SingleGramItem * begin = (const SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
SingleGramItem compare_item;
compare_item.m_token = token;
const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
for ( ; cur_item != end; ++cur_item){
if ( cur_item->m_token > token )
return false;
if ( cur_item->m_token == token ){
freq = cur_item -> m_freq;
return true;
}
}
return false;
}
bool SingleGram::set_freq( /* in */ phrase_token_t token,
/* in */ guint32 freq){
SingleGramItem * begin = (SingleGramItem *)
((const char *)(m_chunk.begin()) + sizeof(guint32));
SingleGramItem * end = (SingleGramItem *)m_chunk.end();
SingleGramItem compare_item;
compare_item.m_token = token;
SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
for ( ;cur_item != end; ++cur_item){
if ( cur_item->m_token > token ){
return false;
}
if ( cur_item->m_token == token ){
cur_item -> m_freq = freq;
return true;
}
}
return false;
}
namespace pinyin{
/* merge origin system info and delta user info */
bool merge_single_gram(SingleGram * merged, const SingleGram * system,
const SingleGram * user){
if (NULL == system && NULL == user)
return false;
MemoryChunk & merged_chunk = merged->m_chunk;
if (NULL == system) {
merged_chunk.set_chunk(user->m_chunk.begin(),
user->m_chunk.size(), NULL);
return true;
}
if (NULL == user) {
merged_chunk.set_chunk(system->m_chunk.begin(),
system->m_chunk.size(), NULL);
return true;
}
/* clear merged. */
merged_chunk.set_size(sizeof(guint32));
/* merge the origin info and delta info */
guint32 system_total, user_total;
assert(system->get_total_freq(system_total));
assert(user->get_total_freq(user_total));
const guint32 merged_total = system_total + user_total;
merged_chunk.set_content(0, &merged_total, sizeof(guint32));
const SingleGramItem * cur_system = (const SingleGramItem *)
(((const char *)(system->m_chunk.begin())) + sizeof(guint32));
const SingleGramItem * system_end = (const SingleGramItem *)
system->m_chunk.end();
const SingleGramItem * cur_user = (const SingleGramItem *)
(((const char *)(user->m_chunk.begin())) + sizeof(guint32));
const SingleGramItem * user_end = (const SingleGramItem *)
user->m_chunk.end();
while (cur_system < system_end && cur_user < user_end) {
if (cur_system->m_token < cur_user->m_token) {
/* do append operation here */
merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
cur_system++;
} else if (cur_system->m_token > cur_user->m_token) {
/* do append operation here */
merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
cur_user++;
} else {
assert(cur_system->m_token == cur_user->m_token);
SingleGramItem merged_item;
merged_item.m_token = cur_system->m_token;
merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
cur_system++; cur_user++;
}
}
/* add remained items. */
while (cur_system < system_end) {
merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
cur_system++;
}
while (cur_user < user_end) {
merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
cur_user++;
}
return true;
}
};