From 8ff7710aabb59538eb0b9b87c865d7aac5413032 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 5 Dec 2011 14:16:16 +0800 Subject: write merge single gram --- src/include/memory_chunk.h | 6 ++++ src/storage/ngram.cpp | 85 +++++++++++++++++++++++++++++++++++++++++++++- src/storage/ngram.h | 9 ++++- 3 files changed, 98 insertions(+), 2 deletions(-) diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h index 2c0b052..86a738a 100644 --- a/src/include/memory_chunk.h +++ b/src/include/memory_chunk.h @@ -169,6 +169,12 @@ public: m_data_end = m_data_begin + cursize; return true; } + /* append function + * Data are appended at the end. + */ + bool append_content(const void * data, size_t len){ + set_content(size(), data, len); + } /* insert function * Data are written to the memory area, * the original content are moved towards the rear. diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp index f7dd347..66cdc56 100644 --- a/src/storage/ngram.cpp +++ b/src/storage/ngram.cpp @@ -43,7 +43,7 @@ SingleGram::SingleGram(void * buffer, size_t length){ m_chunk.set_chunk(buffer, length, NULL); } -bool SingleGram::get_total_freq(guint32 & total){ +bool SingleGram::get_total_freq(guint32 & total) const{ char * buf_begin = (char *)m_chunk.begin(); total = *((guint32 *)buf_begin); return true; @@ -404,3 +404,86 @@ bool Bigram::get_all_items(GArray * items){ return true; } + + +namespace pinyin{ + +/* merge origin system info and delta user info */ +/* Note: Please keep system and user single gram + * when using merged single gram. + */ +bool merge_single_gram(SingleGram * merged, const SingleGram * system, + const SingleGram * user){ + /* clear merged. */ + MemoryChunk & merged_chunk = merged->m_chunk; + merged_chunk.set_size(0); + + if (NULL == system && NULL == user) + return false; + + if (NULL == system) { + merged_chunk.set_chunk(user->m_chunk.begin(), + user->m_chunk.size(), NULL); + return true; + } + + if (NULL == user) { + merged_chunk.set_chunk(system->m_chunk.begin(), + system->m_chunk.size(), NULL); + return true; + } + + /* merge the origin info and delta info */ + + guint32 system_total, user_total; + assert(system->get_total_freq(system_total)); + assert(user->get_total_freq(user_total)); + assert(merged->set_total_freq(system_total + user_total)); + + const SingleGramItem * cur_system = (const SingleGramItem *) + ((const char *)(system->m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * system_end = (const SingleGramItem *) + system->m_chunk.end(); + + const SingleGramItem * cur_user = (const SingleGramItem *) + ((const char *)(user->m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * user_end = (const SingleGramItem *) + user->m_chunk.end(); + + while (cur_system < system_end && cur_user < user_end) { + + if (cur_system->m_token < cur_user->m_token) { + /* do append operation here */ + merged_chunk.append_content(cur_system, sizeof(SingleGramItem)); + cur_system++; + } if (cur_system->m_token > cur_user->m_token) { + /* do append operation here */ + merged_chunk.append_content(cur_user, sizeof(SingleGramItem)); + cur_user++; + } else { + assert(cur_system->m_token == cur_user->m_token); + + SingleGramItem merged_item; + merged_item.m_token = cur_system->m_token; + merged_item.m_freq = cur_system->m_freq + cur_user->m_freq; + + merged_chunk.append_content(&merged_item, sizeof(SingleGramItem)); + cur_system++; cur_user++; + } + } + + /* add remained items. */ + while (cur_system < system_end) { + merged_chunk.append_content(cur_system, sizeof(SingleGramItem)); + cur_system++; + } + + while (cur_user < user_end) { + merged_chunk.append_content(cur_user, sizeof(SingleGramItem)); + cur_user++; + } + + return true; +} + +}; diff --git a/src/storage/ngram.h b/src/storage/ngram.h index 537c1d6..8863392 100644 --- a/src/storage/ngram.h +++ b/src/storage/ngram.h @@ -41,6 +41,10 @@ class Bigram; class SingleGram{ friend class Bigram; + friend bool merge_single_gram(SingleGram * merged, + const SingleGram * system, + const SingleGram * user); + private: MemoryChunk m_chunk; SingleGram(void * buffer, size_t length); @@ -78,7 +82,7 @@ public: /* get_total_freq method * used in user bigram table */ - bool get_total_freq(guint32 & total); + bool get_total_freq(guint32 & total) const; /* set_total_freq method * used in user bigram table @@ -130,6 +134,9 @@ public: bool get_all_items(/* out */ GArray * items); }; +bool merge_single_gram(SingleGram * merged, const SingleGram * system, + const SingleGram * user); + }; #endif -- cgit