summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-12-05 14:16:16 +0800
committerPeng Wu <alexepico@gmail.com>2011-12-05 14:16:16 +0800
commit8ff7710aabb59538eb0b9b87c865d7aac5413032 (patch)
tree97f7115b0beee75b1d7418a1d49df806d7674768
parent3a99dcfdbe50c4a5a7810f3f640e81be877abd9c (diff)
downloadlibpinyin-8ff7710aabb59538eb0b9b87c865d7aac5413032.tar.gz
libpinyin-8ff7710aabb59538eb0b9b87c865d7aac5413032.tar.xz
libpinyin-8ff7710aabb59538eb0b9b87c865d7aac5413032.zip
write merge single gram
-rw-r--r--src/include/memory_chunk.h6
-rw-r--r--src/storage/ngram.cpp85
-rw-r--r--src/storage/ngram.h9
3 files changed, 98 insertions, 2 deletions
diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h
index 2c0b052..86a738a 100644
--- a/src/include/memory_chunk.h
+++ b/src/include/memory_chunk.h
@@ -169,6 +169,12 @@ public:
m_data_end = m_data_begin + cursize;
return true;
}
+ /* append function
+ * Data are appended at the end.
+ */
+ bool append_content(const void * data, size_t len){
+ set_content(size(), data, len);
+ }
/* insert function
* Data are written to the memory area,
* the original content are moved towards the rear.
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
index f7dd347..66cdc56 100644
--- a/src/storage/ngram.cpp
+++ b/src/storage/ngram.cpp
@@ -43,7 +43,7 @@ SingleGram::SingleGram(void * buffer, size_t length){
m_chunk.set_chunk(buffer, length, NULL);
}
-bool SingleGram::get_total_freq(guint32 & total){
+bool SingleGram::get_total_freq(guint32 & total) const{
char * buf_begin = (char *)m_chunk.begin();
total = *((guint32 *)buf_begin);
return true;
@@ -404,3 +404,86 @@ bool Bigram::get_all_items(GArray * items){
return true;
}
+
+
+namespace pinyin{
+
+/* merge origin system info and delta user info */
+/* Note: Please keep system and user single gram
+ * when using merged single gram.
+ */
+bool merge_single_gram(SingleGram * merged, const SingleGram * system,
+ const SingleGram * user){
+ /* clear merged. */
+ MemoryChunk & merged_chunk = merged->m_chunk;
+ merged_chunk.set_size(0);
+
+ if (NULL == system && NULL == user)
+ return false;
+
+ if (NULL == system) {
+ merged_chunk.set_chunk(user->m_chunk.begin(),
+ user->m_chunk.size(), NULL);
+ return true;
+ }
+
+ if (NULL == user) {
+ merged_chunk.set_chunk(system->m_chunk.begin(),
+ system->m_chunk.size(), NULL);
+ return true;
+ }
+
+ /* merge the origin info and delta info */
+
+ guint32 system_total, user_total;
+ assert(system->get_total_freq(system_total));
+ assert(user->get_total_freq(user_total));
+ assert(merged->set_total_freq(system_total + user_total));
+
+ const SingleGramItem * cur_system = (const SingleGramItem *)
+ ((const char *)(system->m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * system_end = (const SingleGramItem *)
+ system->m_chunk.end();
+
+ const SingleGramItem * cur_user = (const SingleGramItem *)
+ ((const char *)(user->m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * user_end = (const SingleGramItem *)
+ user->m_chunk.end();
+
+ while (cur_system < system_end && cur_user < user_end) {
+
+ if (cur_system->m_token < cur_user->m_token) {
+ /* do append operation here */
+ merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
+ cur_system++;
+ } if (cur_system->m_token > cur_user->m_token) {
+ /* do append operation here */
+ merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
+ cur_user++;
+ } else {
+ assert(cur_system->m_token == cur_user->m_token);
+
+ SingleGramItem merged_item;
+ merged_item.m_token = cur_system->m_token;
+ merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
+
+ merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
+ cur_system++; cur_user++;
+ }
+ }
+
+ /* add remained items. */
+ while (cur_system < system_end) {
+ merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
+ cur_system++;
+ }
+
+ while (cur_user < user_end) {
+ merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
+ cur_user++;
+ }
+
+ return true;
+}
+
+};
diff --git a/src/storage/ngram.h b/src/storage/ngram.h
index 537c1d6..8863392 100644
--- a/src/storage/ngram.h
+++ b/src/storage/ngram.h
@@ -41,6 +41,10 @@ class Bigram;
class SingleGram{
friend class Bigram;
+ friend bool merge_single_gram(SingleGram * merged,
+ const SingleGram * system,
+ const SingleGram * user);
+
private:
MemoryChunk m_chunk;
SingleGram(void * buffer, size_t length);
@@ -78,7 +82,7 @@ public:
/* get_total_freq method
* used in user bigram table
*/
- bool get_total_freq(guint32 & total);
+ bool get_total_freq(guint32 & total) const;
/* set_total_freq method
* used in user bigram table
@@ -130,6 +134,9 @@ public:
bool get_all_items(/* out */ GArray * items);
};
+bool merge_single_gram(SingleGram * merged, const SingleGram * system,
+ const SingleGram * user);
+
};
#endif