1 files changed, 602 insertions, 0 deletions
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
new file mode 100644
index 0000000..3964388
--- /dev/null
+++ b/src/storage/ngram.cpp
@@ -0,0 +1,602 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2006-2007 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <glib.h>
+#include <glib/gstdio.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "ngram.h"
+
+using namespace pinyin;
+
+struct SingleGramItem{
+    phrase_token_t m_token;
+    guint32 m_freq;
+};
+
+SingleGram::SingleGram(){
+    m_chunk.set_size(sizeof(guint32));
+    memset(m_chunk.begin(), 0, sizeof(guint32));
+}
+
+SingleGram::SingleGram(void * buffer, size_t length){
+    m_chunk.set_chunk(buffer, length, NULL);
+}
+
+bool SingleGram::get_total_freq(guint32 & total) const{
+    char * buf_begin = (char *)m_chunk.begin();
+    total = *((guint32 *)buf_begin);
+    return true;
+}
+
+bool SingleGram::set_total_freq(guint32 total){
+    char * buf_begin = (char *)m_chunk.begin();
+    *((guint32 *)buf_begin) = total;
+    return true;
+}
+
+guint32 SingleGram::get_length(){
+    /* get the number of items. */
+    const SingleGramItem * begin = (const SingleGramItem *)
+        ((const char *)(m_chunk.begin()) + sizeof(guint32));
+    const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
+
+    const guint32 length = end - begin;
+
+    if (0 == length) {
+        /* no items here, total freq should be zero. */
+        guint32 total_freq = 0;
+        assert(get_total_freq(total_freq));
+        assert(0 == total_freq);
+    }
+
+    return length;
+}
+
+guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
+    guint32 removed_items = 0;
+
+    guint32 total_freq = 0;
+    assert(get_total_freq(total_freq));
+
+    const SingleGramItem * begin = (const SingleGramItem *)
+        ((const char *)(m_chunk.begin()) + sizeof(guint32));
+    const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
+
+    for (const SingleGramItem * cur = begin; cur != end; ++cur) {
+        if ((cur->m_token & mask) != value)
+            continue;
+
+        total_freq -= cur->m_freq;
+        size_t offset = sizeof(guint32) +
+            sizeof(SingleGramItem) * (cur - begin);
+        m_chunk.remove_content(offset, sizeof(SingleGramItem));
+
+        /* update chunk end. */
+        end = (const SingleGramItem *) m_chunk.end();
+        ++removed_items;
+        --cur;
+    }
+
+    assert(set_total_freq(total_freq));
+    return removed_items;
+}
+
+bool SingleGram::prune(){
+    assert(false);
+#if 0
+    SingleGramItem * begin = (SingleGramItem *)
+	((const char *)(m_chunk.begin()) + sizeof(guint32));
+    SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+    
+    size_t nitem = 0;
+    for ( SingleGramItem * cur = begin; cur != end; ++cur){
+	cur->m_freq--;
+	nitem++;
+	if ( cur->m_freq == 0 ){
+	    size_t offset = sizeof(guint32) + (cur - begin)
+		* sizeof(SingleGramItem) ;
+	    m_chunk.remove_content(offset, sizeof(SingleGramItem));
+	}
+    }
+    guint32 total_freq;
+    assert(get_total_freq(total_freq));
+    assert(set_total_freq(total_freq - nitem));
+#endif
+	return true;
+}
+
+static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
+    return lhs.m_token < rhs.m_token;
+}
+
+bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
+    const {
+    const SingleGramItem * begin = (const SingleGramItem *)
+        ((const char *)(m_chunk.begin()) + sizeof(guint32));
+    const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
+
+    guint32 total_freq;
+    BigramPhraseItemWithCount bigram_item_with_count;
+    assert(get_total_freq(total_freq));
+
+    for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
+        bigram_item_with_count.m_token = cur_item->m_token;
+        bigram_item_with_count.m_count = cur_item->m_freq;
+        bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
+        g_array_append_val(array, bigram_item_with_count);
+    }
+
+    return true;
+}
+
+bool SingleGram::search(/* in */ PhraseIndexRange * range,
+			/* out */ BigramPhraseArray array) const {
+    const SingleGramItem * begin = (const SingleGramItem *)
+	((const char *)(m_chunk.begin()) + sizeof(guint32));
+    const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+
+    SingleGramItem compare_item;
+    compare_item.m_token = range->m_range_begin;
+    const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+    guint32 total_freq;
+    BigramPhraseItem bigram_item;
+    assert(get_total_freq(total_freq));
+
+    for ( ; cur_item != end; ++cur_item){
+	if ( cur_item->m_token >= range->m_range_end )
+	    break;
+	bigram_item.m_token = cur_item->m_token;
+	bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
+	g_array_append_val(array, bigram_item);
+    }
+
+    return true;
+}
+
+bool SingleGram::insert_freq( /* in */ phrase_token_t token,
+                              /* in */ guint32 freq){
+    SingleGramItem * begin = (SingleGramItem *)
+        ((const char *)(m_chunk.begin()) + sizeof(guint32));
+    SingleGramItem * end = (SingleGramItem *) m_chunk.end();
+    SingleGramItem compare_item;
+    compare_item.m_token = token;
+    SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+    SingleGramItem insert_item;
+    insert_item.m_token = token;
+    insert_item.m_freq = freq;
+    for ( ; cur_item != end; ++cur_item ){
+        if ( cur_item->m_token > token ){
+            size_t offset = sizeof(guint32) +
+                sizeof(SingleGramItem) * (cur_item - begin);
+            m_chunk.insert_content(offset, &insert_item,
+                                   sizeof(SingleGramItem));
+            return true;
+        }
+        if ( cur_item->m_token == token ){
+            return false;
+        }
+    }
+    m_chunk.insert_content(m_chunk.size(), &insert_item,
+                           sizeof(SingleGramItem));
+    return true;
+}
+
+bool SingleGram::remove_freq( /* in */ phrase_token_t token,
+                              /* out */ guint32 & freq){
+    freq = 0;
+    const SingleGramItem * begin = (const SingleGramItem *)
+        ((const char *)(m_chunk.begin()) + sizeof(guint32));
+    const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+    SingleGramItem compare_item;
+    compare_item.m_token = token;
+    const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+    for ( ; cur_item != end; ++cur_item ){
+        if ( cur_item->m_token > token )
+            return false;
+        if ( cur_item->m_token == token ){
+            freq = cur_item -> m_freq;
+            size_t offset = sizeof(guint32) +
+                sizeof(SingleGramItem) * (cur_item - begin);
+            m_chunk.remove_content(offset, sizeof(SingleGramItem));
+            return true;
+        }
+    }
+    return false;
+}
+
+bool SingleGram::get_freq(/* in */ phrase_token_t token,
+                          /* out */ guint32 & freq) const {
+    freq = 0;
+    const SingleGramItem * begin = (const SingleGramItem *)
+	((const char *)(m_chunk.begin()) + sizeof(guint32));
+    const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+    SingleGramItem compare_item;
+    compare_item.m_token = token;
+    const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+    
+    for ( ; cur_item != end; ++cur_item){
+	if ( cur_item->m_token > token )
+	    return false;
+	if ( cur_item->m_token == token ){
+	    freq = cur_item -> m_freq;
+	    return true;
+	}
+    }
+    return false;
+}
+
+bool SingleGram::set_freq( /* in */ phrase_token_t token,
+			   /* in */ guint32 freq){
+    SingleGramItem * begin = (SingleGramItem *)
+	((const char *)(m_chunk.begin()) + sizeof(guint32));
+    SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+    SingleGramItem compare_item;
+    compare_item.m_token = token;
+    SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+    
+    for ( ;cur_item != end; ++cur_item){
+	if ( cur_item->m_token > token ){
+	    return false;
+	}
+	if ( cur_item->m_token == token ){
+	    cur_item -> m_freq = freq;
+	    return true;
+	}
+    }
+    return false;
+}
+
+bool Bigram::load_db(const char * dbfile){
+    reset();
+
+    /* create in memory db. */
+    int ret = db_create(&m_db, NULL, 0);
+    assert(ret == 0);
+
+    ret = m_db->open(m_db, NULL, NULL, NULL,
+                     DB_HASH, DB_CREATE, 0600);
+    if ( ret != 0 )
+        return false;
+
+    /* load db into memory. */
+    DB * tmp_db = NULL;
+    ret = db_create(&tmp_db, NULL, 0);
+    assert(ret == 0);
+
+    if (NULL == tmp_db)
+        return false;
+
+    ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
+                       DB_HASH, DB_RDONLY, 0600);
+    if ( ret != 0 )
+        return false;
+
+    DBC * cursorp = NULL;
+    DBT key, data;
+
+    /* Get a cursor */
+    tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
+
+    if (NULL == cursorp)
+        return false;
+
+    /* Initialize our DBTs. */
+    memset(&key, 0, sizeof(DBT));
+    memset(&data, 0, sizeof(DBT));
+
+    /* Iterate over the database, retrieving each record in turn. */
+    while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+        int ret = m_db->put(m_db, NULL, &key, &data, 0);
+        assert(ret == 0);
+    }
+    assert (ret == DB_NOTFOUND);
+
+    /* Cursors must be closed */
+    if ( cursorp != NULL )
+        cursorp->c_close(cursorp);
+
+    if ( tmp_db != NULL )
+        tmp_db->close(tmp_db, 0);
+
+    return true;
+}
+
+bool Bigram::save_db(const char * dbfile){
+    DB * tmp_db = NULL;
+
+    int ret = unlink(dbfile);
+    if ( ret != 0 && errno != ENOENT)
+        return false;
+
+    ret = db_create(&tmp_db, NULL, 0);
+    assert(ret == 0);
+
+    if (NULL == tmp_db)
+        return false;
+
+    ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
+                       DB_HASH, DB_CREATE, 0600);
+    if ( ret != 0 )
+        return false;
+
+    DBC * cursorp = NULL;
+    DBT key, data;
+    /* Get a cursor */
+    m_db->cursor(m_db, NULL, &cursorp, 0);
+
+    if (NULL == cursorp)
+        return false;
+
+    /* Initialize our DBTs. */
+    memset(&key, 0, sizeof(DBT));
+    memset(&data, 0, sizeof(DBT));
+
+    /* Iterate over the database, retrieving each record in turn. */
+    while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+        int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
+        assert(ret == 0);
+    }
+    assert (ret == DB_NOTFOUND);
+
+    /* Cursors must be closed */
+    if ( cursorp != NULL )
+        cursorp->c_close(cursorp);
+
+    if ( tmp_db != NULL )
+        tmp_db->close(tmp_db, 0);
+
+    return true;
+}
+
+bool Bigram::attach(const char * dbfile, guint32 flags){
+    reset();
+    u_int32_t db_flags = 0;
+
+    if ( flags & ATTACH_READONLY )
+        db_flags |= DB_RDONLY;
+    if ( flags & ATTACH_READWRITE )
+        assert( !( flags & ATTACH_READONLY ) );
+    if ( flags & ATTACH_CREATE )
+        db_flags |= DB_CREATE;
+
+    if ( !dbfile )
+        return false;
+    int ret = db_create(&m_db, NULL, 0);
+    if ( ret != 0 )
+        assert(false);
+	
+    ret = m_db->open(m_db, NULL, dbfile, NULL,
+                     DB_HASH, db_flags, 0644);
+    if ( ret != 0)
+        return false;
+
+    return true;
+}
+
+bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
+    single_gram = NULL;
+    if ( !m_db )
+        return false;
+
+    DBT db_key;
+    memset(&db_key, 0, sizeof(DBT));
+    db_key.data = &index;
+    db_key.size = sizeof(phrase_token_t);
+
+    DBT db_data;
+    memset(&db_data, 0, sizeof(DBT));
+    int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+    if ( ret != 0 )
+        return false;
+
+    single_gram = new SingleGram(db_data.data, db_data.size);
+    return true;
+}
+
+bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
+    if ( !m_db )
+	return false;
+
+    DBT db_key;
+    memset(&db_key, 0, sizeof(DBT));
+    db_key.data = &index;
+    db_key.size = sizeof(phrase_token_t);
+    DBT db_data;
+    memset(&db_data, 0, sizeof(DBT));
+    db_data.data = single_gram->m_chunk.begin();
+    db_data.size = single_gram->m_chunk.size();
+    
+    int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+    return ret == 0;
+}
+
+bool Bigram::remove(/* in */ phrase_token_t index){
+    if ( !m_db )
+        return false;
+
+    DBT db_key;
+    memset(&db_key, 0, sizeof(DBT));
+    db_key.data = &index;
+    db_key.size = sizeof(phrase_token_t);
+
+    int ret = m_db->del(m_db, NULL, &db_key, 0);
+    return 0 == ret;
+}
+
+bool Bigram::get_all_items(GArray * items){
+    g_array_set_size(items, 0);
+
+    if ( !m_db )
+        return false;
+
+    DBC * cursorp = NULL;
+    DBT key, data;
+    int ret;
+    /* Get a cursor */
+    m_db->cursor(m_db, NULL, &cursorp, 0);
+
+    if (NULL == cursorp)
+        return false;
+
+    /* Initialize our DBTs. */
+    memset(&key, 0, sizeof(DBT));
+    memset(&data, 0, sizeof(DBT));
+	
+    /* Iterate over the database, retrieving each record in turn. */
+    while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+        assert(key.size == sizeof(phrase_token_t));
+        phrase_token_t * token = (phrase_token_t *)key.data;
+        g_array_append_val(items, *token);
+    }
+
+    assert (ret == DB_NOTFOUND);
+
+    /* Cursors must be closed */
+    if (cursorp != NULL) 
+        cursorp->c_close(cursorp); 
+
+    return true;
+}
+
+bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
+    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+    if (!get_all_items(items)) {
+        g_array_free(items, TRUE);
+        return false;
+    }
+
+    for (size_t i = 0; i < items->len; ++i) {
+        phrase_token_t index = g_array_index(items, phrase_token_t, i);
+
+        if ((index & mask) == value) {
+            assert(remove(index));
+            continue;
+        }
+
+        SingleGram * gram = NULL;
+        assert(load(index, gram));
+
+        int num = gram->mask_out(mask, value);
+        if (0 == num) {
+            delete gram;
+            continue;
+        }
+
+        if (0 == gram->get_length()) {
+            assert(remove(index));
+        } else {
+            assert(store(index, gram));
+        }
+
+        delete gram;
+    }
+
+    g_array_free(items, TRUE);
+    return true;
+}
+
+
+namespace pinyin{
+
+/* merge origin system info and delta user info */
+bool merge_single_gram(SingleGram * merged, const SingleGram * system,
+                       const SingleGram * user){
+    if (NULL == system && NULL == user)
+        return false;
+
+    MemoryChunk & merged_chunk = merged->m_chunk;
+
+    if (NULL == system) {
+        merged_chunk.set_chunk(user->m_chunk.begin(),
+                               user->m_chunk.size(), NULL);
+        return true;
+    }
+
+    if (NULL == user) {
+        merged_chunk.set_chunk(system->m_chunk.begin(),
+                               system->m_chunk.size(), NULL);
+        return true;
+    }
+
+    /* clear merged. */
+    merged_chunk.set_size(sizeof(guint32));
+
+    /* merge the origin info and delta info */
+    guint32 system_total, user_total;
+    assert(system->get_total_freq(system_total));
+    assert(user->get_total_freq(user_total));
+    const guint32 merged_total = system_total + user_total;
+    merged_chunk.set_content(0, &merged_total, sizeof(guint32));
+
+    const SingleGramItem * cur_system = (const SingleGramItem *)
+        (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
+    const SingleGramItem * system_end = (const SingleGramItem *)
+        system->m_chunk.end();
+
+    const SingleGramItem * cur_user = (const SingleGramItem *)
+        (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
+    const SingleGramItem * user_end = (const SingleGramItem *)
+        user->m_chunk.end();
+
+    while (cur_system < system_end && cur_user < user_end) {
+
+        if (cur_system->m_token < cur_user->m_token) {
+            /* do append operation here */
+            merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
+            cur_system++;
+        } else if (cur_system->m_token > cur_user->m_token) {
+            /* do append operation here */
+            merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
+            cur_user++;
+        } else {
+            assert(cur_system->m_token == cur_user->m_token);
+
+            SingleGramItem merged_item;
+            merged_item.m_token = cur_system->m_token;
+            merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
+
+            merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
+            cur_system++; cur_user++;
+        }
+    }
+
+    /* add remained items. */
+    while (cur_system < system_end) {
+        merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
+        cur_system++;
+    }
+
+    while (cur_user < user_end) {
+        merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
+        cur_user++;
+    }
+
+    return true;
+}
+
+};