summaryrefslogtreecommitdiffstats
path: root/src/storage/ngram.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/storage/ngram.cpp')
-rw-r--r--src/storage/ngram.cpp602
1 files changed, 602 insertions, 0 deletions
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
new file mode 100644
index 0000000..3964388
--- /dev/null
+++ b/src/storage/ngram.cpp
@@ -0,0 +1,602 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <glib.h>
+#include <glib/gstdio.h>
+#include "memory_chunk.h"
+#include "novel_types.h"
+#include "ngram.h"
+
+using namespace pinyin;
+
+struct SingleGramItem{
+ phrase_token_t m_token;
+ guint32 m_freq;
+};
+
+SingleGram::SingleGram(){
+ m_chunk.set_size(sizeof(guint32));
+ memset(m_chunk.begin(), 0, sizeof(guint32));
+}
+
+SingleGram::SingleGram(void * buffer, size_t length){
+ m_chunk.set_chunk(buffer, length, NULL);
+}
+
+bool SingleGram::get_total_freq(guint32 & total) const{
+ char * buf_begin = (char *)m_chunk.begin();
+ total = *((guint32 *)buf_begin);
+ return true;
+}
+
+bool SingleGram::set_total_freq(guint32 total){
+ char * buf_begin = (char *)m_chunk.begin();
+ *((guint32 *)buf_begin) = total;
+ return true;
+}
+
+guint32 SingleGram::get_length(){
+ /* get the number of items. */
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
+
+ const guint32 length = end - begin;
+
+ if (0 == length) {
+ /* no items here, total freq should be zero. */
+ guint32 total_freq = 0;
+ assert(get_total_freq(total_freq));
+ assert(0 == total_freq);
+ }
+
+ return length;
+}
+
+guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
+ guint32 removed_items = 0;
+
+ guint32 total_freq = 0;
+ assert(get_total_freq(total_freq));
+
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
+
+ for (const SingleGramItem * cur = begin; cur != end; ++cur) {
+ if ((cur->m_token & mask) != value)
+ continue;
+
+ total_freq -= cur->m_freq;
+ size_t offset = sizeof(guint32) +
+ sizeof(SingleGramItem) * (cur - begin);
+ m_chunk.remove_content(offset, sizeof(SingleGramItem));
+
+ /* update chunk end. */
+ end = (const SingleGramItem *) m_chunk.end();
+ ++removed_items;
+ --cur;
+ }
+
+ assert(set_total_freq(total_freq));
+ return removed_items;
+}
+
+bool SingleGram::prune(){
+ assert(false);
+#if 0
+ SingleGramItem * begin = (SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+
+ size_t nitem = 0;
+ for ( SingleGramItem * cur = begin; cur != end; ++cur){
+ cur->m_freq--;
+ nitem++;
+ if ( cur->m_freq == 0 ){
+ size_t offset = sizeof(guint32) + (cur - begin)
+ * sizeof(SingleGramItem) ;
+ m_chunk.remove_content(offset, sizeof(SingleGramItem));
+ }
+ }
+ guint32 total_freq;
+ assert(get_total_freq(total_freq));
+ assert(set_total_freq(total_freq - nitem));
+#endif
+ return true;
+}
+
+static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
+ return lhs.m_token < rhs.m_token;
+}
+
+bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
+ const {
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
+
+ guint32 total_freq;
+ BigramPhraseItemWithCount bigram_item_with_count;
+ assert(get_total_freq(total_freq));
+
+ for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
+ bigram_item_with_count.m_token = cur_item->m_token;
+ bigram_item_with_count.m_count = cur_item->m_freq;
+ bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
+ g_array_append_val(array, bigram_item_with_count);
+ }
+
+ return true;
+}
+
+bool SingleGram::search(/* in */ PhraseIndexRange * range,
+ /* out */ BigramPhraseArray array) const {
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+
+ SingleGramItem compare_item;
+ compare_item.m_token = range->m_range_begin;
+ const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ guint32 total_freq;
+ BigramPhraseItem bigram_item;
+ assert(get_total_freq(total_freq));
+
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token >= range->m_range_end )
+ break;
+ bigram_item.m_token = cur_item->m_token;
+ bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
+ g_array_append_val(array, bigram_item);
+ }
+
+ return true;
+}
+
+bool SingleGram::insert_freq( /* in */ phrase_token_t token,
+ /* in */ guint32 freq){
+ SingleGramItem * begin = (SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ SingleGramItem * end = (SingleGramItem *) m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ SingleGramItem insert_item;
+ insert_item.m_token = token;
+ insert_item.m_freq = freq;
+ for ( ; cur_item != end; ++cur_item ){
+ if ( cur_item->m_token > token ){
+ size_t offset = sizeof(guint32) +
+ sizeof(SingleGramItem) * (cur_item - begin);
+ m_chunk.insert_content(offset, &insert_item,
+ sizeof(SingleGramItem));
+ return true;
+ }
+ if ( cur_item->m_token == token ){
+ return false;
+ }
+ }
+ m_chunk.insert_content(m_chunk.size(), &insert_item,
+ sizeof(SingleGramItem));
+ return true;
+}
+
+bool SingleGram::remove_freq( /* in */ phrase_token_t token,
+ /* out */ guint32 & freq){
+ freq = 0;
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item ){
+ if ( cur_item->m_token > token )
+ return false;
+ if ( cur_item->m_token == token ){
+ freq = cur_item -> m_freq;
+ size_t offset = sizeof(guint32) +
+ sizeof(SingleGramItem) * (cur_item - begin);
+ m_chunk.remove_content(offset, sizeof(SingleGramItem));
+ return true;
+ }
+ }
+ return false;
+}
+
+bool SingleGram::get_freq(/* in */ phrase_token_t token,
+ /* out */ guint32 & freq) const {
+ freq = 0;
+ const SingleGramItem * begin = (const SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token )
+ return false;
+ if ( cur_item->m_token == token ){
+ freq = cur_item -> m_freq;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool SingleGram::set_freq( /* in */ phrase_token_t token,
+ /* in */ guint32 freq){
+ SingleGramItem * begin = (SingleGramItem *)
+ ((const char *)(m_chunk.begin()) + sizeof(guint32));
+ SingleGramItem * end = (SingleGramItem *)m_chunk.end();
+ SingleGramItem compare_item;
+ compare_item.m_token = token;
+ SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
+
+ for ( ;cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token ){
+ return false;
+ }
+ if ( cur_item->m_token == token ){
+ cur_item -> m_freq = freq;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool Bigram::load_db(const char * dbfile){
+ reset();
+
+ /* create in memory db. */
+ int ret = db_create(&m_db, NULL, 0);
+ assert(ret == 0);
+
+ ret = m_db->open(m_db, NULL, NULL, NULL,
+ DB_HASH, DB_CREATE, 0600);
+ if ( ret != 0 )
+ return false;
+
+ /* load db into memory. */
+ DB * tmp_db = NULL;
+ ret = db_create(&tmp_db, NULL, 0);
+ assert(ret == 0);
+
+ if (NULL == tmp_db)
+ return false;
+
+ ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
+ DB_HASH, DB_RDONLY, 0600);
+ if ( ret != 0 )
+ return false;
+
+ DBC * cursorp = NULL;
+ DBT key, data;
+
+ /* Get a cursor */
+ tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ int ret = m_db->put(m_db, NULL, &key, &data, 0);
+ assert(ret == 0);
+ }
+ assert (ret == DB_NOTFOUND);
+
+ /* Cursors must be closed */
+ if ( cursorp != NULL )
+ cursorp->c_close(cursorp);
+
+ if ( tmp_db != NULL )
+ tmp_db->close(tmp_db, 0);
+
+ return true;
+}
+
+bool Bigram::save_db(const char * dbfile){
+ DB * tmp_db = NULL;
+
+ int ret = unlink(dbfile);
+ if ( ret != 0 && errno != ENOENT)
+ return false;
+
+ ret = db_create(&tmp_db, NULL, 0);
+ assert(ret == 0);
+
+ if (NULL == tmp_db)
+ return false;
+
+ ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
+ DB_HASH, DB_CREATE, 0600);
+ if ( ret != 0 )
+ return false;
+
+ DBC * cursorp = NULL;
+ DBT key, data;
+ /* Get a cursor */
+ m_db->cursor(m_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
+ assert(ret == 0);
+ }
+ assert (ret == DB_NOTFOUND);
+
+ /* Cursors must be closed */
+ if ( cursorp != NULL )
+ cursorp->c_close(cursorp);
+
+ if ( tmp_db != NULL )
+ tmp_db->close(tmp_db, 0);
+
+ return true;
+}
+
+bool Bigram::attach(const char * dbfile, guint32 flags){
+ reset();
+ u_int32_t db_flags = 0;
+
+ if ( flags & ATTACH_READONLY )
+ db_flags |= DB_RDONLY;
+ if ( flags & ATTACH_READWRITE )
+ assert( !( flags & ATTACH_READONLY ) );
+ if ( flags & ATTACH_CREATE )
+ db_flags |= DB_CREATE;
+
+ if ( !dbfile )
+ return false;
+ int ret = db_create(&m_db, NULL, 0);
+ if ( ret != 0 )
+ assert(false);
+
+ ret = m_db->open(m_db, NULL, dbfile, NULL,
+ DB_HASH, db_flags, 0644);
+ if ( ret != 0)
+ return false;
+
+ return true;
+}
+
+bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
+ single_gram = NULL;
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+
+ single_gram = new SingleGram(db_data.data, db_data.size);
+ return true;
+}
+
+bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = single_gram->m_chunk.begin();
+ db_data.size = single_gram->m_chunk.size();
+
+ int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+}
+
+bool Bigram::remove(/* in */ phrase_token_t index){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ int ret = m_db->del(m_db, NULL, &db_key, 0);
+ return 0 == ret;
+}
+
+bool Bigram::get_all_items(GArray * items){
+ g_array_set_size(items, 0);
+
+ if ( !m_db )
+ return false;
+
+ DBC * cursorp = NULL;
+ DBT key, data;
+ int ret;
+ /* Get a cursor */
+ m_db->cursor(m_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ assert(key.size == sizeof(phrase_token_t));
+ phrase_token_t * token = (phrase_token_t *)key.data;
+ g_array_append_val(items, *token);
+ }
+
+ assert (ret == DB_NOTFOUND);
+
+ /* Cursors must be closed */
+ if (cursorp != NULL)
+ cursorp->c_close(cursorp);
+
+ return true;
+}
+
+bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ if (!get_all_items(items)) {
+ g_array_free(items, TRUE);
+ return false;
+ }
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t index = g_array_index(items, phrase_token_t, i);
+
+ if ((index & mask) == value) {
+ assert(remove(index));
+ continue;
+ }
+
+ SingleGram * gram = NULL;
+ assert(load(index, gram));
+
+ int num = gram->mask_out(mask, value);
+ if (0 == num) {
+ delete gram;
+ continue;
+ }
+
+ if (0 == gram->get_length()) {
+ assert(remove(index));
+ } else {
+ assert(store(index, gram));
+ }
+
+ delete gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
+
+
+namespace pinyin{
+
+/* merge origin system info and delta user info */
+bool merge_single_gram(SingleGram * merged, const SingleGram * system,
+ const SingleGram * user){
+ if (NULL == system && NULL == user)
+ return false;
+
+ MemoryChunk & merged_chunk = merged->m_chunk;
+
+ if (NULL == system) {
+ merged_chunk.set_chunk(user->m_chunk.begin(),
+ user->m_chunk.size(), NULL);
+ return true;
+ }
+
+ if (NULL == user) {
+ merged_chunk.set_chunk(system->m_chunk.begin(),
+ system->m_chunk.size(), NULL);
+ return true;
+ }
+
+ /* clear merged. */
+ merged_chunk.set_size(sizeof(guint32));
+
+ /* merge the origin info and delta info */
+ guint32 system_total, user_total;
+ assert(system->get_total_freq(system_total));
+ assert(user->get_total_freq(user_total));
+ const guint32 merged_total = system_total + user_total;
+ merged_chunk.set_content(0, &merged_total, sizeof(guint32));
+
+ const SingleGramItem * cur_system = (const SingleGramItem *)
+ (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
+ const SingleGramItem * system_end = (const SingleGramItem *)
+ system->m_chunk.end();
+
+ const SingleGramItem * cur_user = (const SingleGramItem *)
+ (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
+ const SingleGramItem * user_end = (const SingleGramItem *)
+ user->m_chunk.end();
+
+ while (cur_system < system_end && cur_user < user_end) {
+
+ if (cur_system->m_token < cur_user->m_token) {
+ /* do append operation here */
+ merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
+ cur_system++;
+ } else if (cur_system->m_token > cur_user->m_token) {
+ /* do append operation here */
+ merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
+ cur_user++;
+ } else {
+ assert(cur_system->m_token == cur_user->m_token);
+
+ SingleGramItem merged_item;
+ merged_item.m_token = cur_system->m_token;
+ merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
+
+ merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
+ cur_system++; cur_user++;
+ }
+ }
+
+ /* add remained items. */
+ while (cur_system < system_end) {
+ merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
+ cur_system++;
+ }
+
+ while (cur_user < user_end) {
+ merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
+ cur_user++;
+ }
+
+ return true;
+}
+
+};