diff options
Diffstat (limited to 'src/storage/ngram.cpp')
-rw-r--r-- | src/storage/ngram.cpp | 602 |
1 files changed, 602 insertions, 0 deletions
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp new file mode 100644 index 0000000..3964388 --- /dev/null +++ b/src/storage/ngram.cpp @@ -0,0 +1,602 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <errno.h> +#include <glib.h> +#include <glib/gstdio.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "ngram.h" + +using namespace pinyin; + +struct SingleGramItem{ + phrase_token_t m_token; + guint32 m_freq; +}; + +SingleGram::SingleGram(){ + m_chunk.set_size(sizeof(guint32)); + memset(m_chunk.begin(), 0, sizeof(guint32)); +} + +SingleGram::SingleGram(void * buffer, size_t length){ + m_chunk.set_chunk(buffer, length, NULL); +} + +bool SingleGram::get_total_freq(guint32 & total) const{ + char * buf_begin = (char *)m_chunk.begin(); + total = *((guint32 *)buf_begin); + return true; +} + +bool SingleGram::set_total_freq(guint32 total){ + char * buf_begin = (char *)m_chunk.begin(); + *((guint32 *)buf_begin) = total; + return true; +} + +guint32 SingleGram::get_length(){ + /* get the number of items. */ + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *) m_chunk.end(); + + const guint32 length = end - begin; + + if (0 == length) { + /* no items here, total freq should be zero. */ + guint32 total_freq = 0; + assert(get_total_freq(total_freq)); + assert(0 == total_freq); + } + + return length; +} + +guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){ + guint32 removed_items = 0; + + guint32 total_freq = 0; + assert(get_total_freq(total_freq)); + + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *) m_chunk.end(); + + for (const SingleGramItem * cur = begin; cur != end; ++cur) { + if ((cur->m_token & mask) != value) + continue; + + total_freq -= cur->m_freq; + size_t offset = sizeof(guint32) + + sizeof(SingleGramItem) * (cur - begin); + m_chunk.remove_content(offset, sizeof(SingleGramItem)); + + /* update chunk end. */ + end = (const SingleGramItem *) m_chunk.end(); + ++removed_items; + --cur; + } + + assert(set_total_freq(total_freq)); + return removed_items; +} + +bool SingleGram::prune(){ + assert(false); +#if 0 + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *)m_chunk.end(); + + size_t nitem = 0; + for ( SingleGramItem * cur = begin; cur != end; ++cur){ + cur->m_freq--; + nitem++; + if ( cur->m_freq == 0 ){ + size_t offset = sizeof(guint32) + (cur - begin) + * sizeof(SingleGramItem) ; + m_chunk.remove_content(offset, sizeof(SingleGramItem)); + } + } + guint32 total_freq; + assert(get_total_freq(total_freq)); + assert(set_total_freq(total_freq - nitem)); +#endif + return true; +} + +static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){ + return lhs.m_token < rhs.m_token; +} + +bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array) + const { + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *) m_chunk.end(); + + guint32 total_freq; + BigramPhraseItemWithCount bigram_item_with_count; + assert(get_total_freq(total_freq)); + + for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){ + bigram_item_with_count.m_token = cur_item->m_token; + bigram_item_with_count.m_count = cur_item->m_freq; + bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq; + g_array_append_val(array, bigram_item_with_count); + } + + return true; +} + +bool SingleGram::search(/* in */ PhraseIndexRange * range, + /* out */ BigramPhraseArray array) const { + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + + SingleGramItem compare_item; + compare_item.m_token = range->m_range_begin; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + guint32 total_freq; + BigramPhraseItem bigram_item; + assert(get_total_freq(total_freq)); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token >= range->m_range_end ) + break; + bigram_item.m_token = cur_item->m_token; + bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq; + g_array_append_val(array, bigram_item); + } + + return true; +} + +bool SingleGram::insert_freq( /* in */ phrase_token_t token, + /* in */ guint32 freq){ + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *) m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + SingleGramItem insert_item; + insert_item.m_token = token; + insert_item.m_freq = freq; + for ( ; cur_item != end; ++cur_item ){ + if ( cur_item->m_token > token ){ + size_t offset = sizeof(guint32) + + sizeof(SingleGramItem) * (cur_item - begin); + m_chunk.insert_content(offset, &insert_item, + sizeof(SingleGramItem)); + return true; + } + if ( cur_item->m_token == token ){ + return false; + } + } + m_chunk.insert_content(m_chunk.size(), &insert_item, + sizeof(SingleGramItem)); + return true; +} + +bool SingleGram::remove_freq( /* in */ phrase_token_t token, + /* out */ guint32 & freq){ + freq = 0; + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item ){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + freq = cur_item -> m_freq; + size_t offset = sizeof(guint32) + + sizeof(SingleGramItem) * (cur_item - begin); + m_chunk.remove_content(offset, sizeof(SingleGramItem)); + return true; + } + } + return false; +} + +bool SingleGram::get_freq(/* in */ phrase_token_t token, + /* out */ guint32 & freq) const { + freq = 0; + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + freq = cur_item -> m_freq; + return true; + } + } + return false; +} + +bool SingleGram::set_freq( /* in */ phrase_token_t token, + /* in */ guint32 freq){ + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + for ( ;cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ){ + return false; + } + if ( cur_item->m_token == token ){ + cur_item -> m_freq = freq; + return true; + } + } + return false; +} + +bool Bigram::load_db(const char * dbfile){ + reset(); + + /* create in memory db. */ + int ret = db_create(&m_db, NULL, 0); + assert(ret == 0); + + ret = m_db->open(m_db, NULL, NULL, NULL, + DB_HASH, DB_CREATE, 0600); + if ( ret != 0 ) + return false; + + /* load db into memory. */ + DB * tmp_db = NULL; + ret = db_create(&tmp_db, NULL, 0); + assert(ret == 0); + + if (NULL == tmp_db) + return false; + + ret = tmp_db->open(tmp_db, NULL, dbfile, NULL, + DB_HASH, DB_RDONLY, 0600); + if ( ret != 0 ) + return false; + + DBC * cursorp = NULL; + DBT key, data; + + /* Get a cursor */ + tmp_db->cursor(tmp_db, NULL, &cursorp, 0); + + if (NULL == cursorp) + return false; + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + int ret = m_db->put(m_db, NULL, &key, &data, 0); + assert(ret == 0); + } + assert (ret == DB_NOTFOUND); + + /* Cursors must be closed */ + if ( cursorp != NULL ) + cursorp->c_close(cursorp); + + if ( tmp_db != NULL ) + tmp_db->close(tmp_db, 0); + + return true; +} + +bool Bigram::save_db(const char * dbfile){ + DB * tmp_db = NULL; + + int ret = unlink(dbfile); + if ( ret != 0 && errno != ENOENT) + return false; + + ret = db_create(&tmp_db, NULL, 0); + assert(ret == 0); + + if (NULL == tmp_db) + return false; + + ret = tmp_db->open(tmp_db, NULL, dbfile, NULL, + DB_HASH, DB_CREATE, 0600); + if ( ret != 0 ) + return false; + + DBC * cursorp = NULL; + DBT key, data; + /* Get a cursor */ + m_db->cursor(m_db, NULL, &cursorp, 0); + + if (NULL == cursorp) + return false; + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0); + assert(ret == 0); + } + assert (ret == DB_NOTFOUND); + + /* Cursors must be closed */ + if ( cursorp != NULL ) + cursorp->c_close(cursorp); + + if ( tmp_db != NULL ) + tmp_db->close(tmp_db, 0); + + return true; +} + +bool Bigram::attach(const char * dbfile, guint32 flags){ + reset(); + u_int32_t db_flags = 0; + + if ( flags & ATTACH_READONLY ) + db_flags |= DB_RDONLY; + if ( flags & ATTACH_READWRITE ) + assert( !( flags & ATTACH_READONLY ) ); + if ( flags & ATTACH_CREATE ) + db_flags |= DB_CREATE; + + if ( !dbfile ) + return false; + int ret = db_create(&m_db, NULL, 0); + if ( ret != 0 ) + assert(false); + + ret = m_db->open(m_db, NULL, dbfile, NULL, + DB_HASH, db_flags, 0644); + if ( ret != 0) + return false; + + return true; +} + +bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){ + single_gram = NULL; + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); + if ( ret != 0 ) + return false; + + single_gram = new SingleGram(db_data.data, db_data.size); + return true; +} + +bool Bigram::store(phrase_token_t index, SingleGram * single_gram){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = single_gram->m_chunk.begin(); + db_data.size = single_gram->m_chunk.size(); + + int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); + return ret == 0; +} + +bool Bigram::remove(/* in */ phrase_token_t index){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + int ret = m_db->del(m_db, NULL, &db_key, 0); + return 0 == ret; +} + +bool Bigram::get_all_items(GArray * items){ + g_array_set_size(items, 0); + + if ( !m_db ) + return false; + + DBC * cursorp = NULL; + DBT key, data; + int ret; + /* Get a cursor */ + m_db->cursor(m_db, NULL, &cursorp, 0); + + if (NULL == cursorp) + return false; + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + assert(key.size == sizeof(phrase_token_t)); + phrase_token_t * token = (phrase_token_t *)key.data; + g_array_append_val(items, *token); + } + + assert (ret == DB_NOTFOUND); + + /* Cursors must be closed */ + if (cursorp != NULL) + cursorp->c_close(cursorp); + + return true; +} + +bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){ + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + if (!get_all_items(items)) { + g_array_free(items, TRUE); + return false; + } + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t index = g_array_index(items, phrase_token_t, i); + + if ((index & mask) == value) { + assert(remove(index)); + continue; + } + + SingleGram * gram = NULL; + assert(load(index, gram)); + + int num = gram->mask_out(mask, value); + if (0 == num) { + delete gram; + continue; + } + + if (0 == gram->get_length()) { + assert(remove(index)); + } else { + assert(store(index, gram)); + } + + delete gram; + } + + g_array_free(items, TRUE); + return true; +} + + +namespace pinyin{ + +/* merge origin system info and delta user info */ +bool merge_single_gram(SingleGram * merged, const SingleGram * system, + const SingleGram * user){ + if (NULL == system && NULL == user) + return false; + + MemoryChunk & merged_chunk = merged->m_chunk; + + if (NULL == system) { + merged_chunk.set_chunk(user->m_chunk.begin(), + user->m_chunk.size(), NULL); + return true; + } + + if (NULL == user) { + merged_chunk.set_chunk(system->m_chunk.begin(), + system->m_chunk.size(), NULL); + return true; + } + + /* clear merged. */ + merged_chunk.set_size(sizeof(guint32)); + + /* merge the origin info and delta info */ + guint32 system_total, user_total; + assert(system->get_total_freq(system_total)); + assert(user->get_total_freq(user_total)); + const guint32 merged_total = system_total + user_total; + merged_chunk.set_content(0, &merged_total, sizeof(guint32)); + + const SingleGramItem * cur_system = (const SingleGramItem *) + (((const char *)(system->m_chunk.begin())) + sizeof(guint32)); + const SingleGramItem * system_end = (const SingleGramItem *) + system->m_chunk.end(); + + const SingleGramItem * cur_user = (const SingleGramItem *) + (((const char *)(user->m_chunk.begin())) + sizeof(guint32)); + const SingleGramItem * user_end = (const SingleGramItem *) + user->m_chunk.end(); + + while (cur_system < system_end && cur_user < user_end) { + + if (cur_system->m_token < cur_user->m_token) { + /* do append operation here */ + merged_chunk.append_content(cur_system, sizeof(SingleGramItem)); + cur_system++; + } else if (cur_system->m_token > cur_user->m_token) { + /* do append operation here */ + merged_chunk.append_content(cur_user, sizeof(SingleGramItem)); + cur_user++; + } else { + assert(cur_system->m_token == cur_user->m_token); + + SingleGramItem merged_item; + merged_item.m_token = cur_system->m_token; + merged_item.m_freq = cur_system->m_freq + cur_user->m_freq; + + merged_chunk.append_content(&merged_item, sizeof(SingleGramItem)); + cur_system++; cur_user++; + } + } + + /* add remained items. */ + while (cur_system < system_end) { + merged_chunk.append_content(cur_system, sizeof(SingleGramItem)); + cur_system++; + } + + while (cur_user < user_end) { + merged_chunk.append_content(cur_user, sizeof(SingleGramItem)); + cur_user++; + } + + return true; +} + +}; |