summaryrefslogtreecommitdiffstats
path: root/src/storage/flexible_ngram.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/storage/flexible_ngram.h')
-rw-r--r--src/storage/flexible_ngram.h719
1 files changed, 719 insertions, 0 deletions
diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h
new file mode 100644
index 0000000..6cff7ff
--- /dev/null
+++ b/src/storage/flexible_ngram.h
@@ -0,0 +1,719 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+
+#ifndef FLEXIBLE_NGRAM_H
+#define FLEXIBLE_NGRAM_H
+
+#include <db.h>
+#include <errno.h>
+
+/* Note: the signature of the template parameters.
+ * struct MagicHeader, ArrayHeader, ArrayItem.
+ */
+
+namespace pinyin{
+
+typedef GArray * FlexibleBigramPhraseArray;
+
+/**
+ * FlexibleSingleGram:
+ * @ArrayHeader: the struct ArrayHeader.
+ * @ArrayItem: the struct ArrayItem.
+ *
+ * The flexible single gram is mainly used for training purpose.
+ *
+ */
+
+template<typename ArrayHeader, typename ArrayItem>
+class FlexibleSingleGram{
+ template<typename MH, typename AH,
+ typename AI>
+ friend class FlexibleBigram;
+private:
+ MemoryChunk m_chunk;
+ FlexibleSingleGram(void * buffer, size_t length){
+ m_chunk.set_chunk(buffer, length, NULL);
+ }
+public:
+ /**
+ * ArrayItemWithToken:
+ *
+ * Define the struct ArrayItemWithToken type.
+ *
+ */
+ typedef struct{
+ phrase_token_t m_token;
+ ArrayItem m_item;
+ } ArrayItemWithToken;
+
+private:
+ static bool token_less_than(const ArrayItemWithToken & lhs,
+ const ArrayItemWithToken & rhs){
+ return lhs.m_token < rhs.m_token;
+ }
+
+public:
+ /**
+ * FlexibleSingleGram::FlexibleSingleGram:
+ *
+ * The constructor of the FlexibleSingleGram.
+ *
+ */
+ FlexibleSingleGram(){
+ m_chunk.set_size(sizeof(ArrayHeader));
+ memset(m_chunk.begin(), 0, sizeof(ArrayHeader));
+ }
+
+ /**
+ * FlexibleSingleGram::retrieve_all:
+ * @array: the array to store all items in this single gram.
+ * @returns: whether the retrieve operation is successful.
+ *
+ * Retrieve all items in this single gram.
+ *
+ */
+ bool retrieve_all(/* out */ FlexibleBigramPhraseArray array){
+ const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ const ArrayItemWithToken * end = (const ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken item;
+ for ( const ArrayItemWithToken * cur_item = begin;
+ cur_item != end;
+ ++cur_item){
+ /* Note: optimize this with g_array_append_vals? */
+ item.m_token = cur_item->m_token;
+ item.m_item = cur_item->m_item;
+ g_array_append_val(array, item);
+ }
+
+ return true;
+ }
+
+ /**
+ * FlexibleSingleGram::search:
+ * @range: the token range.
+ * @array: the array to store the array items with token in the range.
+ * @returns: whether the search operation is successful.
+ *
+ * Search the array items with token in the range.
+ *
+ * Note: The array result may contain many items.
+ *
+ */
+ bool search(/* in */ PhraseIndexRange * range,
+ /* out */ FlexibleBigramPhraseArray array){
+ const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ const ArrayItemWithToken * end = (const ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = range->m_range_begin;
+ const ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ ArrayItemWithToken item;
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token >= range->m_range_end )
+ break;
+ item.m_token = cur_item->m_token;
+ item.m_item = cur_item->m_item;
+ g_array_append_val(array, item);
+ }
+
+ return true;
+ }
+
+ /**
+ * FlexibleSingleGram::insert_array_item:
+ * @token: the phrase token to be inserted.
+ * @item: the array item of this token.
+ * @returns: whether the insert operation is successful.
+ *
+ * Insert the array item of the token.
+ *
+ */
+ bool insert_array_item(/* in */ phrase_token_t token,
+ /* in */ const ArrayItem & item){
+ ArrayItemWithToken * begin = (ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ ArrayItemWithToken * end = (ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = token;
+ ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ ArrayItemWithToken insert_item;
+ insert_item.m_token = token;
+ insert_item.m_item = item;
+
+ for ( ; cur_item != end; ++cur_item ){
+ if ( cur_item->m_token > token ){
+ size_t offset = sizeof(ArrayHeader) +
+ sizeof(ArrayItemWithToken) * (cur_item - begin);
+ m_chunk.insert_content(offset, &insert_item,
+ sizeof(ArrayItemWithToken));
+ return true;
+ }
+ if ( cur_item->m_token == token ){
+ return false;
+ }
+ }
+ m_chunk.insert_content(m_chunk.size(), &insert_item,
+ sizeof(ArrayItemWithToken));
+ return true;
+ }
+
+ /**
+ * FlexibleSingleGram::remove_array_item:
+ * @token: the phrase token to be removed.
+ * @item: the content of the removed array item.
+ * @returns: whether the remove operation is successful.
+ *
+ * Remove the array item of the token.
+ *
+ */
+ bool remove_array_item(/* in */ phrase_token_t token,
+ /* out */ ArrayItem & item)
+ {
+ /* clear retval */
+ memset(&item, 0, sizeof(ArrayItem));
+
+ const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ const ArrayItemWithToken * end = (const ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = token;
+ const ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token )
+ return false;
+ if ( cur_item->m_token == token ){
+ memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
+ size_t offset = sizeof(ArrayHeader) +
+ sizeof(ArrayItemWithToken) * (cur_item - begin);
+ m_chunk.remove_content(offset, sizeof(ArrayItemWithToken));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * FlexibleSingleGram::get_array_item:
+ * @token: the phrase token.
+ * @item: the array item of the token.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array item of the token.
+ *
+ */
+ bool get_array_item(/* in */ phrase_token_t token,
+ /* out */ ArrayItem & item)
+ {
+ /* clear retval */
+ memset(&item, 0, sizeof(ArrayItem));
+
+ const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ const ArrayItemWithToken * end = (const ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = token;
+ const ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item){
+ if ( cur_item->m_token > token )
+ return false;
+ if ( cur_item->m_token == token ){
+ memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * FlexibleSingleGram::set_array_item:
+ * @token: the phrase token.
+ * @item: the array item of the token.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the array item of the token.
+ *
+ */
+ bool set_array_item(/* in */ phrase_token_t token,
+ /* in */ const ArrayItem & item){
+ ArrayItemWithToken * begin = (ArrayItemWithToken *)
+ ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
+ ArrayItemWithToken * end = (ArrayItemWithToken *)
+ m_chunk.end();
+
+ ArrayItemWithToken compare_item;
+ compare_item.m_token = token;
+ ArrayItemWithToken * cur_item = std_lite::lower_bound
+ (begin, end, compare_item, token_less_than);
+
+ for ( ; cur_item != end; ++cur_item ){
+ if ( cur_item->m_token > token ){
+ return false;
+ }
+ if ( cur_item->m_token == token ){
+ memcpy(&(cur_item->m_item), &item, sizeof(ArrayItem));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * FlexibleSingleGram::get_array_header:
+ * @header: the array header of this single gram.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array header of this single gram.
+ *
+ */
+ bool get_array_header(/* out */ ArrayHeader & header){
+ /* clear retval */
+ memset(&header, 0, sizeof(ArrayHeader));
+ char * buf_begin = (char *)m_chunk.begin();
+ memcpy(&header, buf_begin, sizeof(ArrayHeader));
+ return true;
+ }
+
+ /**
+ * FlexibleSingleGram::set_array_header:
+ * @header: the array header of this single gram.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the array header of this single gram.
+ *
+ */
+ bool set_array_header(/* in */ const ArrayHeader & header){
+ char * buf_begin = (char *)m_chunk.begin();
+ memcpy(buf_begin, &header, sizeof(ArrayHeader));
+ return true;
+ }
+};
+
+/**
+ * FlexibleBigram:
+ * @MagicHeader: the struct type of the magic header.
+ * @ArrayHeader: the struct type of the array header.
+ * @ArrayItem: the struct type of the array item.
+ *
+ * The flexible bi-gram is mainly used for training purpose.
+ *
+ */
+template<typename MagicHeader, typename ArrayHeader,
+ typename ArrayItem>
+class FlexibleBigram{
+ /* Note: some flexible bi-gram file format check should be here. */
+private:
+ DB * m_db;
+
+ phrase_token_t m_magic_header_index[2];
+
+ char m_magic_number[4];
+
+ void reset(){
+ if ( m_db ){
+ m_db->sync(m_db, 0);
+ m_db->close(m_db, 0);
+ m_db = NULL;
+ }
+ }
+
+public:
+ /**
+ * FlexibleBigram::FlexibleBigram:
+ * @magic_number: the 4 bytes magic number of the flexible bi-gram.
+ *
+ * The constructor of the FlexibleBigram.
+ *
+ */
+ FlexibleBigram(const char * magic_number){
+ m_db = NULL;
+ m_magic_header_index[0] = null_token;
+ m_magic_header_index[1] = null_token;
+
+ memcpy(m_magic_number, magic_number, sizeof(m_magic_number));
+ }
+
+ /**
+ * FlexibleBigram::~FlexibleBigram:
+ *
+ * The destructor of the FlexibleBigram.
+ *
+ */
+ ~FlexibleBigram(){
+ reset();
+ }
+
+ /**
+ * FlexibleBigram::attach:
+ * @dbfile: the path name of the flexible bi-gram.
+ * @flags: the attach flags for the Berkeley DB.
+ * @returns: whether the attach operation is successful.
+ *
+ * Attach Berkeley DB on filesystem for training purpose.
+ *
+ */
+ bool attach(const char * dbfile, guint32 flags){
+ reset();
+ u_int32_t db_flags = 0;
+
+ if ( flags & ATTACH_READONLY )
+ db_flags |= DB_RDONLY;
+ if ( flags & ATTACH_READWRITE )
+ assert( !(flags & ATTACH_READONLY ) );
+
+ if ( !dbfile )
+ return false;
+ int ret = db_create(&m_db, NULL, 0);
+ if ( ret != 0 )
+ assert(false);
+
+ ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
+ if ( ret != 0 && (flags & ATTACH_CREATE) ) {
+ db_flags |= DB_CREATE;
+ /* Create database file here, and write the signature. */
+ ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
+ if ( ret != 0 )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = m_magic_number;
+ db_data.size = sizeof(m_magic_number);
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(m_magic_number);
+
+ ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+ }
+
+ /* check the signature. */
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(m_magic_number);
+ ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+ if ( sizeof(m_magic_number) != db_data.size )
+ return false;
+ if ( memcmp(db_data.data, m_magic_number,
+ sizeof(m_magic_number)) == 0 )
+ return true;
+ return false;
+ }
+
+ /**
+ * FlexibleBigram::load:
+ * @index: the previous token in the flexible bi-gram.
+ * @single_gram: the single gram of the previous token.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the single gram of the previous token.
+ *
+ */
+ bool load(phrase_token_t index,
+ FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ single_gram = NULL;
+
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0)
+ return false;
+
+ single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem>
+ (db_data.data, db_data.size);
+
+ return true;
+ }
+
+ /**
+ * FlexibleBigram::store:
+ * @index: the previous token in the flexible bi-gram.
+ * @single_gram: the single gram of the previous token.
+ * @returns: whether the store operation is successful.
+ *
+ * Store the single gram of the previous token.
+ *
+ */
+ bool store(phrase_token_t index,
+ FlexibleSingleGram<ArrayHeader, ArrayItem> * single_gram){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = single_gram->m_chunk.begin();
+ db_data.size = single_gram->m_chunk.size();
+
+ int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+ }
+
+ /**
+ * FlexibleBigram::remove:
+ * @index: the previous token in the flexible bi-gram.
+ * @returns: whether the remove operation is successful.
+ *
+ * Remove the single gram of the previous token.
+ *
+ */
+ bool remove(phrase_token_t index){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ int ret = m_db->del(m_db, NULL, &db_key, 0);
+ return ret == 0;
+ }
+
+ /**
+ * FlexibleBigram::get_all_items:
+ * @items: the GArray to store all previous tokens.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array of all previous tokens for parameter estimation.
+ *
+ */
+ bool get_all_items(GArray * items){
+ g_array_set_size(items, 0);
+
+ if ( !m_db )
+ return false;
+
+ DBC * cursorp;
+ DBT key, data;
+ int ret;
+
+ /* Get a cursor */
+ m_db->cursor(m_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){
+ if (key.size != sizeof(phrase_token_t)){
+ /* skip magic header. */
+ continue;
+ }
+ phrase_token_t * token = (phrase_token_t *) key.data;
+ g_array_append_val(items, *token);
+ }
+
+ if ( ret != DB_NOTFOUND ){
+ fprintf(stderr, "training db error, exit!");
+
+ if (cursorp != NULL)
+ cursorp->c_close(cursorp);
+
+ exit(EIO);
+ }
+
+ /* Cursors must be closed */
+ if (cursorp != NULL)
+ cursorp->c_close(cursorp);
+ return true;
+ }
+
+ /**
+ * FlexibleBigram::get_magic_header:
+ * @header: the magic header.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the magic header of the flexible bi-gram.
+ *
+ */
+ bool get_magic_header(MagicHeader & header){
+ /* clear retval */
+ memset(&header, 0, sizeof(MagicHeader));
+
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = sizeof(m_magic_number);
+ db_data.dlen = sizeof(MagicHeader);
+
+ int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+
+ if ( sizeof(MagicHeader) != db_data.size )
+ return false;
+
+ memcpy(&header, db_data.data, sizeof(MagicHeader));
+ return true;
+ }
+
+ /**
+ * FlexibleBigram::set_magic_header:
+ * @header: the magic header.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the magic header of the flexible bi-gram.
+ *
+ */
+ bool set_magic_header(const MagicHeader & header){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = (void *) &header;
+ db_data.size = sizeof(MagicHeader);
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = sizeof(m_magic_number);
+ db_data.dlen = sizeof(MagicHeader);
+
+ int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+ }
+
+ /**
+ * FlexibleBigram::get_array_header:
+ * @index: the previous token in the flexible bi-gram.
+ * @header: the array header in the single gram of the previous token.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array header in the single gram of the previous token.
+ *
+ */
+ bool get_array_header(phrase_token_t index, ArrayHeader & header){
+ /* clear retval */
+ memset(&header, 0, sizeof(ArrayHeader));
+
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(ArrayHeader);
+ int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+
+ assert(db_data.size == sizeof(ArrayHeader));
+ memcpy(&header, db_data.data, sizeof(ArrayHeader));
+ return true;
+ }
+
+ /**
+ * FlexibleBigram::set_array_header:
+ * @index: the previous token of the flexible bi-gram.
+ * @header: the array header in the single gram of the previous token.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the array header in the single gram of the previous token.
+ *
+ */
+ bool set_array_header(phrase_token_t index, const ArrayHeader & header){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = (void *)&header;
+ db_data.size = sizeof(ArrayHeader);
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(ArrayHeader);
+
+ int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+ }
+
+};
+
+};
+
+#endif