summaryrefslogtreecommitdiffstats
path: root/src/storage/flexible_ngram.h
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2015-04-16 13:45:57 +0800
committerPeng Wu <alexepico@gmail.com>2015-04-16 13:45:57 +0800
commitb54e2c1991d62f852f42e7689c6c156e4ca0cf47 (patch)
treea66cf328f0f4d499635d9a41c483840847c9522b /src/storage/flexible_ngram.h
parent999d77bd238b4f2dbadfca682483939a5f39bf69 (diff)
downloadlibpinyin-b54e2c1991d62f852f42e7689c6c156e4ca0cf47.tar.gz
libpinyin-b54e2c1991d62f852f42e7689c6c156e4ca0cf47.tar.xz
libpinyin-b54e2c1991d62f852f42e7689c6c156e4ca0cf47.zip
add flexible_ngram_bdb.h
Diffstat (limited to 'src/storage/flexible_ngram.h')
-rw-r--r--src/storage/flexible_ngram.h391
1 files changed, 3 insertions, 388 deletions
diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h
index 6cff7ff..9589d45 100644
--- a/src/storage/flexible_ngram.h
+++ b/src/storage/flexible_ngram.h
@@ -24,8 +24,6 @@
#ifndef FLEXIBLE_NGRAM_H
#define FLEXIBLE_NGRAM_H
-#include <db.h>
-#include <errno.h>
/* Note: the signature of the template parameters.
* struct MagicHeader, ArrayHeader, ArrayItem.
@@ -327,393 +325,10 @@ public:
}
};
-/**
- * FlexibleBigram:
- * @MagicHeader: the struct type of the magic header.
- * @ArrayHeader: the struct type of the array header.
- * @ArrayItem: the struct type of the array item.
- *
- * The flexible bi-gram is mainly used for training purpose.
- *
- */
-template<typename MagicHeader, typename ArrayHeader,
- typename ArrayItem>
-class FlexibleBigram{
- /* Note: some flexible bi-gram file format check should be here. */
-private:
- DB * m_db;
-
- phrase_token_t m_magic_header_index[2];
-
- char m_magic_number[4];
-
- void reset(){
- if ( m_db ){
- m_db->sync(m_db, 0);
- m_db->close(m_db, 0);
- m_db = NULL;
- }
- }
-
-public:
- /**
- * FlexibleBigram::FlexibleBigram:
- * @magic_number: the 4 bytes magic number of the flexible bi-gram.
- *
- * The constructor of the FlexibleBigram.
- *
- */
- FlexibleBigram(const char * magic_number){
- m_db = NULL;
- m_magic_header_index[0] = null_token;
- m_magic_header_index[1] = null_token;
-
- memcpy(m_magic_number, magic_number, sizeof(m_magic_number));
- }
-
- /**
- * FlexibleBigram::~FlexibleBigram:
- *
- * The destructor of the FlexibleBigram.
- *
- */
- ~FlexibleBigram(){
- reset();
- }
-
- /**
- * FlexibleBigram::attach:
- * @dbfile: the path name of the flexible bi-gram.
- * @flags: the attach flags for the Berkeley DB.
- * @returns: whether the attach operation is successful.
- *
- * Attach Berkeley DB on filesystem for training purpose.
- *
- */
- bool attach(const char * dbfile, guint32 flags){
- reset();
- u_int32_t db_flags = 0;
-
- if ( flags & ATTACH_READONLY )
- db_flags |= DB_RDONLY;
- if ( flags & ATTACH_READWRITE )
- assert( !(flags & ATTACH_READONLY ) );
-
- if ( !dbfile )
- return false;
- int ret = db_create(&m_db, NULL, 0);
- if ( ret != 0 )
- assert(false);
-
- ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
- if ( ret != 0 && (flags & ATTACH_CREATE) ) {
- db_flags |= DB_CREATE;
- /* Create database file here, and write the signature. */
- ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
- if ( ret != 0 )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = m_magic_header_index;
- db_key.size = sizeof(m_magic_header_index);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.data = m_magic_number;
- db_data.size = sizeof(m_magic_number);
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = 0;
- db_data.dlen = sizeof(m_magic_number);
-
- ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
- return ret == 0;
- }
-
- /* check the signature. */
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = m_magic_header_index;
- db_key.size = sizeof(m_magic_header_index);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = 0;
- db_data.dlen = sizeof(m_magic_number);
- ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
- if ( ret != 0 )
- return false;
- if ( sizeof(m_magic_number) != db_data.size )
- return false;
- if ( memcmp(db_data.data, m_magic_number,
- sizeof(m_magic_number)) == 0 )
- return true;
- return false;
- }
-
- /**
- * FlexibleBigram::load:
- * @index: the previous token in the flexible bi-gram.
- * @single_gram: the single gram of the previous token.
- * @returns: whether the load operation is successful.
- *
- * Load the single gram of the previous token.
- *
- */
- bool load(phrase_token_t index,
- FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
-
- single_gram = NULL;
-
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
- if ( ret != 0)
- return false;
-
- single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem>
- (db_data.data, db_data.size);
-
- return true;
- }
-
- /**
- * FlexibleBigram::store:
- * @index: the previous token in the flexible bi-gram.
- * @single_gram: the single gram of the previous token.
- * @returns: whether the store operation is successful.
- *
- * Store the single gram of the previous token.
- *
- */
- bool store(phrase_token_t index,
- FlexibleSingleGram<ArrayHeader, ArrayItem> * single_gram){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.data = single_gram->m_chunk.begin();
- db_data.size = single_gram->m_chunk.size();
-
- int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
- return ret == 0;
- }
-
- /**
- * FlexibleBigram::remove:
- * @index: the previous token in the flexible bi-gram.
- * @returns: whether the remove operation is successful.
- *
- * Remove the single gram of the previous token.
- *
- */
- bool remove(phrase_token_t index){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
-
- int ret = m_db->del(m_db, NULL, &db_key, 0);
- return ret == 0;
- }
-
- /**
- * FlexibleBigram::get_all_items:
- * @items: the GArray to store all previous tokens.
- * @returns: whether the get operation is successful.
- *
- * Get the array of all previous tokens for parameter estimation.
- *
- */
- bool get_all_items(GArray * items){
- g_array_set_size(items, 0);
-
- if ( !m_db )
- return false;
-
- DBC * cursorp;
- DBT key, data;
- int ret;
-
- /* Get a cursor */
- m_db->cursor(m_db, NULL, &cursorp, 0);
-
- if (NULL == cursorp)
- return false;
-
- /* Initialize our DBTs. */
- memset(&key, 0, sizeof(DBT));
- memset(&data, 0, sizeof(DBT));
-
- /* Iterate over the database, retrieving each record in turn. */
- while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){
- if (key.size != sizeof(phrase_token_t)){
- /* skip magic header. */
- continue;
- }
- phrase_token_t * token = (phrase_token_t *) key.data;
- g_array_append_val(items, *token);
- }
-
- if ( ret != DB_NOTFOUND ){
- fprintf(stderr, "training db error, exit!");
-
- if (cursorp != NULL)
- cursorp->c_close(cursorp);
-
- exit(EIO);
- }
-
- /* Cursors must be closed */
- if (cursorp != NULL)
- cursorp->c_close(cursorp);
- return true;
- }
-
- /**
- * FlexibleBigram::get_magic_header:
- * @header: the magic header.
- * @returns: whether the get operation is successful.
- *
- * Get the magic header of the flexible bi-gram.
- *
- */
- bool get_magic_header(MagicHeader & header){
- /* clear retval */
- memset(&header, 0, sizeof(MagicHeader));
-
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = m_magic_header_index;
- db_key.size = sizeof(m_magic_header_index);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = sizeof(m_magic_number);
- db_data.dlen = sizeof(MagicHeader);
-
- int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
- if ( ret != 0 )
- return false;
-
- if ( sizeof(MagicHeader) != db_data.size )
- return false;
-
- memcpy(&header, db_data.data, sizeof(MagicHeader));
- return true;
- }
-
- /**
- * FlexibleBigram::set_magic_header:
- * @header: the magic header.
- * @returns: whether the set operation is successful.
- *
- * Set the magic header of the flexible bi-gram.
- *
- */
- bool set_magic_header(const MagicHeader & header){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = m_magic_header_index;
- db_key.size = sizeof(m_magic_header_index);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.data = (void *) &header;
- db_data.size = sizeof(MagicHeader);
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = sizeof(m_magic_number);
- db_data.dlen = sizeof(MagicHeader);
-
- int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
- return ret == 0;
- }
-
- /**
- * FlexibleBigram::get_array_header:
- * @index: the previous token in the flexible bi-gram.
- * @header: the array header in the single gram of the previous token.
- * @returns: whether the get operation is successful.
- *
- * Get the array header in the single gram of the previous token.
- *
- */
- bool get_array_header(phrase_token_t index, ArrayHeader & header){
- /* clear retval */
- memset(&header, 0, sizeof(ArrayHeader));
-
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
-
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = 0;
- db_data.dlen = sizeof(ArrayHeader);
- int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
- if ( ret != 0 )
- return false;
-
- assert(db_data.size == sizeof(ArrayHeader));
- memcpy(&header, db_data.data, sizeof(ArrayHeader));
- return true;
- }
-
- /**
- * FlexibleBigram::set_array_header:
- * @index: the previous token of the flexible bi-gram.
- * @header: the array header in the single gram of the previous token.
- * @returns: whether the set operation is successful.
- *
- * Set the array header in the single gram of the previous token.
- *
- */
- bool set_array_header(phrase_token_t index, const ArrayHeader & header){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.data = (void *)&header;
- db_data.size = sizeof(ArrayHeader);
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = 0;
- db_data.dlen = sizeof(ArrayHeader);
-
- int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
- return ret == 0;
- }
-
};
-};
+#ifdef HAVE_BERKELEY_DB
+#include "flexible_ngram_bdb.h"
+#endif
#endif