summaryrefslogtreecommitdiffstats
path: root/src/storage/flexible_ngram.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/storage/flexible_ngram.h')
-rw-r--r--src/storage/flexible_ngram.h719
1 files changed, 0 insertions, 719 deletions
diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h
deleted file mode 100644
index 3cfb338..0000000
--- a/src/storage/flexible_ngram.h
+++ /dev/null
@@ -1,719 +0,0 @@
-/*
- * libzhuyin
- * Library to deal with zhuyin.
- *
- * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- */
-
-
-
-#ifndef FLEXIBLE_NGRAM_H
-#define FLEXIBLE_NGRAM_H
-
-#include <db.h>
-#include <errno.h>
-
-/* Note: the signature of the template parameters.
- * struct MagicHeader, ArrayHeader, ArrayItem.
- */
-
-namespace zhuyin{
-
-typedef GArray * FlexibleBigramPhraseArray;
-
-/**
- * FlexibleSingleGram:
- * @ArrayHeader: the struct ArrayHeader.
- * @ArrayItem: the struct ArrayItem.
- *
- * The flexible single gram is mainly used for training purpose.
- *
- */
-
-template<typename ArrayHeader, typename ArrayItem>
-class FlexibleSingleGram{
- template<typename MH, typename AH,
- typename AI>
- friend class FlexibleBigram;
-private:
- MemoryChunk m_chunk;
- FlexibleSingleGram(void * buffer, size_t length){
- m_chunk.set_chunk(buffer, length, NULL);
- }
-public:
- /**
- * ArrayItemWithToken:
- *
- * Define the struct ArrayItemWithToken type.
- *
- */
- typedef struct{
- phrase_token_t m_token;
- ArrayItem m_item;
- } ArrayItemWithToken;
-
-private:
- static bool token_less_than(const ArrayItemWithToken & lhs,
- const ArrayItemWithToken & rhs){
- return lhs.m_token < rhs.m_token;
- }
-
-public:
- /**
- * FlexibleSingleGram::FlexibleSingleGram:
- *
- * The constructor of the FlexibleSingleGram.
- *
- */
- FlexibleSingleGram(){
- m_chunk.set_size(sizeof(ArrayHeader));
- memset(m_chunk.begin(), 0, sizeof(ArrayHeader));
- }
-
- /**
- * FlexibleSingleGram::retrieve_all:
- * @array: the array to store all items in this single gram.
- * @returns: whether the retrieve operation is successful.
- *
- * Retrieve all items in this single gram.
- *
- */
- bool retrieve_all(/* out */ FlexibleBigramPhraseArray array){
- const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
- ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
- const ArrayItemWithToken * end = (const ArrayItemWithToken *)
- m_chunk.end();
-
- ArrayItemWithToken item;
- for ( const ArrayItemWithToken * cur_item = begin;
- cur_item != end;
- ++cur_item){
- /* Note: optimize this with g_array_append_vals? */
- item.m_token = cur_item->m_token;
- item.m_item = cur_item->m_item;
- g_array_append_val(array, item);
- }
-
- return true;
- }
-
- /**
- * FlexibleSingleGram::search:
- * @range: the token range.
- * @array: the array to store the array items with token in the range.
- * @returns: whether the search operation is successful.
- *
- * Search the array items with token in the range.
- *
- * Note: The array result may contain many items.
- *
- */
- bool search(/* in */ PhraseIndexRange * range,
- /* out */ FlexibleBigramPhraseArray array){
- const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
- ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
- const ArrayItemWithToken * end = (const ArrayItemWithToken *)
- m_chunk.end();
-
- ArrayItemWithToken compare_item;
- compare_item.m_token = range->m_range_begin;
- const ArrayItemWithToken * cur_item = std_lite::lower_bound
- (begin, end, compare_item, token_less_than);
-
- ArrayItemWithToken item;
- for ( ; cur_item != end; ++cur_item){
- if ( cur_item->m_token >= range->m_range_end )
- break;
- item.m_token = cur_item->m_token;
- item.m_item = cur_item->m_item;
- g_array_append_val(array, item);
- }
-
- return true;
- }
-
- /**
- * FlexibleSingleGram::insert_array_item:
- * @token: the phrase token to be inserted.
- * @item: the array item of this token.
- * @returns: whether the insert operation is successful.
- *
- * Insert the array item of the token.
- *
- */
- bool insert_array_item(/* in */ phrase_token_t token,
- /* in */ const ArrayItem & item){
- ArrayItemWithToken * begin = (ArrayItemWithToken *)
- ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
- ArrayItemWithToken * end = (ArrayItemWithToken *)
- m_chunk.end();
-
- ArrayItemWithToken compare_item;
- compare_item.m_token = token;
- ArrayItemWithToken * cur_item = std_lite::lower_bound
- (begin, end, compare_item, token_less_than);
-
- ArrayItemWithToken insert_item;
- insert_item.m_token = token;
- insert_item.m_item = item;
-
- for ( ; cur_item != end; ++cur_item ){
- if ( cur_item->m_token > token ){
- size_t offset = sizeof(ArrayHeader) +
- sizeof(ArrayItemWithToken) * (cur_item - begin);
- m_chunk.insert_content(offset, &insert_item,
- sizeof(ArrayItemWithToken));
- return true;
- }
- if ( cur_item->m_token == token ){
- return false;
- }
- }
- m_chunk.insert_content(m_chunk.size(), &insert_item,
- sizeof(ArrayItemWithToken));
- return true;
- }
-
- /**
- * FlexibleSingleGram::remove_array_item:
- * @token: the phrase token to be removed.
- * @item: the content of the removed array item.
- * @returns: whether the remove operation is successful.
- *
- * Remove the array item of the token.
- *
- */
- bool remove_array_item(/* in */ phrase_token_t token,
- /* out */ ArrayItem & item)
- {
- /* clear retval */
- memset(&item, 0, sizeof(ArrayItem));
-
- const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
- ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
- const ArrayItemWithToken * end = (const ArrayItemWithToken *)
- m_chunk.end();
-
- ArrayItemWithToken compare_item;
- compare_item.m_token = token;
- const ArrayItemWithToken * cur_item = std_lite::lower_bound
- (begin, end, compare_item, token_less_than);
-
- for ( ; cur_item != end; ++cur_item){
- if ( cur_item->m_token > token )
- return false;
- if ( cur_item->m_token == token ){
- memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
- size_t offset = sizeof(ArrayHeader) +
- sizeof(ArrayItemWithToken) * (cur_item - begin);
- m_chunk.remove_content(offset, sizeof(ArrayItemWithToken));
- return true;
- }
- }
- return false;
- }
-
- /**
- * FlexibleSingleGram::get_array_item:
- * @token: the phrase token.
- * @item: the array item of the token.
- * @returns: whether the get operation is successful.
- *
- * Get the array item of the token.
- *
- */
- bool get_array_item(/* in */ phrase_token_t token,
- /* out */ ArrayItem & item)
- {
- /* clear retval */
- memset(&item, 0, sizeof(ArrayItem));
-
- const ArrayItemWithToken * begin = (const ArrayItemWithToken *)
- ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
- const ArrayItemWithToken * end = (const ArrayItemWithToken *)
- m_chunk.end();
-
- ArrayItemWithToken compare_item;
- compare_item.m_token = token;
- const ArrayItemWithToken * cur_item = std_lite::lower_bound
- (begin, end, compare_item, token_less_than);
-
- for ( ; cur_item != end; ++cur_item){
- if ( cur_item->m_token > token )
- return false;
- if ( cur_item->m_token == token ){
- memcpy(&item, &(cur_item->m_item), sizeof(ArrayItem));
- return true;
- }
- }
- return false;
- }
-
- /**
- * FlexibleSingleGram::set_array_item:
- * @token: the phrase token.
- * @item: the array item of the token.
- * @returns: whether the set operation is successful.
- *
- * Set the array item of the token.
- *
- */
- bool set_array_item(/* in */ phrase_token_t token,
- /* in */ const ArrayItem & item){
- ArrayItemWithToken * begin = (ArrayItemWithToken *)
- ((const char *)(m_chunk.begin()) + sizeof(ArrayHeader));
- ArrayItemWithToken * end = (ArrayItemWithToken *)
- m_chunk.end();
-
- ArrayItemWithToken compare_item;
- compare_item.m_token = token;
- ArrayItemWithToken * cur_item = std_lite::lower_bound
- (begin, end, compare_item, token_less_than);
-
- for ( ; cur_item != end; ++cur_item ){
- if ( cur_item->m_token > token ){
- return false;
- }
- if ( cur_item->m_token == token ){
- memcpy(&(cur_item->m_item), &item, sizeof(ArrayItem));
- return true;
- }
- }
- return false;
- }
-
- /**
- * FlexibleSingleGram::get_array_header:
- * @header: the array header of this single gram.
- * @returns: whether the get operation is successful.
- *
- * Get the array header of this single gram.
- *
- */
- bool get_array_header(/* out */ ArrayHeader & header){
- /* clear retval */
- memset(&header, 0, sizeof(ArrayHeader));
- char * buf_begin = (char *)m_chunk.begin();
- memcpy(&header, buf_begin, sizeof(ArrayHeader));
- return true;
- }
-
- /**
- * FlexibleSingleGram::set_array_header:
- * @header: the array header of this single gram.
- * @returns: whether the set operation is successful.
- *
- * Set the array header of this single gram.
- *
- */
- bool set_array_header(/* in */ const ArrayHeader & header){
- char * buf_begin = (char *)m_chunk.begin();
- memcpy(buf_begin, &header, sizeof(ArrayHeader));
- return true;
- }
-};
-
-/**
- * FlexibleBigram:
- * @MagicHeader: the struct type of the magic header.
- * @ArrayHeader: the struct type of the array header.
- * @ArrayItem: the struct type of the array item.
- *
- * The flexible bi-gram is mainly used for training purpose.
- *
- */
-template<typename MagicHeader, typename ArrayHeader,
- typename ArrayItem>
-class FlexibleBigram{
- /* Note: some flexible bi-gram file format check should be here. */
-private:
- DB * m_db;
-
- phrase_token_t m_magic_header_index[2];
-
- char m_magic_number[4];
-
- void reset(){
- if ( m_db ){
- m_db->sync(m_db, 0);
- m_db->close(m_db, 0);
- m_db = NULL;
- }
- }
-
-public:
- /**
- * FlexibleBigram::FlexibleBigram:
- * @magic_number: the 4 bytes magic number of the flexible bi-gram.
- *
- * The constructor of the FlexibleBigram.
- *
- */
- FlexibleBigram(const char * magic_number){
- m_db = NULL;
- m_magic_header_index[0] = null_token;
- m_magic_header_index[1] = null_token;
-
- memcpy(m_magic_number, magic_number, sizeof(m_magic_number));
- }
-
- /**
- * FlexibleBigram::~FlexibleBigram:
- *
- * The destructor of the FlexibleBigram.
- *
- */
- ~FlexibleBigram(){
- reset();
- }
-
- /**
- * FlexibleBigram::attach:
- * @dbfile: the path name of the flexible bi-gram.
- * @flags: the attach flags for the Berkeley DB.
- * @returns: whether the attach operation is successful.
- *
- * Attach Berkeley DB on filesystem for training purpose.
- *
- */
- bool attach(const char * dbfile, guint32 flags){
- reset();
- u_int32_t db_flags = 0;
-
- if ( flags & ATTACH_READONLY )
- db_flags |= DB_RDONLY;
- if ( flags & ATTACH_READWRITE )
- assert( !(flags & ATTACH_READONLY ) );
-
- if ( !dbfile )
- return false;
- int ret = db_create(&m_db, NULL, 0);
- if ( ret != 0 )
- assert(false);
-
- ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
- if ( ret != 0 && (flags & ATTACH_CREATE) ) {
- db_flags |= DB_CREATE;
- /* Create database file here, and write the signature. */
- ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
- if ( ret != 0 )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = m_magic_header_index;
- db_key.size = sizeof(m_magic_header_index);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.data = m_magic_number;
- db_data.size = sizeof(m_magic_number);
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = 0;
- db_data.dlen = sizeof(m_magic_number);
-
- ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
- return ret == 0;
- }
-
- /* check the signature. */
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = m_magic_header_index;
- db_key.size = sizeof(m_magic_header_index);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = 0;
- db_data.dlen = sizeof(m_magic_number);
- ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
- if ( ret != 0 )
- return false;
- if ( sizeof(m_magic_number) != db_data.size )
- return false;
- if ( memcmp(db_data.data, m_magic_number,
- sizeof(m_magic_number)) == 0 )
- return true;
- return false;
- }
-
- /**
- * FlexibleBigram::load:
- * @index: the previous token in the flexible bi-gram.
- * @single_gram: the single gram of the previous token.
- * @returns: whether the load operation is successful.
- *
- * Load the single gram of the previous token.
- *
- */
- bool load(phrase_token_t index,
- FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
-
- single_gram = NULL;
-
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
- if ( ret != 0)
- return false;
-
- single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem>
- (db_data.data, db_data.size);
-
- return true;
- }
-
- /**
- * FlexibleBigram::store:
- * @index: the previous token in the flexible bi-gram.
- * @single_gram: the single gram of the previous token.
- * @returns: whether the store operation is successful.
- *
- * Store the single gram of the previous token.
- *
- */
- bool store(phrase_token_t index,
- FlexibleSingleGram<ArrayHeader, ArrayItem> * single_gram){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.data = single_gram->m_chunk.begin();
- db_data.size = single_gram->m_chunk.size();
-
- int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
- return ret == 0;
- }
-
- /**
- * FlexibleBigram::remove:
- * @index: the previous token in the flexible bi-gram.
- * @returns: whether the remove operation is successful.
- *
- * Remove the single gram of the previous token.
- *
- */
- bool remove(phrase_token_t index){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
-
- int ret = m_db->del(m_db, NULL, &db_key, 0);
- return ret == 0;
- }
-
- /**
- * FlexibleBigram::get_all_items:
- * @items: the GArray to store all previous tokens.
- * @returns: whether the get operation is successful.
- *
- * Get the array of all previous tokens for parameter estimation.
- *
- */
- bool get_all_items(GArray * items){
- g_array_set_size(items, 0);
-
- if ( !m_db )
- return false;
-
- DBC * cursorp;
- DBT key, data;
- int ret;
-
- /* Get a cursor */
- m_db->cursor(m_db, NULL, &cursorp, 0);
-
- if (NULL == cursorp)
- return false;
-
- /* Initialize our DBTs. */
- memset(&key, 0, sizeof(DBT));
- memset(&data, 0, sizeof(DBT));
-
- /* Iterate over the database, retrieving each record in turn. */
- while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){
- if (key.size != sizeof(phrase_token_t)){
- /* skip magic header. */
- continue;
- }
- phrase_token_t * token = (phrase_token_t *) key.data;
- g_array_append_val(items, *token);
- }
-
- if ( ret != DB_NOTFOUND ){
- fprintf(stderr, "training db error, exit!");
-
- if (cursorp != NULL)
- cursorp->c_close(cursorp);
-
- exit(EIO);
- }
-
- /* Cursors must be closed */
- if (cursorp != NULL)
- cursorp->c_close(cursorp);
- return true;
- }
-
- /**
- * FlexibleBigram::get_magic_header:
- * @header: the magic header.
- * @returns: whether the get operation is successful.
- *
- * Get the magic header of the flexible bi-gram.
- *
- */
- bool get_magic_header(MagicHeader & header){
- /* clear retval */
- memset(&header, 0, sizeof(MagicHeader));
-
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = m_magic_header_index;
- db_key.size = sizeof(m_magic_header_index);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = sizeof(m_magic_number);
- db_data.dlen = sizeof(MagicHeader);
-
- int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
- if ( ret != 0 )
- return false;
-
- if ( sizeof(MagicHeader) != db_data.size )
- return false;
-
- memcpy(&header, db_data.data, sizeof(MagicHeader));
- return true;
- }
-
- /**
- * FlexibleBigram::set_magic_header:
- * @header: the magic header.
- * @returns: whether the set operation is successful.
- *
- * Set the magic header of the flexible bi-gram.
- *
- */
- bool set_magic_header(const MagicHeader & header){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = m_magic_header_index;
- db_key.size = sizeof(m_magic_header_index);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.data = (void *) &header;
- db_data.size = sizeof(MagicHeader);
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = sizeof(m_magic_number);
- db_data.dlen = sizeof(MagicHeader);
-
- int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
- return ret == 0;
- }
-
- /**
- * FlexibleBigram::get_array_header:
- * @index: the previous token in the flexible bi-gram.
- * @header: the array header in the single gram of the previous token.
- * @returns: whether the get operation is successful.
- *
- * Get the array header in the single gram of the previous token.
- *
- */
- bool get_array_header(phrase_token_t index, ArrayHeader & header){
- /* clear retval */
- memset(&header, 0, sizeof(ArrayHeader));
-
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
-
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = 0;
- db_data.dlen = sizeof(ArrayHeader);
- int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
- if ( ret != 0 )
- return false;
-
- assert(db_data.size == sizeof(ArrayHeader));
- memcpy(&header, db_data.data, sizeof(ArrayHeader));
- return true;
- }
-
- /**
- * FlexibleBigram::set_array_header:
- * @index: the previous token of the flexible bi-gram.
- * @header: the array header in the single gram of the previous token.
- * @returns: whether the set operation is successful.
- *
- * Set the array header in the single gram of the previous token.
- *
- */
- bool set_array_header(phrase_token_t index, const ArrayHeader & header){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.data = (void *)&header;
- db_data.size = sizeof(ArrayHeader);
- db_data.flags = DB_DBT_PARTIAL;
- db_data.doff = 0;
- db_data.dlen = sizeof(ArrayHeader);
-
- int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
- return ret == 0;
- }
-
-};
-
-};
-
-#endif