From 3a64180cf94cd36641f485b94fdf2e8831b63e53 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Fri, 17 Apr 2015 13:59:59 +0800 Subject: begin to write class FlexibleBigram --- src/storage/flexible_ngram_kyotodb.h | 123 +++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) (limited to 'src') diff --git a/src/storage/flexible_ngram_kyotodb.h b/src/storage/flexible_ngram_kyotodb.h index ed8bc2a..9cf866b 100644 --- a/src/storage/flexible_ngram_kyotodb.h +++ b/src/storage/flexible_ngram_kyotodb.h @@ -26,7 +26,130 @@ #include #endif +#include "memory_chunk.h" + namespace pinyin{ + +/** + * FlexibleBigram: + * @MagicHeader: the struct type of the magic header. + * @ArrayHeader: the struct type of the array header. + * @ArrayItem: the struct type of the array item. + * + * The flexible bi-gram is mainly used for training purpose. + * + */ +template +class FlexibleBigram{ + using kyotocabinet::BasicDB; + + /* Note: some flexible bi-gram file format check should be here. */ +private: + BasicDB * m_db; + + MemoryChunk m_chunk; + + phrase_token_t m_magic_header_index[2]; + + char m_magic_number[4]; + + void reset(){ + if ( m_db ){ + m_db->synchronize(); + m_db->close(); + m_db = NULL; + } + } + +public: + /** + * FlexibleBigram::FlexibleBigram: + * @magic_number: the 4 bytes magic number of the flexible bi-gram. + * + * The constructor of the FlexibleBigram. + * + */ + FlexibleBigram(const char * magic_number){ + m_db = NULL; + m_magic_header_index[0] = null_token; + m_magic_header_index[1] = null_token; + + memcpy(m_magic_number, magic_number, sizeof(m_magic_number)); + } + + /** + * FlexibleBigram::~FlexibleBigram: + * + * The destructor of the FlexibleBigram. + * + */ + ~FlexibleBigram(){ + reset(); + } + + /** + * FlexibleBigram::attach: + * @dbfile: the path name of the flexible bi-gram. + * @flags: the attach flags for the Berkeley DB. + * @returns: whether the attach operation is successful. + * + * Attach Berkeley DB on filesystem for training purpose. + * + */ + bool attach(const char * dbfile, guint32 flags){ + reset(); + uint32_t mode = 0; + + if (flags & ATTACH_READONLY) + mode |= BasicDB::OREADER; + if (flags & ATTACH_READWRITE) { + assert( !( flags & ATTACH_READONLY ) ); + mode |= BasicDB::OREADER | BasicDB::OWRITER; + } + + if (!dbfile) + return false; + + m_db = new HashDB; + + if (!m_db->open(dbfile, mode)) { + if (!(flags & ATTACH_CREATE)) { + delete m_db; + m_db = NULL; + return false; + } + + mode |= BasicDB::OCREATE; + /* Create database file here, and write the signature. */ + if (!m_db->open(dbfile, mode)) + return false; + + const char * kbuf = (char *) m_magic_header_index; + const size_t ksiz = sizeof(m_magic_header_index); + const char * vbuf = (char *) m_magic_number; + const size_t vsiz = sizeof(m_magic_number); + m_db->set(kbuf, ksiz, vbuf, vsiz); + return true; + } + + /* check the signature. */ + const char * kbuf = (char *) m_magic_header_index; + const size_t ksiz = sizeof(m_magic_header_index); + const int32_t vsiz = m_db->check(kbuf, ksiz); + + m_chunk.set_size(vsiz); + char * vbuf = (char *) m_chunk.begin(); + assert (vsiz == m_db->get(kbuf, sizeof(phrase_token_t), + vbuf, vsiz)); + + if ( memcmp(vbuf, m_magic_number, + sizeof(m_magic_number)) == 0 ) + return true; + return false; + } +}; + }; #endif -- cgit