From f18291a36a316496e63142cc34b01377c6b5a58e Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 14 Apr 2015 16:45:14 +0800 Subject: write ngram_kyotodb.cpp in progress --- src/storage/Makefile.am | 5 ++ src/storage/ngram.h | 6 ++ src/storage/ngram_bdb.cpp | 5 +- src/storage/ngram_bdb.h | 2 - src/storage/ngram_kyotodb.cpp | 175 ++++++++++++++++++++++++++++++++++++++++++ src/storage/ngram_kyotodb.h | 6 +- 6 files changed, 192 insertions(+), 7 deletions(-) create mode 100644 src/storage/ngram_kyotodb.cpp diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am index e660608..e54ca8e 100644 --- a/src/storage/Makefile.am +++ b/src/storage/Makefile.am @@ -32,6 +32,7 @@ noinst_HEADERS = chewing_enum.h \ phrase_large_table2.h \ ngram.h \ ngram_bdb.h \ + ngram_kyotodb.h \ flexible_ngram.h \ tag_utility.h \ pinyin_parser_table.h \ @@ -61,3 +62,7 @@ libstorage_la_SOURCES = phrase_index.cpp \ if BERKELEYDB libstorage_la_SOURCES += ngram_bdb.cpp endif + +if KYOTOCABINET +libstorage_la_SOURCES += ngram_kyotodb.cpp +endif diff --git a/src/storage/ngram.h b/src/storage/ngram.h index 7466def..9b26f5d 100644 --- a/src/storage/ngram.h +++ b/src/storage/ngram.h @@ -23,11 +23,17 @@ #define NGRAM_H #include +#include +#include "novel_types.h" #ifdef HAVE_BERKELEY_DB #include "ngram_bdb.h" #endif +#ifdef HAVE_KYOTO_CABINET +#include "ngram_kyotodb.h" +#endif + namespace pinyin{ class Bigram; diff --git a/src/storage/ngram_bdb.cpp b/src/storage/ngram_bdb.cpp index e07127b..9d696ff 100644 --- a/src/storage/ngram_bdb.cpp +++ b/src/storage/ngram_bdb.cpp @@ -140,8 +140,10 @@ bool Bigram::save_db(const char * dbfile){ if ( cursorp != NULL ) cursorp->c_close(cursorp); - if ( tmp_db != NULL ) + if ( tmp_db != NULL ) { + tmp_db->sync(m_db, 0); tmp_db->close(tmp_db, 0); + } return true; } @@ -256,6 +258,7 @@ bool Bigram::get_all_items(GArray * items){ return true; } +/* Note: sync mask_out code with ngram_kyotodb.cpp. */ bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); diff --git a/src/storage/ngram_bdb.h b/src/storage/ngram_bdb.h index f1c15f2..a1e70d3 100644 --- a/src/storage/ngram_bdb.h +++ b/src/storage/ngram_bdb.h @@ -38,8 +38,6 @@ class Bigram{ private: DB * m_db; - /* Note: sync mask_out code with ngram_kyotodb.cpp. */ - void reset(); public: diff --git a/src/storage/ngram_kyotodb.cpp b/src/storage/ngram_kyotodb.cpp new file mode 100644 index 0000000..72b0c76 --- /dev/null +++ b/src/storage/ngram_kyotodb.cpp @@ -0,0 +1,175 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2013 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "ngram.h" +#include +#include +#include +#include + + +/* Use DB interface, first check, second reserve the memory chunk, + third get value into the chunk. */ + +/* Use DB::visitor to get_all_items. */ + +using namespace pinyin; +using namespace kyotocabinet; + + +Bigram::Bigram(){ + m_db = NULL; +} + +Bigram::~Bigram(){ + reset(); +} + +void Bigram::reset(){ + if ( m_db ){ + m_db->synchronize(); + m_db->close(); + delete m_db; + } +} + +class CopyVisitor : public DB::Visitor { +private: + BasicDB * m_db; +public: + CopyVisitor(BasicDB * db) { + m_db = db; + } + + virtual const char* visit_full(const char* kbuf, size_t ksiz, + const char* vbuf, size_t vsiz, size_t* sp) { + m_db->set(kbuf, ksiz, vbuf, vsiz); + return NOP; + } + + virtual const char* visit_empty(const char* kbuf, size_t ksiz, size_t* sp) { + /* assume no empty record. */ + assert (FALSE); + return NOP; + } +}; + +/* Use ProtoHashDB for load_db/save_db methods. */ +bool Bigram::load_db(const char * dbfile){ + reset(); + + /* create on-memory db. */ + m_db = new ProtoHashDB; + + /* load db into memory. */ + BasicDB * tmp_db = new HashDB; + tmp_db->open(dbfile, BasicDB::OREADER); + + CopyVisitor visitor(m_db); + tmp_db->iterate(&visitor, false); + + if (tmp_db != NULL) + tmp_db->close(); + + return true; +} + +bool Bigram::save_db(const char * dbfile){ + + int ret = unlink(dbfile); + if ( ret != 0 && errno != ENOENT) + return false; + + BasicDB * tmp_db = new HashDB; + + if ( !tmp_db->open(dbfile, BasicDB::OWRITER|BasicDB::OCREATE) ) + return false; + + CopyVisitor visitor(tmp_db); + m_db->iterate(&visitor, false); + + if (tmp_db != NULL) { + tmp_db->synchronize(); + tmp_db->close(); + } + + return true; +} + +bool Bigram::attach(const char * dbfile, guint32 flags){ + reset(); + uint32_t mode = 0; + + if (flags & ATTACH_READONLY) + mode |= BasicDB::OREADER; + if (flags & ATTACH_READWRITE) { + assert( !( flags & ATTACH_READONLY ) ); + mode |= BasicDB::OREADER | BasicDB::OWRITER; + } + if (flags & ATTACH_CREATE) + mode |= BasicDB::OCREATE; + + if (!dbfile) + return false; + + m_db = new HashDB; + + return m_db->open(dbfile, mode); +} + +/* Note: sync mask_out code with ngram_bdb.cpp. */ +bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){ + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + if (!get_all_items(items)) { + g_array_free(items, TRUE); + return false; + } + + for (size_t i = 0; i < items->len; ++i) { + phrase_token_t index = g_array_index(items, phrase_token_t, i); + + if ((index & mask) == value) { + assert(remove(index)); + continue; + } + + SingleGram * gram = NULL; + assert(load(index, gram)); + + int num = gram->mask_out(mask, value); + if (0 == num) { + delete gram; + continue; + } + + if (0 == gram->get_length()) { + assert(remove(index)); + } else { + assert(store(index, gram)); + } + + delete gram; + } + + g_array_free(items, TRUE); + return true; +} diff --git a/src/storage/ngram_kyotodb.h b/src/storage/ngram_kyotodb.h index 6c5eea0..2c156de 100644 --- a/src/storage/ngram_kyotodb.h +++ b/src/storage/ngram_kyotodb.h @@ -22,7 +22,7 @@ #ifndef NGRAM_KYOTODB_H #define NGRAM_KYOTODB_H -#include +#include #include "memory_chunk.h" namespace pinyin{ @@ -37,13 +37,11 @@ class SingleGram; */ class Bigram{ private: - kyotocabinet::DB * m_db; + kyotocabinet::BasicDB * m_db; /* memory chunk for Kyoto Cabinet. */ MemoryChunk m_chunk; - /* Note: sync mask_out code with ngram_bdb.cpp. */ - void reset(); public: -- cgit