summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2015-04-14 16:45:14 +0800
committerPeng Wu <alexepico@gmail.com>2015-04-14 16:45:14 +0800
commitf18291a36a316496e63142cc34b01377c6b5a58e (patch)
tree7de7e101c7859dfb72bef5e3b10ab51d0ac80596
parente8d9463f45a7214e0eb36cfe09f45cce3b7baeb9 (diff)
downloadlibpinyin-f18291a36a316496e63142cc34b01377c6b5a58e.tar.gz
libpinyin-f18291a36a316496e63142cc34b01377c6b5a58e.tar.xz
libpinyin-f18291a36a316496e63142cc34b01377c6b5a58e.zip
write ngram_kyotodb.cpp in progress
-rw-r--r--src/storage/Makefile.am5
-rw-r--r--src/storage/ngram.h6
-rw-r--r--src/storage/ngram_bdb.cpp5
-rw-r--r--src/storage/ngram_bdb.h2
-rw-r--r--src/storage/ngram_kyotodb.cpp175
-rw-r--r--src/storage/ngram_kyotodb.h6
6 files changed, 192 insertions, 7 deletions
diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am
index e660608..e54ca8e 100644
--- a/src/storage/Makefile.am
+++ b/src/storage/Makefile.am
@@ -32,6 +32,7 @@ noinst_HEADERS = chewing_enum.h \
phrase_large_table2.h \
ngram.h \
ngram_bdb.h \
+ ngram_kyotodb.h \
flexible_ngram.h \
tag_utility.h \
pinyin_parser_table.h \
@@ -61,3 +62,7 @@ libstorage_la_SOURCES = phrase_index.cpp \
if BERKELEYDB
libstorage_la_SOURCES += ngram_bdb.cpp
endif
+
+if KYOTOCABINET
+libstorage_la_SOURCES += ngram_kyotodb.cpp
+endif
diff --git a/src/storage/ngram.h b/src/storage/ngram.h
index 7466def..9b26f5d 100644
--- a/src/storage/ngram.h
+++ b/src/storage/ngram.h
@@ -23,11 +23,17 @@
#define NGRAM_H
#include <config.h>
+#include <glib.h>
+#include "novel_types.h"
#ifdef HAVE_BERKELEY_DB
#include "ngram_bdb.h"
#endif
+#ifdef HAVE_KYOTO_CABINET
+#include "ngram_kyotodb.h"
+#endif
+
namespace pinyin{
class Bigram;
diff --git a/src/storage/ngram_bdb.cpp b/src/storage/ngram_bdb.cpp
index e07127b..9d696ff 100644
--- a/src/storage/ngram_bdb.cpp
+++ b/src/storage/ngram_bdb.cpp
@@ -140,8 +140,10 @@ bool Bigram::save_db(const char * dbfile){
if ( cursorp != NULL )
cursorp->c_close(cursorp);
- if ( tmp_db != NULL )
+ if ( tmp_db != NULL ) {
+ tmp_db->sync(m_db, 0);
tmp_db->close(tmp_db, 0);
+ }
return true;
}
@@ -256,6 +258,7 @@ bool Bigram::get_all_items(GArray * items){
return true;
}
+/* Note: sync mask_out code with ngram_kyotodb.cpp. */
bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
diff --git a/src/storage/ngram_bdb.h b/src/storage/ngram_bdb.h
index f1c15f2..a1e70d3 100644
--- a/src/storage/ngram_bdb.h
+++ b/src/storage/ngram_bdb.h
@@ -38,8 +38,6 @@ class Bigram{
private:
DB * m_db;
- /* Note: sync mask_out code with ngram_kyotodb.cpp. */
-
void reset();
public:
diff --git a/src/storage/ngram_kyotodb.cpp b/src/storage/ngram_kyotodb.cpp
new file mode 100644
index 0000000..72b0c76
--- /dev/null
+++ b/src/storage/ngram_kyotodb.cpp
@@ -0,0 +1,175 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2013 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "ngram.h"
+#include <assert.h>
+#include <errno.h>
+#include <kchashdb.h>
+#include <kcprotodb.h>
+
+
+/* Use DB interface, first check, second reserve the memory chunk,
+ third get value into the chunk. */
+
+/* Use DB::visitor to get_all_items. */
+
+using namespace pinyin;
+using namespace kyotocabinet;
+
+
+Bigram::Bigram(){
+ m_db = NULL;
+}
+
+Bigram::~Bigram(){
+ reset();
+}
+
+void Bigram::reset(){
+ if ( m_db ){
+ m_db->synchronize();
+ m_db->close();
+ delete m_db;
+ }
+}
+
+class CopyVisitor : public DB::Visitor {
+private:
+ BasicDB * m_db;
+public:
+ CopyVisitor(BasicDB * db) {
+ m_db = db;
+ }
+
+ virtual const char* visit_full(const char* kbuf, size_t ksiz,
+ const char* vbuf, size_t vsiz, size_t* sp) {
+ m_db->set(kbuf, ksiz, vbuf, vsiz);
+ return NOP;
+ }
+
+ virtual const char* visit_empty(const char* kbuf, size_t ksiz, size_t* sp) {
+ /* assume no empty record. */
+ assert (FALSE);
+ return NOP;
+ }
+};
+
+/* Use ProtoHashDB for load_db/save_db methods. */
+bool Bigram::load_db(const char * dbfile){
+ reset();
+
+ /* create on-memory db. */
+ m_db = new ProtoHashDB;
+
+ /* load db into memory. */
+ BasicDB * tmp_db = new HashDB;
+ tmp_db->open(dbfile, BasicDB::OREADER);
+
+ CopyVisitor visitor(m_db);
+ tmp_db->iterate(&visitor, false);
+
+ if (tmp_db != NULL)
+ tmp_db->close();
+
+ return true;
+}
+
+bool Bigram::save_db(const char * dbfile){
+
+ int ret = unlink(dbfile);
+ if ( ret != 0 && errno != ENOENT)
+ return false;
+
+ BasicDB * tmp_db = new HashDB;
+
+ if ( !tmp_db->open(dbfile, BasicDB::OWRITER|BasicDB::OCREATE) )
+ return false;
+
+ CopyVisitor visitor(tmp_db);
+ m_db->iterate(&visitor, false);
+
+ if (tmp_db != NULL) {
+ tmp_db->synchronize();
+ tmp_db->close();
+ }
+
+ return true;
+}
+
+bool Bigram::attach(const char * dbfile, guint32 flags){
+ reset();
+ uint32_t mode = 0;
+
+ if (flags & ATTACH_READONLY)
+ mode |= BasicDB::OREADER;
+ if (flags & ATTACH_READWRITE) {
+ assert( !( flags & ATTACH_READONLY ) );
+ mode |= BasicDB::OREADER | BasicDB::OWRITER;
+ }
+ if (flags & ATTACH_CREATE)
+ mode |= BasicDB::OCREATE;
+
+ if (!dbfile)
+ return false;
+
+ m_db = new HashDB;
+
+ return m_db->open(dbfile, mode);
+}
+
+/* Note: sync mask_out code with ngram_bdb.cpp. */
+bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ if (!get_all_items(items)) {
+ g_array_free(items, TRUE);
+ return false;
+ }
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t index = g_array_index(items, phrase_token_t, i);
+
+ if ((index & mask) == value) {
+ assert(remove(index));
+ continue;
+ }
+
+ SingleGram * gram = NULL;
+ assert(load(index, gram));
+
+ int num = gram->mask_out(mask, value);
+ if (0 == num) {
+ delete gram;
+ continue;
+ }
+
+ if (0 == gram->get_length()) {
+ assert(remove(index));
+ } else {
+ assert(store(index, gram));
+ }
+
+ delete gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}
diff --git a/src/storage/ngram_kyotodb.h b/src/storage/ngram_kyotodb.h
index 6c5eea0..2c156de 100644
--- a/src/storage/ngram_kyotodb.h
+++ b/src/storage/ngram_kyotodb.h
@@ -22,7 +22,7 @@
#ifndef NGRAM_KYOTODB_H
#define NGRAM_KYOTODB_H
-#include <kchashdb.h>
+#include <kcdb.h>
#include "memory_chunk.h"
namespace pinyin{
@@ -37,13 +37,11 @@ class SingleGram;
*/
class Bigram{
private:
- kyotocabinet::DB * m_db;
+ kyotocabinet::BasicDB * m_db;
/* memory chunk for Kyoto Cabinet. */
MemoryChunk m_chunk;
- /* Note: sync mask_out code with ngram_bdb.cpp. */
-
void reset();
public: