summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2025-11-11 14:38:38 +0800
committerPeng Wu <alexepico@gmail.com>2025-11-12 15:12:27 +0800
commit426e5e78c99c481fd9afd9928dfd613eba5e14fc (patch)
tree74e3f4a3984b96817c238cb12cf5dcf97189fd03
parent80e817c6080c31a9718576143a4c4d8adcc0fc54 (diff)
downloadlibpinyin-426e5e78c99c481fd9afd9928dfd613eba5e14fc.tar.gz
libpinyin-426e5e78c99c481fd9afd9928dfd613eba5e14fc.tar.xz
libpinyin-426e5e78c99c481fd9afd9928dfd613eba5e14fc.zip
Write ngram_tkrzwdb.cpp
-rw-r--r--src/storage/ngram_tkrzwdb.cpp213
1 files changed, 213 insertions, 0 deletions
diff --git a/src/storage/ngram_tkrzwdb.cpp b/src/storage/ngram_tkrzwdb.cpp
new file mode 100644
index 0000000..be0a3c4
--- /dev/null
+++ b/src/storage/ngram_tkrzwdb.cpp
@@ -0,0 +1,213 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2025 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "ngram.h"
+#include <assert.h>
+#include <errno.h>
+#include <tkrzw_dbm_hash.h>
+#include <tkrzw_dbm_tiny.h>
+#include <tkrzw_str_util.h>
+#include "tkrzwdb_utils.h"
+
+using namespace pinyin;
+using namespace tkrzw;
+
+Bigram::Bigram(){
+ m_db = NULL;
+}
+
+Bigram::~Bigram(){
+ reset();
+}
+
+void Bigram::reset(){
+ if ( m_db ){
+ m_db->Close();
+ delete m_db;
+ m_db = NULL;
+ }
+}
+
+bool Bigram::load_db(const char * dbfile){
+ reset();
+
+ /* create in-memory db. */
+ m_db = new TinyDBM;
+
+ HashDBM tmp_db;
+ if (tmp_db.Open(dbfile, false, File::OPEN_DEFAULT) != Status::SUCCESS)
+ return false;
+
+ copy_tkrzwdb(&tmp_db, m_db);
+
+ tmp_db.Close();
+
+ return true;
+}
+
+bool Bigram::save_db(const char * dbfile){
+ if (!m_db)
+ return false;
+
+ Status status = RemoveFile(dbfile);
+
+ if (status != Status::SUCCESS && status != Status::NOT_FOUND_ERROR)
+ return false;
+
+ HashDBM tmp_db;
+ if (tmp_db.Open(dbfile, true, File::OPEN_DEFAULT) != Status::SUCCESS)
+ return false;
+
+ copy_tkrzwdb(m_db, &tmp_db);
+
+ tmp_db.Synchronize(false);
+ tmp_db.Close();
+
+ return true;
+}
+
+bool Bigram::attach(const char * dbfile, guint32 flags){
+ bool writable = false;
+
+ reset();
+
+ int32_t options = attach_options(flags, writable);
+
+ if (!dbfile)
+ return false;
+
+ m_db = new HashDBM;
+
+ return m_db->Open(dbfile, writable, options).IsOK();
+}
+
+/* Use DB interface. */
+bool Bigram::load(phrase_token_t index, SingleGram * & single_gram,
+ bool copy){
+ single_gram = NULL;
+ if ( !m_db )
+ return false;
+
+ std::string_view key(reinterpret_cast<const char*>(&index), sizeof(phrase_token_t));
+ std::string value;
+
+ Status status = m_db->Get(key, &value);
+
+ if (!status.IsOK())
+ return false;
+
+ size_t vsiz = value.size();
+ m_chunk.set_size(vsiz);
+ memcpy(m_chunk.begin(), value.data(), vsiz);
+
+ single_gram = new SingleGram(m_chunk.begin(), vsiz, copy);
+ return true;
+}
+
+bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
+ if ( !m_db )
+ return false;
+
+ std::string_view key(reinterpret_cast<const char*>(&index), sizeof(phrase_token_t));
+ std::string_view value(reinterpret_cast<const char*>(single_gram->m_chunk.begin()),
+ single_gram->m_chunk.size());
+
+ return m_db->Set(key, value).IsOK();
+}
+
+bool Bigram::remove(/* in */ phrase_token_t index){
+ if ( !m_db )
+ return false;
+
+ std::string_view key(reinterpret_cast<const char*>(&index), sizeof(phrase_token_t));
+ return m_db->Remove(key).IsOK();
+}
+
+class KeyCollectProcessor : public DBM::RecordProcessor {
+private:
+ GArray * m_items;
+public:
+ KeyCollectProcessor(GArray * items) : m_items(items) {}
+
+ std::string_view ProcessFull(std::string_view key, std::string_view value) override {
+ assert(key.size() == sizeof(phrase_token_t));
+ const phrase_token_t * token = reinterpret_cast<const phrase_token_t *>(key.data());
+ g_array_append_val(m_items, *token);
+ return NOOP;
+ }
+
+ std::string_view ProcessEmpty(std::string_view key) override {
+ /* assume no empty record. */
+ assert (FALSE);
+ return NOOP;
+ }
+};
+
+bool Bigram::get_all_items(GArray * items){
+ g_array_set_size(items, 0);
+
+ if ( !m_db )
+ return false;
+
+ KeyCollectProcessor processor(items);
+
+ Status status = m_db->ProcessEach(&processor, false);
+
+ return status.IsOK();
+}
+
+/* Note: sync mask_out code with ngram_bdb.cpp. */
+bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ if (!get_all_items(items)) {
+ g_array_free(items, TRUE);
+ return false;
+ }
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t index = g_array_index(items, phrase_token_t, i);
+
+ if ((index & mask) == value) {
+ check_result(remove(index));
+ continue;
+ }
+
+ SingleGram * gram = NULL;
+ check_result(load(index, gram));
+
+ int num = gram->mask_out(mask, value);
+ if (0 == num) {
+ delete gram;
+ continue;
+ }
+
+ if (0 == gram->get_length()) {
+ check_result(remove(index));
+ } else {
+ check_result(store(index, gram));
+ }
+
+ delete gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}