summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2015-04-09 16:02:05 +0800
committerPeng Wu <alexepico@gmail.com>2015-04-09 16:02:05 +0800
commit941d6c364906a932b4f2874177962e50188e38de (patch)
treebc19ab189d9e2e140c535a627e77b3bdc94c472a /src
parent9a6dfc762edd7ce29a9ef3d225dc8bd51bc6fcb9 (diff)
downloadlibpinyin-941d6c364906a932b4f2874177962e50188e38de.tar.gz
libpinyin-941d6c364906a932b4f2874177962e50188e38de.tar.xz
libpinyin-941d6c364906a932b4f2874177962e50188e38de.zip
re-factor Bigram
Diffstat (limited to 'src')
-rw-r--r--src/storage/ngram.cpp250
-rw-r--r--src/storage/ngram_bdb.cpp272
2 files changed, 272 insertions, 250 deletions
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
index 3964388..4f4e787 100644
--- a/src/storage/ngram.cpp
+++ b/src/storage/ngram.cpp
@@ -270,256 +270,6 @@ bool SingleGram::set_freq( /* in */ phrase_token_t token,
return false;
}
-bool Bigram::load_db(const char * dbfile){
- reset();
-
- /* create in memory db. */
- int ret = db_create(&m_db, NULL, 0);
- assert(ret == 0);
-
- ret = m_db->open(m_db, NULL, NULL, NULL,
- DB_HASH, DB_CREATE, 0600);
- if ( ret != 0 )
- return false;
-
- /* load db into memory. */
- DB * tmp_db = NULL;
- ret = db_create(&tmp_db, NULL, 0);
- assert(ret == 0);
-
- if (NULL == tmp_db)
- return false;
-
- ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
- DB_HASH, DB_RDONLY, 0600);
- if ( ret != 0 )
- return false;
-
- DBC * cursorp = NULL;
- DBT key, data;
-
- /* Get a cursor */
- tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
-
- if (NULL == cursorp)
- return false;
-
- /* Initialize our DBTs. */
- memset(&key, 0, sizeof(DBT));
- memset(&data, 0, sizeof(DBT));
-
- /* Iterate over the database, retrieving each record in turn. */
- while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
- int ret = m_db->put(m_db, NULL, &key, &data, 0);
- assert(ret == 0);
- }
- assert (ret == DB_NOTFOUND);
-
- /* Cursors must be closed */
- if ( cursorp != NULL )
- cursorp->c_close(cursorp);
-
- if ( tmp_db != NULL )
- tmp_db->close(tmp_db, 0);
-
- return true;
-}
-
-bool Bigram::save_db(const char * dbfile){
- DB * tmp_db = NULL;
-
- int ret = unlink(dbfile);
- if ( ret != 0 && errno != ENOENT)
- return false;
-
- ret = db_create(&tmp_db, NULL, 0);
- assert(ret == 0);
-
- if (NULL == tmp_db)
- return false;
-
- ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
- DB_HASH, DB_CREATE, 0600);
- if ( ret != 0 )
- return false;
-
- DBC * cursorp = NULL;
- DBT key, data;
- /* Get a cursor */
- m_db->cursor(m_db, NULL, &cursorp, 0);
-
- if (NULL == cursorp)
- return false;
-
- /* Initialize our DBTs. */
- memset(&key, 0, sizeof(DBT));
- memset(&data, 0, sizeof(DBT));
-
- /* Iterate over the database, retrieving each record in turn. */
- while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
- int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
- assert(ret == 0);
- }
- assert (ret == DB_NOTFOUND);
-
- /* Cursors must be closed */
- if ( cursorp != NULL )
- cursorp->c_close(cursorp);
-
- if ( tmp_db != NULL )
- tmp_db->close(tmp_db, 0);
-
- return true;
-}
-
-bool Bigram::attach(const char * dbfile, guint32 flags){
- reset();
- u_int32_t db_flags = 0;
-
- if ( flags & ATTACH_READONLY )
- db_flags |= DB_RDONLY;
- if ( flags & ATTACH_READWRITE )
- assert( !( flags & ATTACH_READONLY ) );
- if ( flags & ATTACH_CREATE )
- db_flags |= DB_CREATE;
-
- if ( !dbfile )
- return false;
- int ret = db_create(&m_db, NULL, 0);
- if ( ret != 0 )
- assert(false);
-
- ret = m_db->open(m_db, NULL, dbfile, NULL,
- DB_HASH, db_flags, 0644);
- if ( ret != 0)
- return false;
-
- return true;
-}
-
-bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
- single_gram = NULL;
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
-
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
- if ( ret != 0 )
- return false;
-
- single_gram = new SingleGram(db_data.data, db_data.size);
- return true;
-}
-
-bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
- DBT db_data;
- memset(&db_data, 0, sizeof(DBT));
- db_data.data = single_gram->m_chunk.begin();
- db_data.size = single_gram->m_chunk.size();
-
- int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
- return ret == 0;
-}
-
-bool Bigram::remove(/* in */ phrase_token_t index){
- if ( !m_db )
- return false;
-
- DBT db_key;
- memset(&db_key, 0, sizeof(DBT));
- db_key.data = &index;
- db_key.size = sizeof(phrase_token_t);
-
- int ret = m_db->del(m_db, NULL, &db_key, 0);
- return 0 == ret;
-}
-
-bool Bigram::get_all_items(GArray * items){
- g_array_set_size(items, 0);
-
- if ( !m_db )
- return false;
-
- DBC * cursorp = NULL;
- DBT key, data;
- int ret;
- /* Get a cursor */
- m_db->cursor(m_db, NULL, &cursorp, 0);
-
- if (NULL == cursorp)
- return false;
-
- /* Initialize our DBTs. */
- memset(&key, 0, sizeof(DBT));
- memset(&data, 0, sizeof(DBT));
-
- /* Iterate over the database, retrieving each record in turn. */
- while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
- assert(key.size == sizeof(phrase_token_t));
- phrase_token_t * token = (phrase_token_t *)key.data;
- g_array_append_val(items, *token);
- }
-
- assert (ret == DB_NOTFOUND);
-
- /* Cursors must be closed */
- if (cursorp != NULL)
- cursorp->c_close(cursorp);
-
- return true;
-}
-
-bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
- GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
-
- if (!get_all_items(items)) {
- g_array_free(items, TRUE);
- return false;
- }
-
- for (size_t i = 0; i < items->len; ++i) {
- phrase_token_t index = g_array_index(items, phrase_token_t, i);
-
- if ((index & mask) == value) {
- assert(remove(index));
- continue;
- }
-
- SingleGram * gram = NULL;
- assert(load(index, gram));
-
- int num = gram->mask_out(mask, value);
- if (0 == num) {
- delete gram;
- continue;
- }
-
- if (0 == gram->get_length()) {
- assert(remove(index));
- } else {
- assert(store(index, gram));
- }
-
- delete gram;
- }
-
- g_array_free(items, TRUE);
- return true;
-}
-
namespace pinyin{
diff --git a/src/storage/ngram_bdb.cpp b/src/storage/ngram_bdb.cpp
new file mode 100644
index 0000000..954e701
--- /dev/null
+++ b/src/storage/ngram_bdb.cpp
@@ -0,0 +1,272 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2015 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "ngram.h"
+
+bool Bigram::load_db(const char * dbfile){
+ reset();
+
+ /* create in memory db. */
+ int ret = db_create(&m_db, NULL, 0);
+ assert(ret == 0);
+
+ ret = m_db->open(m_db, NULL, NULL, NULL,
+ DB_HASH, DB_CREATE, 0600);
+ if ( ret != 0 )
+ return false;
+
+ /* load db into memory. */
+ DB * tmp_db = NULL;
+ ret = db_create(&tmp_db, NULL, 0);
+ assert(ret == 0);
+
+ if (NULL == tmp_db)
+ return false;
+
+ ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
+ DB_HASH, DB_RDONLY, 0600);
+ if ( ret != 0 )
+ return false;
+
+ DBC * cursorp = NULL;
+ DBT key, data;
+
+ /* Get a cursor */
+ tmp_db->cursor(tmp_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ int ret = m_db->put(m_db, NULL, &key, &data, 0);
+ assert(ret == 0);
+ }
+ assert (ret == DB_NOTFOUND);
+
+ /* Cursors must be closed */
+ if ( cursorp != NULL )
+ cursorp->c_close(cursorp);
+
+ if ( tmp_db != NULL )
+ tmp_db->close(tmp_db, 0);
+
+ return true;
+}
+
+bool Bigram::save_db(const char * dbfile){
+ DB * tmp_db = NULL;
+
+ int ret = unlink(dbfile);
+ if ( ret != 0 && errno != ENOENT)
+ return false;
+
+ ret = db_create(&tmp_db, NULL, 0);
+ assert(ret == 0);
+
+ if (NULL == tmp_db)
+ return false;
+
+ ret = tmp_db->open(tmp_db, NULL, dbfile, NULL,
+ DB_HASH, DB_CREATE, 0600);
+ if ( ret != 0 )
+ return false;
+
+ DBC * cursorp = NULL;
+ DBT key, data;
+ /* Get a cursor */
+ m_db->cursor(m_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ int ret = tmp_db->put(tmp_db, NULL, &key, &data, 0);
+ assert(ret == 0);
+ }
+ assert (ret == DB_NOTFOUND);
+
+ /* Cursors must be closed */
+ if ( cursorp != NULL )
+ cursorp->c_close(cursorp);
+
+ if ( tmp_db != NULL )
+ tmp_db->close(tmp_db, 0);
+
+ return true;
+}
+
+bool Bigram::attach(const char * dbfile, guint32 flags){
+ reset();
+ u_int32_t db_flags = 0;
+
+ if ( flags & ATTACH_READONLY )
+ db_flags |= DB_RDONLY;
+ if ( flags & ATTACH_READWRITE )
+ assert( !( flags & ATTACH_READONLY ) );
+ if ( flags & ATTACH_CREATE )
+ db_flags |= DB_CREATE;
+
+ if ( !dbfile )
+ return false;
+ int ret = db_create(&m_db, NULL, 0);
+ if ( ret != 0 )
+ assert(false);
+
+ ret = m_db->open(m_db, NULL, dbfile, NULL,
+ DB_HASH, db_flags, 0644);
+ if ( ret != 0)
+ return false;
+
+ return true;
+}
+
+bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){
+ single_gram = NULL;
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+
+ single_gram = new SingleGram(db_data.data, db_data.size);
+ return true;
+}
+
+bool Bigram::store(phrase_token_t index, SingleGram * single_gram){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = single_gram->m_chunk.begin();
+ db_data.size = single_gram->m_chunk.size();
+
+ int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
+}
+
+bool Bigram::remove(/* in */ phrase_token_t index){
+ if ( !m_db )
+ return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = &index;
+ db_key.size = sizeof(phrase_token_t);
+
+ int ret = m_db->del(m_db, NULL, &db_key, 0);
+ return 0 == ret;
+}
+
+bool Bigram::get_all_items(GArray * items){
+ g_array_set_size(items, 0);
+
+ if ( !m_db )
+ return false;
+
+ DBC * cursorp = NULL;
+ DBT key, data;
+ int ret;
+ /* Get a cursor */
+ m_db->cursor(m_db, NULL, &cursorp, 0);
+
+ if (NULL == cursorp)
+ return false;
+
+ /* Initialize our DBTs. */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* Iterate over the database, retrieving each record in turn. */
+ while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
+ assert(key.size == sizeof(phrase_token_t));
+ phrase_token_t * token = (phrase_token_t *)key.data;
+ g_array_append_val(items, *token);
+ }
+
+ assert (ret == DB_NOTFOUND);
+
+ /* Cursors must be closed */
+ if (cursorp != NULL)
+ cursorp->c_close(cursorp);
+
+ return true;
+}
+
+bool Bigram::mask_out(phrase_token_t mask, phrase_token_t value){
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ if (!get_all_items(items)) {
+ g_array_free(items, TRUE);
+ return false;
+ }
+
+ for (size_t i = 0; i < items->len; ++i) {
+ phrase_token_t index = g_array_index(items, phrase_token_t, i);
+
+ if ((index & mask) == value) {
+ assert(remove(index));
+ continue;
+ }
+
+ SingleGram * gram = NULL;
+ assert(load(index, gram));
+
+ int num = gram->mask_out(mask, value);
+ if (0 == num) {
+ delete gram;
+ continue;
+ }
+
+ if (0 == gram->get_length()) {
+ assert(remove(index));
+ } else {
+ assert(store(index, gram));
+ }
+
+ delete gram;
+ }
+
+ g_array_free(items, TRUE);
+ return true;
+}