/* * libpinyin * Library to deal with pinyin. * * Copyright (C) 2016 Peng Wu * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "chewing_large_table2.h" #include #include "bdb_utils.h" namespace pinyin{ /* keep dbm key compare function inside the corresponding dbm file to get more flexibility. */ static bool bdb_chewing_continue_search(const DBT *dbt1, const DBT *dbt2) { ChewingKey * lhs_chewing = (ChewingKey *) dbt1->data; int lhs_chewing_length = dbt1->size / sizeof(ChewingKey); ChewingKey * rhs_chewing = (ChewingKey *) dbt2->data; int rhs_chewing_length = dbt2->size / sizeof(ChewingKey); /* The key in dbm is longer than the key in application. */ if (lhs_chewing_length >= rhs_chewing_length) return false; int min_chewing_length = lhs_chewing_length; int result = pinyin_exact_compare2 (lhs_chewing, rhs_chewing, min_chewing_length); if (0 != result) return false; /* continue the longer chewing search. */ return true; } ChewingLargeTable2::ChewingLargeTable2() { /* create in-memory db. */ m_db = NULL; int ret = db_create(&m_db, NULL, 0); assert(0 == ret); ret = m_db->open(m_db, NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600); assert(0 == ret); m_entries = NULL; init_entries(); } void ChewingLargeTable2::reset() { if (m_db) { m_db->sync(m_db, 0); m_db->close(m_db, 0); m_db = NULL; } fini_entries(); } /* attach method */ bool ChewingLargeTable2::attach(const char * dbfile, guint32 flags) { reset(); init_entries(); u_int32_t db_flags = attach_options(flags); if (!dbfile) return false; int ret = db_create(&m_db, NULL, 0); assert(0 == ret); ret = m_db->open(m_db, NULL, dbfile, NULL, DB_BTREE, db_flags, 0644); if (ret != 0) return false; return true; } /* load/store method */ bool ChewingLargeTable2::load_db(const char * filename) { reset(); init_entries(); /* create in-memory db. */ int ret = db_create(&m_db, NULL, 0); assert(0 == ret); ret = m_db->open(m_db, NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600); if (ret != 0) return false; /* load db into memory. */ DB * tmp_db = NULL; ret = db_create(&tmp_db, NULL, 0); assert(0 == ret); if (NULL == tmp_db) return false; ret = tmp_db->open(tmp_db, NULL, filename, NULL, DB_BTREE, DB_RDONLY, 0600); if (ret != 0) return false; if (!copy_bdb(tmp_db, m_db)) return false; if (tmp_db != NULL) tmp_db->close(tmp_db, 0); return true; } bool ChewingLargeTable2::store_db(const char * new_filename) { DB * tmp_db = NULL; int ret = unlink(new_filename); if (ret != 0 && errno != ENOENT) return false; ret = db_create(&tmp_db, NULL, 0); assert(0 == ret); if (NULL == tmp_db) return false; ret = tmp_db->open(tmp_db, NULL, new_filename, NULL, DB_BTREE, DB_CREATE, 0600); if (ret != 0) return false; if (!copy_bdb(m_db, tmp_db)) return false; if (tmp_db != NULL) { tmp_db->sync(m_db, 0); tmp_db->close(tmp_db, 0); } return true; } template int ChewingLargeTable2::search_internal(/* in */ const ChewingKey index[], /* in */ const ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const { int result = SEARCH_NONE; ChewingTableEntry * entry = (ChewingTableEntry *) g_ptr_array_index(m_entries, phrase_length); assert(NULL != entry); DBT db_key; memset(&db_key, 0, sizeof(DBT)); db_key.data = (void *) index; db_key.size = phrase_length * sizeof(ChewingKey); DBT db_data; memset(&db_data, 0, sizeof(DBT)); int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); if (ret != 0) return result; /* continue searching. */ result |= SEARCH_CONTINUED; entry->m_chunk.set_chunk(db_data.data, db_data.size, NULL); result = entry->search(keys, ranges) | result; return result; } int ChewingLargeTable2::search_internal(int phrase_length, /* in */ const ChewingKey index[], /* in */ const ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const { #define CASE(len) case len: \ { \ return search_internal(index, keys, ranges); \ } switch(phrase_length) { CASE(1); CASE(2); CASE(3); CASE(4); CASE(5); CASE(6); CASE(7); CASE(8); CASE(9); CASE(10); CASE(11); CASE(12); CASE(13); CASE(14); CASE(15); CASE(16); default: abort(); } #undef CASE return SEARCH_NONE; } template int ChewingLargeTable2::search_suggestion_internal (/* in */ const DBT & db_data, int prefix_len, /* in */ const ChewingKey prefix_keys[], /* out */ PhraseTokens tokens) const { int result = SEARCH_NONE; ChewingTableEntry * entry = (ChewingTableEntry *) g_ptr_array_index(m_entries, phrase_length); assert(NULL != entry); entry->m_chunk.set_chunk(db_data.data, db_data.size, NULL); result = entry->search_suggestion(prefix_len, prefix_keys, tokens) | result; return result; } int ChewingLargeTable2::search_suggestion_internal (int phrase_length, /* in */ const DBT & db_data, int prefix_len, /* in */ const ChewingKey prefix_keys[], /* out */ PhraseTokens tokens) const { assert(prefix_len < phrase_length); #define CASE(len) case len: \ { \ return search_suggestion_internal \ (db_data, prefix_len, prefix_keys, tokens); \ } switch(phrase_length) { CASE(1); CASE(2); CASE(3); CASE(4); CASE(5); CASE(6); CASE(7); CASE(8); CASE(9); CASE(10); CASE(11); CASE(12); CASE(13); CASE(14); CASE(15); CASE(16); default: abort(); } #undef CASE return SEARCH_NONE; } template int ChewingLargeTable2::add_index_internal(/* in */ const ChewingKey index[], /* in */ const ChewingKey keys[], /* in */ phrase_token_t token) { ChewingTableEntry * entry = (ChewingTableEntry *) g_ptr_array_index(m_entries, phrase_length); assert(NULL != entry); /* load chewing table entry. */ DBT db_key; memset(&db_key, 0, sizeof(DBT)); db_key.data = (void *) index; db_key.size = phrase_length * sizeof(ChewingKey); DBT db_data; memset(&db_data, 0, sizeof(DBT)); int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); if (ret != 0) { /* new entry. */ ChewingTableEntry new_entry; new_entry.add_index(keys, token); memset(&db_data, 0, sizeof(DBT)); db_data.data = new_entry.m_chunk.begin(); db_data.size = new_entry.m_chunk.size(); ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); if (ret != 0) return ERROR_FILE_CORRUPTION; /* recursively add keys for continued information. */ for (size_t len = phrase_length - 1; len > 0; --len) { memset(&db_key, 0, sizeof(DBT)); db_key.data = (void *) index; db_key.size = len * sizeof(ChewingKey); memset(&db_data, 0, sizeof(DBT)); ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); /* found entry. */ if (0 == ret) return ERROR_OK; /* new entry with empty content. */ memset(&db_data, 0, sizeof(DBT)); ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); if (ret != 0) return ERROR_FILE_CORRUPTION; } return ERROR_OK; } /* already have keys. */ entry->m_chunk.set_chunk(db_data.data, db_data.size, NULL); int result = entry->add_index(keys, token); /* store the entry. */ memset(&db_data, 0, sizeof(DBT)); db_data.data = entry->m_chunk.begin(); db_data.size = entry->m_chunk.size(); ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); if (ret != 0) return ERROR_FILE_CORRUPTION; return result; } int ChewingLargeTable2::add_index_internal(int phrase_length, /* in */ const ChewingKey index[], /* in */ const ChewingKey keys[], /* in */ phrase_token_t token) { #define CASE(len) case len: \ { \ return add_index_internal(index, keys, token); \ } switch(phrase_length) { CASE(1); CASE(2); CASE(3); CASE(4); CASE(5); CASE(6); CASE(7); CASE(8); CASE(9); CASE(10); CASE(11); CASE(12); CASE(13); CASE(14); CASE(15); CASE(16); default: abort(); } #undef CASE return ERROR_FILE_CORRUPTION; } template int ChewingLargeTable2::remove_index_internal(/* in */ const ChewingKey index[], /* in */ const ChewingKey keys[], /* in */ phrase_token_t token) { ChewingTableEntry * entry = (ChewingTableEntry *) g_ptr_array_index(m_entries, phrase_length); assert(NULL != entry); DBT db_key; memset(&db_key, 0, sizeof(DBT)); db_key.data = (void *) index; db_key.size = phrase_length * sizeof(ChewingKey); DBT db_data; memset(&db_data, 0, sizeof(DBT)); int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); if (ret != 0) return ERROR_REMOVE_ITEM_DONOT_EXISTS; entry->m_chunk.set_chunk(db_data.data, db_data.size, NULL); int result = entry->remove_index(keys, token); if (ERROR_OK != result) return result; /* removed the token. */ memset(&db_data, 0, sizeof(DBT)); db_data.data = entry->m_chunk.begin(); db_data.size = entry->m_chunk.size(); ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); if (ret != 0) return ERROR_FILE_CORRUPTION; return ERROR_OK; } int ChewingLargeTable2::remove_index_internal(int phrase_length, /* in */ const ChewingKey index[], /* in */ const ChewingKey keys[], /* in */ phrase_token_t token) { #define CASE(len) case len: \ { \ return remove_index_internal(index, keys, token); \ } switch(phrase_length) { CASE(1); CASE(2); CASE(3); CASE(4); CASE(5); CASE(6); CASE(7); CASE(8); CASE(9); CASE(10); CASE(11); CASE(12); CASE(13); CASE(14); CASE(15); CASE(16); default: abort(); } #undef CASE return ERROR_FILE_CORRUPTION; } /* mask out method */ bool ChewingLargeTable2::mask_out(phrase_token_t mask, phrase_token_t value) { DBC * cursorp = NULL; DBT db_key, db_data; /* Get a cursor */ m_db->cursor(m_db, NULL, &cursorp, 0); if (NULL == cursorp) return false; /* Initialize our DBTs. */ memset(&db_key, 0, sizeof(DBT)); memset(&db_data, 0, sizeof(DBT)); /* Iterate over the database, retrieving each record in turn. */ int ret = 0; while((ret = cursorp->c_get(cursorp, &db_key, &db_data, DB_NEXT)) == 0) { int phrase_length = db_key.size / sizeof(ChewingKey); #define CASE(len) case len: \ { \ ChewingTableEntry * entry = \ (ChewingTableEntry *) \ g_ptr_array_index(m_entries, phrase_length); \ assert(NULL != entry); \ \ entry->m_chunk.set_chunk(db_data.data, db_data.size, NULL); \ \ entry->mask_out(mask, value); \ \ memset(&db_data, 0, sizeof(DBT)); \ db_data.data = entry->m_chunk.begin(); \ db_data.size = entry->m_chunk.size(); \ int ret = cursorp->put \ (cursorp, &db_key, &db_data, DB_CURRENT); \ assert(ret == 0); \ break; \ } switch(phrase_length) { CASE(1); CASE(2); CASE(3); CASE(4); CASE(5); CASE(6); CASE(7); CASE(8); CASE(9); CASE(10); CASE(11); CASE(12); CASE(13); CASE(14); CASE(15); CASE(16); default: abort(); } #undef CASE /* Initialize our DBTs. */ memset(&db_key, 0, sizeof(DBT)); memset(&db_data, 0, sizeof(DBT)); } assert(ret == DB_NOTFOUND); /* Cursors must be closed */ if (cursorp != NULL) cursorp->c_close(cursorp); m_db->sync(m_db, 0); return true; } /* search_suggesion method */ int ChewingLargeTable2::search_suggestion (int prefix_len, /* in */ const ChewingKey prefix_keys[], /* out */ PhraseTokens tokens) const { ChewingKey index[MAX_PHRASE_LENGTH]; int result = SEARCH_NONE; if (NULL == m_db) return result; if (contains_incomplete_pinyin(prefix_keys, prefix_len)) compute_incomplete_chewing_index(prefix_keys, index, prefix_len); else compute_chewing_index(prefix_keys, index, prefix_len); DBC * cursorp = NULL; /* Get a cursor */ int ret = m_db->cursor(m_db, NULL, &cursorp, 0); if (ret != 0) return result; DBT db_key1; memset(&db_key1, 0, sizeof(DBT)); db_key1.data = (void *) index; db_key1.size = prefix_len * sizeof(ChewingKey); DBT db_data; memset(&db_data, 0, sizeof(DBT)); /* Get the prefix entry */ ret = cursorp->c_get(cursorp, &db_key1, &db_data, DB_SET); if (ret != 0) { cursorp->c_close(cursorp); return result; } /* Get the next entry */ DBT db_key2; memset(&db_key2, 0, sizeof(DBT)); memset(&db_data, 0, sizeof(DBT)); ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT); if (ret != 0) { cursorp->c_close(cursorp); return result; } while(bdb_chewing_continue_search(&db_key1, &db_key2)) { int phrase_length = db_key2.size / sizeof(ChewingKey); result = search_suggestion_internal (phrase_length, db_data, prefix_len, prefix_keys, tokens) | result; memset(&db_key2, 0, sizeof(DBT)); memset(&db_data, 0, sizeof(DBT)); ret = cursorp->c_get(cursorp, &db_key2, &db_data, DB_NEXT); if (ret != 0) { cursorp->c_close(cursorp); return result; } } cursorp->c_close(cursorp); return result; } };