From 647b365bbf25bc1e8db10aa26427fddbbbaf4626 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 18 Apr 2011 16:16:59 +0800 Subject: refine bi-gram --- src/storage/ngram.cpp | 153 ++++++++++++++++++++------------------------------ src/storage/ngram.h | 38 +++++++------ 2 files changed, 80 insertions(+), 111 deletions(-) (limited to 'src/storage') diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp index e836ee5..49e7c6f 100644 --- a/src/storage/ngram.cpp +++ b/src/storage/ngram.cpp @@ -217,132 +217,99 @@ bool SingleGram::set_freq( /* in */ phrase_token_t token, } -bool Bigram::attach(const char * systemfile, const char * userfile){ +bool Bigram::attach(const char * dbfile, guint32 flags){ reset(); - if ( systemfile ){ - int ret = db_create(&m_system, NULL, 0); + u_int32_t db_flags = 0; + + if ( flags & ATTACH_READONLY ) + db_flags |= DB_RDONLY; + if ( flags & ATTACH_READWRITE ) + /* nothing */; + if ( flags & ATTACH_CREATE ) + db_flags |= DB_CREATE; + + if ( dbfile ){ + int ret = db_create(&m_db, NULL, 0); if ( ret != 0 ) assert(false); - m_system->open(m_system, NULL, systemfile, NULL, - DB_HASH, DB_RDONLY, 0664); + ret = m_db->open(m_db, NULL, dbfile, NULL, + DB_HASH, db_flags, 0664); if ( ret != 0) return false; } - if ( userfile ){ - int ret = db_create(&m_user, NULL, 0); - if ( ret != 0 ) - assert(false); - - m_user->open(m_user, NULL, userfile, NULL, DB_HASH, DB_CREATE, 0664); - if ( ret != 0) - return false; - } return true; } -bool Bigram::load(phrase_token_t index, SingleGram * & system_gram, SingleGram * & user_gram){ +bool Bigram::load(phrase_token_t index, SingleGram * & single_gram){ DBT db_key; memset(&db_key, 0, sizeof(DBT)); db_key.data = &index; db_key.size = sizeof(phrase_token_t); - system_gram = NULL; user_gram = NULL; - if ( m_system ){ - DBT db_data; - memset(&db_data, 0, sizeof(DBT)); - int ret = m_system->get(m_system, NULL, &db_key, &db_data, 0); - if ( ret == 0 ) - system_gram = new SingleGram(db_data.data, db_data.size); - } - if ( m_user ){ - DBT db_data; - memset(&db_data, 0, sizeof(DBT)); - int ret = m_user->get(m_user, NULL, &db_key, &db_data, 0); - if ( ret == 0 ) - user_gram = new SingleGram(db_data.data, db_data.size); - } + single_gram = NULL; + if ( !m_db ) + return false; + + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); + if ( ret != 0 ) + return false; + + single_gram = new SingleGram(db_data.data, db_data.size); return true; } -bool Bigram::store(phrase_token_t index, SingleGram * user_gram){ - if ( !m_user ) +bool Bigram::store(phrase_token_t index, SingleGram * single_gram){ + if ( !m_db ) return false; + DBT db_key; memset(&db_key, 0, sizeof(DBT)); db_key.data = &index; db_key.size = sizeof(phrase_token_t); DBT db_data; memset(&db_data, 0, sizeof(DBT)); - db_data.data = user_gram->m_chunk.begin(); - db_data.size = user_gram->m_chunk.size(); + db_data.data = single_gram->m_chunk.begin(); + db_data.size = single_gram->m_chunk.size(); - int ret = m_user->put(m_user, NULL, &db_key, &db_data, 0); + int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); return ret == 0; } -bool Bigram::get_all_items(GArray * system, GArray * user){ - bool retval = false; - g_array_set_size(system, 0); - g_array_set_size(user, 0); - if ( m_system ){ - DBC * cursorp; - DBT key, data; - int ret; - /* Get a cursor */ - m_system->cursor(m_system, NULL, &cursorp, 0); - - /* Initialize our DBTs. */ - memset(&key, 0, sizeof(DBT)); - memset(&data, 0, sizeof(DBT)); - - /* Iterate over the database, retrieving each record in turn. */ - while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { - assert(key.size == sizeof(phrase_token_t)); - phrase_token_t * token = (phrase_token_t *)key.data; - g_array_append_val(system, *token); - } - - if (ret != DB_NOTFOUND) { - fprintf(stderr, "system db error, exit!"); - exit(1); - } +bool Bigram::get_all_items(GArray * items){ + g_array_set_size(items, 0); - /* Cursors must be closed */ - if (cursorp != NULL) - cursorp->c_close(cursorp); + if ( !m_db ) + return false; - retval = true; - } - if ( m_user ){ - DBC * cursorp; - DBT key, data; - int ret; - /* Get a cursor */ - m_user->cursor(m_user, NULL, &cursorp, 0); - - /* Initialize out DBTs. */ - memset(&key, 0, sizeof(DBT)); - memset(&data, 0, sizeof(DBT)); - - /* Iterate over the database, retrieving each record in turn. */ - while((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { - assert(key.size == sizeof(phrase_token_t)); - phrase_token_t * token = (phrase_token_t *) key.data; - g_array_append_val(user, *token); - } - - if (ret != DB_NOTFOUND){ - fprintf(stderr, "user db error, exit!"); - exit(1); - } + DBC * cursorp; + DBT key, data; + int ret; + /* Get a cursor */ + m_db->cursor(m_db, NULL, &cursorp, 0); + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); - /* Cursor must be closed */ - if ( cursorp != NULL) - cursorp->c_close(cursorp); + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + assert(key.size == sizeof(phrase_token_t)); + phrase_token_t * token = (phrase_token_t *)key.data; + g_array_append_val(items, *token); + } - retval = true; + if (ret != DB_NOTFOUND) { + fprintf(stderr, "system db error, exit!"); + exit(1); } - return retval; + + /* Cursors must be closed */ + if (cursorp != NULL) + cursorp->c_close(cursorp); + + return true; } diff --git a/src/storage/ngram.h b/src/storage/ngram.h index 290a0bc..93e6ad7 100644 --- a/src/storage/ngram.h +++ b/src/storage/ngram.h @@ -26,6 +26,12 @@ namespace pinyin{ +enum { + ATTACH_READONLY = 1, + ATTACH_READWRITE = 0x1 << 1, + ATTACH_CREATE = 0x1 << 2, +}; + class Bigram; /* Note: @@ -93,44 +99,40 @@ public: class Bigram{ private: - DB * m_system; - DB * m_user; + DB * m_db; void reset(){ - if ( m_system ){ - m_system->close(m_system, 0); - m_system = NULL; - } - if ( m_user ){ - m_user->close(m_user, 0); - m_user = NULL; + if ( m_db ){ + m_db->close(m_db, 0); + m_db = NULL; } } public: Bigram(){ - m_system = NULL; m_user = NULL; + m_db = NULL; } ~Bigram(){ reset(); } - /* attach system and user bi-gram */ - /* when with training systemdb is NULL, only user_gram */ - bool attach(const char * systemfile, const char * userfile); + /* load/save berkeley db in memory. */ + bool load_db(const char * dbfile); + bool save_db(const char * dbfile); + + /* attach bi-gram */ + bool attach(const char * dbfile, guint32 flags); /* load/store one single gram */ bool load(/* in */ phrase_token_t index, - /* out */ SingleGram * & system_gram, - /* out */ SingleGram * & user_gram); + /* out */ SingleGram * & single_gram); bool store(/* in */ phrase_token_t index, - /* in */ SingleGram * user_gram); + /* in */ SingleGram * single_gram); /* array of phrase_token_t items, for parameter estimation. */ - bool get_all_items(/* out */ GArray * system, - /* out */ GArray * user); + bool get_all_items(/* out */ GArray * items); }; }; -- cgit