diff options
author | Peng Wu <alexepico@gmail.com> | 2011-04-07 13:47:05 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-04-07 13:47:05 +0800 |
commit | 7d46595e8d2391fc70be12f3ecf8119bdaa47c7b (patch) | |
tree | 01fec3008487cc2c36955b6d2b8610c8b942f81c /src/storage/flexible_ngram.h | |
parent | 74721fab7449e1fb9e655812a067cd0dd3d7c26d (diff) | |
download | libpinyin-7d46595e8d2391fc70be12f3ecf8119bdaa47c7b.tar.gz libpinyin-7d46595e8d2391fc70be12f3ecf8119bdaa47c7b.tar.xz libpinyin-7d46595e8d2391fc70be12f3ecf8119bdaa47c7b.zip |
write flexible n-gram
Diffstat (limited to 'src/storage/flexible_ngram.h')
-rw-r--r-- | src/storage/flexible_ngram.h | 136 |
1 files changed, 130 insertions, 6 deletions
diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h index e393a91..ff366da 100644 --- a/src/storage/flexible_ngram.h +++ b/src/storage/flexible_ngram.h @@ -189,9 +189,12 @@ public: template<typename MagicHeader, typename ArrayHeader, typename ArrayItem> class FlexibleBigram{ + /* Note: some flexible bi-gram file format check should be here. */ private: DB * m_db; + phrase_token_t m_magic_header_index[2]; + void reset(){ if ( m_db ){ m_db->close(m_db, 0); @@ -202,6 +205,8 @@ private: public: FlexibleBigram(){ m_db = NULL; + m_magic_header_index[0] = null_token; + m_magic_header_index[1] = null_token; } ~FlexibleBigram(){ @@ -209,18 +214,137 @@ public: } /* attach berkeley db on filesystem for training purpose. */ - bool attach(const char * dbfile); + bool attach(const char * dbfile){ + reset(); + if ( dbfile ){ + int ret = db_create(&m_db, NULL, 0); + if ( ret != 0 ) + assert(false); + + m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, DB_CREATE, 0644); + if ( ret != 0 ) + return false; + } + return true; + } + /* load/store one array. */ bool load(phrase_token_t index, - FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram); + FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram){ + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + single_gram = NULL; + if ( !m_db ) + return false; + + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); + if ( ret == 0) + single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem> + (db_data.data, db_data.size); + + return true; + } + bool store(phrase_token_t index, - FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram); + FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram){ + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = single_gram->m_chunk.begin(); + db_data.size = single_gram->m_chunk.size(); + + int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); + return ret == 0; + } + /* array of phrase_token_t items, for parameter estimation. */ - bool get_all_items(GArray * items); + bool get_all_items(GArray * items){ + g_array_set_size(items, 0); + if ( !m_db ) + return false; + + DBC * cursorp; + DBT key, data; + int ret; + + /* Get a cursor */ + m_db->cursor(m_db, NULL, &cursorp, 0); + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){ + assert(key.size == sizeof(phrase_token_t)); + phrase_token_t * token = (phrase_token_t *) key.data; + g_array_append_val(items, *token); + } + + if ( ret != DB_NOTFOUND ){ + fprintf(stderr, "training db error, exit!"); + exit(1); + } + + /* Cursors must be closed */ + if (cursorp != NULL) + cursorp->c_close(cursorp); + return true; + } /* get/set magic header. */ - bool get_magic_header(MagicHeader & header); - bool set_magic_header(const MagicHeader & header); + bool get_magic_header(MagicHeader & header){ + /* Note: remove the below statement later? */ + assert(sizeof(m_magic_header_index) == 2 * sizeof(phrase_token_t)); + + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = m_magic_header_index; + db_key.size = sizeof(m_magic_header_index); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + + int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); + if ( ret != 0 ) + return false; + assert(sizeof(MagicHeader) == db_data.size); + memcpy(&header, db_data.data, sizeof(MagicHeader)); + return true; + } + + bool set_magic_header(const MagicHeader & header){ + /* Note: remove the below statement later? */ + assert(sizeof(m_magic_header_index) == 2 * sizeof(phrase_token_t)); + + if ( !m_db ) + return false; + + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = m_magic_header_index; + db_key.size = sizeof(m_magic_header_index); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = &header; + db_data.size = sizeof(MagicHeader); + + int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); + return ret == 0; + } }; #endif |