summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-03 15:51:48 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-03 15:51:48 +0800
commit8c6cfe9243013e32ee0b3a8e78dd68e6de4df77f (patch)
treecd68bf9529b712bbe46db564a93ed086b7f2a0ed
parent881d876d71098dce20b8df12bc893a243027c339 (diff)
downloadlibpinyin-8c6cfe9243013e32ee0b3a8e78dd68e6de4df77f.tar.gz
libpinyin-8c6cfe9243013e32ee0b3a8e78dd68e6de4df77f.tar.xz
libpinyin-8c6cfe9243013e32ee0b3a8e78dd68e6de4df77f.zip
add flexible n-gram signature check
-rw-r--r--src/storage/flexible_ngram.h70
-rw-r--r--src/storage/ngram.cpp2
-rw-r--r--tests/storage/test_flexible_ngram.cpp2
-rw-r--r--utils/training/estimate_k_mixture_model.cpp8
-rw-r--r--utils/training/k_mixture_model.h3
5 files changed, 70 insertions, 15 deletions
diff --git a/src/storage/flexible_ngram.h b/src/storage/flexible_ngram.h
index 9b08a0d..ec55a76 100644
--- a/src/storage/flexible_ngram.h
+++ b/src/storage/flexible_ngram.h
@@ -252,6 +252,9 @@ private:
phrase_token_t m_magic_header_index[2];
+ char m_magic_number[4];
+ const size_t m_magic_number_length;
+
void reset(){
if ( m_db ){
m_db->close(m_db, 0);
@@ -260,10 +263,15 @@ private:
}
public:
- FlexibleBigram(){
+ FlexibleBigram(const char * magic_number)
+ : m_magic_number_length(sizeof(m_magic_number)){
m_db = NULL;
m_magic_header_index[0] = null_token;
m_magic_header_index[1] = null_token;
+
+ /* Note: remove the below line? */
+ assert(sizeof(m_magic_number) == 4 * sizeof(char) );
+ memcpy(m_magic_number, magic_number, sizeof(m_magic_number));
}
~FlexibleBigram(){
@@ -273,16 +281,54 @@ public:
/* attach berkeley db on filesystem for training purpose. */
bool attach(const char * dbfile){
reset();
- if ( dbfile ){
- int ret = db_create(&m_db, NULL, 0);
- if ( ret != 0 )
- assert(false);
+ if ( !dbfile )
+ return false;
+ int ret = db_create(&m_db, NULL, 0);
+ if ( ret != 0 )
+ assert(false);
- m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, DB_CREATE, 0644);
+ ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, 0, 0644);
+ if ( ret != 0 ) {
+ /* Create database file here, and write the signature. */
+ ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, DB_CREATE, 0644);
if ( ret != 0 )
return false;
+
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.data = m_magic_number;
+ db_data.size = sizeof(m_magic_number);
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(m_magic_number);
+
+ ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
+ return ret == 0;
}
- return true;
+
+ /* check the signature. */
+ DBT db_key;
+ memset(&db_key, 0, sizeof(DBT));
+ db_key.data = m_magic_header_index;
+ db_key.size = sizeof(m_magic_header_index);
+ DBT db_data;
+ memset(&db_data, 0, sizeof(DBT));
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = 0;
+ db_data.dlen = sizeof(m_magic_number);
+ ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
+ if ( ret != 0 )
+ return false;
+ if ( sizeof(m_magic_number) != db_data.size )
+ return false;
+ if ( memcmp(db_data.data, m_magic_number,
+ sizeof(m_magic_number)) == 0 )
+ return true;
+ return false;
}
/* load/store one array. */
@@ -391,10 +437,17 @@ public:
db_key.size = sizeof(m_magic_header_index);
DBT db_data;
memset(&db_data, 0, sizeof(DBT));
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = m_magic_number_length;
+ db_data.dlen = sizeof(MagicHeader);
int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
if ( ret != 0 )
return false;
+
+ if ( 0 == db_data.size )
+ return false;
+
assert(sizeof(MagicHeader) == db_data.size);
memcpy(&header, db_data.data, sizeof(MagicHeader));
return true;
@@ -412,6 +465,9 @@ public:
memset(&db_data, 0, sizeof(DBT));
db_data.data = (void *) &header;
db_data.size = sizeof(MagicHeader);
+ db_data.flags = DB_DBT_PARTIAL;
+ db_data.doff = m_magic_number_length;
+ db_data.dlen = sizeof(MagicHeader);
int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
return ret == 0;
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
index 5b1a7d0..35aea0c 100644
--- a/src/storage/ngram.cpp
+++ b/src/storage/ngram.cpp
@@ -324,7 +324,7 @@ bool Bigram::attach(const char * dbfile, guint32 flags){
assert(false);
ret = m_db->open(m_db, NULL, dbfile, NULL,
- DB_HASH, db_flags, 0664);
+ DB_HASH, db_flags, 0644);
if ( ret != 0)
return false;
diff --git a/tests/storage/test_flexible_ngram.cpp b/tests/storage/test_flexible_ngram.cpp
index 4e48715..8852f59 100644
--- a/tests/storage/test_flexible_ngram.cpp
+++ b/tests/storage/test_flexible_ngram.cpp
@@ -35,7 +35,7 @@ int main(int argc, char * argv[]) {
assert(single_gram.get_array_header(freq));
assert(freq == total_freq);
- FlexibleBigram<guint32, guint32, guint32> bigram;
+ FlexibleBigram<guint32, guint32, guint32> bigram("TEST");
assert(bigram.attach("/tmp/training.db"));
bigram.store(1, &single_gram);
assert(single_gram.insert_array_item(5, 8));
diff --git a/utils/training/estimate_k_mixture_model.cpp b/utils/training/estimate_k_mixture_model.cpp
index 37a88df..12c6ac5 100644
--- a/utils/training/estimate_k_mixture_model.cpp
+++ b/utils/training/estimate_k_mixture_model.cpp
@@ -44,7 +44,7 @@ parameter_t compute_interpolation(KMixtureModelSingleGram * deleted_bigram,
FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
deleted_bigram->retrieve_all(array);
- for ( int i = 0; i < array->len; ++i){
+ for ( size_t i = 0; i < array->len; ++i){
KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, i);
//get the phrase token
phrase_token_t token = item->m_token;
@@ -120,10 +120,10 @@ int main(int argc, char * argv[]){
}
/* TODO: magic header signature check here. */
- KMixtureModelBigram bigram;
+ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
bigram.attach(bigram_filename);
- KMixtureModelBigram deleted_bigram;
+ KMixtureModelBigram deleted_bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
deleted_bigram.attach(deleted_bigram_filename);
GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
@@ -132,7 +132,7 @@ int main(int argc, char * argv[]){
parameter_t lambda_sum = 0;
int lambda_count = 0;
- for( int i = 0; i < deleted_items->len; ++i ){
+ for( size_t i = 0; i < deleted_items->len; ++i ){
phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
KMixtureModelSingleGram * single_gram = NULL;
bigram.load(*token, single_gram);
diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h
index dffece6..af613f5 100644
--- a/utils/training/k_mixture_model.h
+++ b/utils/training/k_mixture_model.h
@@ -106,10 +106,9 @@ static inline parameter_t compute_Pr_G_2_with_count(corpus_count_t k,
return compute_Pr_G_2(k, alpha, B);
}
-#define K_MIXTURE_MODEL_MAGIC_NUMBER "KMMP";
+#define K_MIXTURE_MODEL_MAGIC_NUMBER "KMMP"
typedef struct{
- char m_magic_number[4];
/* the total number of instances of all words. */
guint32 m_WC;
/* the total number of documents. */