summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-04-18 16:59:03 +0800
committerPeng Wu <alexepico@gmail.com>2011-04-18 16:59:03 +0800
commitc418ddb0e8ec1eb8e79cb03e83ed3ebc202d02e5 (patch)
tree6833bd52722b0655858f060e8b7fe0df7e0568ce
parent0b113f330653b82be8a87af8b8b4ac826e72b296 (diff)
downloadlibpinyin-c418ddb0e8ec1eb8e79cb03e83ed3ebc202d02e5.tar.gz
libpinyin-c418ddb0e8ec1eb8e79cb03e83ed3ebc202d02e5.tar.xz
libpinyin-c418ddb0e8ec1eb8e79cb03e83ed3ebc202d02e5.zip
use new bi-gram
-rw-r--r--tests/lookup/test_simple_lookup.cpp9
-rw-r--r--tests/storage/test_ngram.cpp41
-rw-r--r--utils/segment/ngseg.cpp11
-rw-r--r--utils/storage/export_interpolation.cpp22
-rw-r--r--utils/storage/import_interpolation.cpp13
-rw-r--r--utils/training/estimate_interpolation.cpp36
-rw-r--r--utils/training/gen_ngram.cpp30
7 files changed, 68 insertions, 94 deletions
diff --git a/tests/lookup/test_simple_lookup.cpp b/tests/lookup/test_simple_lookup.cpp
index 147d8d6..77fa797 100644
--- a/tests/lookup/test_simple_lookup.cpp
+++ b/tests/lookup/test_simple_lookup.cpp
@@ -44,10 +44,13 @@ int main( int argc, char * argv[]){
new_chunk->load("../../data/gbk_char.bin");
phrase_index.load(2, new_chunk);
- Bigram bigram;
- bigram.attach("../../data/bigram.db", "/tmp/bigram.db");
+ Bigram system_bigram;
+ system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
+ Bigram user_bigram;
+ user_bigram.attach("/tmp/bigram.db", ATTACH_CREATE|ATTACH_READWRITE);
- PinyinLookup pinyin_lookup(&custom, &largetable, &phrase_index, &bigram);
+ PinyinLookup pinyin_lookup(&custom, &largetable, &phrase_index,
+ &system_bigram, &user_bigram);
char* linebuf = (char *)malloc ( 1024 * sizeof (char) );
size_t size = 1024;
diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp
index b93a55d..044cf59 100644
--- a/tests/storage/test_ngram.cpp
+++ b/tests/storage/test_ngram.cpp
@@ -39,7 +39,7 @@ int main(int argc, char * argv[]){
Bigram bigram;
- assert(bigram.attach(NULL, "/tmp/system.db"));
+ assert(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE));
bigram.store(1, &single_gram);
assert(single_gram.insert_freq(5, 8));
single_gram.set_total_freq(32);
@@ -47,37 +47,20 @@ int main(int argc, char * argv[]){
bigram.store(2, &single_gram);
- SingleGram * system, * user;
+ SingleGram * gram = NULL;
for ( int m = 1; m <= 2; ++m ){
printf("--------------------------------------------------------\n");
- bigram.load(m, system, user);
- assert(NULL == system);
+ bigram.load(m, gram);
g_array_set_size(array, 0);
range.m_range_begin = 0; range.m_range_end = 8;
- user->search(&range,array);
+ gram->search(&range,array);
for ( size_t i = 0; i < array->len; ++i){
BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
printf("item:%d:%f\n", item->m_token, item->m_freq);
}
- delete user;
+ delete gram;
}
- bigram.attach("/tmp/system.db", NULL);
-
- for ( int m = 1; m <=2; ++m ){
- printf("--------------------------------------------------------\n");
- bigram.load(m, system, user);
- assert(NULL == user);
- g_array_set_size(array, 0);
- range.m_range_begin = 0; range.m_range_end = 8;
- system->search(&range,array);
- for ( size_t i = 0; i < array->len; ++i){
- BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i);
- printf("item:%d:%f\n", item->m_token, item->m_freq);
- }
- delete system;
- }
-
printf("--------------------------------------------------------\n");
single_gram.prune();
g_array_set_size(array, 0);
@@ -92,18 +75,12 @@ int main(int argc, char * argv[]){
g_array_free(array, TRUE);
- GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
- GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
- bigram.get_all_items(system_items, user_items);
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ bigram.get_all_items(items);
printf("----------------------system----------------------------\n");
- for ( size_t i = 0; i < system_items->len; ++i){
- phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i);
- printf("item:%d\n", *token);
- }
- printf("-----------------------user-----------------------------\n");
- for ( size_t i = 0; i < user_items->len; ++i){
- phrase_token_t * token = &g_array_index(user_items, phrase_token_t, i);
+ for ( size_t i = 0; i < items->len; ++i){
+ phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
printf("item:%d\n", *token);
}
}
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
index ccb88d7..504cacc 100644
--- a/utils/segment/ngseg.cpp
+++ b/utils/segment/ngseg.cpp
@@ -39,7 +39,8 @@
PhraseLargeTable * g_phrase_table = NULL;
FacadePhraseIndex * g_phrase_index = NULL;
-Bigram * g_bigram = NULL;
+Bigram * g_system_bigram = NULL;
+Bigram * g_user_bigram = NULL;
PhraseLookup * g_phrase_lookup = NULL;
enum CONTEXT_STATE{
@@ -114,12 +115,14 @@ int main(int argc, char * argv[]){
g_phrase_index->load(2, chunk);
//init bi-gram
- g_bigram = new Bigram;
- g_bigram->attach("../../data/bigram.db", NULL);
+ g_system_bigram = new Bigram;
+ g_system_bigram->attach("../../data/bigram.db", ATTACH_READONLY);
+ g_user_bigram = new Bigram;
+
//init phrase lookup
g_phrase_lookup = new PhraseLookup(g_phrase_table, g_phrase_index,
- g_bigram);
+ g_system_bigram, g_user_bigram);
CONTEXT_STATE state, next_state;
diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp
index 9fc6cde..333470e 100644
--- a/utils/storage/export_interpolation.cpp
+++ b/utils/storage/export_interpolation.cpp
@@ -57,7 +57,7 @@ int main(int argc, char * argv[]){
phrase_index.load(2, chunk);
Bigram bigram;
- bigram.attach(bigram_filename, NULL);
+ bigram.attach(bigram_filename, ATTACH_READONLY);
begin_data(output);
@@ -99,23 +99,19 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram
fprintf(output, "\\2-gram\n");
/* Retrieve all user items. */
- GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
- GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
- bigram->get_all_items(system_items, user_items);
- assert(0 == user_items->len);
- g_array_free(user_items, TRUE);
+ bigram->get_all_items(items);
PhraseItem item;
- for(size_t i = 0; i < system_items->len; i++){
- phrase_token_t token = g_array_index(system_items, phrase_token_t, i);
- SingleGram * system = NULL, * user = NULL;
- bigram->load(token, system, user);
- assert(NULL == user);
+ for(size_t i = 0; i < items->len; i++){
+ phrase_token_t token = g_array_index(items, phrase_token_t, i);
+ SingleGram * single_gram = NULL;
+ bigram->load(token, single_gram);
BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
- system->retrieve_all(array);
+ single_gram->retrieve_all(array);
for(size_t j = 0; j < array->len; j++) {
BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);
@@ -132,7 +128,7 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram
g_array_free(array, TRUE);
}
- g_array_free(system_items, TRUE);
+ g_array_free(items, TRUE);
}
static const char * special_token_to_string(phrase_token_t token){
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
index d53c3e8..d8d32e4 100644
--- a/utils/storage/import_interpolation.cpp
+++ b/utils/storage/import_interpolation.cpp
@@ -157,15 +157,14 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
last_token = 0;
last_single_gram = NULL;
}
- SingleGram * system_gram = NULL, * user_gram = NULL;
- bigram->load(token1, system_gram, user_gram);
- assert(system_gram == NULL);
+ SingleGram * single_gram = NULL;
+ bigram->load(token1, single_gram);
//create the new single gram
- if ( user_gram == NULL )
- user_gram = new SingleGram;
+ if ( single_gram == NULL )
+ single_gram = new SingleGram;
last_token = token1;
- last_single_gram = user_gram;
+ last_single_gram = single_gram;
}
//save the freq
guint32 total_freq = 0;
@@ -220,7 +219,7 @@ int main(int argc, char * argv[]){
phrase_index.load(2, chunk);
Bigram bigram;
- bigram.attach(NULL, bigram_filename);
+ bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
taglib_init();
diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp
index b7a56f1..feec95e 100644
--- a/utils/training/estimate_interpolation.cpp
+++ b/utils/training/estimate_interpolation.cpp
@@ -102,42 +102,38 @@ int main(int argc, char * argv[]){
phrase_index.load(2, chunk);
Bigram bigram;
- bigram.attach("../../data/bigram.db", NULL);
+ bigram.attach("../../data/bigram.db", ATTACH_READONLY);
Bigram deleted_bigram;
- deleted_bigram.attach("../../data/deleted_bigram.db", NULL);
+ deleted_bigram.attach("../../data/deleted_bigram.db", ATTACH_READONLY);
- GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
- GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
-
- deleted_bigram.get_all_items(system_items, user_items);
- assert(0 == user_items->len);
- g_array_free(user_items, TRUE);
+ GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+ deleted_bigram.get_all_items(deleted_items);
parameter_t lambda_sum = 0;
int lambda_count = 0;
- for ( int i = 0; i < system_items->len; ++i ){
- phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i);
- SingleGram * system = NULL, * user = NULL;
- bigram.load(*token, system, user);
- assert(NULL == user);
- SingleGram * deleted_system = NULL, * deleted_user = NULL;
- deleted_bigram.load(*token, deleted_system, deleted_user);
- assert(NULL == deleted_user);
+ for ( int i = 0; i < deleted_items->len; ++i ){
+ phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
+ SingleGram * single_gram = NULL;
+ bigram.load(*token, single_gram);
+
+ SingleGram * deleted_single_gram = NULL;
+ deleted_bigram.load(*token, deleted_single_gram);
- parameter_t lambda = compute_interpolation(deleted_system, &phrase_index, system);
+ parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram);
printf("lambda:%f\n", lambda);
lambda_sum += lambda;
lambda_count ++;
- if (system) delete system;
- delete deleted_system;
+ if (single_gram)
+ delete single_gram;
+ delete deleted_single_gram;
}
printf("average lambda:%f\n", (lambda_sum/lambda_count));
- g_array_free(system_items, TRUE);
+ g_array_free(deleted_items, TRUE);
}
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
index 3233cee..6109e92 100644
--- a/utils/training/gen_ngram.cpp
+++ b/utils/training/gen_ngram.cpp
@@ -86,7 +86,7 @@ int main(int argc, char * argv[]){
phrase_index.load(2, chunk);
Bigram bigram;
- bigram.attach(NULL, bigram_filename);
+ bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
char* linebuf = (char *)malloc ( 1024 * sizeof (char) );
@@ -113,31 +113,31 @@ int main(int argc, char * argv[]){
phrase_index.add_unigram_frequency(cur_token, 1);
}
if ( cur_token ){
- SingleGram * system = NULL, * user = NULL;
+ SingleGram * single_gram = NULL;
if ( 0 == last_token ){
if (train_pi_gram)
- bigram.load(sentence_start, system, user);
+ bigram.load(sentence_start, single_gram);
} else
- bigram.load(last_token, system, user);
- assert(NULL == system);
- if ( NULL == user ){
- user = new SingleGram;
+ bigram.load(last_token, single_gram);
+
+ if ( NULL == single_gram ){
+ single_gram = new SingleGram;
}
guint32 freq, total_freq;
//increase freq
- if (user->get_freq(cur_token, freq))
- user->set_freq(cur_token, freq + 1);
+ if (single_gram->get_freq(cur_token, freq))
+ single_gram->set_freq(cur_token, freq + 1);
else
- user->insert_freq(cur_token, 1);
+ single_gram->insert_freq(cur_token, 1);
//increase total freq
- user->get_total_freq(total_freq);
- user->set_total_freq(total_freq + 1);
+ single_gram->get_total_freq(total_freq);
+ single_gram->set_total_freq(total_freq + 1);
if ( 0 == last_token ){
if ( train_pi_gram )
- bigram.store(sentence_start, user);
+ bigram.store(sentence_start, single_gram);
}else
- bigram.store(last_token, user);
- delete user;
+ bigram.store(last_token, single_gram);
+ delete single_gram;
}
}