From c418ddb0e8ec1eb8e79cb03e83ed3ebc202d02e5 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 18 Apr 2011 16:59:03 +0800 Subject: use new bi-gram --- tests/lookup/test_simple_lookup.cpp | 9 ++++--- tests/storage/test_ngram.cpp | 41 +++++++------------------------ utils/segment/ngseg.cpp | 11 ++++++--- utils/storage/export_interpolation.cpp | 22 +++++++---------- utils/storage/import_interpolation.cpp | 13 +++++----- utils/training/estimate_interpolation.cpp | 36 ++++++++++++--------------- utils/training/gen_ngram.cpp | 30 +++++++++++----------- 7 files changed, 68 insertions(+), 94 deletions(-) diff --git a/tests/lookup/test_simple_lookup.cpp b/tests/lookup/test_simple_lookup.cpp index 147d8d6..77fa797 100644 --- a/tests/lookup/test_simple_lookup.cpp +++ b/tests/lookup/test_simple_lookup.cpp @@ -44,10 +44,13 @@ int main( int argc, char * argv[]){ new_chunk->load("../../data/gbk_char.bin"); phrase_index.load(2, new_chunk); - Bigram bigram; - bigram.attach("../../data/bigram.db", "/tmp/bigram.db"); + Bigram system_bigram; + system_bigram.attach("../../data/bigram.db", ATTACH_READONLY); + Bigram user_bigram; + user_bigram.attach("/tmp/bigram.db", ATTACH_CREATE|ATTACH_READWRITE); - PinyinLookup pinyin_lookup(&custom, &largetable, &phrase_index, &bigram); + PinyinLookup pinyin_lookup(&custom, &largetable, &phrase_index, + &system_bigram, &user_bigram); char* linebuf = (char *)malloc ( 1024 * sizeof (char) ); size_t size = 1024; diff --git a/tests/storage/test_ngram.cpp b/tests/storage/test_ngram.cpp index b93a55d..044cf59 100644 --- a/tests/storage/test_ngram.cpp +++ b/tests/storage/test_ngram.cpp @@ -39,7 +39,7 @@ int main(int argc, char * argv[]){ Bigram bigram; - assert(bigram.attach(NULL, "/tmp/system.db")); + assert(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE)); bigram.store(1, &single_gram); assert(single_gram.insert_freq(5, 8)); single_gram.set_total_freq(32); @@ -47,37 +47,20 @@ int main(int argc, char * argv[]){ bigram.store(2, &single_gram); - SingleGram * system, * user; + SingleGram * gram = NULL; for ( int m = 1; m <= 2; ++m ){ printf("--------------------------------------------------------\n"); - bigram.load(m, system, user); - assert(NULL == system); + bigram.load(m, gram); g_array_set_size(array, 0); range.m_range_begin = 0; range.m_range_end = 8; - user->search(&range,array); + gram->search(&range,array); for ( size_t i = 0; i < array->len; ++i){ BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); printf("item:%d:%f\n", item->m_token, item->m_freq); } - delete user; + delete gram; } - bigram.attach("/tmp/system.db", NULL); - - for ( int m = 1; m <=2; ++m ){ - printf("--------------------------------------------------------\n"); - bigram.load(m, system, user); - assert(NULL == user); - g_array_set_size(array, 0); - range.m_range_begin = 0; range.m_range_end = 8; - system->search(&range,array); - for ( size_t i = 0; i < array->len; ++i){ - BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); - printf("item:%d:%f\n", item->m_token, item->m_freq); - } - delete system; - } - printf("--------------------------------------------------------\n"); single_gram.prune(); g_array_set_size(array, 0); @@ -92,18 +75,12 @@ int main(int argc, char * argv[]){ g_array_free(array, TRUE); - GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - bigram.get_all_items(system_items, user_items); + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + bigram.get_all_items(items); printf("----------------------system----------------------------\n"); - for ( size_t i = 0; i < system_items->len; ++i){ - phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i); - printf("item:%d\n", *token); - } - printf("-----------------------user-----------------------------\n"); - for ( size_t i = 0; i < user_items->len; ++i){ - phrase_token_t * token = &g_array_index(user_items, phrase_token_t, i); + for ( size_t i = 0; i < items->len; ++i){ + phrase_token_t * token = &g_array_index(items, phrase_token_t, i); printf("item:%d\n", *token); } } diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp index ccb88d7..504cacc 100644 --- a/utils/segment/ngseg.cpp +++ b/utils/segment/ngseg.cpp @@ -39,7 +39,8 @@ PhraseLargeTable * g_phrase_table = NULL; FacadePhraseIndex * g_phrase_index = NULL; -Bigram * g_bigram = NULL; +Bigram * g_system_bigram = NULL; +Bigram * g_user_bigram = NULL; PhraseLookup * g_phrase_lookup = NULL; enum CONTEXT_STATE{ @@ -114,12 +115,14 @@ int main(int argc, char * argv[]){ g_phrase_index->load(2, chunk); //init bi-gram - g_bigram = new Bigram; - g_bigram->attach("../../data/bigram.db", NULL); + g_system_bigram = new Bigram; + g_system_bigram->attach("../../data/bigram.db", ATTACH_READONLY); + g_user_bigram = new Bigram; + //init phrase lookup g_phrase_lookup = new PhraseLookup(g_phrase_table, g_phrase_index, - g_bigram); + g_system_bigram, g_user_bigram); CONTEXT_STATE state, next_state; diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp index 9fc6cde..333470e 100644 --- a/utils/storage/export_interpolation.cpp +++ b/utils/storage/export_interpolation.cpp @@ -57,7 +57,7 @@ int main(int argc, char * argv[]){ phrase_index.load(2, chunk); Bigram bigram; - bigram.attach(bigram_filename, NULL); + bigram.attach(bigram_filename, ATTACH_READONLY); begin_data(output); @@ -99,23 +99,19 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram fprintf(output, "\\2-gram\n"); /* Retrieve all user items. */ - GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - bigram->get_all_items(system_items, user_items); - assert(0 == user_items->len); - g_array_free(user_items, TRUE); + bigram->get_all_items(items); PhraseItem item; - for(size_t i = 0; i < system_items->len; i++){ - phrase_token_t token = g_array_index(system_items, phrase_token_t, i); - SingleGram * system = NULL, * user = NULL; - bigram->load(token, system, user); - assert(NULL == user); + for(size_t i = 0; i < items->len; i++){ + phrase_token_t token = g_array_index(items, phrase_token_t, i); + SingleGram * single_gram = NULL; + bigram->load(token, single_gram); BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); - system->retrieve_all(array); + single_gram->retrieve_all(array); for(size_t j = 0; j < array->len; j++) { BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j); @@ -132,7 +128,7 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram g_array_free(array, TRUE); } - g_array_free(system_items, TRUE); + g_array_free(items, TRUE); } static const char * special_token_to_string(phrase_token_t token){ diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp index d53c3e8..d8d32e4 100644 --- a/utils/storage/import_interpolation.cpp +++ b/utils/storage/import_interpolation.cpp @@ -157,15 +157,14 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases, last_token = 0; last_single_gram = NULL; } - SingleGram * system_gram = NULL, * user_gram = NULL; - bigram->load(token1, system_gram, user_gram); - assert(system_gram == NULL); + SingleGram * single_gram = NULL; + bigram->load(token1, single_gram); //create the new single gram - if ( user_gram == NULL ) - user_gram = new SingleGram; + if ( single_gram == NULL ) + single_gram = new SingleGram; last_token = token1; - last_single_gram = user_gram; + last_single_gram = single_gram; } //save the freq guint32 total_freq = 0; @@ -220,7 +219,7 @@ int main(int argc, char * argv[]){ phrase_index.load(2, chunk); Bigram bigram; - bigram.attach(NULL, bigram_filename); + bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); taglib_init(); diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp index b7a56f1..feec95e 100644 --- a/utils/training/estimate_interpolation.cpp +++ b/utils/training/estimate_interpolation.cpp @@ -102,42 +102,38 @@ int main(int argc, char * argv[]){ phrase_index.load(2, chunk); Bigram bigram; - bigram.attach("../../data/bigram.db", NULL); + bigram.attach("../../data/bigram.db", ATTACH_READONLY); Bigram deleted_bigram; - deleted_bigram.attach("../../data/deleted_bigram.db", NULL); + deleted_bigram.attach("../../data/deleted_bigram.db", ATTACH_READONLY); - GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - - deleted_bigram.get_all_items(system_items, user_items); - assert(0 == user_items->len); - g_array_free(user_items, TRUE); + GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + deleted_bigram.get_all_items(deleted_items); parameter_t lambda_sum = 0; int lambda_count = 0; - for ( int i = 0; i < system_items->len; ++i ){ - phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i); - SingleGram * system = NULL, * user = NULL; - bigram.load(*token, system, user); - assert(NULL == user); - SingleGram * deleted_system = NULL, * deleted_user = NULL; - deleted_bigram.load(*token, deleted_system, deleted_user); - assert(NULL == deleted_user); + for ( int i = 0; i < deleted_items->len; ++i ){ + phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); + SingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + SingleGram * deleted_single_gram = NULL; + deleted_bigram.load(*token, deleted_single_gram); - parameter_t lambda = compute_interpolation(deleted_system, &phrase_index, system); + parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram); printf("lambda:%f\n", lambda); lambda_sum += lambda; lambda_count ++; - if (system) delete system; - delete deleted_system; + if (single_gram) + delete single_gram; + delete deleted_single_gram; } printf("average lambda:%f\n", (lambda_sum/lambda_count)); - g_array_free(system_items, TRUE); + g_array_free(deleted_items, TRUE); } diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp index 3233cee..6109e92 100644 --- a/utils/training/gen_ngram.cpp +++ b/utils/training/gen_ngram.cpp @@ -86,7 +86,7 @@ int main(int argc, char * argv[]){ phrase_index.load(2, chunk); Bigram bigram; - bigram.attach(NULL, bigram_filename); + bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = (char *)malloc ( 1024 * sizeof (char) ); @@ -113,31 +113,31 @@ int main(int argc, char * argv[]){ phrase_index.add_unigram_frequency(cur_token, 1); } if ( cur_token ){ - SingleGram * system = NULL, * user = NULL; + SingleGram * single_gram = NULL; if ( 0 == last_token ){ if (train_pi_gram) - bigram.load(sentence_start, system, user); + bigram.load(sentence_start, single_gram); } else - bigram.load(last_token, system, user); - assert(NULL == system); - if ( NULL == user ){ - user = new SingleGram; + bigram.load(last_token, single_gram); + + if ( NULL == single_gram ){ + single_gram = new SingleGram; } guint32 freq, total_freq; //increase freq - if (user->get_freq(cur_token, freq)) - user->set_freq(cur_token, freq + 1); + if (single_gram->get_freq(cur_token, freq)) + single_gram->set_freq(cur_token, freq + 1); else - user->insert_freq(cur_token, 1); + single_gram->insert_freq(cur_token, 1); //increase total freq - user->get_total_freq(total_freq); - user->set_total_freq(total_freq + 1); + single_gram->get_total_freq(total_freq); + single_gram->set_total_freq(total_freq + 1); if ( 0 == last_token ){ if ( train_pi_gram ) - bigram.store(sentence_start, user); + bigram.store(sentence_start, single_gram); }else - bigram.store(last_token, user); - delete user; + bigram.store(last_token, single_gram); + delete single_gram; } } -- cgit