diff options
author | Peng Wu <alexepico@gmail.com> | 2011-04-18 16:59:03 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-04-18 16:59:03 +0800 |
commit | c418ddb0e8ec1eb8e79cb03e83ed3ebc202d02e5 (patch) | |
tree | 6833bd52722b0655858f060e8b7fe0df7e0568ce /utils | |
parent | 0b113f330653b82be8a87af8b8b4ac826e72b296 (diff) | |
download | libpinyin-c418ddb0e8ec1eb8e79cb03e83ed3ebc202d02e5.tar.gz libpinyin-c418ddb0e8ec1eb8e79cb03e83ed3ebc202d02e5.tar.xz libpinyin-c418ddb0e8ec1eb8e79cb03e83ed3ebc202d02e5.zip |
use new bi-gram
Diffstat (limited to 'utils')
-rw-r--r-- | utils/segment/ngseg.cpp | 11 | ||||
-rw-r--r-- | utils/storage/export_interpolation.cpp | 22 | ||||
-rw-r--r-- | utils/storage/import_interpolation.cpp | 13 | ||||
-rw-r--r-- | utils/training/estimate_interpolation.cpp | 36 | ||||
-rw-r--r-- | utils/training/gen_ngram.cpp | 30 |
5 files changed, 53 insertions, 59 deletions
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp index ccb88d7..504cacc 100644 --- a/utils/segment/ngseg.cpp +++ b/utils/segment/ngseg.cpp @@ -39,7 +39,8 @@ PhraseLargeTable * g_phrase_table = NULL; FacadePhraseIndex * g_phrase_index = NULL; -Bigram * g_bigram = NULL; +Bigram * g_system_bigram = NULL; +Bigram * g_user_bigram = NULL; PhraseLookup * g_phrase_lookup = NULL; enum CONTEXT_STATE{ @@ -114,12 +115,14 @@ int main(int argc, char * argv[]){ g_phrase_index->load(2, chunk); //init bi-gram - g_bigram = new Bigram; - g_bigram->attach("../../data/bigram.db", NULL); + g_system_bigram = new Bigram; + g_system_bigram->attach("../../data/bigram.db", ATTACH_READONLY); + g_user_bigram = new Bigram; + //init phrase lookup g_phrase_lookup = new PhraseLookup(g_phrase_table, g_phrase_index, - g_bigram); + g_system_bigram, g_user_bigram); CONTEXT_STATE state, next_state; diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp index 9fc6cde..333470e 100644 --- a/utils/storage/export_interpolation.cpp +++ b/utils/storage/export_interpolation.cpp @@ -57,7 +57,7 @@ int main(int argc, char * argv[]){ phrase_index.load(2, chunk); Bigram bigram; - bigram.attach(bigram_filename, NULL); + bigram.attach(bigram_filename, ATTACH_READONLY); begin_data(output); @@ -99,23 +99,19 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram fprintf(output, "\\2-gram\n"); /* Retrieve all user items. */ - GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - bigram->get_all_items(system_items, user_items); - assert(0 == user_items->len); - g_array_free(user_items, TRUE); + bigram->get_all_items(items); PhraseItem item; - for(size_t i = 0; i < system_items->len; i++){ - phrase_token_t token = g_array_index(system_items, phrase_token_t, i); - SingleGram * system = NULL, * user = NULL; - bigram->load(token, system, user); - assert(NULL == user); + for(size_t i = 0; i < items->len; i++){ + phrase_token_t token = g_array_index(items, phrase_token_t, i); + SingleGram * single_gram = NULL; + bigram->load(token, single_gram); BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); - system->retrieve_all(array); + single_gram->retrieve_all(array); for(size_t j = 0; j < array->len; j++) { BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j); @@ -132,7 +128,7 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram g_array_free(array, TRUE); } - g_array_free(system_items, TRUE); + g_array_free(items, TRUE); } static const char * special_token_to_string(phrase_token_t token){ diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp index d53c3e8..d8d32e4 100644 --- a/utils/storage/import_interpolation.cpp +++ b/utils/storage/import_interpolation.cpp @@ -157,15 +157,14 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases, last_token = 0; last_single_gram = NULL; } - SingleGram * system_gram = NULL, * user_gram = NULL; - bigram->load(token1, system_gram, user_gram); - assert(system_gram == NULL); + SingleGram * single_gram = NULL; + bigram->load(token1, single_gram); //create the new single gram - if ( user_gram == NULL ) - user_gram = new SingleGram; + if ( single_gram == NULL ) + single_gram = new SingleGram; last_token = token1; - last_single_gram = user_gram; + last_single_gram = single_gram; } //save the freq guint32 total_freq = 0; @@ -220,7 +219,7 @@ int main(int argc, char * argv[]){ phrase_index.load(2, chunk); Bigram bigram; - bigram.attach(NULL, bigram_filename); + bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); taglib_init(); diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp index b7a56f1..feec95e 100644 --- a/utils/training/estimate_interpolation.cpp +++ b/utils/training/estimate_interpolation.cpp @@ -102,42 +102,38 @@ int main(int argc, char * argv[]){ phrase_index.load(2, chunk); Bigram bigram; - bigram.attach("../../data/bigram.db", NULL); + bigram.attach("../../data/bigram.db", ATTACH_READONLY); Bigram deleted_bigram; - deleted_bigram.attach("../../data/deleted_bigram.db", NULL); + deleted_bigram.attach("../../data/deleted_bigram.db", ATTACH_READONLY); - GArray * system_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - GArray * user_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - - deleted_bigram.get_all_items(system_items, user_items); - assert(0 == user_items->len); - g_array_free(user_items, TRUE); + GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + deleted_bigram.get_all_items(deleted_items); parameter_t lambda_sum = 0; int lambda_count = 0; - for ( int i = 0; i < system_items->len; ++i ){ - phrase_token_t * token = &g_array_index(system_items, phrase_token_t, i); - SingleGram * system = NULL, * user = NULL; - bigram.load(*token, system, user); - assert(NULL == user); - SingleGram * deleted_system = NULL, * deleted_user = NULL; - deleted_bigram.load(*token, deleted_system, deleted_user); - assert(NULL == deleted_user); + for ( int i = 0; i < deleted_items->len; ++i ){ + phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); + SingleGram * single_gram = NULL; + bigram.load(*token, single_gram); + + SingleGram * deleted_single_gram = NULL; + deleted_bigram.load(*token, deleted_single_gram); - parameter_t lambda = compute_interpolation(deleted_system, &phrase_index, system); + parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram); printf("lambda:%f\n", lambda); lambda_sum += lambda; lambda_count ++; - if (system) delete system; - delete deleted_system; + if (single_gram) + delete single_gram; + delete deleted_single_gram; } printf("average lambda:%f\n", (lambda_sum/lambda_count)); - g_array_free(system_items, TRUE); + g_array_free(deleted_items, TRUE); } diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp index 3233cee..6109e92 100644 --- a/utils/training/gen_ngram.cpp +++ b/utils/training/gen_ngram.cpp @@ -86,7 +86,7 @@ int main(int argc, char * argv[]){ phrase_index.load(2, chunk); Bigram bigram; - bigram.attach(NULL, bigram_filename); + bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = (char *)malloc ( 1024 * sizeof (char) ); @@ -113,31 +113,31 @@ int main(int argc, char * argv[]){ phrase_index.add_unigram_frequency(cur_token, 1); } if ( cur_token ){ - SingleGram * system = NULL, * user = NULL; + SingleGram * single_gram = NULL; if ( 0 == last_token ){ if (train_pi_gram) - bigram.load(sentence_start, system, user); + bigram.load(sentence_start, single_gram); } else - bigram.load(last_token, system, user); - assert(NULL == system); - if ( NULL == user ){ - user = new SingleGram; + bigram.load(last_token, single_gram); + + if ( NULL == single_gram ){ + single_gram = new SingleGram; } guint32 freq, total_freq; //increase freq - if (user->get_freq(cur_token, freq)) - user->set_freq(cur_token, freq + 1); + if (single_gram->get_freq(cur_token, freq)) + single_gram->set_freq(cur_token, freq + 1); else - user->insert_freq(cur_token, 1); + single_gram->insert_freq(cur_token, 1); //increase total freq - user->get_total_freq(total_freq); - user->set_total_freq(total_freq + 1); + single_gram->get_total_freq(total_freq); + single_gram->set_total_freq(total_freq + 1); if ( 0 == last_token ){ if ( train_pi_gram ) - bigram.store(sentence_start, user); + bigram.store(sentence_start, single_gram); }else - bigram.store(last_token, user); - delete user; + bigram.store(last_token, single_gram); + delete single_gram; } } |