diff options
author | Peng Wu <alexepico@gmail.com> | 2011-07-25 18:38:28 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-07-25 18:45:57 +0800 |
commit | 63e1d37800e2f2ecc71e347c79f85c772042d8d2 (patch) | |
tree | e472e3723c3a2d78c6a2d7eb7ef39ac4d0005c4a | |
parent | b7ecd0860c793f25ed97fc717be3b4f07cbd8f3e (diff) | |
download | libpinyin-63e1d37800e2f2ecc71e347c79f85c772042d8d2.tar.gz libpinyin-63e1d37800e2f2ecc71e347c79f85c772042d8d2.tar.xz libpinyin-63e1d37800e2f2ecc71e347c79f85c772042d8d2.zip |
write eval correction rate
-rw-r--r-- | utils/training/eval_correction_rate.cpp | 147 |
1 files changed, 146 insertions, 1 deletions
diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp index 6009251..81aadaa 100644 --- a/utils/training/eval_correction_rate.cpp +++ b/utils/training/eval_correction_rate.cpp @@ -20,9 +20,154 @@ */ - #include "pinyin.h" +typedef MatchResults TokenVector; + +void print_help(){ + printf("Usage: eval_correction_rate\n"); +} + +bool get_possible_pinyin(FacadePhraseIndex * phrase_index, + TokenVector tokens, PinyinKeyVector pinyins){ + PinyinKey buffer[MAX_PHRASE_LENGTH]; + size_t pinyin_index; guint32 max_freq; + guint32 freq; + g_array_set_size(pinyins, 0); + + for (size_t i = 0; i < tokens->len; ++i){ + phrase_token_t * token = &g_array_index(tokens, phrase_token_t, i); + PhraseItem item; + phrase_index->get_phrase_item(*token, item); + pinyin_index = 0; max_freq = 0; + for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) { + freq = 0; + assert(item.get_nth_pronunciation(m, buffer, freq)); + if ( freq > max_freq ) { + pinyin_index = m; + max_freq = freq; + } + } + + assert(item.get_nth_pronunciation(pinyin_index, buffer, freq)); + assert(max_freq == freq); + guint8 len = item.get_phrase_length(); + g_array_append_vals(pinyins, buffer, len); + } + return true; +} + +bool get_best_match(PinyinLookup * pinyin_lookup, + PinyinKeyVector pinyins, TokenVector tokens){ + /* initialize constraints. */ + CandidateConstraints constraints = g_array_new + (FALSE, FALSE, sizeof(lookup_constraint_t)); + g_array_set_size(constraints, pinyins->len); + for ( size_t i = 0; i < constraints->len; ++i ) { + lookup_constraint_t * constraint = &g_array_index + (constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + return pinyin_lookup->get_best_match(pinyins, constraints, tokens); +} + int main(int argc, char * argv[]){ + const char * evals_text = "../../data/evals.text"; + + PinyinCustomSettings custom; + PinyinLargeTable largetable(&custom); + + MemoryChunk * new_chunk = new MemoryChunk; + new_chunk->load("../../data/pinyin_index.bin"); + largetable.load(new_chunk); + + FacadePhraseIndex phrase_index; + new_chunk = new MemoryChunk; + new_chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, new_chunk); + new_chunk = new MemoryChunk; + new_chunk->load("../../data/gbk_char.bin"); + phrase_index.load(2, new_chunk); + + PhraseLargeTable phrases; + new_chunk = new MemoryChunk; + new_chunk->load("../../data/phrase_index.bin"); + phrases.load(new_chunk); + + Bigram system_bigram; + system_bigram.attach("../../data/bigram.db", ATTACH_READONLY); + Bigram user_bigram; + user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE); + + PinyinLookup pinyin_lookup(&custom, &largetable, &phrase_index, + &system_bigram, &user_bigram); + + /* open evals.text. */ + FILE * evals_file = fopen(evals_text, "r"); + if ( NULL == evals_file ) { + fprintf(stderr, "Can't open file:%s\n", evals_text); + exit(ENOENT); + } + + /* Evaluates the correction rate of test text documents. */ + size_t tested_count = 0; size_t passed_count = 0; + char* linebuf = NULL; size_t size = 0; + TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t)); + TokenVector guessed_tokens = g_array_new + (FALSE, TRUE, sizeof(phrase_token_t)); + PinyinKeyVector pinyins = g_array_new(FALSE, TRUE, sizeof(PinyinKey)); + + phrase_token_t token; + while( getline(&linebuf, &size, evals_file) ) { + if ( feof(evals_file) ) + break; + if ( '\n' == linebuf[strlen(linebuf)-1] ) + linebuf[strlen(linebuf)-1] = '\0'; + + glong phrase_len = 0; + utf16_t * phrase = g_utf8_to_utf16(linebuf, -1, NULL, &phrase_len, NULL); + + token = 0; + if ( 0 != phrase_len ) { + int result = phrases.search( phrase_len, phrase, token); + if ( ! (result & SEARCH_OK) ) + token = 0; + g_free(phrase); + phrase = NULL; + } + + if ( 0 == token ) { + if ( tokens->len ) { /* one test. */ + get_possible_pinyin(&phrase_index, tokens, pinyins); + get_best_match(&pinyin_lookup, pinyins, guessed_tokens); + /* compare the results */ + char * sentence = NULL; char * guessed_sentence = NULL; + pinyin_lookup.convert_to_utf8(tokens, sentence); + pinyin_lookup.convert_to_utf8 + (guessed_tokens, guessed_sentence); + + if (strcmp(sentence, guessed_sentence) != 0){ + fprintf(stderr, "test sentence:%s\n", sentence); + fprintf(stderr, "guessed sentence:%s\n", guessed_sentence); + fprintf(stderr, "the result mis-matches.\n"); + tested_count ++; + } else { + tested_count ++; passed_count ++; + } + + g_free(sentence); g_free(guessed_sentence); + g_array_set_size(tokens, 0); + } + } else { + g_array_append_val(tokens, token); + } + } + + parameter_t rate = passed_count / (parameter_t) tested_count; + printf("correction rate:%f\n", rate); + + fclose(evals_file); + free(linebuf); return 0; } |