summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-07-29 20:01:21 +0800
committerPeng Wu <alexepico@gmail.com>2011-07-29 20:01:34 +0800
commit20ee4a647744f4ae4327754e01301351146c440b (patch)
treea455658f487af63c1dd0e807644560fd6535e8d6
parent2f1e4e17542d7e91261e8d05321b2886fbe38298 (diff)
downloadlibpinyin-20ee4a647744f4ae4327754e01301351146c440b.tar.gz
libpinyin-20ee4a647744f4ae4327754e01301351146c440b.tar.xz
libpinyin-20ee4a647744f4ae4327754e01301351146c440b.zip
improve eval correction rate
-rw-r--r--utils/training/eval_correction_rate.cpp63
1 files changed, 44 insertions, 19 deletions
diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp
index 81aadaa..f301ddb 100644
--- a/utils/training/eval_correction_rate.cpp
+++ b/utils/training/eval_correction_rate.cpp
@@ -72,6 +72,38 @@ bool get_best_match(PinyinLookup * pinyin_lookup,
return pinyin_lookup->get_best_match(pinyins, constraints, tokens);
}
+bool do_one_test(PinyinLookup * pinyin_lookup,
+ FacadePhraseIndex * phrase_index,
+ TokenVector tokens){
+ bool retval = false;
+
+ PinyinKeyVector pinyins = g_array_new(FALSE, TRUE, sizeof(PinyinKey));
+ TokenVector guessed_tokens = g_array_new
+ (FALSE, TRUE, sizeof(phrase_token_t));
+
+ get_possible_pinyin(phrase_index, tokens, pinyins);
+ get_best_match(pinyin_lookup, pinyins, guessed_tokens);
+ /* compare the results */
+ char * sentence = NULL; char * guessed_sentence = NULL;
+ pinyin_lookup->convert_to_utf8(tokens, sentence);
+ pinyin_lookup->convert_to_utf8
+ (guessed_tokens, guessed_sentence);
+
+ if ( strcmp(sentence, guessed_sentence) != 0 ) {
+ fprintf(stderr, "test sentence:%s\n", sentence);
+ fprintf(stderr, "guessed sentence:%s\n", guessed_sentence);
+ fprintf(stderr, "the result mis-matches.\n");
+ retval = false;
+ } else {
+ retval = true;
+ }
+
+ g_free(sentence); g_free(guessed_sentence);
+ g_array_free(pinyins, TRUE);
+ g_array_free(guessed_tokens, TRUE);
+ return retval;
+}
+
int main(int argc, char * argv[]){
const char * evals_text = "../../data/evals.text";
@@ -114,9 +146,6 @@ int main(int argc, char * argv[]){
size_t tested_count = 0; size_t passed_count = 0;
char* linebuf = NULL; size_t size = 0;
TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t));
- TokenVector guessed_tokens = g_array_new
- (FALSE, TRUE, sizeof(phrase_token_t));
- PinyinKeyVector pinyins = g_array_new(FALSE, TRUE, sizeof(PinyinKey));
phrase_token_t token;
while( getline(&linebuf, &size, evals_file) ) {
@@ -139,24 +168,11 @@ int main(int argc, char * argv[]){
if ( 0 == token ) {
if ( tokens->len ) { /* one test. */
- get_possible_pinyin(&phrase_index, tokens, pinyins);
- get_best_match(&pinyin_lookup, pinyins, guessed_tokens);
- /* compare the results */
- char * sentence = NULL; char * guessed_sentence = NULL;
- pinyin_lookup.convert_to_utf8(tokens, sentence);
- pinyin_lookup.convert_to_utf8
- (guessed_tokens, guessed_sentence);
-
- if (strcmp(sentence, guessed_sentence) != 0){
- fprintf(stderr, "test sentence:%s\n", sentence);
- fprintf(stderr, "guessed sentence:%s\n", guessed_sentence);
- fprintf(stderr, "the result mis-matches.\n");
- tested_count ++;
- } else {
+ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
tested_count ++; passed_count ++;
+ } else {
+ tested_count ++;
}
-
- g_free(sentence); g_free(guessed_sentence);
g_array_set_size(tokens, 0);
}
} else {
@@ -164,9 +180,18 @@ int main(int argc, char * argv[]){
}
}
+ if ( tokens->len ) { /* one test. */
+ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
+ tested_count ++; passed_count ++;
+ } else {
+ tested_count ++;
+ }
+ }
+
parameter_t rate = passed_count / (parameter_t) tested_count;
printf("correction rate:%f\n", rate);
+ g_array_free(tokens, TRUE);
fclose(evals_file);
free(linebuf);
return 0;