summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-07-25 18:38:28 +0800
committerPeng Wu <alexepico@gmail.com>2011-07-25 18:45:57 +0800
commit63e1d37800e2f2ecc71e347c79f85c772042d8d2 (patch)
treee472e3723c3a2d78c6a2d7eb7ef39ac4d0005c4a
parentb7ecd0860c793f25ed97fc717be3b4f07cbd8f3e (diff)
downloadlibpinyin-63e1d37800e2f2ecc71e347c79f85c772042d8d2.tar.gz
libpinyin-63e1d37800e2f2ecc71e347c79f85c772042d8d2.tar.xz
libpinyin-63e1d37800e2f2ecc71e347c79f85c772042d8d2.zip
write eval correction rate
-rw-r--r--utils/training/eval_correction_rate.cpp147
1 files changed, 146 insertions, 1 deletions
diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp
index 6009251..81aadaa 100644
--- a/utils/training/eval_correction_rate.cpp
+++ b/utils/training/eval_correction_rate.cpp
@@ -20,9 +20,154 @@
*/
-
#include "pinyin.h"
+typedef MatchResults TokenVector;
+
+void print_help(){
+ printf("Usage: eval_correction_rate\n");
+}
+
+bool get_possible_pinyin(FacadePhraseIndex * phrase_index,
+ TokenVector tokens, PinyinKeyVector pinyins){
+ PinyinKey buffer[MAX_PHRASE_LENGTH];
+ size_t pinyin_index; guint32 max_freq;
+ guint32 freq;
+ g_array_set_size(pinyins, 0);
+
+ for (size_t i = 0; i < tokens->len; ++i){
+ phrase_token_t * token = &g_array_index(tokens, phrase_token_t, i);
+ PhraseItem item;
+ phrase_index->get_phrase_item(*token, item);
+ pinyin_index = 0; max_freq = 0;
+ for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) {
+ freq = 0;
+ assert(item.get_nth_pronunciation(m, buffer, freq));
+ if ( freq > max_freq ) {
+ pinyin_index = m;
+ max_freq = freq;
+ }
+ }
+
+ assert(item.get_nth_pronunciation(pinyin_index, buffer, freq));
+ assert(max_freq == freq);
+ guint8 len = item.get_phrase_length();
+ g_array_append_vals(pinyins, buffer, len);
+ }
+ return true;
+}
+
+bool get_best_match(PinyinLookup * pinyin_lookup,
+ PinyinKeyVector pinyins, TokenVector tokens){
+ /* initialize constraints. */
+ CandidateConstraints constraints = g_array_new
+ (FALSE, FALSE, sizeof(lookup_constraint_t));
+ g_array_set_size(constraints, pinyins->len);
+ for ( size_t i = 0; i < constraints->len; ++i ) {
+ lookup_constraint_t * constraint = &g_array_index
+ (constraints, lookup_constraint_t, i);
+ constraint->m_type = NO_CONSTRAINT;
+ }
+
+ return pinyin_lookup->get_best_match(pinyins, constraints, tokens);
+}
+
int main(int argc, char * argv[]){
+ const char * evals_text = "../../data/evals.text";
+
+ PinyinCustomSettings custom;
+ PinyinLargeTable largetable(&custom);
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ new_chunk->load("../../data/pinyin_index.bin");
+ largetable.load(new_chunk);
+
+ FacadePhraseIndex phrase_index;
+ new_chunk = new MemoryChunk;
+ new_chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, new_chunk);
+ new_chunk = new MemoryChunk;
+ new_chunk->load("../../data/gbk_char.bin");
+ phrase_index.load(2, new_chunk);
+
+ PhraseLargeTable phrases;
+ new_chunk = new MemoryChunk;
+ new_chunk->load("../../data/phrase_index.bin");
+ phrases.load(new_chunk);
+
+ Bigram system_bigram;
+ system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
+ Bigram user_bigram;
+ user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE);
+
+ PinyinLookup pinyin_lookup(&custom, &largetable, &phrase_index,
+ &system_bigram, &user_bigram);
+
+ /* open evals.text. */
+ FILE * evals_file = fopen(evals_text, "r");
+ if ( NULL == evals_file ) {
+ fprintf(stderr, "Can't open file:%s\n", evals_text);
+ exit(ENOENT);
+ }
+
+ /* Evaluates the correction rate of test text documents. */
+ size_t tested_count = 0; size_t passed_count = 0;
+ char* linebuf = NULL; size_t size = 0;
+ TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t));
+ TokenVector guessed_tokens = g_array_new
+ (FALSE, TRUE, sizeof(phrase_token_t));
+ PinyinKeyVector pinyins = g_array_new(FALSE, TRUE, sizeof(PinyinKey));
+
+ phrase_token_t token;
+ while( getline(&linebuf, &size, evals_file) ) {
+ if ( feof(evals_file) )
+ break;
+ if ( '\n' == linebuf[strlen(linebuf)-1] )
+ linebuf[strlen(linebuf)-1] = '\0';
+
+ glong phrase_len = 0;
+ utf16_t * phrase = g_utf8_to_utf16(linebuf, -1, NULL, &phrase_len, NULL);
+
+ token = 0;
+ if ( 0 != phrase_len ) {
+ int result = phrases.search( phrase_len, phrase, token);
+ if ( ! (result & SEARCH_OK) )
+ token = 0;
+ g_free(phrase);
+ phrase = NULL;
+ }
+
+ if ( 0 == token ) {
+ if ( tokens->len ) { /* one test. */
+ get_possible_pinyin(&phrase_index, tokens, pinyins);
+ get_best_match(&pinyin_lookup, pinyins, guessed_tokens);
+ /* compare the results */
+ char * sentence = NULL; char * guessed_sentence = NULL;
+ pinyin_lookup.convert_to_utf8(tokens, sentence);
+ pinyin_lookup.convert_to_utf8
+ (guessed_tokens, guessed_sentence);
+
+ if (strcmp(sentence, guessed_sentence) != 0){
+ fprintf(stderr, "test sentence:%s\n", sentence);
+ fprintf(stderr, "guessed sentence:%s\n", guessed_sentence);
+ fprintf(stderr, "the result mis-matches.\n");
+ tested_count ++;
+ } else {
+ tested_count ++; passed_count ++;
+ }
+
+ g_free(sentence); g_free(guessed_sentence);
+ g_array_set_size(tokens, 0);
+ }
+ } else {
+ g_array_append_val(tokens, token);
+ }
+ }
+
+ parameter_t rate = passed_count / (parameter_t) tested_count;
+ printf("correction rate:%f\n", rate);
+
+ fclose(evals_file);
+ free(linebuf);
return 0;
}