From b328718959470e2a104e5aa492e13fd71ff14162 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 17 Aug 2010 12:19:00 +0800 Subject: write gen_unigram --- utils/storage/export_interpolation.cpp | 79 ++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'utils/storage/export_interpolation.cpp') diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp index a4ef9ec..b6dd86f 100644 --- a/utils/storage/export_interpolation.cpp +++ b/utils/storage/export_interpolation.cpp @@ -1,5 +1,84 @@ #include +#include +#include +#include "memory_chunk.h" +#include "novel_types.h" +#include "phrase_index.h" +#include "ngram.h" + +/* export interpolation model as textual format */ + +void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index); +void gen_bigram(FILE * output, Bigram * bigram); +const char * token_to_string(phrase_token_t token); + +void begin_data(FILE * file){ + fprintf(file, "\\data\n"); +} + +void end_data(FILE * file){ + fprintf(file, "\\end\n"); +} int main(int argc, char * argv[]){ + FILE * file = stdout; + const char * bigram_filename = "../../data/bigram.db"; + + FacadePhraseIndex phrase_index; + + //gb_char binary file + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + //gbk_char binary file + chunk = new MemoryChunk; + chunk->load("../../data/gbk_char.bin"); + phrase_index.load(2, chunk); + + Bigram bigram; + bigram.attach(NULL, bigram_filename); + + begin_data(file); + + gen_unigram(stdout, &phrase_index); + gen_bigram(stdout, &bigram); + + end_data(stdout); return 0; } + +void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) { + fprintf(output, "\\1-gram\n"); + for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) { + /* Generate each phrase index library */ + const phrase_token_t min = PHRASE_INDEX_MAKE_TOKEN(i, token_min); + const phrase_token_t max = PHRASE_INDEX_MAKE_TOKEN(i, token_max); + + PhraseItem item; + utf16_t buffer[MAX_PHRASE_LENGTH]; + for ( size_t j = min; j < max; j++) { + int result = phrase_index->get_phrase_item(j, item); + if ( result == ERROR_NO_SUB_PHRASE_INDEX || + result == ERROR_OUT_OF_RANGE) + break; + if ( result == ERROR_NO_ITEM ) + continue; + assert( result != ERROR_FILE_CORRUPTION ); + /* when get_phrase_item, the next error is impossible */ + assert( result != ERROR_INTEGER_OVERFLOW ); + assert( result == ERROR_OK); + + size_t freq = item.get_unigram_frequency(); + item.get_phrase_string(buffer); + guint8 length = item.get_phrase_length(); + gchar * phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL); + fprintf(output, "\\item %s %d\n", phrase, freq); + g_free(phrase); + } + } +} + +void gen_bigram(FILE * output, Bigram * bigram){ + +} -- cgit