diff options
author | Peng Wu <alexepico@gmail.com> | 2010-10-11 15:06:20 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2010-10-11 15:06:20 +0800 |
commit | 774f0353ae5a8f84b228b9a91a0b09e27e065f2d (patch) | |
tree | cdeb7b09a739dd5fe40c8f1d13e3f2dbc0f330d7 /utils/storage/import_interpolation.cpp | |
parent | ed9e811631b71ec835a9f25243c354f6050c885e (diff) | |
download | libpinyin-774f0353ae5a8f84b228b9a91a0b09e27e065f2d.tar.gz libpinyin-774f0353ae5a8f84b228b9a91a0b09e27e065f2d.tar.xz libpinyin-774f0353ae5a8f84b228b9a91a0b09e27e065f2d.zip |
write import interpolation in progress
Diffstat (limited to 'utils/storage/import_interpolation.cpp')
-rw-r--r-- | utils/storage/import_interpolation.cpp | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp index 536219d..6e88fdb 100644 --- a/utils/storage/import_interpolation.cpp +++ b/utils/storage/import_interpolation.cpp @@ -20,7 +20,129 @@ */ #include <stdio.h> +#include <glib.h> +#include "novel_types.h" +#include "memory_chunk.h" +#include "phrase_index.h" +#include "ngram.h" +#include "phrase_large_table.h" +#include "tag_utility.h" + +enum LINE_TYPE{ + BEGIN_LINE = 1, + END_LINE, + GRAM_1_LINE, + GRAM_2_LINE, + GRAM_1_ITEM_LINE, + GRAM_2_ITEM_LINE +}; + +static int line_type = 0; +static GPtrArray * values = NULL; +static GHashTable * required = NULL; +/* variables for line buffer. */ +static char * linebuf = NULL; +static size_t len = 0; + +bool parse_unigram(FILE * input, PhraseLargeTable * phrases, + FacadePhraseIndex * phrase_index); + +bool parse_bigram(FILE * input, PhraseLargeTable * phrases, + FacadePhraseIndex * phrase_index, + Bigram * bigram); + +bool parse_body(FILE * input, PhraseLargeTable * phrases, + FacadePhraseIndex * phrase_index, + Bigram * bigram){ + taglib_push_state(); + + assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); + assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); + assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); + + do{ + retry: + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case END_LINE: + goto end; + break; + case GRAM_1_LINE: + parse_unigram(input, phrases, phrase_index); + goto retry; + break; + case GRAM_2_LINE: + parse_bigram(input, phrases, phrase_index, bigram); + goto retry; + break; + default: + assert(false); + } + } while (getline(&linebuf, &len, input) != -1) ; + + end: + taglib_pop_state(); + return true; +} + +bool parse_unigram(FILE * input, PhraseLargeTable * phrases, + FacadePhraseIndex * phrase_index){ + + return true; +} + +bool parse_bigram(FILE * input, PhraseLargeTable * phrases, + FacadePhraseIndex * phrase_index, + Bigram * bigram){ + + return true; +} int main(int argc, char * argv[]){ + FILE * input = stdin; + const char * bigram_filename = "../../data/bigram.db"; + + PhraseLargeTable phrases; + + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/phrase_index.bin"); + phrases.load(chunk); + + FacadePhraseIndex phrase_index; + + //gb_char binary file + chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + //gbk_char binary file + chunk = new MemoryChunk; + chunk->load("../../data/gbk_char.bin"); + phrase_index.load(2, chunk); + + Bigram bigram; + bigram.attach(NULL, bigram_filename); + + taglib_init(); + + values = g_ptr_array_new(); + required = g_hash_table_new(g_str_hash, g_str_equal); + + assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); + getline(&linebuf, &len, input); + linebuf[strlen(linebuf) - 1] = '\0'; + + assert(taglib_read(linebuf, line_type, values, required)); + assert(line_type == BEGIN_LINE); + char * value = NULL; + assert(g_hash_table_lookup_extended(required, "model", NULL, (gpointer *)&value)); + assert(strcmp("interpolation", value) == 0); + + getline(&linebuf, &len, input); + linebuf[strlen(linebuf) - 1] = '\0'; + parse_body(input, &phrases, &phrase_index, &bigram); + + taglib_fini(); + return 0; } |