diff options
-rw-r--r-- | utils/storage/import_interpolation.cpp | 114 |
1 files changed, 104 insertions, 10 deletions
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp index 6e88fdb..fb5a18a 100644 --- a/utils/storage/import_interpolation.cpp +++ b/utils/storage/import_interpolation.cpp @@ -44,6 +44,9 @@ static GHashTable * required = NULL; static char * linebuf = NULL; static size_t len = 0; +phrase_token_t string_to_token(PhraseLargeTable * phrases, + const char * string); + bool parse_unigram(FILE * input, PhraseLargeTable * phrases, FacadePhraseIndex * phrase_index); @@ -51,6 +54,15 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases, FacadePhraseIndex * phrase_index, Bigram * bigram); +static ssize_t my_getline(FILE * input){ + ssize_t result = getline(&linebuf, &len, input); + if ( result == -1 ) + return result; + + linebuf[strlen(linebuf) - 1] = '\0'; + return result; +} + bool parse_body(FILE * input, PhraseLargeTable * phrases, FacadePhraseIndex * phrase_index, Bigram * bigram){ @@ -60,25 +72,24 @@ bool parse_body(FILE * input, PhraseLargeTable * phrases, assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); - do{ + do { retry: assert(taglib_read(linebuf, line_type, values, required)); switch(line_type) { case END_LINE: goto end; - break; case GRAM_1_LINE: + my_getline(input); parse_unigram(input, phrases, phrase_index); goto retry; - break; case GRAM_2_LINE: + my_getline(input); parse_bigram(input, phrases, phrase_index, bigram); goto retry; - break; default: assert(false); } - } while (getline(&linebuf, &len, input) != -1) ; + } while (my_getline(input) != -1) ; end: taglib_pop_state(); @@ -87,14 +98,46 @@ bool parse_body(FILE * input, PhraseLargeTable * phrases, bool parse_unigram(FILE * input, PhraseLargeTable * phrases, FacadePhraseIndex * phrase_index){ + taglib_push_state(); + + assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 1, "count", "")); + + do { + assert(taglib_read(linebuf, line_type, values, required)); + switch(line_type) { + case GRAM_1_ITEM_LINE:{ + /* handle \item in \1-gram */ + assert(values->len == 1); + const char * string = (const char *)g_ptr_array_index(values, 0); + phrase_token_t token = string_to_token(phrases, string); + char * value = NULL; + assert(g_hash_table_lookup_extended(required, "count", NULL, (gpointer *)&value)); + glong count = atol(value); + phrase_index->add_unigram_frequency(token, count); + break; + } + case END_LINE: + case GRAM_1_LINE: + case GRAM_2_LINE: + goto end; + default: + assert(false); + } + } while (my_getline(input) != -1) ; + end: + taglib_pop_state(); return true; } bool parse_bigram(FILE * input, PhraseLargeTable * phrases, FacadePhraseIndex * phrase_index, Bigram * bigram){ + taglib_push_state(); + assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 2, "count", "")); + + taglib_pop_state(); return true; } @@ -128,21 +171,72 @@ int main(int argc, char * argv[]){ values = g_ptr_array_new(); required = g_hash_table_new(g_str_hash, g_str_equal); + //enter "\data" line assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); - getline(&linebuf, &len, input); - linebuf[strlen(linebuf) - 1] = '\0'; + my_getline(input); + //read "\data" line assert(taglib_read(linebuf, line_type, values, required)); assert(line_type == BEGIN_LINE); char * value = NULL; assert(g_hash_table_lookup_extended(required, "model", NULL, (gpointer *)&value)); - assert(strcmp("interpolation", value) == 0); + if ( !( strcmp("interpolation", value) == 0 ) ) { + fprintf(stderr, "error: interpolation model expected.\n"); + exit(1); + } - getline(&linebuf, &len, input); - linebuf[strlen(linebuf) - 1] = '\0'; + my_getline(input); parse_body(input, &phrases, &phrase_index, &bigram); taglib_fini(); + chunk = new MemoryChunk; + phrase_index.store(1, chunk); + chunk->save("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + chunk = new MemoryChunk; + phrase_index.store(2, chunk); + chunk->save("../../data/gbk_char.bin"); + phrase_index.load(2, chunk); + return 0; } + +static phrase_token_t special_string_to_token(const char * string){ + struct token_pair{ + phrase_token_t token; + const char * string; + }; + + static const token_pair tokens [] = { + {sentence_start, "<start>"}, + {0, NULL} + }; + + const token_pair * pair = tokens; + while (pair->string) { + if ( strcmp(string, pair->string ) == 0 ){ + return pair->token; + } + } + + fprintf(stderr, "error: unknown token:%s.\n", string); + return 0; +} + +phrase_token_t string_to_token(PhraseLargeTable * phrases, const char * string){ + phrase_token_t token = 0; + if ( string[0] == '<' ) { + return special_string_to_token(string); + } + + glong phrase_len = g_utf8_strlen(string, -1); + utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL); + int result = phrases->search(phrase_len, phrase, token); + if ( !(result & SEARCH_OK) ) + fprintf(stderr, "error: unknown token:%s.\n", string); + + g_free(phrase); + return token; +} |