/* * libpinyin * Library to deal with pinyin. * * Copyright (C) 2010 Peng Wu * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include "pinyin_internal.h" #include "utils_helper.h" static const gchar * table_dir = "."; static GOptionEntry entries[] = { {"table-dir", 0, 0, G_OPTION_ARG_FILENAME, &table_dir, "table directory", NULL}, {NULL} }; enum LINE_TYPE{ BEGIN_LINE = 1, END_LINE, GRAM_1_LINE, GRAM_2_LINE, GRAM_1_ITEM_LINE, GRAM_2_ITEM_LINE }; static int line_type = 0; static GPtrArray * values = NULL; static GHashTable * required = NULL; /* variables for line buffer. */ static char * linebuf = NULL; static size_t len = 0; bool parse_headline(); bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index); bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index, Bigram * bigram); static ssize_t my_getline(FILE * input){ ssize_t result = getline(&linebuf, &len, input); if ( result == -1 ) return result; if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } return result; } bool parse_headline(){ /* enter "\data" line */ assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); /* read "\data" line */ if ( !taglib_read(linebuf, line_type, values, required) ) { fprintf(stderr, "error: interpolation model expected.\n"); return false; } assert(line_type == BEGIN_LINE); /* check header */ TAGLIB_GET_TAGVALUE(const char *, model, (const char *)); if ( !( strcmp("interpolation", model) == 0 ) ) { fprintf(stderr, "error: interpolation model expected.\n"); return false; } return true; } bool parse_body(FILE * input, PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index, Bigram * bigram){ taglib_push_state(); assert(taglib_add_tag(END_LINE, "\\end", 0, "", "")); assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", "")); assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", "")); do { retry: assert(taglib_read(linebuf, line_type, values, required)); switch(line_type) { case END_LINE: goto end; case GRAM_1_LINE: my_getline(input); parse_unigram(input, phrase_table, phrase_index); goto retry; case GRAM_2_LINE: my_getline(input); parse_bigram(input, phrase_table, phrase_index, bigram); goto retry; default: assert(false); } } while (my_getline(input) != -1) ; end: taglib_pop_state(); return true; } bool parse_unigram(FILE * input, PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index){ taglib_push_state(); assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 2, "count", "")); do { assert(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_1_ITEM_LINE:{ /* handle \item in \1-gram */ TAGLIB_GET_TOKEN(token, 0); TAGLIB_GET_PHRASE_STRING(word, 1); assert(taglib_validate_token_with_string (phrase_index, token, word)); TAGLIB_GET_TAGVALUE(glong, count, atol); phrase_index->add_unigram_frequency(token, count); break; } case END_LINE: case GRAM_1_LINE: case GRAM_2_LINE: goto end; default: assert(false); } } while (my_getline(input) != -1); end: taglib_pop_state(); return true; } bool parse_bigram(FILE * input, PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index, Bigram * bigram){ taglib_push_state(); assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 4, "count", "")); phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL; do { assert(taglib_read(linebuf, line_type, values, required)); switch (line_type) { case GRAM_2_ITEM_LINE:{ /* handle \item in \2-gram */ /* two tokens */ TAGLIB_GET_TOKEN(token1, 0); TAGLIB_GET_PHRASE_STRING(word1, 1); assert(taglib_validate_token_with_string (phrase_index, token1, word1)); TAGLIB_GET_TOKEN(token2, 2); TAGLIB_GET_PHRASE_STRING(word2, 3); assert(taglib_validate_token_with_string (phrase_index, token2, word2)); TAGLIB_GET_TAGVALUE(glong, count, atol); if ( last_token != token1 ) { if ( last_token && last_single_gram ) { bigram->store(last_token, last_single_gram); delete last_single_gram; /* safe guard */ last_token = null_token; last_single_gram = NULL; } SingleGram * single_gram = NULL; bigram->load(token1, single_gram); /* create the new single gram */ if ( single_gram == NULL ) single_gram = new SingleGram; last_token = token1; last_single_gram = single_gram; } /* save the freq */ assert(NULL != last_single_gram); guint32 total_freq = 0; assert(last_single_gram->get_total_freq(total_freq)); assert(last_single_gram->insert_freq(token2, count)); total_freq += count; assert(last_single_gram->set_total_freq(total_freq)); break; } case END_LINE: case GRAM_1_LINE: case GRAM_2_LINE: goto end; default: assert(false); } } while (my_getline(input) != -1); end: if ( last_token && last_single_gram ) { bigram->store(last_token, last_single_gram); delete last_single_gram; //safe guard last_token = 0; last_single_gram = NULL; } taglib_pop_state(); return true; } int main(int argc, char * argv[]){ FILE * input = stdin; const char * bigram_filename = SYSTEM_BIGRAM; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- import interpolation model"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo2 system_table_info; gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); bool retval = system_table_info.load(filename); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } g_free(filename); PhraseLargeTable3 phrase_table; retval = phrase_table.attach(SYSTEM_PHRASE_INDEX, ATTACH_READONLY); if (!retval) { fprintf(stderr, "open %s failed!\n", SYSTEM_PHRASE_INDEX); exit(ENOENT); } FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); Bigram bigram; retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); if (!retval) { fprintf(stderr, "open %s failed!\n", bigram_filename); exit(ENOENT); } taglib_init(); values = g_ptr_array_new(); required = g_hash_table_new(g_str_hash, g_str_equal); /* read first line */ ssize_t result = my_getline(input); if ( result == -1 ) { fprintf(stderr, "empty file input.\n"); exit(ENODATA); } if (!parse_headline()) exit(ENODATA); result = my_getline(input); if ( result != -1 ) parse_body(input, &phrase_table, &phrase_index, &bigram); taglib_fini(); if (!save_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); return 0; }