/* * libpinyin * Library to deal with pinyin. * * Copyright (C) 2010,2013 Peng Wu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include "pinyin_internal.h" #include "utils_helper.h" void print_help(){ printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n"); } static gboolean gen_extra_enter = FALSE; static gchar * outputfile = NULL; static GOptionEntry entries[] = { {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"}, {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL}, {NULL} }; /* graph shortest path sentence segment. */ /* Note: * Currently libpinyin only supports ucs4 characters, as this is a * pre-processor tool for raw corpus, it will skip all sentences * which contains non-ucs4 characters. */ enum CONTEXT_STATE{ CONTEXT_INIT, CONTEXT_SEGMENTABLE, CONTEXT_UNKNOWN }; struct SegmentStep{ phrase_token_t m_handle; ucs4_t * m_phrase; size_t m_phrase_len; //use formula W = number of words. Zero handle means one word. guint m_nword; //backtrace information, -1 one step backward. gint m_backward_nstep; public: SegmentStep(){ m_handle = null_token; m_phrase = NULL; m_phrase_len = 0; m_nword = UINT_MAX; m_backward_nstep = -0; } }; bool backtrace(GArray * steps, glong phrase_len, GArray * strings); /* Note: do not free phrase, as it is used by strings (array of segment). */ bool segment(FacadePhraseTable3 * phrase_table, FacadePhraseIndex * phrase_index, GArray * current_ucs4, GArray * strings /* Array of SegmentStep. */){ ucs4_t * phrase = (ucs4_t *)current_ucs4->data; guint phrase_len = current_ucs4->len; /* Prepare for shortest path segment dynamic programming. */ GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); SegmentStep step; for ( glong i = 0; i < phrase_len + 1; ++i ){ g_array_append_val(steps, step); } SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0); first_step->m_nword = 0; PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index->prepare_tokens(tokens); for ( glong i = 0; i < phrase_len + 1; ++i ) { SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i); size_t nword = step_begin->m_nword; for ( glong k = i + 1; k < phrase_len + 1; ++k ) { size_t len = k - i; ucs4_t * cur_phrase = phrase + i; phrase_token_t token = null_token; int result = phrase_table->search(len, cur_phrase, tokens); int num = get_first_token(tokens, token); if ( !(result & SEARCH_OK) ){ token = null_token; if ( 1 != len ) continue; } ++nword; SegmentStep * step_end = &g_array_index(steps, SegmentStep, k); if ( nword < step_end->m_nword ) { step_end->m_handle = token; step_end->m_phrase = cur_phrase; step_end->m_phrase_len = len; step_end->m_nword = nword; step_end->m_backward_nstep = i - k; } if ( !(result & SEARCH_CONTINUED) ) break; } } phrase_index->destroy_tokens(tokens); return backtrace(steps, phrase_len, strings); } bool backtrace(GArray * steps, glong phrase_len, GArray * strings){ /* backtracing to get the result. */ size_t cur_step = phrase_len; g_array_set_size(strings, 0); while ( cur_step ){ SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step); g_array_append_val(strings, *step); cur_step = cur_step + step->m_backward_nstep; /* intended to avoid leaking internal informations. */ step->m_nword = 0; step->m_backward_nstep = 0; } /* reverse the strings. */ for ( size_t i = 0; i < strings->len / 2; ++i ) { SegmentStep * head, * tail; head = &g_array_index(strings, SegmentStep, i); tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i ); SegmentStep tmp; tmp = *head; *head = *tail; *tail = tmp; } g_array_free(steps, TRUE); return true; } bool deal_with_segmentable(FacadePhraseTable3 * phrase_table, FacadePhraseIndex * phrase_index, GArray * current_ucs4, FILE * output){ /* do segment stuff. */ GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); segment(phrase_table, phrase_index, current_ucs4, strings); /* print out the split phrase. */ for ( glong i = 0; i < strings->len; ++i ) { SegmentStep * step = &g_array_index(strings, SegmentStep, i); char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL); fprintf(output, "%d %s\n", step->m_handle, string); g_free(string); } g_array_free(strings, TRUE); return true; } bool deal_with_unknown(GArray * current_ucs4, FILE * output){ char * result_string = g_ucs4_to_utf8 ( (ucs4_t *) current_ucs4->data, current_ucs4->len, NULL, NULL, NULL); fprintf(output, "%d %s\n", null_token, result_string); g_free(result_string); return true; } int main(int argc, char * argv[]){ FILE * input = stdin; FILE * output = stdout; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- shortest path segment"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } if (outputfile) { output = fopen(outputfile, "w"); if (NULL == output) { perror("open file failed"); exit(EINVAL); } } if (argc > 2) { fprintf(stderr, "too many arguments.\n"); exit(EINVAL); } if (2 == argc) { input = fopen(argv[1], "r"); if (NULL == input) { perror("open file failed"); exit(EINVAL); } } SystemTableInfo2 system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } /* init phrase table */ FacadePhraseTable3 phrase_table; phrase_table.load(SYSTEM_PHRASE_INDEX, NULL); /* init phrase index */ FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); CONTEXT_STATE state, next_state; GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, input)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } /* check non-ucs4 characters. */ const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); fprintf(output, "%d \n", null_token); continue; } /* only new-line persists. */ if ( 0 == num_of_chars ) { fprintf(output, "%d \n", null_token); continue; } state = CONTEXT_INIT; int result = phrase_table.search( 1, sentence, tokens); g_array_append_val( current_ucs4, sentence[0]); if ( result & SEARCH_OK ) state = CONTEXT_SEGMENTABLE; else state = CONTEXT_UNKNOWN; for ( int i = 1; i < num_of_chars; ++i) { int result = phrase_table.search( 1, sentence + i, tokens); if ( result & SEARCH_OK ) next_state = CONTEXT_SEGMENTABLE; else next_state = CONTEXT_UNKNOWN; if ( state == next_state ){ g_array_append_val(current_ucs4, sentence[i]); continue; } assert ( state != next_state ); if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_table, &phrase_index, current_ucs4, output); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4, output); /* save the current character */ g_array_set_size(current_ucs4, 0); g_array_append_val(current_ucs4, sentence[i]); state = next_state; } if ( current_ucs4->len ) { /* this seems always true. */ if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_table, &phrase_index, current_ucs4, output); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4, output); g_array_set_size(current_ucs4, 0); } /* print extra enter */ if ( gen_extra_enter ) fprintf(output, "%d \n", null_token); g_free(sentence); } phrase_index.destroy_tokens(tokens); /* print enter at file tail */ fprintf(output, "%d \n", null_token); g_array_free(current_ucs4, TRUE); free(linebuf); fclose(input); fclose(output); return 0; }