From 2eba4a047a2639150c9448b04dc0af568f9b7538 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Wed, 17 Apr 2013 15:29:13 +0800 Subject: write main function --- utils/segment/mergeseq.cpp | 148 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 123 insertions(+), 25 deletions(-) diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp index 9064b45..1f33ccf 100644 --- a/utils/segment/mergeseq.cpp +++ b/utils/segment/mergeseq.cpp @@ -19,7 +19,11 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ +#include +#include +#include #include "pinyin_internal.h" +#include "utils_helper.h" void print_help(){ @@ -27,6 +31,17 @@ void print_help(){ } +static gboolean gen_extra_enter = FALSE; +static gchar * outputfile = NULL; + +static GOptionEntry entries[] = +{ + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"}, + {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL}, + {NULL} +}; + + /* data structure definition. */ typedef struct{ phrase_token_t m_token; @@ -39,12 +54,12 @@ typedef GArray * UnicodeCharVector; /* GArray of TokenInfo. */ typedef GArray * TokenInfoVector; -gint calculate_sequence_length(TokenInfoVector * tokens) { +gint calculate_sequence_length(TokenInfoVector * tokeninfos) { gint len = 0; size_t i = 0; - for (i = 0; i < tokens->len; ++i) { - TokenInfo * token_info = &g_array_index(tokens, TokenInfo, i); + for (i = 0; i < tokeninfos->len; ++i) { + TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i); len += token_info->len; } @@ -58,8 +73,8 @@ gint calculate_sequence_length(TokenInfoVector * tokens) { bool merge_sequence(PhraseLargeTable2 * phrase_table, FacadePhraseIndex * phrase_index, UnicodeCharVector * unichars, - TokenInfoVector * tokens) { - assert(tokens->len > 0); + TokenInfoVector * tokeninfos) { + assert(tokeninfos->len > 0); bool found = false; TokenInfo * token_info = NULL; @@ -73,8 +88,8 @@ bool merge_sequence(PhraseLargeTable2 * phrase_table, phrase_index->prepare_tokens(tokens); /* search the merge sequence. */ - size_t index = tokens->len; - gint seq_len = calculate_sequence_length(tokens); + size_t index = tokeninfos->len; + gint seq_len = calculate_sequence_length(tokeninfos); while (seq_len > 0) { /* do phrase table search. */ phrase_index->clear_tokens(tokens); @@ -87,7 +102,7 @@ bool merge_sequence(PhraseLargeTable2 * phrase_table, } --index; - token_info = &g_array_index(tokens, TokenInfo, index); + token_info = &g_array_index(tokeninfos, TokenInfo, index); seq_len -= token_info->m_token_len; } @@ -96,24 +111,24 @@ bool merge_sequence(PhraseLargeTable2 * phrase_table, /* push the merged sequence back. */ if (found) { /* pop up the origin sequence. */ - g_array_remove_range(tokens, 0, index); + g_array_remove_range(tokeninfos, 0, index); TokenInfo info; info.m_token = token; info.m_token_len = seq_len; - g_array_prepend_val(tokens, info); + g_array_prepend_val(tokeninfos, info); } return found; } bool pop_first_token(UnicodeCharVector * unichars, - TokenInfoVector * tokens, + TokenInfoVector * tokeninfos, FILE * output) { const gunichar * ucs4_str = (const gunichar *)unichars->data; /* pop it. */ - token_info = &g_array_index(tokens, TokenInfo, 0); + token_info = &g_array_index(tokeninfos, TokenInfo, 0); token = token_info->m_token; token_len = token_info->m_token_len; @@ -124,7 +139,7 @@ bool pop_first_token(UnicodeCharVector * unichars, g_free(utf8_str); g_array_remove_range(unichars, 0, token_len); - g_array_remove_index(tokens, 0); + g_array_remove_index(tokeninfos, 0); return true; } @@ -132,21 +147,21 @@ bool pop_first_token(UnicodeCharVector * unichars, bool feed_line(PhraseLargeTable2 * phrase_table, FacadePhraseIndex * phrase_index, UnicodeCharVector * unichars, - TokenInfoVector * tokens, - const char * line, + TokenInfoVector * tokeninfos, + const char * linebuf, FILE * output) { - TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, line); + TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf); if (null_token == token) { /* empty the queue. */ - while (0 != tokens->len) { - merge_sequence(phrase_table, phrase_index, unichars, tokens); - pop_first_token(unichars, tokens, output); + while (0 != tokeninfos->len) { + merge_sequence(phrase_table, phrase_index, unichars, tokeninfos); + pop_first_token(unichars, tokeninfos, output); } assert(0 == unichars->len); - assert(0 == tokens->len); + assert(0 == tokeninfos->len); return false; } @@ -157,19 +172,102 @@ bool feed_line(PhraseLargeTable2 * phrase_table, TokenInfo info; info.m_token = token; info.m_token_len = len; - g_array_append_val(tokens, info); + g_array_append_val(tokeninfos, info); ucs4_t buffer[MAX_PHRASE_LENGTH]; item.get_phrase_string(buffer); g_array_append_vals(unichars, buffer, len); /* probe merge sequence. */ - gint len = calculate_sequence_length(tokens); + gint len = calculate_sequence_length(tokeninfos); while (len >= MAX_PHRASE_LENGTH) { - merge_sequence(phrase_table, phrase_index, unichars, tokens); - pop_first_token(unichars, tokens, output); - len = calculate_sequence_length(tokens); + merge_sequence(phrase_table, phrase_index, unichars, tokeninfos); + pop_first_token(unichars, tokeninfos, output); + len = calculate_sequence_length(tokeninfos); } return true; } + + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- shortest path segment"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (outputfile) { + output = fopen(outputfile, "w"); + if (NULL == output) { + perror("open file failed"); + exit(EINVAL); + } + } + + if (argc > 2) { + fprintf(stderr, "too many arguments.\n"); + exit(EINVAL); + } + + if (2 == argc) { + input = fopen(argv[1], "r"); + if (NULL == input) { + perror("open file failed"); + exit(EINVAL); + } + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* init phrase table */ + FacadePhraseTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + GArray * unichars = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); + GArray * tokeninfos = g_array_new(TRUE, TRUE, sizeof(TokenInfo)); + + char * linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, input)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + feed_line(phrase_table, phrase_index, + unichars, tokeninfos, + linebuf, output); + } + + g_array_free(unichars, TRUE); + g_array_free(tokeninfos, TRUE); + free(linebuf); + fclose(input); + fclose(output); + return 0; +} -- cgit