diff options
author | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
commit | b78429d78df745dd327b6dada6b9bd71ea5df84e (patch) | |
tree | 82c4625db8674c66d69fd566fce8efc347e3cb3a /utils/segment/mergeseq.cpp | |
download | libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.gz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.xz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.zip |
import libpinyin code
Diffstat (limited to 'utils/segment/mergeseq.cpp')
-rw-r--r-- | utils/segment/mergeseq.cpp | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp new file mode 100644 index 0000000..1a26064 --- /dev/null +++ b/utils/segment/mergeseq.cpp @@ -0,0 +1,278 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2013 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdio.h> +#include <locale.h> +#include <string.h> +#include <glib.h> +#include "pinyin_internal.h" +#include "utils_helper.h" + + +void print_help(){ + printf("Usage: mergeseq [-o outputfile] [inputfile]\n"); +} + + +static gboolean gen_extra_enter = FALSE; +static gchar * outputfile = NULL; + +static GOptionEntry entries[] = +{ + {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"}, + {NULL} +}; + + +/* data structure definition. */ +typedef struct{ + phrase_token_t m_token; + gint m_token_len; +} TokenInfo; + + +/* GArray of ucs4 characters. */ +typedef GArray * UnicodeCharVector; +/* GArray of TokenInfo. */ +typedef GArray * TokenInfoVector; + +gint calculate_sequence_length(TokenInfoVector tokeninfos) { + gint len = 0; + + size_t i = 0; + for (i = 0; i < tokeninfos->len; ++i) { + TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i); + len += token_info->m_token_len; + } + + return len; +} + +/* if merge sequence found, merge and output it, + * if not, just output the first token; + * pop the first token or sequence. + */ +bool merge_sequence(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + UnicodeCharVector unichars, + TokenInfoVector tokeninfos) { + assert(tokeninfos->len > 0); + + bool found = false; + TokenInfo * token_info = NULL; + phrase_token_t token = null_token; + + ucs4_t * ucs4_str = (ucs4_t *) unichars->data; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + + /* search the merge sequence. */ + size_t index = tokeninfos->len; + gint seq_len = calculate_sequence_length(tokeninfos); + while (seq_len > 0) { + /* do phrase table search. */ + int retval = phrase_table->search(seq_len, ucs4_str, tokens); + + if (retval & SEARCH_OK) { + int num = get_first_token(tokens, token); + found = true; + break; + } + + --index; + token_info = &g_array_index(tokeninfos, TokenInfo, index); + seq_len -= token_info->m_token_len; + } + + phrase_index->destroy_tokens(tokens); + + /* push the merged sequence back. */ + if (found) { + /* pop up the origin sequence. */ + g_array_remove_range(tokeninfos, 0, index); + + TokenInfo info; + info.m_token = token; + info.m_token_len = seq_len; + g_array_prepend_val(tokeninfos, info); + } + + return found; +} + +bool pop_first_token(UnicodeCharVector unichars, + TokenInfoVector tokeninfos, + FILE * output) { + ucs4_t * ucs4_str = (ucs4_t *) unichars->data; + + /* pop it. */ + TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, 0); + phrase_token_t token = token_info->m_token; + gint token_len = token_info->m_token_len; + + glong read = 0; + gchar * utf8_str = g_ucs4_to_utf8(ucs4_str, token_len, &read, NULL, NULL); + assert(read == token_len); + fprintf(output, "%d %s\n", token, utf8_str); + g_free(utf8_str); + + g_array_remove_range(unichars, 0, token_len); + g_array_remove_index(tokeninfos, 0); + + return true; +} + +bool feed_line(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, + UnicodeCharVector unichars, + TokenInfoVector tokeninfos, + const char * linebuf, + FILE * output) { + + TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf); + + if (null_token == token) { + /* empty the queue. */ + while (0 != tokeninfos->len) { + merge_sequence(phrase_table, phrase_index, unichars, tokeninfos); + pop_first_token(unichars, tokeninfos, output); + } + + assert(0 == unichars->len); + assert(0 == tokeninfos->len); + + /* restore the null token line. */ + fprintf(output, "%s\n", linebuf); + + return false; + } + + PhraseItem item; + phrase_index->get_phrase_item(token, item); + gint len = item.get_phrase_length(); + + TokenInfo info; + info.m_token = token; + info.m_token_len = len; + g_array_append_val(tokeninfos, info); + + ucs4_t buffer[MAX_PHRASE_LENGTH]; + item.get_phrase_string(buffer); + g_array_append_vals(unichars, buffer, len); + + /* probe merge sequence. */ + len = calculate_sequence_length(tokeninfos); + while (len >= MAX_PHRASE_LENGTH) { + merge_sequence(phrase_table, phrase_index, unichars, tokeninfos); + pop_first_token(unichars, tokeninfos, output); + len = calculate_sequence_length(tokeninfos); + } + + return true; +} + + +int main(int argc, char * argv[]){ + FILE * input = stdin; + FILE * output = stdout; + + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- merge word sequence"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + if (outputfile) { + output = fopen(outputfile, "w"); + if (NULL == output) { + perror("open file failed"); + exit(EINVAL); + } + } + + if (argc > 2) { + fprintf(stderr, "too many arguments.\n"); + exit(EINVAL); + } + + if (2 == argc) { + input = fopen(argv[1], "r"); + if (NULL == input) { + perror("open file failed"); + exit(EINVAL); + } + } + + SystemTableInfo system_table_info; + + bool retval = system_table_info.load(SYSTEM_TABLE_INFO); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + + /* init phrase table */ + FacadePhraseTable2 phrase_table; + MemoryChunk * chunk = new MemoryChunk; + chunk->load(SYSTEM_PHRASE_INDEX); + phrase_table.load(chunk, NULL); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + + const pinyin_table_info_t * phrase_files = + system_table_info.get_table_info(); + + if (!load_phrase_index(phrase_files, &phrase_index)) + exit(ENOENT); + + GArray * unichars = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); + GArray * tokeninfos = g_array_new(TRUE, TRUE, sizeof(TokenInfo)); + + char * linebuf = NULL; size_t size = 0; ssize_t read; + while( (read = getline(&linebuf, &size, input)) != -1 ){ + if ( '\n' == linebuf[strlen(linebuf) - 1] ) { + linebuf[strlen(linebuf) - 1] = '\0'; + } + + if (0 == strlen(linebuf)) + continue; + + feed_line(&phrase_table, &phrase_index, + unichars, tokeninfos, + linebuf, output); + } + + g_array_free(unichars, TRUE); + g_array_free(tokeninfos, TRUE); + free(linebuf); + fclose(input); + fclose(output); + return 0; +} |