summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-04-17 15:29:13 +0800
committerPeng Wu <alexepico@gmail.com>2013-04-17 15:29:13 +0800
commit2eba4a047a2639150c9448b04dc0af568f9b7538 (patch)
tree4391e3223dec6571da8a6de38ee07479eab7c4fe
parentfb15ae7c60ad7806d6eb3aade05d5d02ec435f0e (diff)
downloadlibpinyin-2eba4a047a2639150c9448b04dc0af568f9b7538.tar.gz
libpinyin-2eba4a047a2639150c9448b04dc0af568f9b7538.tar.xz
libpinyin-2eba4a047a2639150c9448b04dc0af568f9b7538.zip
write main function
-rw-r--r--utils/segment/mergeseq.cpp148
1 files changed, 123 insertions, 25 deletions
diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp
index 9064b45..1f33ccf 100644
--- a/utils/segment/mergeseq.cpp
+++ b/utils/segment/mergeseq.cpp
@@ -19,7 +19,11 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
+#include <stdio.h>
+#include <locale.h>
+#include <glib.h>
#include "pinyin_internal.h"
+#include "utils_helper.h"
void print_help(){
@@ -27,6 +31,17 @@ void print_help(){
}
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+ {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+ {NULL}
+};
+
+
/* data structure definition. */
typedef struct{
phrase_token_t m_token;
@@ -39,12 +54,12 @@ typedef GArray * UnicodeCharVector;
/* GArray of TokenInfo. */
typedef GArray * TokenInfoVector;
-gint calculate_sequence_length(TokenInfoVector * tokens) {
+gint calculate_sequence_length(TokenInfoVector * tokeninfos) {
gint len = 0;
size_t i = 0;
- for (i = 0; i < tokens->len; ++i) {
- TokenInfo * token_info = &g_array_index(tokens, TokenInfo, i);
+ for (i = 0; i < tokeninfos->len; ++i) {
+ TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i);
len += token_info->len;
}
@@ -58,8 +73,8 @@ gint calculate_sequence_length(TokenInfoVector * tokens) {
bool merge_sequence(PhraseLargeTable2 * phrase_table,
FacadePhraseIndex * phrase_index,
UnicodeCharVector * unichars,
- TokenInfoVector * tokens) {
- assert(tokens->len > 0);
+ TokenInfoVector * tokeninfos) {
+ assert(tokeninfos->len > 0);
bool found = false;
TokenInfo * token_info = NULL;
@@ -73,8 +88,8 @@ bool merge_sequence(PhraseLargeTable2 * phrase_table,
phrase_index->prepare_tokens(tokens);
/* search the merge sequence. */
- size_t index = tokens->len;
- gint seq_len = calculate_sequence_length(tokens);
+ size_t index = tokeninfos->len;
+ gint seq_len = calculate_sequence_length(tokeninfos);
while (seq_len > 0) {
/* do phrase table search. */
phrase_index->clear_tokens(tokens);
@@ -87,7 +102,7 @@ bool merge_sequence(PhraseLargeTable2 * phrase_table,
}
--index;
- token_info = &g_array_index(tokens, TokenInfo, index);
+ token_info = &g_array_index(tokeninfos, TokenInfo, index);
seq_len -= token_info->m_token_len;
}
@@ -96,24 +111,24 @@ bool merge_sequence(PhraseLargeTable2 * phrase_table,
/* push the merged sequence back. */
if (found) {
/* pop up the origin sequence. */
- g_array_remove_range(tokens, 0, index);
+ g_array_remove_range(tokeninfos, 0, index);
TokenInfo info;
info.m_token = token;
info.m_token_len = seq_len;
- g_array_prepend_val(tokens, info);
+ g_array_prepend_val(tokeninfos, info);
}
return found;
}
bool pop_first_token(UnicodeCharVector * unichars,
- TokenInfoVector * tokens,
+ TokenInfoVector * tokeninfos,
FILE * output) {
const gunichar * ucs4_str = (const gunichar *)unichars->data;
/* pop it. */
- token_info = &g_array_index(tokens, TokenInfo, 0);
+ token_info = &g_array_index(tokeninfos, TokenInfo, 0);
token = token_info->m_token;
token_len = token_info->m_token_len;
@@ -124,7 +139,7 @@ bool pop_first_token(UnicodeCharVector * unichars,
g_free(utf8_str);
g_array_remove_range(unichars, 0, token_len);
- g_array_remove_index(tokens, 0);
+ g_array_remove_index(tokeninfos, 0);
return true;
}
@@ -132,21 +147,21 @@ bool pop_first_token(UnicodeCharVector * unichars,
bool feed_line(PhraseLargeTable2 * phrase_table,
FacadePhraseIndex * phrase_index,
UnicodeCharVector * unichars,
- TokenInfoVector * tokens,
- const char * line,
+ TokenInfoVector * tokeninfos,
+ const char * linebuf,
FILE * output) {
- TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, line);
+ TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf);
if (null_token == token) {
/* empty the queue. */
- while (0 != tokens->len) {
- merge_sequence(phrase_table, phrase_index, unichars, tokens);
- pop_first_token(unichars, tokens, output);
+ while (0 != tokeninfos->len) {
+ merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+ pop_first_token(unichars, tokeninfos, output);
}
assert(0 == unichars->len);
- assert(0 == tokens->len);
+ assert(0 == tokeninfos->len);
return false;
}
@@ -157,19 +172,102 @@ bool feed_line(PhraseLargeTable2 * phrase_table,
TokenInfo info;
info.m_token = token;
info.m_token_len = len;
- g_array_append_val(tokens, info);
+ g_array_append_val(tokeninfos, info);
ucs4_t buffer[MAX_PHRASE_LENGTH];
item.get_phrase_string(buffer);
g_array_append_vals(unichars, buffer, len);
/* probe merge sequence. */
- gint len = calculate_sequence_length(tokens);
+ gint len = calculate_sequence_length(tokeninfos);
while (len >= MAX_PHRASE_LENGTH) {
- merge_sequence(phrase_table, phrase_index, unichars, tokens);
- pop_first_token(unichars, tokens, output);
- len = calculate_sequence_length(tokens);
+ merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+ pop_first_token(unichars, tokeninfos, output);
+ len = calculate_sequence_length(tokeninfos);
}
return true;
}
+
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- shortest path segment");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (outputfile) {
+ output = fopen(outputfile, "w");
+ if (NULL == output) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ if (argc > 2) {
+ fprintf(stderr, "too many arguments.\n");
+ exit(EINVAL);
+ }
+
+ if (2 == argc) {
+ input = fopen(argv[1], "r");
+ if (NULL == input) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* init phrase table */
+ FacadePhraseTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ /* init phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ GArray * unichars = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+ GArray * tokeninfos = g_array_new(TRUE, TRUE, sizeof(TokenInfo));
+
+ char * linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, input)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ feed_line(phrase_table, phrase_index,
+ unichars, tokeninfos,
+ linebuf, output);
+ }
+
+ g_array_free(unichars, TRUE);
+ g_array_free(tokeninfos, TRUE);
+ free(linebuf);
+ fclose(input);
+ fclose(output);
+ return 0;
+}