summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
Diffstat (limited to 'utils')
-rw-r--r--utils/segment/Makefile.am6
-rw-r--r--utils/segment/mergeseq.cpp47
2 files changed, 29 insertions, 24 deletions
diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am
index 5315266..579d6e4 100644
--- a/utils/segment/Makefile.am
+++ b/utils/segment/Makefile.am
@@ -24,7 +24,7 @@ INCLUDES = -I$(top_srcdir)/src \
-I$(top_srcdir)/utils \
@GLIB2_CFLAGS@
-noinst_PROGRAMS = spseg ngseg
+noinst_PROGRAMS = spseg ngseg mergeseq
spseg_SOURCES = spseg.cpp
@@ -33,3 +33,7 @@ spseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
ngseg_SOURCES = ngseg.cpp
ngseg_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+mergeseq_SOURCES = mergeseq.cpp
+
+mergeseq_LDADD = ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp
index 1f33ccf..1c724f0 100644
--- a/utils/segment/mergeseq.cpp
+++ b/utils/segment/mergeseq.cpp
@@ -21,13 +21,14 @@
#include <stdio.h>
#include <locale.h>
+#include <string.h>
#include <glib.h>
#include "pinyin_internal.h"
#include "utils_helper.h"
void print_help(){
- printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
+ printf("Usage: mergeseq [-o outputfile] [inputfile]\n");
}
@@ -37,7 +38,6 @@ static gchar * outputfile = NULL;
static GOptionEntry entries[] =
{
{"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
- {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
{NULL}
};
@@ -54,13 +54,13 @@ typedef GArray * UnicodeCharVector;
/* GArray of TokenInfo. */
typedef GArray * TokenInfoVector;
-gint calculate_sequence_length(TokenInfoVector * tokeninfos) {
+gint calculate_sequence_length(TokenInfoVector tokeninfos) {
gint len = 0;
size_t i = 0;
for (i = 0; i < tokeninfos->len; ++i) {
TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i);
- len += token_info->len;
+ len += token_info->m_token_len;
}
return len;
@@ -70,18 +70,17 @@ gint calculate_sequence_length(TokenInfoVector * tokeninfos) {
* if not, just output the first token;
* pop the first token or sequence.
*/
-bool merge_sequence(PhraseLargeTable2 * phrase_table,
+bool merge_sequence(FacadePhraseTable2 * phrase_table,
FacadePhraseIndex * phrase_index,
- UnicodeCharVector * unichars,
- TokenInfoVector * tokeninfos) {
+ UnicodeCharVector unichars,
+ TokenInfoVector tokeninfos) {
assert(tokeninfos->len > 0);
bool found = false;
TokenInfo * token_info = NULL;
- gint token_len = 0;
phrase_token_t token = null_token;
- const gunichar * ucs4_str = (const gunichar *)unichars->data;
+ ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
PhraseTokens tokens;
memset(tokens, 0, sizeof(PhraseTokens));
@@ -92,7 +91,6 @@ bool merge_sequence(PhraseLargeTable2 * phrase_table,
gint seq_len = calculate_sequence_length(tokeninfos);
while (seq_len > 0) {
/* do phrase table search. */
- phrase_index->clear_tokens(tokens);
int retval = phrase_table->search(seq_len, ucs4_str, tokens);
if (retval & SEARCH_OK) {
@@ -122,15 +120,15 @@ bool merge_sequence(PhraseLargeTable2 * phrase_table,
return found;
}
-bool pop_first_token(UnicodeCharVector * unichars,
- TokenInfoVector * tokeninfos,
+bool pop_first_token(UnicodeCharVector unichars,
+ TokenInfoVector tokeninfos,
FILE * output) {
- const gunichar * ucs4_str = (const gunichar *)unichars->data;
+ ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
/* pop it. */
- token_info = &g_array_index(tokeninfos, TokenInfo, 0);
- token = token_info->m_token;
- token_len = token_info->m_token_len;
+ TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, 0);
+ phrase_token_t token = token_info->m_token;
+ gint token_len = token_info->m_token_len;
glong read = 0;
gchar * utf8_str = g_ucs4_to_utf8(ucs4_str, token_len, &read, NULL, NULL);
@@ -144,10 +142,10 @@ bool pop_first_token(UnicodeCharVector * unichars,
return true;
}
-bool feed_line(PhraseLargeTable2 * phrase_table,
+bool feed_line(FacadePhraseTable2 * phrase_table,
FacadePhraseIndex * phrase_index,
- UnicodeCharVector * unichars,
- TokenInfoVector * tokeninfos,
+ UnicodeCharVector unichars,
+ TokenInfoVector tokeninfos,
const char * linebuf,
FILE * output) {
@@ -167,7 +165,7 @@ bool feed_line(PhraseLargeTable2 * phrase_table,
PhraseItem item;
phrase_index->get_phrase_item(token, item);
- guint8 len = item.get_phrase_length();
+ gint len = item.get_phrase_length();
TokenInfo info;
info.m_token = token;
@@ -179,7 +177,7 @@ bool feed_line(PhraseLargeTable2 * phrase_table,
g_array_append_vals(unichars, buffer, len);
/* probe merge sequence. */
- gint len = calculate_sequence_length(tokeninfos);
+ len = calculate_sequence_length(tokeninfos);
while (len >= MAX_PHRASE_LENGTH) {
merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
pop_first_token(unichars, tokeninfos, output);
@@ -199,7 +197,7 @@ int main(int argc, char * argv[]){
GError * error = NULL;
GOptionContext * context;
- context = g_option_context_new("- shortest path segment");
+ context = g_option_context_new("- merge word sequence");
g_option_context_add_main_entries(context, entries, NULL);
if (!g_option_context_parse(context, &argc, &argv, &error)) {
g_print("option parsing failed:%s\n", error->message);
@@ -259,7 +257,10 @@ int main(int argc, char * argv[]){
linebuf[strlen(linebuf) - 1] = '\0';
}
- feed_line(phrase_table, phrase_index,
+ if (0 == strlen(linebuf))
+ continue;
+
+ feed_line(&phrase_table, &phrase_index,
unichars, tokeninfos,
linebuf, output);
}