5 files changed, 940 insertions, 0 deletions
diff --git a/utils/segment/CMakeLists.txt b/utils/segment/CMakeLists.txt
new file mode 100644
index 0000000..82e4deb
--- /dev/null
+++ b/utils/segment/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_executable(
+    spseg
+    spseg.cpp
+)
+
+target_link_libraries(
+    spseg
+    libpinyin
+)
+
+add_executable(
+    ngseg
+    ngseg.cpp
+)
+
+target_link_libraries(
+    ngseg
+    libpinyin
+)
+\ No newline at end of file
diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am
new file mode 100644
index 0000000..579d6e4
--- /dev/null
+++ b/utils/segment/Makefile.am
@@ -0,0 +1,39 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2007 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+MAINTAINERCLEANFILES    = Makefile.in
+
+INCLUDES		= -I$(top_srcdir)/src \
+			  -I$(top_srcdir)/src/include \
+			  -I$(top_srcdir)/src/storage \
+			  -I$(top_srcdir)/src/lookup \
+			  -I$(top_srcdir)/utils \
+			  @GLIB2_CFLAGS@
+
+noinst_PROGRAMS		= spseg ngseg mergeseq
+
+spseg_SOURCES		= spseg.cpp
+
+spseg_LDADD		= ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+ngseg_SOURCES		= ngseg.cpp
+
+ngseg_LDADD		= ../../src/libpinyin_internal.la @GLIB2_LIBS@
+
+mergeseq_SOURCES	= mergeseq.cpp
+
+mergeseq_LDADD		= ../../src/libpinyin_internal.la @GLIB2_LIBS@
diff --git a/utils/segment/mergeseq.cpp b/utils/segment/mergeseq.cpp
new file mode 100644
index 0000000..1a26064
--- /dev/null
+++ b/utils/segment/mergeseq.cpp
@@ -0,0 +1,278 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2013 Peng Wu <alexepico@gmail.com>
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <string.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+    printf("Usage: mergeseq [-o outputfile] [inputfile]\n");
+}
+
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+    {NULL}
+};
+
+
+/* data structure definition. */
+typedef struct{
+    phrase_token_t m_token;
+    gint m_token_len;
+} TokenInfo;
+
+
+/* GArray of ucs4 characters. */
+typedef GArray * UnicodeCharVector;
+/* GArray of TokenInfo. */
+typedef GArray * TokenInfoVector;
+
+gint calculate_sequence_length(TokenInfoVector tokeninfos) {
+    gint len = 0;
+
+    size_t i = 0;
+    for (i = 0; i < tokeninfos->len; ++i) {
+        TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, i);
+        len += token_info->m_token_len;
+    }
+
+    return len;
+}
+
+/* if merge sequence found, merge and output it,
+ *   if not, just output the first token;
+ * pop the first token or sequence.
+ */
+bool merge_sequence(FacadePhraseTable2 * phrase_table,
+                    FacadePhraseIndex * phrase_index,
+                    UnicodeCharVector unichars,
+                    TokenInfoVector tokeninfos) {
+    assert(tokeninfos->len > 0);
+
+    bool found = false;
+    TokenInfo * token_info = NULL;
+    phrase_token_t token = null_token;
+
+    ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(PhraseTokens));
+    phrase_index->prepare_tokens(tokens);
+
+    /* search the merge sequence. */
+    size_t index = tokeninfos->len;
+    gint seq_len = calculate_sequence_length(tokeninfos);
+    while (seq_len > 0) {
+        /* do phrase table search. */
+        int retval = phrase_table->search(seq_len, ucs4_str, tokens);
+
+        if (retval & SEARCH_OK) {
+            int num = get_first_token(tokens, token);
+            found = true;
+            break;
+        }
+
+        --index;
+        token_info = &g_array_index(tokeninfos, TokenInfo, index);
+        seq_len -= token_info->m_token_len;
+    }
+
+    phrase_index->destroy_tokens(tokens);
+
+    /* push the merged sequence back. */
+    if (found) {
+        /* pop up the origin sequence. */
+        g_array_remove_range(tokeninfos, 0, index);
+
+        TokenInfo info;
+        info.m_token = token;
+        info.m_token_len = seq_len;
+        g_array_prepend_val(tokeninfos, info);
+    }
+
+    return found;
+}
+
+bool pop_first_token(UnicodeCharVector unichars,
+                     TokenInfoVector tokeninfos,
+                     FILE * output) {
+    ucs4_t * ucs4_str = (ucs4_t *) unichars->data;
+
+    /* pop it. */
+    TokenInfo * token_info = &g_array_index(tokeninfos, TokenInfo, 0);
+    phrase_token_t token = token_info->m_token;
+    gint token_len = token_info->m_token_len;
+
+    glong read = 0;
+    gchar * utf8_str = g_ucs4_to_utf8(ucs4_str, token_len, &read, NULL, NULL);
+    assert(read == token_len);
+    fprintf(output, "%d %s\n", token, utf8_str);
+    g_free(utf8_str);
+
+    g_array_remove_range(unichars, 0, token_len);
+    g_array_remove_index(tokeninfos, 0);
+
+    return true;
+}
+
+bool feed_line(FacadePhraseTable2 * phrase_table,
+               FacadePhraseIndex * phrase_index,
+               UnicodeCharVector unichars,
+               TokenInfoVector tokeninfos,
+               const char * linebuf,
+               FILE * output) {
+
+    TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf);
+
+    if (null_token == token) {
+        /* empty the queue. */
+        while (0 != tokeninfos->len) {
+            merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+            pop_first_token(unichars, tokeninfos, output);
+        }
+
+        assert(0 == unichars->len);
+        assert(0 == tokeninfos->len);
+
+        /* restore the null token line. */
+        fprintf(output, "%s\n", linebuf);
+
+        return false;
+    }
+
+    PhraseItem item;
+    phrase_index->get_phrase_item(token, item);
+    gint len = item.get_phrase_length();
+
+    TokenInfo info;
+    info.m_token = token;
+    info.m_token_len = len;
+    g_array_append_val(tokeninfos, info);
+
+    ucs4_t buffer[MAX_PHRASE_LENGTH];
+    item.get_phrase_string(buffer);
+    g_array_append_vals(unichars, buffer, len);
+
+    /* probe merge sequence. */
+    len = calculate_sequence_length(tokeninfos);
+    while (len >= MAX_PHRASE_LENGTH) {
+        merge_sequence(phrase_table, phrase_index, unichars, tokeninfos);
+        pop_first_token(unichars, tokeninfos, output);
+        len = calculate_sequence_length(tokeninfos);
+    }
+
+    return true;
+}
+
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+    FILE * output = stdout;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- merge word sequence");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    if (outputfile) {
+        output = fopen(outputfile, "w");
+        if (NULL == output) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    if (argc > 2) {
+        fprintf(stderr, "too many arguments.\n");
+        exit(EINVAL);
+    }
+
+    if (2 == argc) {
+        input = fopen(argv[1], "r");
+        if (NULL == input) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    /* init phrase table */
+    FacadePhraseTable2 phrase_table;
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk, NULL);
+
+    /* init phrase index */
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    GArray * unichars = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+    GArray * tokeninfos = g_array_new(TRUE, TRUE, sizeof(TokenInfo));
+
+    char * linebuf = NULL; size_t size = 0; ssize_t read;
+    while( (read = getline(&linebuf, &size, input)) != -1 ){
+        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        if (0 == strlen(linebuf))
+            continue;
+
+        feed_line(&phrase_table, &phrase_index,
+                  unichars, tokeninfos,
+                  linebuf, output);
+    }
+
+    g_array_free(unichars, TRUE);
+    g_array_free(tokeninfos, TRUE);
+    free(linebuf);
+    fclose(input);
+    fclose(output);
+    return 0;
+}
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
new file mode 100644
index 0000000..03fe5b4
--- /dev/null
+++ b/utils/segment/ngseg.cpp
@@ -0,0 +1,261 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2010 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <locale.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+    printf("Usage: ngseg [--generate-extra-enter]  [-o outputfile] [inputfile]\n");
+}
+
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+    {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+    {NULL}
+};
+
+
+/* n-gram based sentence segment. */
+
+/* Note:
+ * Currently libpinyin supports ucs4 characters.
+ * This is a pre-processor tool for raw corpus,
+ * and skips non-Chinese characters.
+ */
+
+/* TODO:
+ * Try to add punctuation mark and english support,
+ * such as ',', '.', '?', '!', <english>, and other punctuations.
+ */
+
+enum CONTEXT_STATE{
+    CONTEXT_INIT,
+    CONTEXT_SEGMENTABLE,
+    CONTEXT_UNKNOWN
+};
+
+bool deal_with_segmentable(PhraseLookup * phrase_lookup,
+                           GArray * current_ucs4,
+                           FILE * output){
+    char * result_string = NULL;
+    MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+    phrase_lookup->get_best_match(current_ucs4->len,
+                                  (ucs4_t *) current_ucs4->data, results);
+
+    phrase_lookup->convert_to_utf8(results, result_string);
+
+    if (result_string) {
+        fprintf(output, "%s\n", result_string);
+    } else {
+        char * tmp_string = g_ucs4_to_utf8
+            ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+              NULL, NULL, NULL);
+        fprintf(stderr, "Un-segmentable sentence encountered:%s\n",
+                tmp_string);
+        g_array_free(results, TRUE);
+        return false;
+    }
+    g_array_free(results, TRUE);
+    g_free(result_string);
+    return true;
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+    char * result_string = g_ucs4_to_utf8
+        ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+          NULL, NULL, NULL);
+    fprintf(output, "%d %s\n", null_token, result_string);
+    g_free(result_string);
+    return true;
+}
+
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+    FILE * output = stdout;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- n-gram segment");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    if (outputfile) {
+        output = fopen(outputfile, "w");
+        if (NULL == output) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    if (argc > 2) {
+        fprintf(stderr, "too many arguments.\n");
+        exit(EINVAL);
+    }
+
+    if (2 == argc) {
+        input = fopen(argv[1], "r");
+        if (NULL == input) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    /* init phrase table */
+    FacadePhraseTable2 phrase_table;
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk, NULL);
+
+    /* init phrase index */
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    /* init bi-gram */
+    Bigram system_bigram;
+    system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
+    Bigram user_bigram;
+
+    gfloat lambda = system_table_info.get_lambda();
+
+    /* init phrase lookup */
+    PhraseLookup phrase_lookup(lambda,
+                               &phrase_table, &phrase_index,
+                               &system_bigram, &user_bigram);
+
+
+    CONTEXT_STATE state, next_state;
+    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(PhraseTokens));
+    phrase_index.prepare_tokens(tokens);
+
+    /* split the sentence */
+    char * linebuf = NULL; size_t size = 0; ssize_t read;
+    while( (read = getline(&linebuf, &size, input)) != -1 ){
+        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        /* check non-ucs4 characters */
+        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+        glong len = 0;
+        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+        if ( len != num_of_chars ) {
+            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
+            fprintf(output, "%d \n", null_token);
+            continue;
+        }
+
+        /* only new-line persists. */
+        if ( 0  == num_of_chars ) {
+            fprintf(output, "%d \n", null_token);
+            continue;
+        }
+
+        state = CONTEXT_INIT;
+        int result = phrase_table.search( 1, sentence, tokens);
+        g_array_append_val( current_ucs4, sentence[0]);
+        if ( result & SEARCH_OK )
+            state = CONTEXT_SEGMENTABLE;
+        else
+            state = CONTEXT_UNKNOWN;
+
+        for ( int i = 1; i < num_of_chars; ++i) {
+            int result = phrase_table.search( 1, sentence + i, tokens);
+            if ( result & SEARCH_OK )
+                next_state = CONTEXT_SEGMENTABLE;
+            else
+                next_state = CONTEXT_UNKNOWN;
+
+            if ( state == next_state ){
+                g_array_append_val(current_ucs4, sentence[i]);
+                continue;
+            }
+
+            assert ( state != next_state );
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_lookup, current_ucs4, output);
+
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+
+            /* save the current character */
+            g_array_set_size(current_ucs4, 0);
+            g_array_append_val(current_ucs4, sentence[i]);
+            state = next_state;
+        }
+
+        if ( current_ucs4->len ) {
+            /* this seems always true. */
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_lookup, current_ucs4, output);
+
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+            g_array_set_size(current_ucs4, 0);
+        }
+
+        /* print extra enter */
+        if ( gen_extra_enter )
+            fprintf(output, "%d \n", null_token);
+
+        g_free(sentence);
+    }
+    phrase_index.destroy_tokens(tokens);
+
+    /* print enter at file tail */
+    fprintf(output, "%d \n", null_token);
+    g_array_free(current_ucs4, TRUE);
+    free(linebuf);
+    fclose(input);
+    fclose(output);
+    return 0;
+}
diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp
new file mode 100644
index 0000000..b543cc5
--- /dev/null
+++ b/utils/segment/spseg.cpp
@@ -0,0 +1,343 @@
+/* 
+ *  libpinyin
+ *  Library to deal with pinyin.
+ *  
+ *  Copyright (C) 2010,2013 Peng Wu
+ *  
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+    printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
+}
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+    {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+    {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+    {NULL}
+};
+
+
+/* graph shortest path sentence segment. */
+
+/* Note:
+ * Currently libpinyin only supports ucs4 characters, as this is a
+ * pre-processor tool for raw corpus, it will skip all sentences
+ * which contains non-ucs4 characters.
+ */
+
+enum CONTEXT_STATE{
+    CONTEXT_INIT,
+    CONTEXT_SEGMENTABLE,
+    CONTEXT_UNKNOWN
+};
+
+struct SegmentStep{
+    phrase_token_t m_handle;
+    ucs4_t * m_phrase;
+    size_t m_phrase_len;
+    //use formula W = number of words. Zero handle means one word.
+    guint m_nword;
+    //backtrace information, -1 one step backward.
+    gint m_backward_nstep;
+public:
+    SegmentStep(){
+        m_handle = null_token;
+        m_phrase = NULL;
+        m_phrase_len = 0;
+        m_nword = UINT_MAX;
+        m_backward_nstep = -0;
+    }
+};
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
+
+/* Note: do not free phrase, as it is used by strings (array of segment). */
+bool segment(FacadePhraseTable2 * phrase_table,
+             FacadePhraseIndex * phrase_index,
+             GArray * current_ucs4,
+             GArray * strings /* Array of SegmentStep. */){
+    ucs4_t * phrase = (ucs4_t *)current_ucs4->data;
+    guint phrase_len = current_ucs4->len;
+
+    /* Prepare for shortest path segment dynamic programming. */
+    GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+    SegmentStep step;
+    for ( glong i = 0; i < phrase_len + 1; ++i ){
+        g_array_append_val(steps, step);
+    }
+
+    SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0);
+    first_step->m_nword = 0;
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(PhraseTokens));
+    phrase_index->prepare_tokens(tokens);
+
+    for ( glong i = 0; i < phrase_len + 1; ++i ) {
+        SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
+        size_t nword = step_begin->m_nword;
+        for ( glong k = i + 1; k < phrase_len + 1; ++k ) {
+            size_t len = k - i;
+            ucs4_t * cur_phrase = phrase + i;
+
+            phrase_token_t token = null_token;
+            int result = phrase_table->search(len, cur_phrase, tokens);
+            int num = get_first_token(tokens, token);
+
+            if ( !(result & SEARCH_OK) ){
+                token = null_token;
+                if ( 1 != len )
+                    continue;
+            }
+            ++nword;
+
+            SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
+            if ( nword < step_end->m_nword ) {
+                step_end->m_handle = token;
+                step_end->m_phrase = cur_phrase;
+                step_end->m_phrase_len = len;
+                step_end->m_nword = nword;
+                step_end->m_backward_nstep = i - k;
+            }
+            if ( !(result & SEARCH_CONTINUED) )
+                break;
+        }
+    }
+    phrase_index->destroy_tokens(tokens);
+
+    return backtrace(steps, phrase_len, strings);
+}
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings){
+    /* backtracing to get the result. */
+    size_t cur_step = phrase_len;
+    g_array_set_size(strings, 0);
+    while ( cur_step ){
+        SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step);
+        g_array_append_val(strings, *step);
+        cur_step = cur_step + step->m_backward_nstep;
+        /* intended to avoid leaking internal informations. */
+        step->m_nword = 0; step->m_backward_nstep = 0;
+    }
+
+    /* reverse the strings. */
+    for ( size_t i = 0; i < strings->len / 2; ++i ) {
+        SegmentStep * head, * tail;
+        head = &g_array_index(strings, SegmentStep, i);
+        tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i );
+        SegmentStep tmp;
+        tmp = *head;
+        *head = *tail;
+        *tail = tmp;
+    }
+
+    g_array_free(steps, TRUE);
+    return true;
+}
+
+bool deal_with_segmentable(FacadePhraseTable2 * phrase_table,
+                           FacadePhraseIndex * phrase_index,
+                           GArray * current_ucs4,
+                           FILE * output){
+
+    /* do segment stuff. */
+    GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+    segment(phrase_table, phrase_index, current_ucs4, strings);
+
+    /* print out the split phrase. */
+    for ( glong i = 0; i < strings->len; ++i ) {
+        SegmentStep * step = &g_array_index(strings, SegmentStep, i);
+        char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
+        fprintf(output, "%d %s\n", step->m_handle, string);
+        g_free(string);
+    }
+
+    g_array_free(strings, TRUE);
+    return true;
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+    char * result_string = g_ucs4_to_utf8
+        ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+          NULL, NULL, NULL);
+    fprintf(output, "%d %s\n", null_token, result_string);
+    g_free(result_string);
+    return true;
+}
+
+
+int main(int argc, char * argv[]){
+    FILE * input = stdin;
+    FILE * output = stdout;
+
+    setlocale(LC_ALL, "");
+
+    GError * error = NULL;
+    GOptionContext * context;
+
+    context = g_option_context_new("- shortest path segment");
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_print("option parsing failed:%s\n", error->message);
+        exit(EINVAL);
+    }
+
+    if (outputfile) {
+        output = fopen(outputfile, "w");
+        if (NULL == output) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    if (argc > 2) {
+        fprintf(stderr, "too many arguments.\n");
+        exit(EINVAL);
+    }
+
+    if (2 == argc) {
+        input = fopen(argv[1], "r");
+        if (NULL == input) {
+            perror("open file failed");
+            exit(EINVAL);
+        }
+    }
+
+    SystemTableInfo system_table_info;
+
+    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+    if (!retval) {
+        fprintf(stderr, "load table.conf failed.\n");
+        exit(ENOENT);
+    }
+
+    /* init phrase table */
+    FacadePhraseTable2 phrase_table;
+    MemoryChunk * chunk = new MemoryChunk;
+    chunk->load(SYSTEM_PHRASE_INDEX);
+    phrase_table.load(chunk, NULL);
+
+    /* init phrase index */
+    FacadePhraseIndex phrase_index;
+
+    const pinyin_table_info_t * phrase_files =
+        system_table_info.get_table_info();
+
+    if (!load_phrase_index(phrase_files, &phrase_index))
+        exit(ENOENT);
+
+    CONTEXT_STATE state, next_state;
+    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(PhraseTokens));
+    phrase_index.prepare_tokens(tokens);
+
+    char * linebuf = NULL; size_t size = 0; ssize_t read;
+    while( (read = getline(&linebuf, &size, input)) != -1 ){
+        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
+            linebuf[strlen(linebuf) - 1] = '\0';
+        }
+
+        /* check non-ucs4 characters. */
+        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+        glong len = 0;
+        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+        if ( len != num_of_chars ) {
+            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
+            fprintf(output, "%d \n", null_token);
+            continue;
+        }
+
+        /* only new-line persists. */
+        if ( 0  == num_of_chars ) {
+            fprintf(output, "%d \n", null_token);
+            continue;
+        }
+
+        state = CONTEXT_INIT;
+        int result = phrase_table.search( 1, sentence, tokens);
+        g_array_append_val( current_ucs4, sentence[0]);
+        if ( result & SEARCH_OK )
+            state = CONTEXT_SEGMENTABLE;
+        else
+            state = CONTEXT_UNKNOWN;
+
+        for ( int i = 1; i < num_of_chars; ++i) {
+            int result = phrase_table.search( 1, sentence + i, tokens);
+            if ( result & SEARCH_OK )
+                next_state = CONTEXT_SEGMENTABLE;
+            else
+                next_state = CONTEXT_UNKNOWN;
+
+            if ( state == next_state ){
+                g_array_append_val(current_ucs4, sentence[i]);
+                continue;
+            }
+
+            assert ( state != next_state );
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_table, &phrase_index,
+                                      current_ucs4, output);
+
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+
+            /* save the current character */
+            g_array_set_size(current_ucs4, 0);
+            g_array_append_val(current_ucs4, sentence[i]);
+            state = next_state;
+        }
+
+        if ( current_ucs4->len ) {
+            /* this seems always true. */
+            if ( state == CONTEXT_SEGMENTABLE )
+                deal_with_segmentable(&phrase_table, &phrase_index,
+                                      current_ucs4, output);
+
+            if ( state == CONTEXT_UNKNOWN )
+                deal_with_unknown(current_ucs4, output);
+            g_array_set_size(current_ucs4, 0);
+        }
+
+        /* print extra enter */
+        if ( gen_extra_enter )
+            fprintf(output, "%d \n", null_token);
+
+        g_free(sentence);
+    }
+    phrase_index.destroy_tokens(tokens);
+
+    /* print enter at file tail */
+    fprintf(output, "%d \n", null_token);
+    g_array_free(current_ucs4, TRUE);
+    free(linebuf);
+    fclose(input);
+    fclose(output);
+    return 0;
+}