summaryrefslogtreecommitdiffstats
path: root/utils/segment/spseg.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'utils/segment/spseg.cpp')
-rw-r--r--utils/segment/spseg.cpp343
1 files changed, 343 insertions, 0 deletions
diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp
new file mode 100644
index 0000000..b543cc5
--- /dev/null
+++ b/utils/segment/spseg.cpp
@@ -0,0 +1,343 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010,2013 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <locale.h>
+#include <glib.h>
+#include "pinyin_internal.h"
+#include "utils_helper.h"
+
+
+void print_help(){
+ printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
+}
+
+static gboolean gen_extra_enter = FALSE;
+static gchar * outputfile = NULL;
+
+static GOptionEntry entries[] =
+{
+ {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
+ {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
+ {NULL}
+};
+
+
+/* graph shortest path sentence segment. */
+
+/* Note:
+ * Currently libpinyin only supports ucs4 characters, as this is a
+ * pre-processor tool for raw corpus, it will skip all sentences
+ * which contains non-ucs4 characters.
+ */
+
+enum CONTEXT_STATE{
+ CONTEXT_INIT,
+ CONTEXT_SEGMENTABLE,
+ CONTEXT_UNKNOWN
+};
+
+struct SegmentStep{
+ phrase_token_t m_handle;
+ ucs4_t * m_phrase;
+ size_t m_phrase_len;
+ //use formula W = number of words. Zero handle means one word.
+ guint m_nword;
+ //backtrace information, -1 one step backward.
+ gint m_backward_nstep;
+public:
+ SegmentStep(){
+ m_handle = null_token;
+ m_phrase = NULL;
+ m_phrase_len = 0;
+ m_nword = UINT_MAX;
+ m_backward_nstep = -0;
+ }
+};
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
+
+/* Note: do not free phrase, as it is used by strings (array of segment). */
+bool segment(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ GArray * current_ucs4,
+ GArray * strings /* Array of SegmentStep. */){
+ ucs4_t * phrase = (ucs4_t *)current_ucs4->data;
+ guint phrase_len = current_ucs4->len;
+
+ /* Prepare for shortest path segment dynamic programming. */
+ GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+ SegmentStep step;
+ for ( glong i = 0; i < phrase_len + 1; ++i ){
+ g_array_append_val(steps, step);
+ }
+
+ SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0);
+ first_step->m_nword = 0;
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index->prepare_tokens(tokens);
+
+ for ( glong i = 0; i < phrase_len + 1; ++i ) {
+ SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
+ size_t nword = step_begin->m_nword;
+ for ( glong k = i + 1; k < phrase_len + 1; ++k ) {
+ size_t len = k - i;
+ ucs4_t * cur_phrase = phrase + i;
+
+ phrase_token_t token = null_token;
+ int result = phrase_table->search(len, cur_phrase, tokens);
+ int num = get_first_token(tokens, token);
+
+ if ( !(result & SEARCH_OK) ){
+ token = null_token;
+ if ( 1 != len )
+ continue;
+ }
+ ++nword;
+
+ SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
+ if ( nword < step_end->m_nword ) {
+ step_end->m_handle = token;
+ step_end->m_phrase = cur_phrase;
+ step_end->m_phrase_len = len;
+ step_end->m_nword = nword;
+ step_end->m_backward_nstep = i - k;
+ }
+ if ( !(result & SEARCH_CONTINUED) )
+ break;
+ }
+ }
+ phrase_index->destroy_tokens(tokens);
+
+ return backtrace(steps, phrase_len, strings);
+}
+
+bool backtrace(GArray * steps, glong phrase_len, GArray * strings){
+ /* backtracing to get the result. */
+ size_t cur_step = phrase_len;
+ g_array_set_size(strings, 0);
+ while ( cur_step ){
+ SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step);
+ g_array_append_val(strings, *step);
+ cur_step = cur_step + step->m_backward_nstep;
+ /* intended to avoid leaking internal informations. */
+ step->m_nword = 0; step->m_backward_nstep = 0;
+ }
+
+ /* reverse the strings. */
+ for ( size_t i = 0; i < strings->len / 2; ++i ) {
+ SegmentStep * head, * tail;
+ head = &g_array_index(strings, SegmentStep, i);
+ tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i );
+ SegmentStep tmp;
+ tmp = *head;
+ *head = *tail;
+ *tail = tmp;
+ }
+
+ g_array_free(steps, TRUE);
+ return true;
+}
+
+bool deal_with_segmentable(FacadePhraseTable2 * phrase_table,
+ FacadePhraseIndex * phrase_index,
+ GArray * current_ucs4,
+ FILE * output){
+
+ /* do segment stuff. */
+ GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+ segment(phrase_table, phrase_index, current_ucs4, strings);
+
+ /* print out the split phrase. */
+ for ( glong i = 0; i < strings->len; ++i ) {
+ SegmentStep * step = &g_array_index(strings, SegmentStep, i);
+ char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
+ fprintf(output, "%d %s\n", step->m_handle, string);
+ g_free(string);
+ }
+
+ g_array_free(strings, TRUE);
+ return true;
+}
+
+bool deal_with_unknown(GArray * current_ucs4, FILE * output){
+ char * result_string = g_ucs4_to_utf8
+ ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
+ NULL, NULL, NULL);
+ fprintf(output, "%d %s\n", null_token, result_string);
+ g_free(result_string);
+ return true;
+}
+
+
+int main(int argc, char * argv[]){
+ FILE * input = stdin;
+ FILE * output = stdout;
+
+ setlocale(LC_ALL, "");
+
+ GError * error = NULL;
+ GOptionContext * context;
+
+ context = g_option_context_new("- shortest path segment");
+ g_option_context_add_main_entries(context, entries, NULL);
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_print("option parsing failed:%s\n", error->message);
+ exit(EINVAL);
+ }
+
+ if (outputfile) {
+ output = fopen(outputfile, "w");
+ if (NULL == output) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ if (argc > 2) {
+ fprintf(stderr, "too many arguments.\n");
+ exit(EINVAL);
+ }
+
+ if (2 == argc) {
+ input = fopen(argv[1], "r");
+ if (NULL == input) {
+ perror("open file failed");
+ exit(EINVAL);
+ }
+ }
+
+ SystemTableInfo system_table_info;
+
+ bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
+ if (!retval) {
+ fprintf(stderr, "load table.conf failed.\n");
+ exit(ENOENT);
+ }
+
+ /* init phrase table */
+ FacadePhraseTable2 phrase_table;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load(SYSTEM_PHRASE_INDEX);
+ phrase_table.load(chunk, NULL);
+
+ /* init phrase index */
+ FacadePhraseIndex phrase_index;
+
+ const pinyin_table_info_t * phrase_files =
+ system_table_info.get_table_info();
+
+ if (!load_phrase_index(phrase_files, &phrase_index))
+ exit(ENOENT);
+
+ CONTEXT_STATE state, next_state;
+ GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index.prepare_tokens(tokens);
+
+ char * linebuf = NULL; size_t size = 0; ssize_t read;
+ while( (read = getline(&linebuf, &size, input)) != -1 ){
+ if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
+ linebuf[strlen(linebuf) - 1] = '\0';
+ }
+
+ /* check non-ucs4 characters. */
+ const glong num_of_chars = g_utf8_strlen(linebuf, -1);
+ glong len = 0;
+ ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
+ if ( len != num_of_chars ) {
+ fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ /* only new-line persists. */
+ if ( 0 == num_of_chars ) {
+ fprintf(output, "%d \n", null_token);
+ continue;
+ }
+
+ state = CONTEXT_INIT;
+ int result = phrase_table.search( 1, sentence, tokens);
+ g_array_append_val( current_ucs4, sentence[0]);
+ if ( result & SEARCH_OK )
+ state = CONTEXT_SEGMENTABLE;
+ else
+ state = CONTEXT_UNKNOWN;
+
+ for ( int i = 1; i < num_of_chars; ++i) {
+ int result = phrase_table.search( 1, sentence + i, tokens);
+ if ( result & SEARCH_OK )
+ next_state = CONTEXT_SEGMENTABLE;
+ else
+ next_state = CONTEXT_UNKNOWN;
+
+ if ( state == next_state ){
+ g_array_append_val(current_ucs4, sentence[i]);
+ continue;
+ }
+
+ assert ( state != next_state );
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_table, &phrase_index,
+ current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+
+ /* save the current character */
+ g_array_set_size(current_ucs4, 0);
+ g_array_append_val(current_ucs4, sentence[i]);
+ state = next_state;
+ }
+
+ if ( current_ucs4->len ) {
+ /* this seems always true. */
+ if ( state == CONTEXT_SEGMENTABLE )
+ deal_with_segmentable(&phrase_table, &phrase_index,
+ current_ucs4, output);
+
+ if ( state == CONTEXT_UNKNOWN )
+ deal_with_unknown(current_ucs4, output);
+ g_array_set_size(current_ucs4, 0);
+ }
+
+ /* print extra enter */
+ if ( gen_extra_enter )
+ fprintf(output, "%d \n", null_token);
+
+ g_free(sentence);
+ }
+ phrase_index.destroy_tokens(tokens);
+
+ /* print enter at file tail */
+ fprintf(output, "%d \n", null_token);
+ g_array_free(current_ucs4, TRUE);
+ free(linebuf);
+ fclose(input);
+ fclose(output);
+ return 0;
+}