summaryrefslogtreecommitdiffstats
path: root/src/segment/mmseg.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/segment/mmseg.cpp')
-rw-r--r--src/segment/mmseg.cpp212
1 files changed, 212 insertions, 0 deletions
diff --git a/src/segment/mmseg.cpp b/src/segment/mmseg.cpp
new file mode 100644
index 0000000..6a3d7f7
--- /dev/null
+++ b/src/segment/mmseg.cpp
@@ -0,0 +1,212 @@
+/*
+ * novel-pinyin,
+ * A Simplified Chinese Sentence-Based Pinyin Input Method Engine
+ * Based On Markov Model.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <limits.h>
+#include <locale.h>
+#include <glib.h>
+#include "novel_types.h"
+
+static GHashTable * g_phrases;
+
+struct SegmentStep{
+ phrase_token_t m_handle;
+ char * m_phrase;
+ //use formula W = No. of words. Zero handle means one word.
+ size_t m_nword;
+ //backtracing information, -1 one step backward.
+ gint8 m_backward_nstep;
+};
+
+//read gb_char.table and gbk_char.table
+bool init_phrases(FILE * infile){
+ char pinyin[256];
+ char phrase[256];
+ phrase_token_t token;
+ size_t freq;
+ while (!feof(infile)){
+ fscanf(infile, "%s", pinyin);
+ fscanf(infile, "%s", phrase);
+ fscanf(infile, "%d", &token);
+ fscanf(infile, "%ld", &freq);
+ if ( feof(infile) )
+ break;
+ g_hash_table_insert(g_phrases, g_strdup(phrase),
+ GUINT_TO_POINTER(token));
+ }
+ return true;
+}
+
+bool segment(GHashTable * phrases, // Lookup Phrases
+ const char * phrase,
+ GArray * strings /* Array of const char * */){
+ GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
+ GArray * offsets = g_array_new(TRUE, TRUE, sizeof(size_t));
+ //construct dynamic programming.
+ size_t phrase_length = g_utf8_strlen(phrase, -1);
+ const char * p = phrase;
+ size_t offset = p - phrase;
+ g_array_append_val(offsets, offset);
+ g_array_set_size(steps, phrase_length + 1);
+ for ( size_t i = 0 ; i < phrase_length; ++i){
+ p = g_utf8_next_char(p);
+ offset = p - phrase;
+ g_array_append_val(offsets, offset);
+ }
+ assert( *p == '\0' );
+
+ //initialize segment steps values.
+ for ( size_t i = 0; i < phrase_length + 1; ++i){
+ SegmentStep* step = &g_array_index(steps, SegmentStep, i);
+ step->m_nword = UINT_MAX;
+ }
+
+ for ( size_t i = 0 ; i < phrase_length + 1; ++i){
+ size_t* offset_begin = &g_array_index(offsets, size_t, i);
+ const char * phrase_begin = phrase + *offset_begin;
+ SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
+ size_t nword = step_begin->m_nword;
+ for ( size_t k = i + 1; k < phrase_length + 1; ++k){
+ size_t* offset_end = &g_array_index(offsets, size_t, k);
+ size_t len = *offset_end - *offset_begin;
+ char * cur_phrase = g_strndup(phrase_begin, len);
+ phrase_token_t token;
+ gpointer orig_key, value;
+ gboolean result = g_hash_table_lookup_extended
+ (phrases, cur_phrase, &orig_key, &value);
+ if ( result ){
+ token = GPOINTER_TO_UINT(value);
+ }else{
+ token = 0;
+ if ( 1 != k - i ){ //skip non-phrase
+ g_free(cur_phrase);
+ continue;
+ }
+ }
+ ++nword;
+ SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
+ if ( nword < step_end->m_nword){
+ if ( step_end->m_phrase ){
+ g_free(step_end->m_phrase);
+ step_end->m_phrase = NULL;
+ }
+ step_end->m_handle = token;
+ step_end->m_phrase = cur_phrase;
+ step_end->m_nword = nword;
+ step_end->m_backward_nstep = k - i;
+ }else{
+ g_free(cur_phrase);
+ }
+ }
+ }
+ //backtracing to get the result.
+ size_t cur_step = phrase_length;
+ g_array_set_size(strings, 0);
+ while ( cur_step ){
+ SegmentStep* step_end = &g_array_index(steps, SegmentStep, cur_step);
+ char * str_dup = g_strdup(step_end->m_phrase);
+ g_array_append_val(strings, str_dup);
+ cur_step = cur_step - step_end->m_backward_nstep;
+ }
+
+ for ( size_t i = 0; i < strings->len / 2; ++i){
+ char ** phrase_head = &g_array_index(strings, char * , i);
+ char ** phrase_tail = &g_array_index(strings, char * , strings->len -1 - i);
+ char * phrase_tmp;
+ phrase_tmp = * phrase_head;
+ * phrase_head = * phrase_tail;
+ * phrase_tail = phrase_tmp;
+ }
+
+ //free strndup memory
+ for ( size_t i = 0; i < steps->len; ++i){
+ SegmentStep* step = &g_array_index(steps, SegmentStep, i);
+ if ( step->m_phrase ){
+ g_free(step->m_phrase);
+ step->m_phrase = NULL;
+ }
+ }
+
+ g_array_free(offsets, TRUE);
+ g_array_free(steps, TRUE);
+ return true;
+}
+
+void print_help(){
+ printf("Usage: mmseg [--generate-extra-enter]\n");
+ exit(1);
+}
+
+int main(int argc, char * argv[]){
+ int i = 1;
+ bool gen_extra_enter = false;
+
+ setlocale(LC_ALL,"");
+ while ( i < argc ){
+ if ( strcmp("--help", argv[i] ) == 0) {
+ print_help();
+ }else if ( strcmp("--generate-extra-enter", argv[i]) == 0) {
+ gen_extra_enter = true;
+ }
+ ++i;
+ }
+
+ g_phrases = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
+ //init phrase lookup
+ FILE * gb_file = fopen("../../data/gb_char.table", "r");
+ if ( gb_file == NULL ){
+ fprintf(stderr, "can't open gb_char.table!\n");
+ exit(1);
+ }
+ init_phrases(gb_file);
+ fclose(gb_file);
+ FILE * gbk_file = fopen("../../data/gbk_char.table", "r");
+ if ( gbk_file == NULL ){
+ fprintf(stderr, "can't open gbk_char.table!\n");
+ exit(1);
+ }
+ init_phrases(gbk_file);
+ fclose(gbk_file);
+
+ char* linebuf = (char *)malloc ( 1024 * sizeof (char) );
+ size_t size = 1024;
+ while( getline(&linebuf, &size, stdin) ){
+ if ( feof(stdin) )
+ break;
+ linebuf[strlen(linebuf)-1] = '\0';
+
+ GArray * phrases = g_array_new(TRUE, TRUE, sizeof( char *));
+ segment(g_phrases, linebuf, phrases);
+ for ( size_t i = 0; i < phrases->len; ++i){
+ char * phrase = g_array_index(phrases, char *, i);
+ printf("%s\n", phrase);
+ g_free(phrase);
+ }
+ if ( gen_extra_enter )
+ printf("\n");
+ g_array_free(phrases, TRUE);
+ }
+ free(linebuf);
+}