summaryrefslogtreecommitdiffstats
path: root/utils/segment/ngseg.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'utils/segment/ngseg.cpp')
-rw-r--r--utils/segment/ngseg.cpp261
1 files changed, 0 insertions, 261 deletions
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
deleted file mode 100644
index eb7a12d..0000000
--- a/utils/segment/ngseg.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * libzhuyin
- * Library to deal with zhuyin.
- *
- * Copyright (C) 2010 Peng Wu
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <locale.h>
-#include "zhuyin_internal.h"
-#include "utils_helper.h"
-
-
-void print_help(){
- printf("Usage: ngseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
-}
-
-
-static gboolean gen_extra_enter = FALSE;
-static gchar * outputfile = NULL;
-
-static GOptionEntry entries[] =
-{
- {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
- {"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
- {NULL}
-};
-
-
-/* n-gram based sentence segment. */
-
-/* Note:
- * Currently libpinyin supports ucs4 characters.
- * This is a pre-processor tool for raw corpus,
- * and skips non-Chinese characters.
- */
-
-/* TODO:
- * Try to add punctuation mark and english support,
- * such as ',', '.', '?', '!', <english>, and other punctuations.
- */
-
-enum CONTEXT_STATE{
- CONTEXT_INIT,
- CONTEXT_SEGMENTABLE,
- CONTEXT_UNKNOWN
-};
-
-bool deal_with_segmentable(PhraseLookup * phrase_lookup,
- GArray * current_ucs4,
- FILE * output){
- char * result_string = NULL;
- MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
- phrase_lookup->get_best_match(current_ucs4->len,
- (ucs4_t *) current_ucs4->data, results);
-
- phrase_lookup->convert_to_utf8(results, result_string);
-
- if (result_string) {
- fprintf(output, "%s\n", result_string);
- } else {
- char * tmp_string = g_ucs4_to_utf8
- ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
- NULL, NULL, NULL);
- fprintf(stderr, "Un-segmentable sentence encountered:%s\n",
- tmp_string);
- g_array_free(results, TRUE);
- return false;
- }
- g_array_free(results, TRUE);
- g_free(result_string);
- return true;
-}
-
-bool deal_with_unknown(GArray * current_ucs4, FILE * output){
- char * result_string = g_ucs4_to_utf8
- ( (ucs4_t *) current_ucs4->data, current_ucs4->len,
- NULL, NULL, NULL);
- fprintf(output, "%d %s\n", null_token, result_string);
- g_free(result_string);
- return true;
-}
-
-
-int main(int argc, char * argv[]){
- FILE * input = stdin;
- FILE * output = stdout;
-
- setlocale(LC_ALL, "");
-
- GError * error = NULL;
- GOptionContext * context;
-
- context = g_option_context_new("- n-gram segment");
- g_option_context_add_main_entries(context, entries, NULL);
- if (!g_option_context_parse(context, &argc, &argv, &error)) {
- g_print("option parsing failed:%s\n", error->message);
- exit(EINVAL);
- }
-
- if (outputfile) {
- output = fopen(outputfile, "w");
- if (NULL == output) {
- perror("open file failed");
- exit(EINVAL);
- }
- }
-
- if (argc > 2) {
- fprintf(stderr, "too many arguments.\n");
- exit(EINVAL);
- }
-
- if (2 == argc) {
- input = fopen(argv[1], "r");
- if (NULL == input) {
- perror("open file failed");
- exit(EINVAL);
- }
- }
-
- SystemTableInfo system_table_info;
-
- bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
- if (!retval) {
- fprintf(stderr, "load table.conf failed.\n");
- exit(ENOENT);
- }
-
- /* init phrase table */
- FacadePhraseTable2 phrase_table;
- MemoryChunk * chunk = new MemoryChunk;
- chunk->load(SYSTEM_PHRASE_INDEX);
- phrase_table.load(chunk, NULL);
-
- /* init phrase index */
- FacadePhraseIndex phrase_index;
-
- const pinyin_table_info_t * phrase_files =
- system_table_info.get_table_info();
-
- if (!load_phrase_index(phrase_files, &phrase_index))
- exit(ENOENT);
-
- /* init bi-gram */
- Bigram system_bigram;
- system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
- Bigram user_bigram;
-
- gfloat lambda = system_table_info.get_lambda();
-
- /* init phrase lookup */
- PhraseLookup phrase_lookup(lambda,
- &phrase_table, &phrase_index,
- &system_bigram, &user_bigram);
-
-
- CONTEXT_STATE state, next_state;
- GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
-
- PhraseTokens tokens;
- memset(tokens, 0, sizeof(PhraseTokens));
- phrase_index.prepare_tokens(tokens);
-
- /* split the sentence */
- char * linebuf = NULL; size_t size = 0; ssize_t read;
- while( (read = getline(&linebuf, &size, input)) != -1 ){
- if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
- linebuf[strlen(linebuf) - 1] = '\0';
- }
-
- /* check non-ucs4 characters */
- const glong num_of_chars = g_utf8_strlen(linebuf, -1);
- glong len = 0;
- ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
- if ( len != num_of_chars ) {
- fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
- fprintf(output, "%d \n", null_token);
- continue;
- }
-
- /* only new-line persists. */
- if ( 0 == num_of_chars ) {
- fprintf(output, "%d \n", null_token);
- continue;
- }
-
- state = CONTEXT_INIT;
- int result = phrase_table.search( 1, sentence, tokens);
- g_array_append_val( current_ucs4, sentence[0]);
- if ( result & SEARCH_OK )
- state = CONTEXT_SEGMENTABLE;
- else
- state = CONTEXT_UNKNOWN;
-
- for ( int i = 1; i < num_of_chars; ++i) {
- int result = phrase_table.search( 1, sentence + i, tokens);
- if ( result & SEARCH_OK )
- next_state = CONTEXT_SEGMENTABLE;
- else
- next_state = CONTEXT_UNKNOWN;
-
- if ( state == next_state ){
- g_array_append_val(current_ucs4, sentence[i]);
- continue;
- }
-
- assert ( state != next_state );
- if ( state == CONTEXT_SEGMENTABLE )
- deal_with_segmentable(&phrase_lookup, current_ucs4, output);
-
- if ( state == CONTEXT_UNKNOWN )
- deal_with_unknown(current_ucs4, output);
-
- /* save the current character */
- g_array_set_size(current_ucs4, 0);
- g_array_append_val(current_ucs4, sentence[i]);
- state = next_state;
- }
-
- if ( current_ucs4->len ) {
- /* this seems always true. */
- if ( state == CONTEXT_SEGMENTABLE )
- deal_with_segmentable(&phrase_lookup, current_ucs4, output);
-
- if ( state == CONTEXT_UNKNOWN )
- deal_with_unknown(current_ucs4, output);
- g_array_set_size(current_ucs4, 0);
- }
-
- /* print extra enter */
- if ( gen_extra_enter )
- fprintf(output, "%d \n", null_token);
-
- g_free(sentence);
- }
- phrase_index.destroy_tokens(tokens);
-
- /* print enter at file tail */
- fprintf(output, "%d \n", null_token);
- g_array_free(current_ucs4, TRUE);
- free(linebuf);
- fclose(input);
- fclose(output);
- return 0;
-}