From f18a652c8a03961ae1004daf051d28aedbae282f Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Fri, 20 May 2011 11:02:55 +0800 Subject: move tag utility to src/storage --- src/pinyin.h | 1 + src/storage/Makefile.am | 16 +- src/storage/tag_utility.cpp | 389 ++++++++++++++++++++++++++++++++++++++++++ src/storage/tag_utility.h | 68 ++++++++ utils/storage/Makefile.am | 12 +- utils/storage/tag_utility.cpp | 387 ----------------------------------------- utils/storage/tag_utility.h | 68 -------- utils/training/Makefile.am | 9 +- 8 files changed, 473 insertions(+), 477 deletions(-) create mode 100644 src/storage/tag_utility.cpp create mode 100644 src/storage/tag_utility.h delete mode 100644 utils/storage/tag_utility.cpp delete mode 100644 utils/storage/tag_utility.h diff --git a/src/pinyin.h b/src/pinyin.h index 3cf6e71..6e3c81e 100644 --- a/src/pinyin.h +++ b/src/pinyin.h @@ -10,6 +10,7 @@ #include "lookup.h" #include "pinyin_lookup.h" #include "phrase_lookup.h" +#include "tag_utility.h" /* training module */ #include "flexible_ngram.h" diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am index b2d5b1a..8c10cdf 100644 --- a/src/storage/Makefile.am +++ b/src/storage/Makefile.am @@ -21,20 +21,22 @@ INCLUDES = -I$(top_srcdir)/src/include \ noinst_HEADERS = pinyin_large_table.h \ pinyin_base.h \ - pinyin_phrase.h \ + pinyin_phrase.h \ phrase_index.h \ - pinyin_zhuyin_map_data.h \ + pinyin_zhuyin_map_data.h \ phrase_large_table.h \ ngram.h \ - flexible_ngram.h + flexible_ngram.h \ + tag_utility.h noinst_LTLIBRARIES = libstorage.la libstorage_la_LDFLAGS = -static libstorage_la_SOURCES = pinyin_base.cpp \ - pinyin_large_table.cpp \ - phrase_index.cpp \ - phrase_large_table.cpp \ - ngram.cpp + pinyin_large_table.cpp \ + phrase_index.cpp \ + phrase_large_table.cpp \ + ngram.cpp \ + tag_utility.cpp diff --git a/src/storage/tag_utility.cpp b/src/storage/tag_utility.cpp new file mode 100644 index 0000000..dc1f520 --- /dev/null +++ b/src/storage/tag_utility.cpp @@ -0,0 +1,389 @@ +#include +#include +#include +#include +#include "novel_types.h" +#include "phrase_index.h" +#include "phrase_large_table.h" +#include "tag_utility.h" + +/* internal taglib structure */ +struct tag_entry{ + int m_line_type; + char * m_line_tag; + int m_num_of_values; + char ** m_required_tags; + /* char ** m_optional_tags; */ + /* int m_optional_count = 0; */ + char ** m_ignored_tags; +}; + +tag_entry tag_entry_copy(int line_type, const char * line_tag, + int num_of_values, + char * required_tags[], + char * ignored_tags[]){ + tag_entry entry; + entry.m_line_type = line_type; + entry.m_line_tag = g_strdup( line_tag ); + entry.m_num_of_values = num_of_values; + entry.m_required_tags = g_strdupv( required_tags ); + entry.m_ignored_tags = g_strdupv( ignored_tags ); + return entry; +} + +tag_entry tag_entry_clone(tag_entry * entry){ + return tag_entry_copy(entry->m_line_type, entry->m_line_tag, + entry->m_num_of_values, + entry->m_required_tags, entry->m_ignored_tags); +} + +void tag_entry_reclaim(tag_entry * entry){ + g_free( entry->m_line_tag ); + g_strfreev( entry->m_required_tags ); + g_strfreev(entry->m_ignored_tags); +} + +static bool taglib_free_tag_array(GArray * tag_array){ + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + tag_entry_reclaim(entry); + } + g_array_free(tag_array, TRUE); + return true; +} + +/* special unichar to be handled in split_line. */ +static gunichar backslash = 0; +static gunichar quote = 0; + +static gboolean split_line_init(){ + backslash = g_utf8_get_char("\\"); + quote = g_utf8_get_char("\""); + return TRUE; +} + +/* Pointer Array of Array of tag_entry */ +static GPtrArray * g_tagutils_stack = NULL; + +bool taglib_init(){ + assert( g_tagutils_stack == NULL); + g_tagutils_stack = g_ptr_array_new(); + GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); + g_ptr_array_add(g_tagutils_stack, tag_array); + + /* init split_line. */ + split_line_init(); + return true; +} + +bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, + const char * required_tags, const char * ignored_tags){ + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, + g_tagutils_stack->len - 1); + + /* some duplicate tagname or line_type check here. */ + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + if ( entry->m_line_type == line_type || + strcmp( entry->m_line_tag, line_tag ) == 0 ) + return false; + } + + char ** required = g_strsplit_set(required_tags, ",:", -1); + char ** ignored = g_strsplit_set(ignored_tags, ",:", -1); + + tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values, + required, ignored); + g_array_append_val(tag_array, entry); + + g_strfreev(required); + g_strfreev(ignored); + return true; +} + +static void ptr_array_entry_free(gpointer data, gpointer user_data){ + g_free(data); +} + +static gboolean hash_table_key_value_free(gpointer key, gpointer value, + gpointer user_data){ + g_free(key); + g_free(value); + return TRUE; +} + +/* split the line into tokens. */ +static gchar ** split_line(const gchar * line){ + /* array for tokens. */ + GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *)); + + for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){ + gunichar unichar = g_utf8_get_char(cur); + const gchar * begin = cur; + gchar * token = NULL; + + if ( g_unichar_isspace (unichar) ) { + continue; + }else if ( unichar == quote ) { + /* handles "\"". */ + /* skip the first '"'. */ + begin = cur = g_utf8_next_char(cur); + while (*cur) { + unichar = g_utf8_get_char(cur); + if ( unichar == backslash ) { + cur = g_utf8_next_char(cur); + g_return_val_if_fail(*cur, NULL); + } else if ( unichar == quote ){ + break; + } + cur = g_utf8_next_char(cur); + } + gchar * tmp = g_strndup( begin, cur - begin); + /* TODO: switch to own strdup_escape implementation + for \"->" transforming. */ + token = g_strdup_printf(tmp); + g_free(tmp); + } else { + /* handles other tokens. */ + while(*cur) { + unichar = g_utf8_get_char(cur); + if ( g_unichar_isgraph(unichar) ) { + /* next unichar */ + cur = g_utf8_next_char(cur); + } else { + /* space and other characters handles. */ + break; + } + } + token = g_strndup( begin, cur - begin ); + } + + g_array_append_val(tokens, token); + if ( !*cur ) + break; + } + + return (gchar **)g_array_free(tokens, FALSE); +} + +bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, + GHashTable * required){ + /* reset values and required. */ + g_ptr_array_foreach(values, ptr_array_entry_free, NULL); + g_ptr_array_set_size(values, 0); + g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL); + + /* use own version of split_line + instead of g_strsplit_set for special token.*/ + char ** tokens = split_line(input_line); + int num_of_tokens = g_strv_length(tokens); + + char * line_tag = tokens[0]; + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + + tag_entry * cur_entry = NULL; + /* find line type. */ + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) { + cur_entry = entry; + break; + } + } + + if ( !cur_entry ) + return false; + + line_type = cur_entry->m_line_type; + + for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) { + g_return_val_if_fail(i < num_of_tokens, false); + char * value = g_strdup( tokens[i] ); + g_ptr_array_add(values, value); + } + + int ignored_len = g_strv_length( cur_entry->m_ignored_tags ); + int required_len = g_strv_length( cur_entry->m_required_tags); + + for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){ + g_return_val_if_fail(i < num_of_tokens, false); + const char * tmp = tokens[i]; + + /* check ignored tags. */ + bool tag_ignored = false; + for ( int m = 0; m < ignored_len; ++m) { + if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) { + tag_ignored = true; + break; + } + } + + if ( tag_ignored ) { + ++i; + continue; + } + + /* check required tags. */ + bool tag_required = false; + for ( int m = 0; m < required_len; ++m) { + if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) { + tag_required = true; + break; + } + } + + /* warning on the un-expected tags. */ + if ( !tag_required ) { + g_warning("un-expected tags:%s.\n", tmp); + ++i; + continue; + } + + char * key = g_strdup(tokens[i]); + ++i; + g_return_val_if_fail(i < num_of_tokens, false); + char * value = g_strdup(tokens[i]); + g_hash_table_insert(required, key, value); + } + + /* check for all required tags. */ + for ( int i = 0; i < required_len; ++i) { + const char * required_tag_str = cur_entry->m_required_tags[i]; + gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL); + if ( !result ) { + g_warning("missed required tags: %s.\n", required_tag_str); + g_strfreev(tokens); + return false; + } + } + + g_strfreev(tokens); + return true; +} + +bool taglib_remove_tag(int line_type){ + /* Note: duplicate entry check is in taglib_add_tag. */ + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + if (entry->m_line_type != line_type) + continue; + tag_entry_reclaim(entry); + g_array_remove_index(tag_array, i); + return true; + } + return false; +} + +bool taglib_push_state(){ + assert(g_tagutils_stack->len >= 1); + GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); + GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + for ( size_t i = 0; i < prev_tag_array->len; ++i) { + tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i); + tag_entry new_entry = tag_entry_clone(entry); + g_array_append_val(next_tag_array, new_entry); + } + g_ptr_array_add(g_tagutils_stack, next_tag_array); + return true; +} + +bool taglib_pop_state(){ + assert(g_tagutils_stack->len > 1); + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1); + taglib_free_tag_array(tag_array); + return true; +} + +bool taglib_fini(){ + for ( size_t i = 0; i < g_tagutils_stack->len; ++i){ + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i); + taglib_free_tag_array(tag_array); + } + g_ptr_array_free(g_tagutils_stack, TRUE); + g_tagutils_stack = NULL; + return true; +} + +static phrase_token_t taglib_special_string_to_token(const char * string){ + struct token_pair{ + phrase_token_t token; + const char * string; + }; + + static const token_pair tokens [] = { + {sentence_start, ""}, + {0, NULL} + }; + + const token_pair * pair = tokens; + while (pair->string) { + if ( strcmp(string, pair->string ) == 0 ){ + return pair->token; + } + } + + fprintf(stderr, "error: unknown token:%s.\n", string); + return 0; +} + +phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){ + phrase_token_t token = 0; + if ( string[0] == '<' ) { + return taglib_special_string_to_token(string); + } + + glong phrase_len = g_utf8_strlen(string, -1); + utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL); + int result = phrases->search(phrase_len, phrase, token); + if ( !(result & SEARCH_OK) ) + fprintf(stderr, "error: unknown token:%s.\n", string); + + g_free(phrase); + return token; +} + +static const char * taglib_special_token_to_string(phrase_token_t token){ + struct token_pair{ + phrase_token_t token; + const char * string; + }; + + static const token_pair tokens [] = { + {sentence_start, ""}, + {0, NULL} + }; + + const token_pair * pair = tokens; + while (pair->token) { + if ( token == pair->token ) + return pair->string; + } + + fprintf(stderr, "error: unknown token:%d.\n", token); + return NULL; +} + +char * taglib_token_to_string(FacadePhraseIndex * phrase_index, + phrase_token_t token) { + PhraseItem item; + utf16_t buffer[MAX_PHRASE_LENGTH]; + + gchar * phrase; + /* deal with the special phrase index, for "..." */ + if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) { + return g_strdup(taglib_special_token_to_string(token)); + } + + int result = phrase_index->get_phrase_item(token, item); + if (result != ERROR_OK) { + fprintf(stderr, "error: unknown token:%d.\n", token); + return NULL; + } + + item.get_phrase_string(buffer); + guint8 length = item.get_phrase_length(); + phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL); + return phrase; +} diff --git a/src/storage/tag_utility.h b/src/storage/tag_utility.h new file mode 100644 index 0000000..67d8946 --- /dev/null +++ b/src/storage/tag_utility.h @@ -0,0 +1,68 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef TAG_UTILITY_H +#define TAG_UTILITY_H + +#include "novel_types.h" + +/* Note: the optional tag has been removed from the first implementation. + * Maybe the optional tag will be added back later. + */ + +bool taglib_init(); + +/* Note: most tags are separated by ',' or ':' . */ +bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags); + +/* most parameters are hash table of string (const char *). */ +bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, GHashTable * required); + +/* Note: taglib_write is omited, as printf is more suitable for this. */ + +/* Note the following function is only available when the optional tag exists. + * bool taglib_report_status(int line_type); + */ + +/* remove the tag of type line_type. */ +bool taglib_remove_tag(int line_type); + +/* the following functions are used to save current known tag list in stack. + * Used when the parsing context is changed. + */ +bool taglib_push_state(); +bool taglib_pop_state(); + +bool taglib_fini(); + +namespace pinyin{ + class PhraseLargeTable; +}; + +using namespace pinyin; + +phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, + const char * string); + +char * taglib_token_to_string(FacadePhraseIndex * phrase_index, + phrase_token_t token); + +#endif diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am index f314a1a..bc033f1 100644 --- a/utils/storage/Makefile.am +++ b/utils/storage/Makefile.am @@ -21,8 +21,6 @@ INCLUDES = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/lookup \ @GLIB2_CPPFLAGS@ -noinst_HEADERS = tag_utility.h - noinst_PROGRAMS = gen_pinyin_table gen_binary_files export_interpolation import_interpolation gen_pinyin_table_SOURCES = gen_pinyin_table.cpp @@ -33,16 +31,10 @@ gen_binary_files_SOURCES = gen_binary_files.cpp gen_binary_files_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ -noinst_LTLIBRARIES = libtagutils.la - -libtagutils_la_LDFLAGS = -static - -libtagutils_la_SOURCES = tag_utility.cpp - import_interpolation_SOURCES = import_interpolation.cpp -import_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@ +import_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ export_interpolation_SOURCES = export_interpolation.cpp -export_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@ +export_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ diff --git a/utils/storage/tag_utility.cpp b/utils/storage/tag_utility.cpp deleted file mode 100644 index 5dcb35a..0000000 --- a/utils/storage/tag_utility.cpp +++ /dev/null @@ -1,387 +0,0 @@ -#include "pinyin.h" -#include -#include -#include -#include -#include "tag_utility.h" - -/* internal taglib structure */ -struct tag_entry{ - int m_line_type; - char * m_line_tag; - int m_num_of_values; - char ** m_required_tags; - /* char ** m_optional_tags; */ - /* int m_optional_count = 0; */ - char ** m_ignored_tags; -}; - -tag_entry tag_entry_copy(int line_type, const char * line_tag, - int num_of_values, - char * required_tags[], - char * ignored_tags[]){ - tag_entry entry; - entry.m_line_type = line_type; - entry.m_line_tag = g_strdup( line_tag ); - entry.m_num_of_values = num_of_values; - entry.m_required_tags = g_strdupv( required_tags ); - entry.m_ignored_tags = g_strdupv( ignored_tags ); - return entry; -} - -tag_entry tag_entry_clone(tag_entry * entry){ - return tag_entry_copy(entry->m_line_type, entry->m_line_tag, - entry->m_num_of_values, - entry->m_required_tags, entry->m_ignored_tags); -} - -void tag_entry_reclaim(tag_entry * entry){ - g_free( entry->m_line_tag ); - g_strfreev( entry->m_required_tags ); - g_strfreev(entry->m_ignored_tags); -} - -static bool taglib_free_tag_array(GArray * tag_array){ - for ( size_t i = 0; i < tag_array->len; ++i) { - tag_entry * entry = &g_array_index(tag_array, tag_entry, i); - tag_entry_reclaim(entry); - } - g_array_free(tag_array, TRUE); - return true; -} - -/* special unichar to be handled in split_line. */ -static gunichar backslash = 0; -static gunichar quote = 0; - -static gboolean split_line_init(){ - backslash = g_utf8_get_char("\\"); - quote = g_utf8_get_char("\""); - return TRUE; -} - -/* Pointer Array of Array of tag_entry */ -static GPtrArray * g_tagutils_stack = NULL; - -bool taglib_init(){ - assert( g_tagutils_stack == NULL); - g_tagutils_stack = g_ptr_array_new(); - GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); - g_ptr_array_add(g_tagutils_stack, tag_array); - - /* init split_line. */ - split_line_init(); - return true; -} - -bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, - const char * required_tags, const char * ignored_tags){ - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, - g_tagutils_stack->len - 1); - - /* some duplicate tagname or line_type check here. */ - for ( size_t i = 0; i < tag_array->len; ++i) { - tag_entry * entry = &g_array_index(tag_array, tag_entry, i); - if ( entry->m_line_type == line_type || - strcmp( entry->m_line_tag, line_tag ) == 0 ) - return false; - } - - char ** required = g_strsplit_set(required_tags, ",:", -1); - char ** ignored = g_strsplit_set(ignored_tags, ",:", -1); - - tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values, - required, ignored); - g_array_append_val(tag_array, entry); - - g_strfreev(required); - g_strfreev(ignored); - return true; -} - -static void ptr_array_entry_free(gpointer data, gpointer user_data){ - g_free(data); -} - -static gboolean hash_table_key_value_free(gpointer key, gpointer value, - gpointer user_data){ - g_free(key); - g_free(value); - return TRUE; -} - -/* split the line into tokens. */ -static gchar ** split_line(const gchar * line){ - /* array for tokens. */ - GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *)); - - for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){ - gunichar unichar = g_utf8_get_char(cur); - const gchar * begin = cur; - gchar * token = NULL; - - if ( g_unichar_isspace (unichar) ) { - continue; - }else if ( unichar == quote ) { - /* handles "\"". */ - /* skip the first '"'. */ - begin = cur = g_utf8_next_char(cur); - while (*cur) { - unichar = g_utf8_get_char(cur); - if ( unichar == backslash ) { - cur = g_utf8_next_char(cur); - g_return_val_if_fail(*cur, NULL); - } else if ( unichar == quote ){ - break; - } - cur = g_utf8_next_char(cur); - } - gchar * tmp = g_strndup( begin, cur - begin); - /* TODO: switch to own strdup_escape implementation - for \"->" transforming. */ - token = g_strdup_printf(tmp); - g_free(tmp); - } else { - /* handles other tokens. */ - while(*cur) { - unichar = g_utf8_get_char(cur); - if ( g_unichar_isgraph(unichar) ) { - /* next unichar */ - cur = g_utf8_next_char(cur); - } else { - /* space and other characters handles. */ - break; - } - } - token = g_strndup( begin, cur - begin ); - } - - g_array_append_val(tokens, token); - if ( !*cur ) - break; - } - - return (gchar **)g_array_free(tokens, FALSE); -} - -bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, - GHashTable * required){ - /* reset values and required. */ - g_ptr_array_foreach(values, ptr_array_entry_free, NULL); - g_ptr_array_set_size(values, 0); - g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL); - - /* use own version of split_line - instead of g_strsplit_set for special token.*/ - char ** tokens = split_line(input_line); - int num_of_tokens = g_strv_length(tokens); - - char * line_tag = tokens[0]; - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); - - tag_entry * cur_entry = NULL; - /* find line type. */ - for ( size_t i = 0; i < tag_array->len; ++i) { - tag_entry * entry = &g_array_index(tag_array, tag_entry, i); - if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) { - cur_entry = entry; - break; - } - } - - if ( !cur_entry ) - return false; - - line_type = cur_entry->m_line_type; - - for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) { - g_return_val_if_fail(i < num_of_tokens, false); - char * value = g_strdup( tokens[i] ); - g_ptr_array_add(values, value); - } - - int ignored_len = g_strv_length( cur_entry->m_ignored_tags ); - int required_len = g_strv_length( cur_entry->m_required_tags); - - for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){ - g_return_val_if_fail(i < num_of_tokens, false); - const char * tmp = tokens[i]; - - /* check ignored tags. */ - bool tag_ignored = false; - for ( int m = 0; m < ignored_len; ++m) { - if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) { - tag_ignored = true; - break; - } - } - - if ( tag_ignored ) { - ++i; - continue; - } - - /* check required tags. */ - bool tag_required = false; - for ( int m = 0; m < required_len; ++m) { - if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) { - tag_required = true; - break; - } - } - - /* warning on the un-expected tags. */ - if ( !tag_required ) { - g_warning("un-expected tags:%s.\n", tmp); - ++i; - continue; - } - - char * key = g_strdup(tokens[i]); - ++i; - g_return_val_if_fail(i < num_of_tokens, false); - char * value = g_strdup(tokens[i]); - g_hash_table_insert(required, key, value); - } - - /* check for all required tags. */ - for ( int i = 0; i < required_len; ++i) { - const char * required_tag_str = cur_entry->m_required_tags[i]; - gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL); - if ( !result ) { - g_warning("missed required tags: %s.\n", required_tag_str); - g_strfreev(tokens); - return false; - } - } - - g_strfreev(tokens); - return true; -} - -bool taglib_remove_tag(int line_type){ - /* Note: duplicate entry check is in taglib_add_tag. */ - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); - for ( size_t i = 0; i < tag_array->len; ++i) { - tag_entry * entry = &g_array_index(tag_array, tag_entry, i); - if (entry->m_line_type != line_type) - continue; - tag_entry_reclaim(entry); - g_array_remove_index(tag_array, i); - return true; - } - return false; -} - -bool taglib_push_state(){ - assert(g_tagutils_stack->len >= 1); - GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); - GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); - for ( size_t i = 0; i < prev_tag_array->len; ++i) { - tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i); - tag_entry new_entry = tag_entry_clone(entry); - g_array_append_val(next_tag_array, new_entry); - } - g_ptr_array_add(g_tagutils_stack, next_tag_array); - return true; -} - -bool taglib_pop_state(){ - assert(g_tagutils_stack->len > 1); - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); - g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1); - taglib_free_tag_array(tag_array); - return true; -} - -bool taglib_fini(){ - for ( size_t i = 0; i < g_tagutils_stack->len; ++i){ - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i); - taglib_free_tag_array(tag_array); - } - g_ptr_array_free(g_tagutils_stack, TRUE); - g_tagutils_stack = NULL; - return true; -} - -static phrase_token_t taglib_special_string_to_token(const char * string){ - struct token_pair{ - phrase_token_t token; - const char * string; - }; - - static const token_pair tokens [] = { - {sentence_start, ""}, - {0, NULL} - }; - - const token_pair * pair = tokens; - while (pair->string) { - if ( strcmp(string, pair->string ) == 0 ){ - return pair->token; - } - } - - fprintf(stderr, "error: unknown token:%s.\n", string); - return 0; -} - -phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){ - phrase_token_t token = 0; - if ( string[0] == '<' ) { - return taglib_special_string_to_token(string); - } - - glong phrase_len = g_utf8_strlen(string, -1); - utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL); - int result = phrases->search(phrase_len, phrase, token); - if ( !(result & SEARCH_OK) ) - fprintf(stderr, "error: unknown token:%s.\n", string); - - g_free(phrase); - return token; -} - -static const char * taglib_special_token_to_string(phrase_token_t token){ - struct token_pair{ - phrase_token_t token; - const char * string; - }; - - static const token_pair tokens [] = { - {sentence_start, ""}, - {0, NULL} - }; - - const token_pair * pair = tokens; - while (pair->token) { - if ( token == pair->token ) - return pair->string; - } - - fprintf(stderr, "error: unknown token:%d.\n", token); - return NULL; -} - -char * taglib_token_to_string(FacadePhraseIndex * phrase_index, - phrase_token_t token) { - PhraseItem item; - utf16_t buffer[MAX_PHRASE_LENGTH]; - - gchar * phrase; - /* deal with the special phrase index, for "..." */ - if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) { - return g_strdup(taglib_special_token_to_string(token)); - } - - int result = phrase_index->get_phrase_item(token, item); - if (result != ERROR_OK) { - fprintf(stderr, "error: unknown token:%d.\n", token); - return NULL; - } - - item.get_phrase_string(buffer); - guint8 length = item.get_phrase_length(); - phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL); - return phrase; -} diff --git a/utils/storage/tag_utility.h b/utils/storage/tag_utility.h deleted file mode 100644 index 67d8946..0000000 --- a/utils/storage/tag_utility.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * libpinyin - * Library to deal with pinyin. - * - * Copyright (C) 2010 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef TAG_UTILITY_H -#define TAG_UTILITY_H - -#include "novel_types.h" - -/* Note: the optional tag has been removed from the first implementation. - * Maybe the optional tag will be added back later. - */ - -bool taglib_init(); - -/* Note: most tags are separated by ',' or ':' . */ -bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags); - -/* most parameters are hash table of string (const char *). */ -bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, GHashTable * required); - -/* Note: taglib_write is omited, as printf is more suitable for this. */ - -/* Note the following function is only available when the optional tag exists. - * bool taglib_report_status(int line_type); - */ - -/* remove the tag of type line_type. */ -bool taglib_remove_tag(int line_type); - -/* the following functions are used to save current known tag list in stack. - * Used when the parsing context is changed. - */ -bool taglib_push_state(); -bool taglib_pop_state(); - -bool taglib_fini(); - -namespace pinyin{ - class PhraseLargeTable; -}; - -using namespace pinyin; - -phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, - const char * string); - -char * taglib_token_to_string(FacadePhraseIndex * phrase_index, - phrase_token_t token); - -#endif diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am index 7411e78..0915479 100644 --- a/utils/training/Makefile.am +++ b/utils/training/Makefile.am @@ -20,8 +20,7 @@ MAINTAINERCLEANFILES = Makefile.in INCLUDES = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ - -I$(top_srcdir)/src/lookup \ - -I$(top_srcdir)/utils/storage \ + -I$(top_srcdir)/src/lookup \ @GLIB2_CPPFLAGS@ noinst_HEADERS = k_mixture_model.h @@ -72,12 +71,12 @@ prune_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ import_k_mixture_model_SOURCES = import_k_mixture_model.cpp -import_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ +import_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ export_k_mixture_model_SOURCES = export_k_mixture_model.cpp -export_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ +export_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ k_mixture_model_to_interpolation_SOURCES = k_mixture_model_to_interpolation.cpp -k_mixture_model_to_interpolation_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file +k_mixture_model_to_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file -- cgit