diff options
author | Peng Wu <alexepico@gmail.com> | 2011-05-20 11:02:55 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-05-20 11:02:55 +0800 |
commit | f18a652c8a03961ae1004daf051d28aedbae282f (patch) | |
tree | 27e9008ec1abecd48ea868a3a303e375984500d8 /src | |
parent | 5150341809f92fb2179decdfdd6ec1477d988461 (diff) | |
download | libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.tar.gz libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.tar.xz libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.zip |
move tag utility to src/storage
Diffstat (limited to 'src')
-rw-r--r-- | src/pinyin.h | 1 | ||||
-rw-r--r-- | src/storage/Makefile.am | 16 | ||||
-rw-r--r-- | src/storage/tag_utility.cpp | 389 | ||||
-rw-r--r-- | src/storage/tag_utility.h | 68 |
4 files changed, 467 insertions, 7 deletions
diff --git a/src/pinyin.h b/src/pinyin.h index 3cf6e71..6e3c81e 100644 --- a/src/pinyin.h +++ b/src/pinyin.h @@ -10,6 +10,7 @@ #include "lookup.h" #include "pinyin_lookup.h" #include "phrase_lookup.h" +#include "tag_utility.h" /* training module */ #include "flexible_ngram.h" diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am index b2d5b1a..8c10cdf 100644 --- a/src/storage/Makefile.am +++ b/src/storage/Makefile.am @@ -21,20 +21,22 @@ INCLUDES = -I$(top_srcdir)/src/include \ noinst_HEADERS = pinyin_large_table.h \ pinyin_base.h \ - pinyin_phrase.h \ + pinyin_phrase.h \ phrase_index.h \ - pinyin_zhuyin_map_data.h \ + pinyin_zhuyin_map_data.h \ phrase_large_table.h \ ngram.h \ - flexible_ngram.h + flexible_ngram.h \ + tag_utility.h noinst_LTLIBRARIES = libstorage.la libstorage_la_LDFLAGS = -static libstorage_la_SOURCES = pinyin_base.cpp \ - pinyin_large_table.cpp \ - phrase_index.cpp \ - phrase_large_table.cpp \ - ngram.cpp + pinyin_large_table.cpp \ + phrase_index.cpp \ + phrase_large_table.cpp \ + ngram.cpp \ + tag_utility.cpp diff --git a/src/storage/tag_utility.cpp b/src/storage/tag_utility.cpp new file mode 100644 index 0000000..dc1f520 --- /dev/null +++ b/src/storage/tag_utility.cpp @@ -0,0 +1,389 @@ +#include <glib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "novel_types.h" +#include "phrase_index.h" +#include "phrase_large_table.h" +#include "tag_utility.h" + +/* internal taglib structure */ +struct tag_entry{ + int m_line_type; + char * m_line_tag; + int m_num_of_values; + char ** m_required_tags; + /* char ** m_optional_tags; */ + /* int m_optional_count = 0; */ + char ** m_ignored_tags; +}; + +tag_entry tag_entry_copy(int line_type, const char * line_tag, + int num_of_values, + char * required_tags[], + char * ignored_tags[]){ + tag_entry entry; + entry.m_line_type = line_type; + entry.m_line_tag = g_strdup( line_tag ); + entry.m_num_of_values = num_of_values; + entry.m_required_tags = g_strdupv( required_tags ); + entry.m_ignored_tags = g_strdupv( ignored_tags ); + return entry; +} + +tag_entry tag_entry_clone(tag_entry * entry){ + return tag_entry_copy(entry->m_line_type, entry->m_line_tag, + entry->m_num_of_values, + entry->m_required_tags, entry->m_ignored_tags); +} + +void tag_entry_reclaim(tag_entry * entry){ + g_free( entry->m_line_tag ); + g_strfreev( entry->m_required_tags ); + g_strfreev(entry->m_ignored_tags); +} + +static bool taglib_free_tag_array(GArray * tag_array){ + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + tag_entry_reclaim(entry); + } + g_array_free(tag_array, TRUE); + return true; +} + +/* special unichar to be handled in split_line. */ +static gunichar backslash = 0; +static gunichar quote = 0; + +static gboolean split_line_init(){ + backslash = g_utf8_get_char("\\"); + quote = g_utf8_get_char("\""); + return TRUE; +} + +/* Pointer Array of Array of tag_entry */ +static GPtrArray * g_tagutils_stack = NULL; + +bool taglib_init(){ + assert( g_tagutils_stack == NULL); + g_tagutils_stack = g_ptr_array_new(); + GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); + g_ptr_array_add(g_tagutils_stack, tag_array); + + /* init split_line. */ + split_line_init(); + return true; +} + +bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, + const char * required_tags, const char * ignored_tags){ + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, + g_tagutils_stack->len - 1); + + /* some duplicate tagname or line_type check here. */ + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + if ( entry->m_line_type == line_type || + strcmp( entry->m_line_tag, line_tag ) == 0 ) + return false; + } + + char ** required = g_strsplit_set(required_tags, ",:", -1); + char ** ignored = g_strsplit_set(ignored_tags, ",:", -1); + + tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values, + required, ignored); + g_array_append_val(tag_array, entry); + + g_strfreev(required); + g_strfreev(ignored); + return true; +} + +static void ptr_array_entry_free(gpointer data, gpointer user_data){ + g_free(data); +} + +static gboolean hash_table_key_value_free(gpointer key, gpointer value, + gpointer user_data){ + g_free(key); + g_free(value); + return TRUE; +} + +/* split the line into tokens. */ +static gchar ** split_line(const gchar * line){ + /* array for tokens. */ + GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *)); + + for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){ + gunichar unichar = g_utf8_get_char(cur); + const gchar * begin = cur; + gchar * token = NULL; + + if ( g_unichar_isspace (unichar) ) { + continue; + }else if ( unichar == quote ) { + /* handles "\"". */ + /* skip the first '"'. */ + begin = cur = g_utf8_next_char(cur); + while (*cur) { + unichar = g_utf8_get_char(cur); + if ( unichar == backslash ) { + cur = g_utf8_next_char(cur); + g_return_val_if_fail(*cur, NULL); + } else if ( unichar == quote ){ + break; + } + cur = g_utf8_next_char(cur); + } + gchar * tmp = g_strndup( begin, cur - begin); + /* TODO: switch to own strdup_escape implementation + for \"->" transforming. */ + token = g_strdup_printf(tmp); + g_free(tmp); + } else { + /* handles other tokens. */ + while(*cur) { + unichar = g_utf8_get_char(cur); + if ( g_unichar_isgraph(unichar) ) { + /* next unichar */ + cur = g_utf8_next_char(cur); + } else { + /* space and other characters handles. */ + break; + } + } + token = g_strndup( begin, cur - begin ); + } + + g_array_append_val(tokens, token); + if ( !*cur ) + break; + } + + return (gchar **)g_array_free(tokens, FALSE); +} + +bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, + GHashTable * required){ + /* reset values and required. */ + g_ptr_array_foreach(values, ptr_array_entry_free, NULL); + g_ptr_array_set_size(values, 0); + g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL); + + /* use own version of split_line + instead of g_strsplit_set for special token.*/ + char ** tokens = split_line(input_line); + int num_of_tokens = g_strv_length(tokens); + + char * line_tag = tokens[0]; + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + + tag_entry * cur_entry = NULL; + /* find line type. */ + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) { + cur_entry = entry; + break; + } + } + + if ( !cur_entry ) + return false; + + line_type = cur_entry->m_line_type; + + for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) { + g_return_val_if_fail(i < num_of_tokens, false); + char * value = g_strdup( tokens[i] ); + g_ptr_array_add(values, value); + } + + int ignored_len = g_strv_length( cur_entry->m_ignored_tags ); + int required_len = g_strv_length( cur_entry->m_required_tags); + + for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){ + g_return_val_if_fail(i < num_of_tokens, false); + const char * tmp = tokens[i]; + + /* check ignored tags. */ + bool tag_ignored = false; + for ( int m = 0; m < ignored_len; ++m) { + if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) { + tag_ignored = true; + break; + } + } + + if ( tag_ignored ) { + ++i; + continue; + } + + /* check required tags. */ + bool tag_required = false; + for ( int m = 0; m < required_len; ++m) { + if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) { + tag_required = true; + break; + } + } + + /* warning on the un-expected tags. */ + if ( !tag_required ) { + g_warning("un-expected tags:%s.\n", tmp); + ++i; + continue; + } + + char * key = g_strdup(tokens[i]); + ++i; + g_return_val_if_fail(i < num_of_tokens, false); + char * value = g_strdup(tokens[i]); + g_hash_table_insert(required, key, value); + } + + /* check for all required tags. */ + for ( int i = 0; i < required_len; ++i) { + const char * required_tag_str = cur_entry->m_required_tags[i]; + gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL); + if ( !result ) { + g_warning("missed required tags: %s.\n", required_tag_str); + g_strfreev(tokens); + return false; + } + } + + g_strfreev(tokens); + return true; +} + +bool taglib_remove_tag(int line_type){ + /* Note: duplicate entry check is in taglib_add_tag. */ + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + for ( size_t i = 0; i < tag_array->len; ++i) { + tag_entry * entry = &g_array_index(tag_array, tag_entry, i); + if (entry->m_line_type != line_type) + continue; + tag_entry_reclaim(entry); + g_array_remove_index(tag_array, i); + return true; + } + return false; +} + +bool taglib_push_state(){ + assert(g_tagutils_stack->len >= 1); + GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); + GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + for ( size_t i = 0; i < prev_tag_array->len; ++i) { + tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i); + tag_entry new_entry = tag_entry_clone(entry); + g_array_append_val(next_tag_array, new_entry); + } + g_ptr_array_add(g_tagutils_stack, next_tag_array); + return true; +} + +bool taglib_pop_state(){ + assert(g_tagutils_stack->len > 1); + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); + g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1); + taglib_free_tag_array(tag_array); + return true; +} + +bool taglib_fini(){ + for ( size_t i = 0; i < g_tagutils_stack->len; ++i){ + GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i); + taglib_free_tag_array(tag_array); + } + g_ptr_array_free(g_tagutils_stack, TRUE); + g_tagutils_stack = NULL; + return true; +} + +static phrase_token_t taglib_special_string_to_token(const char * string){ + struct token_pair{ + phrase_token_t token; + const char * string; + }; + + static const token_pair tokens [] = { + {sentence_start, "<start>"}, + {0, NULL} + }; + + const token_pair * pair = tokens; + while (pair->string) { + if ( strcmp(string, pair->string ) == 0 ){ + return pair->token; + } + } + + fprintf(stderr, "error: unknown token:%s.\n", string); + return 0; +} + +phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){ + phrase_token_t token = 0; + if ( string[0] == '<' ) { + return taglib_special_string_to_token(string); + } + + glong phrase_len = g_utf8_strlen(string, -1); + utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL); + int result = phrases->search(phrase_len, phrase, token); + if ( !(result & SEARCH_OK) ) + fprintf(stderr, "error: unknown token:%s.\n", string); + + g_free(phrase); + return token; +} + +static const char * taglib_special_token_to_string(phrase_token_t token){ + struct token_pair{ + phrase_token_t token; + const char * string; + }; + + static const token_pair tokens [] = { + {sentence_start, "<start>"}, + {0, NULL} + }; + + const token_pair * pair = tokens; + while (pair->token) { + if ( token == pair->token ) + return pair->string; + } + + fprintf(stderr, "error: unknown token:%d.\n", token); + return NULL; +} + +char * taglib_token_to_string(FacadePhraseIndex * phrase_index, + phrase_token_t token) { + PhraseItem item; + utf16_t buffer[MAX_PHRASE_LENGTH]; + + gchar * phrase; + /* deal with the special phrase index, for "<start>..." */ + if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) { + return g_strdup(taglib_special_token_to_string(token)); + } + + int result = phrase_index->get_phrase_item(token, item); + if (result != ERROR_OK) { + fprintf(stderr, "error: unknown token:%d.\n", token); + return NULL; + } + + item.get_phrase_string(buffer); + guint8 length = item.get_phrase_length(); + phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL); + return phrase; +} diff --git a/src/storage/tag_utility.h b/src/storage/tag_utility.h new file mode 100644 index 0000000..67d8946 --- /dev/null +++ b/src/storage/tag_utility.h @@ -0,0 +1,68 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2010 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef TAG_UTILITY_H +#define TAG_UTILITY_H + +#include "novel_types.h" + +/* Note: the optional tag has been removed from the first implementation. + * Maybe the optional tag will be added back later. + */ + +bool taglib_init(); + +/* Note: most tags are separated by ',' or ':' . */ +bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags); + +/* most parameters are hash table of string (const char *). */ +bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, GHashTable * required); + +/* Note: taglib_write is omited, as printf is more suitable for this. */ + +/* Note the following function is only available when the optional tag exists. + * bool taglib_report_status(int line_type); + */ + +/* remove the tag of type line_type. */ +bool taglib_remove_tag(int line_type); + +/* the following functions are used to save current known tag list in stack. + * Used when the parsing context is changed. + */ +bool taglib_push_state(); +bool taglib_pop_state(); + +bool taglib_fini(); + +namespace pinyin{ + class PhraseLargeTable; +}; + +using namespace pinyin; + +phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, + const char * string); + +char * taglib_token_to_string(FacadePhraseIndex * phrase_index, + phrase_token_t token); + +#endif |