diff options
author | Peng Wu <alexepico@gmail.com> | 2011-05-20 11:02:55 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2011-05-20 11:02:55 +0800 |
commit | f18a652c8a03961ae1004daf051d28aedbae282f (patch) | |
tree | 27e9008ec1abecd48ea868a3a303e375984500d8 /utils/storage | |
parent | 5150341809f92fb2179decdfdd6ec1477d988461 (diff) | |
download | libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.tar.gz libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.tar.xz libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.zip |
move tag utility to src/storage
Diffstat (limited to 'utils/storage')
-rw-r--r-- | utils/storage/Makefile.am | 12 | ||||
-rw-r--r-- | utils/storage/tag_utility.cpp | 387 | ||||
-rw-r--r-- | utils/storage/tag_utility.h | 68 |
3 files changed, 2 insertions, 465 deletions
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am index f314a1a..bc033f1 100644 --- a/utils/storage/Makefile.am +++ b/utils/storage/Makefile.am @@ -21,8 +21,6 @@ INCLUDES = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/lookup \ @GLIB2_CPPFLAGS@ -noinst_HEADERS = tag_utility.h - noinst_PROGRAMS = gen_pinyin_table gen_binary_files export_interpolation import_interpolation gen_pinyin_table_SOURCES = gen_pinyin_table.cpp @@ -33,16 +31,10 @@ gen_binary_files_SOURCES = gen_binary_files.cpp gen_binary_files_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ -noinst_LTLIBRARIES = libtagutils.la - -libtagutils_la_LDFLAGS = -static - -libtagutils_la_SOURCES = tag_utility.cpp - import_interpolation_SOURCES = import_interpolation.cpp -import_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@ +import_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ export_interpolation_SOURCES = export_interpolation.cpp -export_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@ +export_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ diff --git a/utils/storage/tag_utility.cpp b/utils/storage/tag_utility.cpp deleted file mode 100644 index 5dcb35a..0000000 --- a/utils/storage/tag_utility.cpp +++ /dev/null @@ -1,387 +0,0 @@ -#include "pinyin.h" -#include <glib.h> -#include <stdio.h> -#include <string.h> -#include <assert.h> -#include "tag_utility.h" - -/* internal taglib structure */ -struct tag_entry{ - int m_line_type; - char * m_line_tag; - int m_num_of_values; - char ** m_required_tags; - /* char ** m_optional_tags; */ - /* int m_optional_count = 0; */ - char ** m_ignored_tags; -}; - -tag_entry tag_entry_copy(int line_type, const char * line_tag, - int num_of_values, - char * required_tags[], - char * ignored_tags[]){ - tag_entry entry; - entry.m_line_type = line_type; - entry.m_line_tag = g_strdup( line_tag ); - entry.m_num_of_values = num_of_values; - entry.m_required_tags = g_strdupv( required_tags ); - entry.m_ignored_tags = g_strdupv( ignored_tags ); - return entry; -} - -tag_entry tag_entry_clone(tag_entry * entry){ - return tag_entry_copy(entry->m_line_type, entry->m_line_tag, - entry->m_num_of_values, - entry->m_required_tags, entry->m_ignored_tags); -} - -void tag_entry_reclaim(tag_entry * entry){ - g_free( entry->m_line_tag ); - g_strfreev( entry->m_required_tags ); - g_strfreev(entry->m_ignored_tags); -} - -static bool taglib_free_tag_array(GArray * tag_array){ - for ( size_t i = 0; i < tag_array->len; ++i) { - tag_entry * entry = &g_array_index(tag_array, tag_entry, i); - tag_entry_reclaim(entry); - } - g_array_free(tag_array, TRUE); - return true; -} - -/* special unichar to be handled in split_line. */ -static gunichar backslash = 0; -static gunichar quote = 0; - -static gboolean split_line_init(){ - backslash = g_utf8_get_char("\\"); - quote = g_utf8_get_char("\""); - return TRUE; -} - -/* Pointer Array of Array of tag_entry */ -static GPtrArray * g_tagutils_stack = NULL; - -bool taglib_init(){ - assert( g_tagutils_stack == NULL); - g_tagutils_stack = g_ptr_array_new(); - GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); - g_ptr_array_add(g_tagutils_stack, tag_array); - - /* init split_line. */ - split_line_init(); - return true; -} - -bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, - const char * required_tags, const char * ignored_tags){ - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, - g_tagutils_stack->len - 1); - - /* some duplicate tagname or line_type check here. */ - for ( size_t i = 0; i < tag_array->len; ++i) { - tag_entry * entry = &g_array_index(tag_array, tag_entry, i); - if ( entry->m_line_type == line_type || - strcmp( entry->m_line_tag, line_tag ) == 0 ) - return false; - } - - char ** required = g_strsplit_set(required_tags, ",:", -1); - char ** ignored = g_strsplit_set(ignored_tags, ",:", -1); - - tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values, - required, ignored); - g_array_append_val(tag_array, entry); - - g_strfreev(required); - g_strfreev(ignored); - return true; -} - -static void ptr_array_entry_free(gpointer data, gpointer user_data){ - g_free(data); -} - -static gboolean hash_table_key_value_free(gpointer key, gpointer value, - gpointer user_data){ - g_free(key); - g_free(value); - return TRUE; -} - -/* split the line into tokens. */ -static gchar ** split_line(const gchar * line){ - /* array for tokens. */ - GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *)); - - for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){ - gunichar unichar = g_utf8_get_char(cur); - const gchar * begin = cur; - gchar * token = NULL; - - if ( g_unichar_isspace (unichar) ) { - continue; - }else if ( unichar == quote ) { - /* handles "\"". */ - /* skip the first '"'. */ - begin = cur = g_utf8_next_char(cur); - while (*cur) { - unichar = g_utf8_get_char(cur); - if ( unichar == backslash ) { - cur = g_utf8_next_char(cur); - g_return_val_if_fail(*cur, NULL); - } else if ( unichar == quote ){ - break; - } - cur = g_utf8_next_char(cur); - } - gchar * tmp = g_strndup( begin, cur - begin); - /* TODO: switch to own strdup_escape implementation - for \"->" transforming. */ - token = g_strdup_printf(tmp); - g_free(tmp); - } else { - /* handles other tokens. */ - while(*cur) { - unichar = g_utf8_get_char(cur); - if ( g_unichar_isgraph(unichar) ) { - /* next unichar */ - cur = g_utf8_next_char(cur); - } else { - /* space and other characters handles. */ - break; - } - } - token = g_strndup( begin, cur - begin ); - } - - g_array_append_val(tokens, token); - if ( !*cur ) - break; - } - - return (gchar **)g_array_free(tokens, FALSE); -} - -bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, - GHashTable * required){ - /* reset values and required. */ - g_ptr_array_foreach(values, ptr_array_entry_free, NULL); - g_ptr_array_set_size(values, 0); - g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL); - - /* use own version of split_line - instead of g_strsplit_set for special token.*/ - char ** tokens = split_line(input_line); - int num_of_tokens = g_strv_length(tokens); - - char * line_tag = tokens[0]; - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); - - tag_entry * cur_entry = NULL; - /* find line type. */ - for ( size_t i = 0; i < tag_array->len; ++i) { - tag_entry * entry = &g_array_index(tag_array, tag_entry, i); - if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) { - cur_entry = entry; - break; - } - } - - if ( !cur_entry ) - return false; - - line_type = cur_entry->m_line_type; - - for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) { - g_return_val_if_fail(i < num_of_tokens, false); - char * value = g_strdup( tokens[i] ); - g_ptr_array_add(values, value); - } - - int ignored_len = g_strv_length( cur_entry->m_ignored_tags ); - int required_len = g_strv_length( cur_entry->m_required_tags); - - for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){ - g_return_val_if_fail(i < num_of_tokens, false); - const char * tmp = tokens[i]; - - /* check ignored tags. */ - bool tag_ignored = false; - for ( int m = 0; m < ignored_len; ++m) { - if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) { - tag_ignored = true; - break; - } - } - - if ( tag_ignored ) { - ++i; - continue; - } - - /* check required tags. */ - bool tag_required = false; - for ( int m = 0; m < required_len; ++m) { - if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) { - tag_required = true; - break; - } - } - - /* warning on the un-expected tags. */ - if ( !tag_required ) { - g_warning("un-expected tags:%s.\n", tmp); - ++i; - continue; - } - - char * key = g_strdup(tokens[i]); - ++i; - g_return_val_if_fail(i < num_of_tokens, false); - char * value = g_strdup(tokens[i]); - g_hash_table_insert(required, key, value); - } - - /* check for all required tags. */ - for ( int i = 0; i < required_len; ++i) { - const char * required_tag_str = cur_entry->m_required_tags[i]; - gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL); - if ( !result ) { - g_warning("missed required tags: %s.\n", required_tag_str); - g_strfreev(tokens); - return false; - } - } - - g_strfreev(tokens); - return true; -} - -bool taglib_remove_tag(int line_type){ - /* Note: duplicate entry check is in taglib_add_tag. */ - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); - for ( size_t i = 0; i < tag_array->len; ++i) { - tag_entry * entry = &g_array_index(tag_array, tag_entry, i); - if (entry->m_line_type != line_type) - continue; - tag_entry_reclaim(entry); - g_array_remove_index(tag_array, i); - return true; - } - return false; -} - -bool taglib_push_state(){ - assert(g_tagutils_stack->len >= 1); - GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry)); - GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); - for ( size_t i = 0; i < prev_tag_array->len; ++i) { - tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i); - tag_entry new_entry = tag_entry_clone(entry); - g_array_append_val(next_tag_array, new_entry); - } - g_ptr_array_add(g_tagutils_stack, next_tag_array); - return true; -} - -bool taglib_pop_state(){ - assert(g_tagutils_stack->len > 1); - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1); - g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1); - taglib_free_tag_array(tag_array); - return true; -} - -bool taglib_fini(){ - for ( size_t i = 0; i < g_tagutils_stack->len; ++i){ - GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i); - taglib_free_tag_array(tag_array); - } - g_ptr_array_free(g_tagutils_stack, TRUE); - g_tagutils_stack = NULL; - return true; -} - -static phrase_token_t taglib_special_string_to_token(const char * string){ - struct token_pair{ - phrase_token_t token; - const char * string; - }; - - static const token_pair tokens [] = { - {sentence_start, "<start>"}, - {0, NULL} - }; - - const token_pair * pair = tokens; - while (pair->string) { - if ( strcmp(string, pair->string ) == 0 ){ - return pair->token; - } - } - - fprintf(stderr, "error: unknown token:%s.\n", string); - return 0; -} - -phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){ - phrase_token_t token = 0; - if ( string[0] == '<' ) { - return taglib_special_string_to_token(string); - } - - glong phrase_len = g_utf8_strlen(string, -1); - utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL); - int result = phrases->search(phrase_len, phrase, token); - if ( !(result & SEARCH_OK) ) - fprintf(stderr, "error: unknown token:%s.\n", string); - - g_free(phrase); - return token; -} - -static const char * taglib_special_token_to_string(phrase_token_t token){ - struct token_pair{ - phrase_token_t token; - const char * string; - }; - - static const token_pair tokens [] = { - {sentence_start, "<start>"}, - {0, NULL} - }; - - const token_pair * pair = tokens; - while (pair->token) { - if ( token == pair->token ) - return pair->string; - } - - fprintf(stderr, "error: unknown token:%d.\n", token); - return NULL; -} - -char * taglib_token_to_string(FacadePhraseIndex * phrase_index, - phrase_token_t token) { - PhraseItem item; - utf16_t buffer[MAX_PHRASE_LENGTH]; - - gchar * phrase; - /* deal with the special phrase index, for "<start>..." */ - if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) { - return g_strdup(taglib_special_token_to_string(token)); - } - - int result = phrase_index->get_phrase_item(token, item); - if (result != ERROR_OK) { - fprintf(stderr, "error: unknown token:%d.\n", token); - return NULL; - } - - item.get_phrase_string(buffer); - guint8 length = item.get_phrase_length(); - phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL); - return phrase; -} diff --git a/utils/storage/tag_utility.h b/utils/storage/tag_utility.h deleted file mode 100644 index 67d8946..0000000 --- a/utils/storage/tag_utility.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * libpinyin - * Library to deal with pinyin. - * - * Copyright (C) 2010 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef TAG_UTILITY_H -#define TAG_UTILITY_H - -#include "novel_types.h" - -/* Note: the optional tag has been removed from the first implementation. - * Maybe the optional tag will be added back later. - */ - -bool taglib_init(); - -/* Note: most tags are separated by ',' or ':' . */ -bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags); - -/* most parameters are hash table of string (const char *). */ -bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, GHashTable * required); - -/* Note: taglib_write is omited, as printf is more suitable for this. */ - -/* Note the following function is only available when the optional tag exists. - * bool taglib_report_status(int line_type); - */ - -/* remove the tag of type line_type. */ -bool taglib_remove_tag(int line_type); - -/* the following functions are used to save current known tag list in stack. - * Used when the parsing context is changed. - */ -bool taglib_push_state(); -bool taglib_pop_state(); - -bool taglib_fini(); - -namespace pinyin{ - class PhraseLargeTable; -}; - -using namespace pinyin; - -phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, - const char * string); - -char * taglib_token_to_string(FacadePhraseIndex * phrase_index, - phrase_token_t token); - -#endif |