summaryrefslogtreecommitdiffstats
path: root/utils/storage
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-20 11:02:55 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-20 11:02:55 +0800
commitf18a652c8a03961ae1004daf051d28aedbae282f (patch)
tree27e9008ec1abecd48ea868a3a303e375984500d8 /utils/storage
parent5150341809f92fb2179decdfdd6ec1477d988461 (diff)
downloadlibpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.tar.gz
libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.tar.xz
libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.zip
move tag utility to src/storage
Diffstat (limited to 'utils/storage')
-rw-r--r--utils/storage/Makefile.am12
-rw-r--r--utils/storage/tag_utility.cpp387
-rw-r--r--utils/storage/tag_utility.h68
3 files changed, 2 insertions, 465 deletions
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
index f314a1a..bc033f1 100644
--- a/utils/storage/Makefile.am
+++ b/utils/storage/Makefile.am
@@ -21,8 +21,6 @@ INCLUDES = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/lookup \
@GLIB2_CPPFLAGS@
-noinst_HEADERS = tag_utility.h
-
noinst_PROGRAMS = gen_pinyin_table gen_binary_files export_interpolation import_interpolation
gen_pinyin_table_SOURCES = gen_pinyin_table.cpp
@@ -33,16 +31,10 @@ gen_binary_files_SOURCES = gen_binary_files.cpp
gen_binary_files_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
-noinst_LTLIBRARIES = libtagutils.la
-
-libtagutils_la_LDFLAGS = -static
-
-libtagutils_la_SOURCES = tag_utility.cpp
-
import_interpolation_SOURCES = import_interpolation.cpp
-import_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@
+import_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
export_interpolation_SOURCES = export_interpolation.cpp
-export_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@
+export_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
diff --git a/utils/storage/tag_utility.cpp b/utils/storage/tag_utility.cpp
deleted file mode 100644
index 5dcb35a..0000000
--- a/utils/storage/tag_utility.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-#include "pinyin.h"
-#include <glib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "tag_utility.h"
-
-/* internal taglib structure */
-struct tag_entry{
- int m_line_type;
- char * m_line_tag;
- int m_num_of_values;
- char ** m_required_tags;
- /* char ** m_optional_tags; */
- /* int m_optional_count = 0; */
- char ** m_ignored_tags;
-};
-
-tag_entry tag_entry_copy(int line_type, const char * line_tag,
- int num_of_values,
- char * required_tags[],
- char * ignored_tags[]){
- tag_entry entry;
- entry.m_line_type = line_type;
- entry.m_line_tag = g_strdup( line_tag );
- entry.m_num_of_values = num_of_values;
- entry.m_required_tags = g_strdupv( required_tags );
- entry.m_ignored_tags = g_strdupv( ignored_tags );
- return entry;
-}
-
-tag_entry tag_entry_clone(tag_entry * entry){
- return tag_entry_copy(entry->m_line_type, entry->m_line_tag,
- entry->m_num_of_values,
- entry->m_required_tags, entry->m_ignored_tags);
-}
-
-void tag_entry_reclaim(tag_entry * entry){
- g_free( entry->m_line_tag );
- g_strfreev( entry->m_required_tags );
- g_strfreev(entry->m_ignored_tags);
-}
-
-static bool taglib_free_tag_array(GArray * tag_array){
- for ( size_t i = 0; i < tag_array->len; ++i) {
- tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
- tag_entry_reclaim(entry);
- }
- g_array_free(tag_array, TRUE);
- return true;
-}
-
-/* special unichar to be handled in split_line. */
-static gunichar backslash = 0;
-static gunichar quote = 0;
-
-static gboolean split_line_init(){
- backslash = g_utf8_get_char("\\");
- quote = g_utf8_get_char("\"");
- return TRUE;
-}
-
-/* Pointer Array of Array of tag_entry */
-static GPtrArray * g_tagutils_stack = NULL;
-
-bool taglib_init(){
- assert( g_tagutils_stack == NULL);
- g_tagutils_stack = g_ptr_array_new();
- GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
- g_ptr_array_add(g_tagutils_stack, tag_array);
-
- /* init split_line. */
- split_line_init();
- return true;
-}
-
-bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values,
- const char * required_tags, const char * ignored_tags){
- GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack,
- g_tagutils_stack->len - 1);
-
- /* some duplicate tagname or line_type check here. */
- for ( size_t i = 0; i < tag_array->len; ++i) {
- tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
- if ( entry->m_line_type == line_type ||
- strcmp( entry->m_line_tag, line_tag ) == 0 )
- return false;
- }
-
- char ** required = g_strsplit_set(required_tags, ",:", -1);
- char ** ignored = g_strsplit_set(ignored_tags, ",:", -1);
-
- tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values,
- required, ignored);
- g_array_append_val(tag_array, entry);
-
- g_strfreev(required);
- g_strfreev(ignored);
- return true;
-}
-
-static void ptr_array_entry_free(gpointer data, gpointer user_data){
- g_free(data);
-}
-
-static gboolean hash_table_key_value_free(gpointer key, gpointer value,
- gpointer user_data){
- g_free(key);
- g_free(value);
- return TRUE;
-}
-
-/* split the line into tokens. */
-static gchar ** split_line(const gchar * line){
- /* array for tokens. */
- GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *));
-
- for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){
- gunichar unichar = g_utf8_get_char(cur);
- const gchar * begin = cur;
- gchar * token = NULL;
-
- if ( g_unichar_isspace (unichar) ) {
- continue;
- }else if ( unichar == quote ) {
- /* handles "\"". */
- /* skip the first '"'. */
- begin = cur = g_utf8_next_char(cur);
- while (*cur) {
- unichar = g_utf8_get_char(cur);
- if ( unichar == backslash ) {
- cur = g_utf8_next_char(cur);
- g_return_val_if_fail(*cur, NULL);
- } else if ( unichar == quote ){
- break;
- }
- cur = g_utf8_next_char(cur);
- }
- gchar * tmp = g_strndup( begin, cur - begin);
- /* TODO: switch to own strdup_escape implementation
- for \"->" transforming. */
- token = g_strdup_printf(tmp);
- g_free(tmp);
- } else {
- /* handles other tokens. */
- while(*cur) {
- unichar = g_utf8_get_char(cur);
- if ( g_unichar_isgraph(unichar) ) {
- /* next unichar */
- cur = g_utf8_next_char(cur);
- } else {
- /* space and other characters handles. */
- break;
- }
- }
- token = g_strndup( begin, cur - begin );
- }
-
- g_array_append_val(tokens, token);
- if ( !*cur )
- break;
- }
-
- return (gchar **)g_array_free(tokens, FALSE);
-}
-
-bool taglib_read(const char * input_line, int & line_type, GPtrArray * values,
- GHashTable * required){
- /* reset values and required. */
- g_ptr_array_foreach(values, ptr_array_entry_free, NULL);
- g_ptr_array_set_size(values, 0);
- g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL);
-
- /* use own version of split_line
- instead of g_strsplit_set for special token.*/
- char ** tokens = split_line(input_line);
- int num_of_tokens = g_strv_length(tokens);
-
- char * line_tag = tokens[0];
- GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
-
- tag_entry * cur_entry = NULL;
- /* find line type. */
- for ( size_t i = 0; i < tag_array->len; ++i) {
- tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
- if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) {
- cur_entry = entry;
- break;
- }
- }
-
- if ( !cur_entry )
- return false;
-
- line_type = cur_entry->m_line_type;
-
- for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) {
- g_return_val_if_fail(i < num_of_tokens, false);
- char * value = g_strdup( tokens[i] );
- g_ptr_array_add(values, value);
- }
-
- int ignored_len = g_strv_length( cur_entry->m_ignored_tags );
- int required_len = g_strv_length( cur_entry->m_required_tags);
-
- for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){
- g_return_val_if_fail(i < num_of_tokens, false);
- const char * tmp = tokens[i];
-
- /* check ignored tags. */
- bool tag_ignored = false;
- for ( int m = 0; m < ignored_len; ++m) {
- if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) {
- tag_ignored = true;
- break;
- }
- }
-
- if ( tag_ignored ) {
- ++i;
- continue;
- }
-
- /* check required tags. */
- bool tag_required = false;
- for ( int m = 0; m < required_len; ++m) {
- if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) {
- tag_required = true;
- break;
- }
- }
-
- /* warning on the un-expected tags. */
- if ( !tag_required ) {
- g_warning("un-expected tags:%s.\n", tmp);
- ++i;
- continue;
- }
-
- char * key = g_strdup(tokens[i]);
- ++i;
- g_return_val_if_fail(i < num_of_tokens, false);
- char * value = g_strdup(tokens[i]);
- g_hash_table_insert(required, key, value);
- }
-
- /* check for all required tags. */
- for ( int i = 0; i < required_len; ++i) {
- const char * required_tag_str = cur_entry->m_required_tags[i];
- gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL);
- if ( !result ) {
- g_warning("missed required tags: %s.\n", required_tag_str);
- g_strfreev(tokens);
- return false;
- }
- }
-
- g_strfreev(tokens);
- return true;
-}
-
-bool taglib_remove_tag(int line_type){
- /* Note: duplicate entry check is in taglib_add_tag. */
- GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
- for ( size_t i = 0; i < tag_array->len; ++i) {
- tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
- if (entry->m_line_type != line_type)
- continue;
- tag_entry_reclaim(entry);
- g_array_remove_index(tag_array, i);
- return true;
- }
- return false;
-}
-
-bool taglib_push_state(){
- assert(g_tagutils_stack->len >= 1);
- GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
- GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
- for ( size_t i = 0; i < prev_tag_array->len; ++i) {
- tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i);
- tag_entry new_entry = tag_entry_clone(entry);
- g_array_append_val(next_tag_array, new_entry);
- }
- g_ptr_array_add(g_tagutils_stack, next_tag_array);
- return true;
-}
-
-bool taglib_pop_state(){
- assert(g_tagutils_stack->len > 1);
- GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
- g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1);
- taglib_free_tag_array(tag_array);
- return true;
-}
-
-bool taglib_fini(){
- for ( size_t i = 0; i < g_tagutils_stack->len; ++i){
- GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i);
- taglib_free_tag_array(tag_array);
- }
- g_ptr_array_free(g_tagutils_stack, TRUE);
- g_tagutils_stack = NULL;
- return true;
-}
-
-static phrase_token_t taglib_special_string_to_token(const char * string){
- struct token_pair{
- phrase_token_t token;
- const char * string;
- };
-
- static const token_pair tokens [] = {
- {sentence_start, "<start>"},
- {0, NULL}
- };
-
- const token_pair * pair = tokens;
- while (pair->string) {
- if ( strcmp(string, pair->string ) == 0 ){
- return pair->token;
- }
- }
-
- fprintf(stderr, "error: unknown token:%s.\n", string);
- return 0;
-}
-
-phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){
- phrase_token_t token = 0;
- if ( string[0] == '<' ) {
- return taglib_special_string_to_token(string);
- }
-
- glong phrase_len = g_utf8_strlen(string, -1);
- utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
- int result = phrases->search(phrase_len, phrase, token);
- if ( !(result & SEARCH_OK) )
- fprintf(stderr, "error: unknown token:%s.\n", string);
-
- g_free(phrase);
- return token;
-}
-
-static const char * taglib_special_token_to_string(phrase_token_t token){
- struct token_pair{
- phrase_token_t token;
- const char * string;
- };
-
- static const token_pair tokens [] = {
- {sentence_start, "<start>"},
- {0, NULL}
- };
-
- const token_pair * pair = tokens;
- while (pair->token) {
- if ( token == pair->token )
- return pair->string;
- }
-
- fprintf(stderr, "error: unknown token:%d.\n", token);
- return NULL;
-}
-
-char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
- phrase_token_t token) {
- PhraseItem item;
- utf16_t buffer[MAX_PHRASE_LENGTH];
-
- gchar * phrase;
- /* deal with the special phrase index, for "<start>..." */
- if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
- return g_strdup(taglib_special_token_to_string(token));
- }
-
- int result = phrase_index->get_phrase_item(token, item);
- if (result != ERROR_OK) {
- fprintf(stderr, "error: unknown token:%d.\n", token);
- return NULL;
- }
-
- item.get_phrase_string(buffer);
- guint8 length = item.get_phrase_length();
- phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
- return phrase;
-}
diff --git a/utils/storage/tag_utility.h b/utils/storage/tag_utility.h
deleted file mode 100644
index 67d8946..0000000
--- a/utils/storage/tag_utility.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * libpinyin
- * Library to deal with pinyin.
- *
- * Copyright (C) 2010 Peng Wu
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef TAG_UTILITY_H
-#define TAG_UTILITY_H
-
-#include "novel_types.h"
-
-/* Note: the optional tag has been removed from the first implementation.
- * Maybe the optional tag will be added back later.
- */
-
-bool taglib_init();
-
-/* Note: most tags are separated by ',' or ':' . */
-bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags);
-
-/* most parameters are hash table of string (const char *). */
-bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, GHashTable * required);
-
-/* Note: taglib_write is omited, as printf is more suitable for this. */
-
-/* Note the following function is only available when the optional tag exists.
- * bool taglib_report_status(int line_type);
- */
-
-/* remove the tag of type line_type. */
-bool taglib_remove_tag(int line_type);
-
-/* the following functions are used to save current known tag list in stack.
- * Used when the parsing context is changed.
- */
-bool taglib_push_state();
-bool taglib_pop_state();
-
-bool taglib_fini();
-
-namespace pinyin{
- class PhraseLargeTable;
-};
-
-using namespace pinyin;
-
-phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases,
- const char * string);
-
-char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
- phrase_token_t token);
-
-#endif