summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-20 11:02:55 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-20 11:02:55 +0800
commitf18a652c8a03961ae1004daf051d28aedbae282f (patch)
tree27e9008ec1abecd48ea868a3a303e375984500d8 /src
parent5150341809f92fb2179decdfdd6ec1477d988461 (diff)
downloadlibpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.tar.gz
libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.tar.xz
libpinyin-f18a652c8a03961ae1004daf051d28aedbae282f.zip
move tag utility to src/storage
Diffstat (limited to 'src')
-rw-r--r--src/pinyin.h1
-rw-r--r--src/storage/Makefile.am16
-rw-r--r--src/storage/tag_utility.cpp389
-rw-r--r--src/storage/tag_utility.h68
4 files changed, 467 insertions, 7 deletions
diff --git a/src/pinyin.h b/src/pinyin.h
index 3cf6e71..6e3c81e 100644
--- a/src/pinyin.h
+++ b/src/pinyin.h
@@ -10,6 +10,7 @@
#include "lookup.h"
#include "pinyin_lookup.h"
#include "phrase_lookup.h"
+#include "tag_utility.h"
/* training module */
#include "flexible_ngram.h"
diff --git a/src/storage/Makefile.am b/src/storage/Makefile.am
index b2d5b1a..8c10cdf 100644
--- a/src/storage/Makefile.am
+++ b/src/storage/Makefile.am
@@ -21,20 +21,22 @@ INCLUDES = -I$(top_srcdir)/src/include \
noinst_HEADERS = pinyin_large_table.h \
pinyin_base.h \
- pinyin_phrase.h \
+ pinyin_phrase.h \
phrase_index.h \
- pinyin_zhuyin_map_data.h \
+ pinyin_zhuyin_map_data.h \
phrase_large_table.h \
ngram.h \
- flexible_ngram.h
+ flexible_ngram.h \
+ tag_utility.h
noinst_LTLIBRARIES = libstorage.la
libstorage_la_LDFLAGS = -static
libstorage_la_SOURCES = pinyin_base.cpp \
- pinyin_large_table.cpp \
- phrase_index.cpp \
- phrase_large_table.cpp \
- ngram.cpp
+ pinyin_large_table.cpp \
+ phrase_index.cpp \
+ phrase_large_table.cpp \
+ ngram.cpp \
+ tag_utility.cpp
diff --git a/src/storage/tag_utility.cpp b/src/storage/tag_utility.cpp
new file mode 100644
index 0000000..dc1f520
--- /dev/null
+++ b/src/storage/tag_utility.cpp
@@ -0,0 +1,389 @@
+#include <glib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "novel_types.h"
+#include "phrase_index.h"
+#include "phrase_large_table.h"
+#include "tag_utility.h"
+
+/* internal taglib structure */
+struct tag_entry{
+ int m_line_type;
+ char * m_line_tag;
+ int m_num_of_values;
+ char ** m_required_tags;
+ /* char ** m_optional_tags; */
+ /* int m_optional_count = 0; */
+ char ** m_ignored_tags;
+};
+
+tag_entry tag_entry_copy(int line_type, const char * line_tag,
+ int num_of_values,
+ char * required_tags[],
+ char * ignored_tags[]){
+ tag_entry entry;
+ entry.m_line_type = line_type;
+ entry.m_line_tag = g_strdup( line_tag );
+ entry.m_num_of_values = num_of_values;
+ entry.m_required_tags = g_strdupv( required_tags );
+ entry.m_ignored_tags = g_strdupv( ignored_tags );
+ return entry;
+}
+
+tag_entry tag_entry_clone(tag_entry * entry){
+ return tag_entry_copy(entry->m_line_type, entry->m_line_tag,
+ entry->m_num_of_values,
+ entry->m_required_tags, entry->m_ignored_tags);
+}
+
+void tag_entry_reclaim(tag_entry * entry){
+ g_free( entry->m_line_tag );
+ g_strfreev( entry->m_required_tags );
+ g_strfreev(entry->m_ignored_tags);
+}
+
+static bool taglib_free_tag_array(GArray * tag_array){
+ for ( size_t i = 0; i < tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+ tag_entry_reclaim(entry);
+ }
+ g_array_free(tag_array, TRUE);
+ return true;
+}
+
+/* special unichar to be handled in split_line. */
+static gunichar backslash = 0;
+static gunichar quote = 0;
+
+static gboolean split_line_init(){
+ backslash = g_utf8_get_char("\\");
+ quote = g_utf8_get_char("\"");
+ return TRUE;
+}
+
+/* Pointer Array of Array of tag_entry */
+static GPtrArray * g_tagutils_stack = NULL;
+
+bool taglib_init(){
+ assert( g_tagutils_stack == NULL);
+ g_tagutils_stack = g_ptr_array_new();
+ GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
+ g_ptr_array_add(g_tagutils_stack, tag_array);
+
+ /* init split_line. */
+ split_line_init();
+ return true;
+}
+
+bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values,
+ const char * required_tags, const char * ignored_tags){
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack,
+ g_tagutils_stack->len - 1);
+
+ /* some duplicate tagname or line_type check here. */
+ for ( size_t i = 0; i < tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+ if ( entry->m_line_type == line_type ||
+ strcmp( entry->m_line_tag, line_tag ) == 0 )
+ return false;
+ }
+
+ char ** required = g_strsplit_set(required_tags, ",:", -1);
+ char ** ignored = g_strsplit_set(ignored_tags, ",:", -1);
+
+ tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values,
+ required, ignored);
+ g_array_append_val(tag_array, entry);
+
+ g_strfreev(required);
+ g_strfreev(ignored);
+ return true;
+}
+
+static void ptr_array_entry_free(gpointer data, gpointer user_data){
+ g_free(data);
+}
+
+static gboolean hash_table_key_value_free(gpointer key, gpointer value,
+ gpointer user_data){
+ g_free(key);
+ g_free(value);
+ return TRUE;
+}
+
+/* split the line into tokens. */
+static gchar ** split_line(const gchar * line){
+ /* array for tokens. */
+ GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *));
+
+ for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){
+ gunichar unichar = g_utf8_get_char(cur);
+ const gchar * begin = cur;
+ gchar * token = NULL;
+
+ if ( g_unichar_isspace (unichar) ) {
+ continue;
+ }else if ( unichar == quote ) {
+ /* handles "\"". */
+ /* skip the first '"'. */
+ begin = cur = g_utf8_next_char(cur);
+ while (*cur) {
+ unichar = g_utf8_get_char(cur);
+ if ( unichar == backslash ) {
+ cur = g_utf8_next_char(cur);
+ g_return_val_if_fail(*cur, NULL);
+ } else if ( unichar == quote ){
+ break;
+ }
+ cur = g_utf8_next_char(cur);
+ }
+ gchar * tmp = g_strndup( begin, cur - begin);
+ /* TODO: switch to own strdup_escape implementation
+ for \"->" transforming. */
+ token = g_strdup_printf(tmp);
+ g_free(tmp);
+ } else {
+ /* handles other tokens. */
+ while(*cur) {
+ unichar = g_utf8_get_char(cur);
+ if ( g_unichar_isgraph(unichar) ) {
+ /* next unichar */
+ cur = g_utf8_next_char(cur);
+ } else {
+ /* space and other characters handles. */
+ break;
+ }
+ }
+ token = g_strndup( begin, cur - begin );
+ }
+
+ g_array_append_val(tokens, token);
+ if ( !*cur )
+ break;
+ }
+
+ return (gchar **)g_array_free(tokens, FALSE);
+}
+
+bool taglib_read(const char * input_line, int & line_type, GPtrArray * values,
+ GHashTable * required){
+ /* reset values and required. */
+ g_ptr_array_foreach(values, ptr_array_entry_free, NULL);
+ g_ptr_array_set_size(values, 0);
+ g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL);
+
+ /* use own version of split_line
+ instead of g_strsplit_set for special token.*/
+ char ** tokens = split_line(input_line);
+ int num_of_tokens = g_strv_length(tokens);
+
+ char * line_tag = tokens[0];
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+
+ tag_entry * cur_entry = NULL;
+ /* find line type. */
+ for ( size_t i = 0; i < tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+ if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) {
+ cur_entry = entry;
+ break;
+ }
+ }
+
+ if ( !cur_entry )
+ return false;
+
+ line_type = cur_entry->m_line_type;
+
+ for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) {
+ g_return_val_if_fail(i < num_of_tokens, false);
+ char * value = g_strdup( tokens[i] );
+ g_ptr_array_add(values, value);
+ }
+
+ int ignored_len = g_strv_length( cur_entry->m_ignored_tags );
+ int required_len = g_strv_length( cur_entry->m_required_tags);
+
+ for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){
+ g_return_val_if_fail(i < num_of_tokens, false);
+ const char * tmp = tokens[i];
+
+ /* check ignored tags. */
+ bool tag_ignored = false;
+ for ( int m = 0; m < ignored_len; ++m) {
+ if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) {
+ tag_ignored = true;
+ break;
+ }
+ }
+
+ if ( tag_ignored ) {
+ ++i;
+ continue;
+ }
+
+ /* check required tags. */
+ bool tag_required = false;
+ for ( int m = 0; m < required_len; ++m) {
+ if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) {
+ tag_required = true;
+ break;
+ }
+ }
+
+ /* warning on the un-expected tags. */
+ if ( !tag_required ) {
+ g_warning("un-expected tags:%s.\n", tmp);
+ ++i;
+ continue;
+ }
+
+ char * key = g_strdup(tokens[i]);
+ ++i;
+ g_return_val_if_fail(i < num_of_tokens, false);
+ char * value = g_strdup(tokens[i]);
+ g_hash_table_insert(required, key, value);
+ }
+
+ /* check for all required tags. */
+ for ( int i = 0; i < required_len; ++i) {
+ const char * required_tag_str = cur_entry->m_required_tags[i];
+ gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL);
+ if ( !result ) {
+ g_warning("missed required tags: %s.\n", required_tag_str);
+ g_strfreev(tokens);
+ return false;
+ }
+ }
+
+ g_strfreev(tokens);
+ return true;
+}
+
+bool taglib_remove_tag(int line_type){
+ /* Note: duplicate entry check is in taglib_add_tag. */
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+ for ( size_t i = 0; i < tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
+ if (entry->m_line_type != line_type)
+ continue;
+ tag_entry_reclaim(entry);
+ g_array_remove_index(tag_array, i);
+ return true;
+ }
+ return false;
+}
+
+bool taglib_push_state(){
+ assert(g_tagutils_stack->len >= 1);
+ GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
+ GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+ for ( size_t i = 0; i < prev_tag_array->len; ++i) {
+ tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i);
+ tag_entry new_entry = tag_entry_clone(entry);
+ g_array_append_val(next_tag_array, new_entry);
+ }
+ g_ptr_array_add(g_tagutils_stack, next_tag_array);
+ return true;
+}
+
+bool taglib_pop_state(){
+ assert(g_tagutils_stack->len > 1);
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+ g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1);
+ taglib_free_tag_array(tag_array);
+ return true;
+}
+
+bool taglib_fini(){
+ for ( size_t i = 0; i < g_tagutils_stack->len; ++i){
+ GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i);
+ taglib_free_tag_array(tag_array);
+ }
+ g_ptr_array_free(g_tagutils_stack, TRUE);
+ g_tagutils_stack = NULL;
+ return true;
+}
+
+static phrase_token_t taglib_special_string_to_token(const char * string){
+ struct token_pair{
+ phrase_token_t token;
+ const char * string;
+ };
+
+ static const token_pair tokens [] = {
+ {sentence_start, "<start>"},
+ {0, NULL}
+ };
+
+ const token_pair * pair = tokens;
+ while (pair->string) {
+ if ( strcmp(string, pair->string ) == 0 ){
+ return pair->token;
+ }
+ }
+
+ fprintf(stderr, "error: unknown token:%s.\n", string);
+ return 0;
+}
+
+phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){
+ phrase_token_t token = 0;
+ if ( string[0] == '<' ) {
+ return taglib_special_string_to_token(string);
+ }
+
+ glong phrase_len = g_utf8_strlen(string, -1);
+ utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
+ int result = phrases->search(phrase_len, phrase, token);
+ if ( !(result & SEARCH_OK) )
+ fprintf(stderr, "error: unknown token:%s.\n", string);
+
+ g_free(phrase);
+ return token;
+}
+
+static const char * taglib_special_token_to_string(phrase_token_t token){
+ struct token_pair{
+ phrase_token_t token;
+ const char * string;
+ };
+
+ static const token_pair tokens [] = {
+ {sentence_start, "<start>"},
+ {0, NULL}
+ };
+
+ const token_pair * pair = tokens;
+ while (pair->token) {
+ if ( token == pair->token )
+ return pair->string;
+ }
+
+ fprintf(stderr, "error: unknown token:%d.\n", token);
+ return NULL;
+}
+
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+ phrase_token_t token) {
+ PhraseItem item;
+ utf16_t buffer[MAX_PHRASE_LENGTH];
+
+ gchar * phrase;
+ /* deal with the special phrase index, for "<start>..." */
+ if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
+ return g_strdup(taglib_special_token_to_string(token));
+ }
+
+ int result = phrase_index->get_phrase_item(token, item);
+ if (result != ERROR_OK) {
+ fprintf(stderr, "error: unknown token:%d.\n", token);
+ return NULL;
+ }
+
+ item.get_phrase_string(buffer);
+ guint8 length = item.get_phrase_length();
+ phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
+ return phrase;
+}
diff --git a/src/storage/tag_utility.h b/src/storage/tag_utility.h
new file mode 100644
index 0000000..67d8946
--- /dev/null
+++ b/src/storage/tag_utility.h
@@ -0,0 +1,68 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2010 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef TAG_UTILITY_H
+#define TAG_UTILITY_H
+
+#include "novel_types.h"
+
+/* Note: the optional tag has been removed from the first implementation.
+ * Maybe the optional tag will be added back later.
+ */
+
+bool taglib_init();
+
+/* Note: most tags are separated by ',' or ':' . */
+bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values, const char * required_tags, const char * ignored_tags);
+
+/* most parameters are hash table of string (const char *). */
+bool taglib_read(const char * input_line, int & line_type, GPtrArray * values, GHashTable * required);
+
+/* Note: taglib_write is omited, as printf is more suitable for this. */
+
+/* Note the following function is only available when the optional tag exists.
+ * bool taglib_report_status(int line_type);
+ */
+
+/* remove the tag of type line_type. */
+bool taglib_remove_tag(int line_type);
+
+/* the following functions are used to save current known tag list in stack.
+ * Used when the parsing context is changed.
+ */
+bool taglib_push_state();
+bool taglib_pop_state();
+
+bool taglib_fini();
+
+namespace pinyin{
+ class PhraseLargeTable;
+};
+
+using namespace pinyin;
+
+phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases,
+ const char * string);
+
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+ phrase_token_t token);
+
+#endif