From 5394772a45e214206652ad565f506ed3d3a149af Mon Sep 17 00:00:00 2001
From: Peng Wu <alexepico@gmail.com>
Date: Mon, 16 May 2011 14:11:05 +0800
Subject: move token string conversion function to taglib

---
 utils/storage/Makefile.am              | 10 ++--
 utils/storage/export_interpolation.cpp | 55 ++-------------------
 utils/storage/import_interpolation.cpp | 47 ++----------------
 utils/storage/tag_utility.cpp          | 87 ++++++++++++++++++++++++++++++++--
 utils/storage/tag_utility.h            | 14 ++++++
 5 files changed, 110 insertions(+), 103 deletions(-)

(limited to 'utils')
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
index 3e0fab4..f314a1a 100644
--- a/utils/storage/Makefile.am
+++ b/utils/storage/Makefile.am
@@ -33,10 +33,6 @@ gen_binary_files_SOURCES    = gen_binary_files.cpp
 
 gen_binary_files_LDADD      = ../../src/libpinyin.la @GLIB2_LDFLAGS@
 
-export_interpolation_SOURCES = export_interpolation.cpp
-
-export_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
-
 noinst_LTLIBRARIES        = libtagutils.la
 
 libtagutils_la_LDFLAGS    = -static
@@ -45,4 +41,8 @@ libtagutils_la_SOURCES    = tag_utility.cpp
 
 import_interpolation_SOURCES = import_interpolation.cpp
 
-import_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ ./libtagutils.la
\ No newline at end of file
+import_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@
+
+export_interpolation_SOURCES = export_interpolation.cpp
+
+export_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@
diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp
index 333470e..bcc90b8 100644
--- a/utils/storage/export_interpolation.cpp
+++ b/utils/storage/export_interpolation.cpp
@@ -19,19 +19,17 @@
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
+#include "pinyin.h"
 #include <stdio.h>
 #include <assert.h>
 #include <glib.h>
-#include "pinyin.h"
+#include "tag_utility.h"
 
 /* export interpolation model as textual format */
 
 void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index);
 void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram);
 
-/* consider moving the following function to utils/storage/utility.h */
-char * token_to_string(FacadePhraseIndex * phrase_index, phrase_token_t token);
-
 void begin_data(FILE * file){
     fprintf(file, "\\data model interpolation\n");
 }
@@ -86,7 +84,7 @@ void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
             assert( result == ERROR_OK);
 
             size_t freq = item.get_unigram_frequency();
-            char * phrase = token_to_string(phrase_index, j);
+            char * phrase = taglib_token_to_string(phrase_index, j);
             if ( phrase )
                 fprintf(output, "\\item %s count %ld\n", phrase, freq);
 
@@ -115,8 +113,8 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram
         for(size_t j = 0; j < array->len; j++) {
             BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);
 
-            char * word1 = token_to_string(phrase_index, token);
-            char * word2 = token_to_string(phrase_index, item->m_token);
+            char * word1 = taglib_token_to_string(phrase_index, token);
+            char * word2 = taglib_token_to_string(phrase_index, item->m_token);
             guint32 freq = item->m_count;
 
             if ( word1 && word2)
@@ -130,46 +128,3 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram
 
     g_array_free(items, TRUE);
 }
-
-static const char * special_token_to_string(phrase_token_t token){
-    struct token_pair{
-        phrase_token_t token;
-        const char * string;
-    };
-
-    static const token_pair tokens [] = {
-        {sentence_start, "<start>"},
-        {0, NULL}
-    };
-
-    const token_pair * pair = tokens;
-    while (pair->token) {
-        if ( token == pair->token )
-            return pair->string;
-    }
-
-    fprintf(stderr, "error: unknown token:%d.\n", token);
-    return NULL;
-}
-
-char * token_to_string(FacadePhraseIndex * phrase_index, phrase_token_t token) {
-    PhraseItem item;
-    utf16_t buffer[MAX_PHRASE_LENGTH];
-
-    gchar * phrase;
-    /* deal with the special phrase index, for "<start>..." */
-    if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
-        return g_strdup(special_token_to_string(token));
-    }
-
-    int result = phrase_index->get_phrase_item(token, item);
-    if (result != ERROR_OK) {
-        fprintf(stderr, "error: unknown token:%d.\n", token);
-        return NULL;
-    }
-
-    item.get_phrase_string(buffer);
-    guint8 length = item.get_phrase_length();
-    phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
-    return phrase;
-}
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
index 57c87c1..6c97109 100644
--- a/utils/storage/import_interpolation.cpp
+++ b/utils/storage/import_interpolation.cpp
@@ -40,9 +40,6 @@ static GHashTable * required = NULL;
 static char * linebuf = NULL;
 static size_t len = 0;
 
-phrase_token_t string_to_token(PhraseLargeTable * phrases,
-                               const char * string);
-
 bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
                    FacadePhraseIndex * phrase_index);
 
@@ -104,7 +101,7 @@ bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
         case GRAM_1_ITEM_LINE:{
             /* handle \item in \1-gram */
             const char * string = (const char *) g_ptr_array_index(values, 0);
-            phrase_token_t token = string_to_token(phrases, string);
+            phrase_token_t token = taglib_string_to_token(phrases, string);
             char * value = NULL;
             assert(g_hash_table_lookup_extended(required, "count", NULL, (gpointer *)&value));
             glong count = atol(value);
@@ -140,9 +137,9 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
             /* handle \item in \2-gram */
             /* two tokens */
             const char * string = (const char *) g_ptr_array_index(values, 0);
-            phrase_token_t token1 = string_to_token(phrases, string);
+            phrase_token_t token1 = taglib_string_to_token(phrases, string);
             string = (const char *) g_ptr_array_index(values, 1);
-            phrase_token_t token2 = string_to_token(phrases, string);
+            phrase_token_t token2 = taglib_string_to_token(phrases, string);
 
             /* tag: count */
             char * value = NULL;
@@ -262,41 +259,3 @@ int main(int argc, char * argv[]){
 
     return 0;
 }
-
-static phrase_token_t special_string_to_token(const char * string){
-    struct token_pair{
-        phrase_token_t token;
-        const char * string;
-    };
-
-    static const token_pair tokens [] = {
-        {sentence_start, "<start>"},
-        {0, NULL}
-    };
-
-    const token_pair * pair = tokens;
-    while (pair->string) {
-        if ( strcmp(string, pair->string ) == 0 ){
-            return pair->token;
-        }
-    }
-
-    fprintf(stderr, "error: unknown token:%s.\n", string);
-    return 0;
-}
-
-phrase_token_t string_to_token(PhraseLargeTable * phrases, const char * string){
-    phrase_token_t token = 0;
-    if ( string[0] == '<' ) {
-        return special_string_to_token(string);
-    }
-
-    glong phrase_len = g_utf8_strlen(string, -1);
-    utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
-    int result = phrases->search(phrase_len, phrase, token);
-    if ( !(result & SEARCH_OK) )
-        fprintf(stderr, "error: unknown token:%s.\n", string);
-
-    g_free(phrase);
-    return token;
-}
diff --git a/utils/storage/tag_utility.cpp b/utils/storage/tag_utility.cpp
index 3176056..5dcb35a 100644
--- a/utils/storage/tag_utility.cpp
+++ b/utils/storage/tag_utility.cpp
@@ -1,9 +1,10 @@
+#include "pinyin.h"
+#include <glib.h>
+#include <stdio.h>
 #include <string.h>
 #include <assert.h>
-#include <glib.h>
 #include "tag_utility.h"
 
-
 /* internal taglib structure */
 struct tag_entry{
     int m_line_type;
@@ -303,6 +304,84 @@ bool taglib_fini(){
     return true;
 }
 
-void test(){
-    assert(taglib_add_tag(2, "\\data", 1, "data", ""));
+static phrase_token_t taglib_special_string_to_token(const char * string){
+    struct token_pair{
+        phrase_token_t token;
+        const char * string;
+    };
+
+    static const token_pair tokens [] = {
+        {sentence_start, "<start>"},
+        {0, NULL}
+    };
+
+    const token_pair * pair = tokens;
+    while (pair->string) {
+        if ( strcmp(string, pair->string ) == 0 ){
+            return pair->token;
+        }
+    }
+
+    fprintf(stderr, "error: unknown token:%s.\n", string);
+    return 0;
+}
+
+phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){
+    phrase_token_t token = 0;
+    if ( string[0] == '<' ) {
+        return taglib_special_string_to_token(string);
+    }
+
+    glong phrase_len = g_utf8_strlen(string, -1);
+    utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
+    int result = phrases->search(phrase_len, phrase, token);
+    if ( !(result & SEARCH_OK) )
+        fprintf(stderr, "error: unknown token:%s.\n", string);
+
+    g_free(phrase);
+    return token;
+}
+
+static const char * taglib_special_token_to_string(phrase_token_t token){
+    struct token_pair{
+        phrase_token_t token;
+        const char * string;
+    };
+
+    static const token_pair tokens [] = {
+        {sentence_start, "<start>"},
+        {0, NULL}
+    };
+
+    const token_pair * pair = tokens;
+    while (pair->token) {
+        if ( token == pair->token )
+            return pair->string;
+    }
+
+    fprintf(stderr, "error: unknown token:%d.\n", token);
+    return NULL;
+}
+
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+                              phrase_token_t token) {
+    PhraseItem item;
+    utf16_t buffer[MAX_PHRASE_LENGTH];
+
+    gchar * phrase;
+    /* deal with the special phrase index, for "<start>..." */
+    if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
+        return g_strdup(taglib_special_token_to_string(token));
+    }
+
+    int result = phrase_index->get_phrase_item(token, item);
+    if (result != ERROR_OK) {
+        fprintf(stderr, "error: unknown token:%d.\n", token);
+        return NULL;
+    }
+
+    item.get_phrase_string(buffer);
+    guint8 length = item.get_phrase_length();
+    phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
+    return phrase;
 }
diff --git a/utils/storage/tag_utility.h b/utils/storage/tag_utility.h
index 22a7dd4..67d8946 100644
--- a/utils/storage/tag_utility.h
+++ b/utils/storage/tag_utility.h
@@ -22,6 +22,8 @@
 #ifndef TAG_UTILITY_H
 #define TAG_UTILITY_H
 
+#include "novel_types.h"
+
 /* Note: the optional tag has been removed from the first implementation.
  * Maybe the optional tag will be added back later.
  */
@@ -51,4 +53,16 @@ bool taglib_pop_state();
 
 bool taglib_fini();
 
+namespace pinyin{
+    class PhraseLargeTable;
+};
+
+using namespace pinyin;
+
+phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases,
+                                      const char * string);
+
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+                              phrase_token_t token);
+
 #endif
-- 
cgit