summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-16 14:11:05 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-16 14:13:07 +0800
commit5394772a45e214206652ad565f506ed3d3a149af (patch)
tree9b93171891767ace95b27aa6b1ae5861c2977990 /utils
parent82c49d8b7337dca828c142c902682bb991382df0 (diff)
downloadlibpinyin-5394772a45e214206652ad565f506ed3d3a149af.tar.gz
libpinyin-5394772a45e214206652ad565f506ed3d3a149af.tar.xz
libpinyin-5394772a45e214206652ad565f506ed3d3a149af.zip
move token string conversion function to taglib
Diffstat (limited to 'utils')
-rw-r--r--utils/storage/Makefile.am10
-rw-r--r--utils/storage/export_interpolation.cpp55
-rw-r--r--utils/storage/import_interpolation.cpp47
-rw-r--r--utils/storage/tag_utility.cpp87
-rw-r--r--utils/storage/tag_utility.h14
5 files changed, 110 insertions, 103 deletions
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
index 3e0fab4..f314a1a 100644
--- a/utils/storage/Makefile.am
+++ b/utils/storage/Makefile.am
@@ -33,10 +33,6 @@ gen_binary_files_SOURCES = gen_binary_files.cpp
gen_binary_files_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
-export_interpolation_SOURCES = export_interpolation.cpp
-
-export_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
-
noinst_LTLIBRARIES = libtagutils.la
libtagutils_la_LDFLAGS = -static
@@ -45,4 +41,8 @@ libtagutils_la_SOURCES = tag_utility.cpp
import_interpolation_SOURCES = import_interpolation.cpp
-import_interpolation_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@ ./libtagutils.la \ No newline at end of file
+import_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@
+
+export_interpolation_SOURCES = export_interpolation.cpp
+
+export_interpolation_LDADD = ./.libs/libtagutils.a ../../src/libpinyin.la @GLIB2_LDFLAGS@
diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp
index 333470e..bcc90b8 100644
--- a/utils/storage/export_interpolation.cpp
+++ b/utils/storage/export_interpolation.cpp
@@ -19,19 +19,17 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include "pinyin.h"
#include <stdio.h>
#include <assert.h>
#include <glib.h>
-#include "pinyin.h"
+#include "tag_utility.h"
/* export interpolation model as textual format */
void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index);
void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram);
-/* consider moving the following function to utils/storage/utility.h */
-char * token_to_string(FacadePhraseIndex * phrase_index, phrase_token_t token);
-
void begin_data(FILE * file){
fprintf(file, "\\data model interpolation\n");
}
@@ -86,7 +84,7 @@ void gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
assert( result == ERROR_OK);
size_t freq = item.get_unigram_frequency();
- char * phrase = token_to_string(phrase_index, j);
+ char * phrase = taglib_token_to_string(phrase_index, j);
if ( phrase )
fprintf(output, "\\item %s count %ld\n", phrase, freq);
@@ -115,8 +113,8 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram
for(size_t j = 0; j < array->len; j++) {
BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);
- char * word1 = token_to_string(phrase_index, token);
- char * word2 = token_to_string(phrase_index, item->m_token);
+ char * word1 = taglib_token_to_string(phrase_index, token);
+ char * word2 = taglib_token_to_string(phrase_index, item->m_token);
guint32 freq = item->m_count;
if ( word1 && word2)
@@ -130,46 +128,3 @@ void gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram
g_array_free(items, TRUE);
}
-
-static const char * special_token_to_string(phrase_token_t token){
- struct token_pair{
- phrase_token_t token;
- const char * string;
- };
-
- static const token_pair tokens [] = {
- {sentence_start, "<start>"},
- {0, NULL}
- };
-
- const token_pair * pair = tokens;
- while (pair->token) {
- if ( token == pair->token )
- return pair->string;
- }
-
- fprintf(stderr, "error: unknown token:%d.\n", token);
- return NULL;
-}
-
-char * token_to_string(FacadePhraseIndex * phrase_index, phrase_token_t token) {
- PhraseItem item;
- utf16_t buffer[MAX_PHRASE_LENGTH];
-
- gchar * phrase;
- /* deal with the special phrase index, for "<start>..." */
- if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
- return g_strdup(special_token_to_string(token));
- }
-
- int result = phrase_index->get_phrase_item(token, item);
- if (result != ERROR_OK) {
- fprintf(stderr, "error: unknown token:%d.\n", token);
- return NULL;
- }
-
- item.get_phrase_string(buffer);
- guint8 length = item.get_phrase_length();
- phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
- return phrase;
-}
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
index 57c87c1..6c97109 100644
--- a/utils/storage/import_interpolation.cpp
+++ b/utils/storage/import_interpolation.cpp
@@ -40,9 +40,6 @@ static GHashTable * required = NULL;
static char * linebuf = NULL;
static size_t len = 0;
-phrase_token_t string_to_token(PhraseLargeTable * phrases,
- const char * string);
-
bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
FacadePhraseIndex * phrase_index);
@@ -104,7 +101,7 @@ bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
case GRAM_1_ITEM_LINE:{
/* handle \item in \1-gram */
const char * string = (const char *) g_ptr_array_index(values, 0);
- phrase_token_t token = string_to_token(phrases, string);
+ phrase_token_t token = taglib_string_to_token(phrases, string);
char * value = NULL;
assert(g_hash_table_lookup_extended(required, "count", NULL, (gpointer *)&value));
glong count = atol(value);
@@ -140,9 +137,9 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
/* handle \item in \2-gram */
/* two tokens */
const char * string = (const char *) g_ptr_array_index(values, 0);
- phrase_token_t token1 = string_to_token(phrases, string);
+ phrase_token_t token1 = taglib_string_to_token(phrases, string);
string = (const char *) g_ptr_array_index(values, 1);
- phrase_token_t token2 = string_to_token(phrases, string);
+ phrase_token_t token2 = taglib_string_to_token(phrases, string);
/* tag: count */
char * value = NULL;
@@ -262,41 +259,3 @@ int main(int argc, char * argv[]){
return 0;
}
-
-static phrase_token_t special_string_to_token(const char * string){
- struct token_pair{
- phrase_token_t token;
- const char * string;
- };
-
- static const token_pair tokens [] = {
- {sentence_start, "<start>"},
- {0, NULL}
- };
-
- const token_pair * pair = tokens;
- while (pair->string) {
- if ( strcmp(string, pair->string ) == 0 ){
- return pair->token;
- }
- }
-
- fprintf(stderr, "error: unknown token:%s.\n", string);
- return 0;
-}
-
-phrase_token_t string_to_token(PhraseLargeTable * phrases, const char * string){
- phrase_token_t token = 0;
- if ( string[0] == '<' ) {
- return special_string_to_token(string);
- }
-
- glong phrase_len = g_utf8_strlen(string, -1);
- utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
- int result = phrases->search(phrase_len, phrase, token);
- if ( !(result & SEARCH_OK) )
- fprintf(stderr, "error: unknown token:%s.\n", string);
-
- g_free(phrase);
- return token;
-}
diff --git a/utils/storage/tag_utility.cpp b/utils/storage/tag_utility.cpp
index 3176056..5dcb35a 100644
--- a/utils/storage/tag_utility.cpp
+++ b/utils/storage/tag_utility.cpp
@@ -1,9 +1,10 @@
+#include "pinyin.h"
+#include <glib.h>
+#include <stdio.h>
#include <string.h>
#include <assert.h>
-#include <glib.h>
#include "tag_utility.h"
-
/* internal taglib structure */
struct tag_entry{
int m_line_type;
@@ -303,6 +304,84 @@ bool taglib_fini(){
return true;
}
-void test(){
- assert(taglib_add_tag(2, "\\data", 1, "data", ""));
+static phrase_token_t taglib_special_string_to_token(const char * string){
+ struct token_pair{
+ phrase_token_t token;
+ const char * string;
+ };
+
+ static const token_pair tokens [] = {
+ {sentence_start, "<start>"},
+ {0, NULL}
+ };
+
+ const token_pair * pair = tokens;
+ while (pair->string) {
+ if ( strcmp(string, pair->string ) == 0 ){
+ return pair->token;
+ }
+ }
+
+ fprintf(stderr, "error: unknown token:%s.\n", string);
+ return 0;
+}
+
+phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){
+ phrase_token_t token = 0;
+ if ( string[0] == '<' ) {
+ return taglib_special_string_to_token(string);
+ }
+
+ glong phrase_len = g_utf8_strlen(string, -1);
+ utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
+ int result = phrases->search(phrase_len, phrase, token);
+ if ( !(result & SEARCH_OK) )
+ fprintf(stderr, "error: unknown token:%s.\n", string);
+
+ g_free(phrase);
+ return token;
+}
+
+static const char * taglib_special_token_to_string(phrase_token_t token){
+ struct token_pair{
+ phrase_token_t token;
+ const char * string;
+ };
+
+ static const token_pair tokens [] = {
+ {sentence_start, "<start>"},
+ {0, NULL}
+ };
+
+ const token_pair * pair = tokens;
+ while (pair->token) {
+ if ( token == pair->token )
+ return pair->string;
+ }
+
+ fprintf(stderr, "error: unknown token:%d.\n", token);
+ return NULL;
+}
+
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+ phrase_token_t token) {
+ PhraseItem item;
+ utf16_t buffer[MAX_PHRASE_LENGTH];
+
+ gchar * phrase;
+ /* deal with the special phrase index, for "<start>..." */
+ if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
+ return g_strdup(taglib_special_token_to_string(token));
+ }
+
+ int result = phrase_index->get_phrase_item(token, item);
+ if (result != ERROR_OK) {
+ fprintf(stderr, "error: unknown token:%d.\n", token);
+ return NULL;
+ }
+
+ item.get_phrase_string(buffer);
+ guint8 length = item.get_phrase_length();
+ phrase = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL);
+ return phrase;
}
diff --git a/utils/storage/tag_utility.h b/utils/storage/tag_utility.h
index 22a7dd4..67d8946 100644
--- a/utils/storage/tag_utility.h
+++ b/utils/storage/tag_utility.h
@@ -22,6 +22,8 @@
#ifndef TAG_UTILITY_H
#define TAG_UTILITY_H
+#include "novel_types.h"
+
/* Note: the optional tag has been removed from the first implementation.
* Maybe the optional tag will be added back later.
*/
@@ -51,4 +53,16 @@ bool taglib_pop_state();
bool taglib_fini();
+namespace pinyin{
+ class PhraseLargeTable;
+};
+
+using namespace pinyin;
+
+phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases,
+ const char * string);
+
+char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
+ phrase_token_t token);
+
#endif