summaryrefslogtreecommitdiffstats
path: root/utils/storage
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2010-10-12 14:49:54 +0800
committerPeng Wu <alexepico@gmail.com>2010-10-12 14:49:54 +0800
commit6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140 (patch)
treef58fa3b625dce056962fbdf0003623600d388255 /utils/storage
parente2e105eee7426cb75568f4954373e11c77f09d04 (diff)
downloadlibpinyin-6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140.tar.gz
libpinyin-6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140.tar.xz
libpinyin-6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140.zip
write parse unigram for import tool
Diffstat (limited to 'utils/storage')
-rw-r--r--utils/storage/import_interpolation.cpp114
1 files changed, 104 insertions, 10 deletions
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
index 6e88fdb..fb5a18a 100644
--- a/utils/storage/import_interpolation.cpp
+++ b/utils/storage/import_interpolation.cpp
@@ -44,6 +44,9 @@ static GHashTable * required = NULL;
static char * linebuf = NULL;
static size_t len = 0;
+phrase_token_t string_to_token(PhraseLargeTable * phrases,
+ const char * string);
+
bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
FacadePhraseIndex * phrase_index);
@@ -51,6 +54,15 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
FacadePhraseIndex * phrase_index,
Bigram * bigram);
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ linebuf[strlen(linebuf) - 1] = '\0';
+ return result;
+}
+
bool parse_body(FILE * input, PhraseLargeTable * phrases,
FacadePhraseIndex * phrase_index,
Bigram * bigram){
@@ -60,25 +72,24 @@ bool parse_body(FILE * input, PhraseLargeTable * phrases,
assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
- do{
+ do {
retry:
assert(taglib_read(linebuf, line_type, values, required));
switch(line_type) {
case END_LINE:
goto end;
- break;
case GRAM_1_LINE:
+ my_getline(input);
parse_unigram(input, phrases, phrase_index);
goto retry;
- break;
case GRAM_2_LINE:
+ my_getline(input);
parse_bigram(input, phrases, phrase_index, bigram);
goto retry;
- break;
default:
assert(false);
}
- } while (getline(&linebuf, &len, input) != -1) ;
+ } while (my_getline(input) != -1) ;
end:
taglib_pop_state();
@@ -87,14 +98,46 @@ bool parse_body(FILE * input, PhraseLargeTable * phrases,
bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
FacadePhraseIndex * phrase_index){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 1, "count", ""));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case GRAM_1_ITEM_LINE:{
+ /* handle \item in \1-gram */
+ assert(values->len == 1);
+ const char * string = (const char *)g_ptr_array_index(values, 0);
+ phrase_token_t token = string_to_token(phrases, string);
+ char * value = NULL;
+ assert(g_hash_table_lookup_extended(required, "count", NULL, (gpointer *)&value));
+ glong count = atol(value);
+ phrase_index->add_unigram_frequency(token, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1) ;
+ end:
+ taglib_pop_state();
return true;
}
bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
FacadePhraseIndex * phrase_index,
Bigram * bigram){
+ taglib_push_state();
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 2, "count", ""));
+
+ taglib_pop_state();
return true;
}
@@ -128,21 +171,72 @@ int main(int argc, char * argv[]){
values = g_ptr_array_new();
required = g_hash_table_new(g_str_hash, g_str_equal);
+ //enter "\data" line
assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
- getline(&linebuf, &len, input);
- linebuf[strlen(linebuf) - 1] = '\0';
+ my_getline(input);
+ //read "\data" line
assert(taglib_read(linebuf, line_type, values, required));
assert(line_type == BEGIN_LINE);
char * value = NULL;
assert(g_hash_table_lookup_extended(required, "model", NULL, (gpointer *)&value));
- assert(strcmp("interpolation", value) == 0);
+ if ( !( strcmp("interpolation", value) == 0 ) ) {
+ fprintf(stderr, "error: interpolation model expected.\n");
+ exit(1);
+ }
- getline(&linebuf, &len, input);
- linebuf[strlen(linebuf) - 1] = '\0';
+ my_getline(input);
parse_body(input, &phrases, &phrase_index, &bigram);
taglib_fini();
+ chunk = new MemoryChunk;
+ phrase_index.store(1, chunk);
+ chunk->save("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+
+ chunk = new MemoryChunk;
+ phrase_index.store(2, chunk);
+ chunk->save("../../data/gbk_char.bin");
+ phrase_index.load(2, chunk);
+
return 0;
}
+
+static phrase_token_t special_string_to_token(const char * string){
+ struct token_pair{
+ phrase_token_t token;
+ const char * string;
+ };
+
+ static const token_pair tokens [] = {
+ {sentence_start, "<start>"},
+ {0, NULL}
+ };
+
+ const token_pair * pair = tokens;
+ while (pair->string) {
+ if ( strcmp(string, pair->string ) == 0 ){
+ return pair->token;
+ }
+ }
+
+ fprintf(stderr, "error: unknown token:%s.\n", string);
+ return 0;
+}
+
+phrase_token_t string_to_token(PhraseLargeTable * phrases, const char * string){
+ phrase_token_t token = 0;
+ if ( string[0] == '<' ) {
+ return special_string_to_token(string);
+ }
+
+ glong phrase_len = g_utf8_strlen(string, -1);
+ utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
+ int result = phrases->search(phrase_len, phrase, token);
+ if ( !(result & SEARCH_OK) )
+ fprintf(stderr, "error: unknown token:%s.\n", string);
+
+ g_free(phrase);
+ return token;
+}