write parse unigram for import tool

author: Peng Wu <alexepico@gmail.com> 2010-10-12 14:49:54 +0800
committer: Peng Wu <alexepico@gmail.com> 2010-10-12 14:49:54 +0800
commit: 6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140 (patch)
tree: f58fa3b625dce056962fbdf0003623600d388255 /utils/storage
parent: e2e105eee7426cb75568f4954373e11c77f09d04 (diff)
download: libpinyin-6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140.tar.gz
libpinyin-6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140.tar.xz
libpinyin-6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140.zip
1 files changed, 104 insertions, 10 deletions
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
index 6e88fdb..fb5a18a 100644
--- a/utils/storage/import_interpolation.cpp
+++ b/utils/storage/import_interpolation.cpp
@@ -44,6 +44,9 @@ static GHashTable * required = NULL;
 static char * linebuf = NULL;
 static size_t len = 0;
 
+phrase_token_t string_to_token(PhraseLargeTable * phrases,
+                               const char * string);
+
 bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
                    FacadePhraseIndex * phrase_index);
 
@@ -51,6 +54,15 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
                   FacadePhraseIndex * phrase_index,
                   Bigram * bigram);
 
+static ssize_t my_getline(FILE * input){
+    ssize_t result = getline(&linebuf, &len, input);
+    if ( result == -1 )
+        return result;
+
+    linebuf[strlen(linebuf) - 1] = '\0';
+    return result;
+}
+
 bool parse_body(FILE * input, PhraseLargeTable * phrases,
                 FacadePhraseIndex * phrase_index,
                 Bigram * bigram){
@@ -60,25 +72,24 @@ bool parse_body(FILE * input, PhraseLargeTable * phrases,
     assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
     assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
 
-    do{
+    do {
     retry:
         assert(taglib_read(linebuf, line_type, values, required));
         switch(line_type) {
         case END_LINE:
             goto end;
-            break;
         case GRAM_1_LINE:
+            my_getline(input);
             parse_unigram(input, phrases, phrase_index);
             goto retry;
-            break;
         case GRAM_2_LINE:
+            my_getline(input);
             parse_bigram(input, phrases, phrase_index, bigram);
             goto retry;
-            break;
         default:
             assert(false);
         }
-    } while (getline(&linebuf, &len, input) != -1) ;
+    } while (my_getline(input) != -1) ;
 
  end:
     taglib_pop_state();
@@ -87,14 +98,46 @@ bool parse_body(FILE * input, PhraseLargeTable * phrases,
 
 bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
                    FacadePhraseIndex * phrase_index){
+    taglib_push_state();
+
+    assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 1, "count", ""));
+
+    do {
+        assert(taglib_read(linebuf, line_type, values, required));
+        switch(line_type) {
+        case GRAM_1_ITEM_LINE:{
+            /* handle \item in \1-gram */
+            assert(values->len == 1);
+            const char * string = (const char *)g_ptr_array_index(values, 0);
+            phrase_token_t token = string_to_token(phrases, string);
+            char * value = NULL;
+            assert(g_hash_table_lookup_extended(required, "count", NULL, (gpointer *)&value));
+            glong count = atol(value);
+            phrase_index->add_unigram_frequency(token, count);
+            break;
+        }
+        case END_LINE:
+        case GRAM_1_LINE:
+        case GRAM_2_LINE:
+            goto end;
+        default:
+            assert(false);
+        }
+    } while (my_getline(input) != -1) ;
 
+ end:
+    taglib_pop_state();
     return true;
 }
 
 bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
                   FacadePhraseIndex * phrase_index,
                   Bigram * bigram){
+    taglib_push_state();
 
+    assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 2, "count", ""));
+
+    taglib_pop_state();
     return true;
 }
 
@@ -128,21 +171,72 @@ int main(int argc, char * argv[]){
     values = g_ptr_array_new();
     required = g_hash_table_new(g_str_hash, g_str_equal);
 
+    //enter "\data" line
     assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
-    getline(&linebuf, &len, input);
-    linebuf[strlen(linebuf) - 1] = '\0';
+    my_getline(input);
 
+    //read "\data" line
     assert(taglib_read(linebuf, line_type, values, required));
     assert(line_type == BEGIN_LINE);
     char * value = NULL;
     assert(g_hash_table_lookup_extended(required, "model", NULL, (gpointer *)&value));
-    assert(strcmp("interpolation", value) == 0);
+    if ( !( strcmp("interpolation", value) == 0 ) ) {
+        fprintf(stderr, "error: interpolation model expected.\n");
+        exit(1);
+    }
 
-    getline(&linebuf, &len, input);
-    linebuf[strlen(linebuf) - 1] = '\0';
+    my_getline(input);
     parse_body(input, &phrases, &phrase_index, &bigram);
 
     taglib_fini();
 
+    chunk = new MemoryChunk;
+    phrase_index.store(1, chunk);
+    chunk->save("../../data/gb_char.bin");
+    phrase_index.load(1, chunk);
+
+    chunk = new MemoryChunk;
+    phrase_index.store(2, chunk);
+    chunk->save("../../data/gbk_char.bin");
+    phrase_index.load(2, chunk);
+
     return 0;
 }
+
+static phrase_token_t special_string_to_token(const char * string){
+    struct token_pair{
+        phrase_token_t token;
+        const char * string;
+    };
+
+    static const token_pair tokens [] = {
+        {sentence_start, "<start>"},
+        {0, NULL}
+    };
+
+    const token_pair * pair = tokens;
+    while (pair->string) {
+        if ( strcmp(string, pair->string ) == 0 ){
+            return pair->token;
+        }
+    }
+
+    fprintf(stderr, "error: unknown token:%s.\n", string);
+    return 0;
+}
+
+phrase_token_t string_to_token(PhraseLargeTable * phrases, const char * string){
+    phrase_token_t token = 0;
+    if ( string[0] == '<' ) {
+        return special_string_to_token(string);
+    }
+
+    glong phrase_len = g_utf8_strlen(string, -1);
+    utf16_t * phrase = g_utf8_to_utf16(string, -1, NULL, NULL, NULL);
+    int result = phrases->search(phrase_len, phrase, token);
+    if ( !(result & SEARCH_OK) )
+        fprintf(stderr, "error: unknown token:%s.\n", string);
+
+    g_free(phrase);
+    return token;
+}
author	Peng Wu <alexepico@gmail.com>	2010-10-12 14:49:54 +0800
committer	Peng Wu <alexepico@gmail.com>	2010-10-12 14:49:54 +0800
commit	6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140 (patch)
tree	f58fa3b625dce056962fbdf0003623600d388255 /utils/storage
parent	e2e105eee7426cb75568f4954373e11c77f09d04 (diff)
download	libpinyin-6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140.tar.gz libpinyin-6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140.tar.xz libpinyin-6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140.zip