summaryrefslogtreecommitdiffstats
path: root/utils/storage
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2010-10-12 16:02:12 +0800
committerPeng Wu <alexepico@gmail.com>2010-10-12 16:02:12 +0800
commitebef7e93515b40c25dbb1236e92383e8e1bb2ff4 (patch)
tree375c5f0f4a4337fc1bf57533c5187c3c810b727c /utils/storage
parent6d18d7757a6bfd1edc4595861e4dbdd3a1eb1140 (diff)
downloadlibpinyin-ebef7e93515b40c25dbb1236e92383e8e1bb2ff4.tar.gz
libpinyin-ebef7e93515b40c25dbb1236e92383e8e1bb2ff4.tar.xz
libpinyin-ebef7e93515b40c25dbb1236e92383e8e1bb2ff4.zip
write parse bigram
Diffstat (limited to 'utils/storage')
-rw-r--r--utils/storage/import_interpolation.cpp59
1 files changed, 57 insertions, 2 deletions
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
index fb5a18a..f003d12 100644
--- a/utils/storage/import_interpolation.cpp
+++ b/utils/storage/import_interpolation.cpp
@@ -107,8 +107,7 @@ bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
switch(line_type) {
case GRAM_1_ITEM_LINE:{
/* handle \item in \1-gram */
- assert(values->len == 1);
- const char * string = (const char *)g_ptr_array_index(values, 0);
+ const char * string = (const char *) g_ptr_array_index(values, 0);
phrase_token_t token = string_to_token(phrases, string);
char * value = NULL;
assert(g_hash_table_lookup_extended(required, "count", NULL, (gpointer *)&value));
@@ -137,6 +136,62 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 2, "count", ""));
+ phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL;
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two tokens */
+ const char * string = (const char *) g_ptr_array_index(values, 0);
+ phrase_token_t token1 = string_to_token(phrases, string);
+ string = (const char *) g_ptr_array_index(values, 1);
+ phrase_token_t token2 = string_to_token(phrases, string);
+
+ /* tag: count */
+ char * value = NULL;
+ assert(g_hash_table_lookup_extended(required, "count", NULL, (gpointer *)&value));
+ glong count = atol(value);
+
+ if ( last_token != token1 ) {
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ //safe guard
+ last_token = 0;
+ last_single_gram = NULL;
+ }
+ SingleGram * system_gram = NULL, * user_gram = NULL;
+ bigram->load(token1, system_gram, user_gram);
+ assert(system_gram == NULL);
+
+ //create the new single gram
+ if ( user_gram == NULL )
+ user_gram = new SingleGram;
+ last_token = token1;
+ last_single_gram = user_gram;
+ }
+ last_single_gram->set_freq(token2, count);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ //safe guard
+ last_token = 0;
+ last_single_gram = NULL;
+ }
+
taglib_pop_state();
return true;
}