summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-17 15:57:44 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-17 15:59:24 +0800
commitfaa30bc5e1b5c2f98959936334340f84d30e82bb (patch)
tree51cae9c77d52b26ae19cd54f28edf897efa74a74 /utils
parent00099bfaac3911f13c38c593d01f8c2c3bd35523 (diff)
downloadlibpinyin-faa30bc5e1b5c2f98959936334340f84d30e82bb.tar.gz
libpinyin-faa30bc5e1b5c2f98959936334340f84d30e82bb.tar.xz
libpinyin-faa30bc5e1b5c2f98959936334340f84d30e82bb.zip
begin to write import k mixture model
Diffstat (limited to 'utils')
-rw-r--r--utils/storage/import_interpolation.cpp20
-rw-r--r--utils/training/Makefile.am5
-rw-r--r--utils/training/import_k_mixture_model.cpp214
3 files changed, 229 insertions, 10 deletions
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
index 6c97109..bc2da68 100644
--- a/utils/storage/import_interpolation.cpp
+++ b/utils/storage/import_interpolation.cpp
@@ -97,14 +97,14 @@ bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
do {
assert(taglib_read(linebuf, line_type, values, required));
- switch(line_type) {
+ switch (line_type) {
case GRAM_1_ITEM_LINE:{
/* handle \item in \1-gram */
const char * string = (const char *) g_ptr_array_index(values, 0);
phrase_token_t token = taglib_string_to_token(phrases, string);
- char * value = NULL;
- assert(g_hash_table_lookup_extended(required, "count", NULL, (gpointer *)&value));
- glong count = atol(value);
+ gpointer value = NULL;
+ assert(g_hash_table_lookup_extended(required, "count", NULL, &value));
+ glong count = atol((const char *)value);
phrase_index->add_unigram_frequency(token, count);
break;
}
@@ -115,7 +115,7 @@ bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
default:
assert(false);
}
- } while (my_getline(input) != -1) ;
+ } while (my_getline(input) != -1);
end:
taglib_pop_state();
@@ -132,7 +132,7 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
phrase_token_t last_token = 0; SingleGram * last_single_gram = NULL;
do {
assert(taglib_read(linebuf, line_type, values, required));
- switch(line_type) {
+ switch (line_type) {
case GRAM_2_ITEM_LINE:{
/* handle \item in \2-gram */
/* two tokens */
@@ -141,10 +141,10 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
string = (const char *) g_ptr_array_index(values, 1);
phrase_token_t token2 = taglib_string_to_token(phrases, string);
+ gpointer value = NULL;
/* tag: count */
- char * value = NULL;
- assert(g_hash_table_lookup_extended(required, "count", NULL, (gpointer *)&value));
- glong count = atol(value);
+ assert(g_hash_table_lookup_extended(required, "count", NULL, &value));
+ glong count = atol((char *)value);
if ( last_token != token1 ) {
if ( last_token && last_single_gram ) {
@@ -166,7 +166,7 @@ bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
//save the freq
guint32 total_freq = 0;
assert(last_single_gram->get_total_freq(total_freq));
- last_single_gram->insert_freq(token2, count);
+ assert(last_single_gram->insert_freq(token2, count));
total_freq += count;
assert(last_single_gram->set_total_freq(total_freq));
break;
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am
index adc0c30..3a3d6b9 100644
--- a/utils/training/Makefile.am
+++ b/utils/training/Makefile.am
@@ -34,6 +34,7 @@ noinst_PROGRAMS = gen_ngram \
estimate_k_mixture_model \
merge_k_mixture_model \
prune_k_mixture_model \
+ import_k_mixture_model \
export_k_mixture_model
gen_ngram_SOURCES = gen_ngram.cpp
@@ -68,6 +69,10 @@ prune_k_mixture_model_SOURCES = prune_k_mixture_model.cpp
prune_k_mixture_model_LDADD = ../../src/libpinyin.la @GLIB2_LDFLAGS@
+import_k_mixture_model_SOURCES = import_k_mixture_model.cpp
+
+import_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@
+
export_k_mixture_model_SOURCES = export_k_mixture_model.cpp
export_k_mixture_model_LDADD = ../storage/libtagutils.la ../../src/libpinyin.la @GLIB2_LDFLAGS@ \ No newline at end of file
diff --git a/utils/training/import_k_mixture_model.cpp b/utils/training/import_k_mixture_model.cpp
new file mode 100644
index 0000000..f669170
--- /dev/null
+++ b/utils/training/import_k_mixture_model.cpp
@@ -0,0 +1,214 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdio.h>
+#include "pinyin.h"
+#include "tag_utility.h"
+#include "k_mixture_model.h"
+
+enum LINE_TYPE{
+ BEGIN_LINE = 1,
+ END_LINE,
+ GRAM_1_LINE,
+ GRAM_2_LINE,
+ GRAM_1_ITEM_LINE,
+ GRAM_2_ITEM_LINE
+};
+
+static int line_type = 0;
+static GPtrArray * values = NULL;
+static GHashTable * required = NULL;
+/* variables for line buffer. */
+static char * linebuf = NULL;
+static size_t len = 0;
+
+bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
+ KMixtureModelBigram * bigram);
+
+bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
+ KMixtureModelBigram * bigram);
+
+static ssize_t my_getline(FILE * input){
+ ssize_t result = getline(&linebuf, &len, input);
+ if ( result == -1 )
+ return result;
+
+ linebuf[strlen(linebuf) - 1] = '\0';
+ return result;
+}
+
+bool parse_body(FILE * input, PhraseLargeTable * phrases,
+ KMixtureModelBigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(END_LINE, "\\end", 0, "", ""));
+ assert(taglib_add_tag(GRAM_1_LINE, "\\1-gram", 0, "", ""));
+ assert(taglib_add_tag(GRAM_2_LINE, "\\2-gram", 0, "", ""));
+
+ do {
+ retry:
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch(line_type) {
+ case END_LINE:
+ goto end;
+ case GRAM_1_LINE:
+ my_getline(input);
+ parse_unigram(input, phrases, bigram);
+ goto retry;
+ case GRAM_2_LINE:
+ my_getline(input);
+ parse_bigram(input, phrases, bigram);
+ goto retry;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1) ;
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_unigram(FILE * input, PhraseLargeTable * phrases,
+ KMixtureModelBigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_1_ITEM_LINE, "\\item", 1, "count", ""));
+
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_1_ITEM_LINE:{
+ /* handle \item in \1-gram */
+ const char * string = (const char *) g_ptr_array_index(values, 0);
+ phrase_token_t token = taglib_string_to_token(phrases, string);
+ gpointer value = NULL;
+ assert(g_hash_table_lookup_extended(required, "count",
+ NULL, &value));
+ glong count = atol((const char *)value);
+ KMixtureModelArrayHeader array_header;
+ memset(&array_header, 0, sizeof(KMixtureModelArrayHeader));
+ array_header.m_WC = count;
+ bigram->set_array_header(token, array_header);
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ taglib_pop_state();
+ return true;
+}
+
+bool parse_bigram(FILE * input, PhraseLargeTable * phrases,
+ KMixtureModelBigram * bigram){
+ taglib_push_state();
+
+ assert(taglib_add_tag(GRAM_2_ITEM_LINE, "\\item", 2,
+ "count:T:N_n_0:n_1:Mr", ""));
+
+ phrase_token_t last_token = 0;
+ KMixtureModelSingleGram * last_single_gram = NULL;
+ do {
+ assert(taglib_read(linebuf, line_type, values, required));
+ switch (line_type) {
+ case GRAM_2_ITEM_LINE:{
+ /* handle \item in \2-gram */
+ /* two tokens */
+ const char * string = (const char *) g_ptr_array_index(values, 0);
+ phrase_token_t token1 = taglib_string_to_token(phrases, string);
+ string = (const char *) g_ptr_array_index(values, 1);
+ phrase_token_t token2 = taglib_string_to_token(phrases, string);
+
+ gpointer value = NULL;
+ /* tag: count */
+ assert(g_hash_table_lookup_extended(required, "count", NULL, &value));
+ glong count = atol((char *)value);
+ /* tag: T */
+ assert(g_hash_table_lookup_extended(required, "T", NULL, &value));
+ glong T = atol((char *)value);
+ assert(count == T);
+ /* tag: N_n_0 */
+ assert(g_hash_table_lookup_extended(required, "N_n_0", NULL, &value));
+ glong N_n_0 = atol((char *)value);
+ /* tag: n_1 */
+ assert(g_hash_table_lookup_extended(required, "n_1", NULL, &value));
+ glong n_1 = atol((char *)value);
+ /* tag: Mr */
+ assert(g_hash_table_lookup_extended(required, "Mr", NULL, &value));
+ glong Mr = atol((char *)value);
+
+ KMixtureModelArrayItem array_item;
+ memset(&array_item, 0, sizeof(KMixtureModelArrayItem));
+ array_item.m_WC = count; array_item.m_N_n_0 = N_n_0;
+ array_item.m_n_1 = n_1; array_item.m_Mr = Mr;
+
+ if ( last_token != token1 ) {
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ //safe guard
+ last_token = 0;
+ last_single_gram = NULL;
+ }
+ KMixtureModelSingleGram * single_gram = NULL;
+ bigram->load(token1, single_gram);
+
+ //create the new single gram
+ if ( single_gram == NULL )
+ single_gram = new KMixtureModelSingleGram;
+ last_token = token1;
+ last_single_gram = single_gram;
+ }
+ assert(last_single_gram->insert_array_item(token2, array_item));
+ break;
+ }
+ case END_LINE:
+ case GRAM_1_LINE:
+ case GRAM_2_LINE:
+ goto end;
+ default:
+ assert(false);
+ }
+ } while (my_getline(input) != -1);
+
+ end:
+ if ( last_token && last_single_gram ) {
+ bigram->store(last_token, last_single_gram);
+ delete last_single_gram;
+ //safe guard
+ last_token = 0;
+ last_single_gram = NULL;
+ }
+
+ taglib_pop_state();
+ return true;
+}
+
+int main(int argc, char * argv[]){
+ return 0;
+}