From 6cca2a97d0660e5a51f65389fbfacc0f5cdaaf5d Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Fri, 22 May 2015 14:44:07 +0800 Subject: update gen_unigram --- utils/training/gen_unigram.cpp | 68 ++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp index f4c51af..bb9b6b3 100644 --- a/utils/training/gen_unigram.cpp +++ b/utils/training/gen_unigram.cpp @@ -33,35 +33,9 @@ static GOptionEntry entries[] = }; /* increase all unigram frequency by a constant. */ - -int main(int argc, char * argv[]){ - setlocale(LC_ALL, ""); - - GError * error = NULL; - GOptionContext * context; - - context = g_option_context_new("- increase uni-gram"); - g_option_context_add_main_entries(context, entries, NULL); - if (!g_option_context_parse(context, &argc, &argv, &error)) { - g_print("option parsing failed:%s\n", error->message); - exit(EINVAL); - } - - SystemTableInfo system_table_info; - - gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); - bool retval = system_table_info.load(filename); - if (!retval) { - fprintf(stderr, "load table.conf failed.\n"); - exit(ENOENT); - } - g_free(filename); - +bool generate_unigram(const pinyin_table_info_t * phrase_files) { FacadePhraseIndex phrase_index; - const pinyin_table_info_t * phrase_files = - system_table_info.get_table_info(); - /* Note: please increase the value when corpus size becomes larger. * To avoid zero value when computing unigram frequency in float format. */ @@ -74,11 +48,6 @@ int main(int argc, char * argv[]){ continue; guint32 freq = 1; -#if 0 - /* skip GBK_DICTIONARY. */ - if (GBK_DICTIONARY == table_info->m_dict_index) - freq = 1; -#endif const char * binfile = table_info->m_system_filename; @@ -107,5 +76,40 @@ int main(int argc, char * argv[]){ if (!save_dictionary(phrase_files, &phrase_index)) exit(ENOENT); + return true; +} + +int main(int argc, char * argv[]){ + setlocale(LC_ALL, ""); + + GError * error = NULL; + GOptionContext * context; + + context = g_option_context_new("- increase uni-gram"); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed:%s\n", error->message); + exit(EINVAL); + } + + SystemTableInfo2 system_table_info; + + gchar * filename = g_build_filename(table_dir, SYSTEM_TABLE_INFO, NULL); + bool retval = system_table_info.load(filename); + if (!retval) { + fprintf(stderr, "load table.conf failed.\n"); + exit(ENOENT); + } + g_free(filename); + + const pinyin_table_info_t * phrase_files = + system_table_info.get_default_tables(); + + generate_unigram(phrase_files); + + phrase_files = system_table_info.get_addon_tables(); + + generate_unigram(phrase_files); + return 0; } -- cgit