diff options
author | Peng Wu <alexepico@gmail.com> | 2012-05-21 15:39:41 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-05-21 15:39:41 +0800 |
commit | 1b3140958408a80d029d59835d612f1b76f592ab (patch) | |
tree | c7ce156183812bc28c9e400e1043fdfc5e849f8a /utils | |
parent | c34f87b345d762882c265a638d306f10b21d4cf8 (diff) | |
download | libpinyin-1b3140958408a80d029d59835d612f1b76f592ab.tar.gz libpinyin-1b3140958408a80d029d59835d612f1b76f592ab.tar.xz libpinyin-1b3140958408a80d029d59835d612f1b76f592ab.zip |
update gen_unigram.cpp
Diffstat (limited to 'utils')
-rw-r--r-- | utils/training/gen_unigram.cpp | 26 |
1 files changed, 12 insertions, 14 deletions
diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp index f94c214..72bb2a8 100644 --- a/utils/training/gen_unigram.cpp +++ b/utils/training/gen_unigram.cpp @@ -35,23 +35,21 @@ int main(int argc, char * argv[]){ /* Note: please increase the value when corpus size becomes larger. * To avoid zero value when computing unigram frequency in float format. */ - guint32 freq = 1; PhraseIndexRange range; - int result = phrase_index.get_range(1, range); - if ( result == ERROR_OK ) { - for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) { - phrase_index.add_unigram_frequency(i, freq); + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * binfile = pinyin_phrase_files[i]; + if (NULL == binfile) + continue; + + guint32 freq = 1; PhraseIndexRange range; + int result = phrase_index.get_range(i, range); + if ( result == ERROR_OK ) { + for (size_t token = range.m_range_begin; + token <= range.m_range_end; ++token) { + phrase_index.add_unigram_frequency(token, freq); + } } } -#if 1 - result = phrase_index.get_range(2, range); - if ( result == ERROR_OK ) { - for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) { - phrase_index.add_unigram_frequency(i, freq); - } - } -#endif - if (!save_phrase_index(&phrase_index)) exit(ENOENT); |