summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-05-21 15:39:41 +0800
committerPeng Wu <alexepico@gmail.com>2012-05-21 15:39:41 +0800
commit1b3140958408a80d029d59835d612f1b76f592ab (patch)
treec7ce156183812bc28c9e400e1043fdfc5e849f8a
parentc34f87b345d762882c265a638d306f10b21d4cf8 (diff)
downloadlibpinyin-1b3140958408a80d029d59835d612f1b76f592ab.tar.gz
libpinyin-1b3140958408a80d029d59835d612f1b76f592ab.tar.xz
libpinyin-1b3140958408a80d029d59835d612f1b76f592ab.zip
update gen_unigram.cpp
-rw-r--r--utils/training/gen_unigram.cpp26
1 files changed, 12 insertions, 14 deletions
diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp
index f94c214..72bb2a8 100644
--- a/utils/training/gen_unigram.cpp
+++ b/utils/training/gen_unigram.cpp
@@ -35,23 +35,21 @@ int main(int argc, char * argv[]){
/* Note: please increase the value when corpus size becomes larger.
* To avoid zero value when computing unigram frequency in float format.
*/
- guint32 freq = 1; PhraseIndexRange range;
- int result = phrase_index.get_range(1, range);
- if ( result == ERROR_OK ) {
- for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) {
- phrase_index.add_unigram_frequency(i, freq);
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const char * binfile = pinyin_phrase_files[i];
+ if (NULL == binfile)
+ continue;
+
+ guint32 freq = 1; PhraseIndexRange range;
+ int result = phrase_index.get_range(i, range);
+ if ( result == ERROR_OK ) {
+ for (size_t token = range.m_range_begin;
+ token <= range.m_range_end; ++token) {
+ phrase_index.add_unigram_frequency(token, freq);
+ }
}
}
-#if 1
- result = phrase_index.get_range(2, range);
- if ( result == ERROR_OK ) {
- for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) {
- phrase_index.add_unigram_frequency(i, freq);
- }
- }
-#endif
-
if (!save_phrase_index(&phrase_index))
exit(ENOENT);