summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-08-15 11:18:51 +0800
committerPeng Wu <alexepico@gmail.com>2011-08-15 12:59:44 +0800
commitaa0c5a5e74ddb8f3523d09dc67008afed304f2f2 (patch)
tree2c55a735790ca89516cf9b3fcf1178b4cc46e717
parent299dc559ff9b0747a4cf214b70773e61b06752c7 (diff)
downloadlibpinyin-aa0c5a5e74ddb8f3523d09dc67008afed304f2f2.tar.gz
libpinyin-aa0c5a5e74ddb8f3523d09dc67008afed304f2f2.tar.xz
libpinyin-aa0c5a5e74ddb8f3523d09dc67008afed304f2f2.zip
comments gen unigram
-rw-r--r--utils/training/gen_unigram.cpp24
1 files changed, 14 insertions, 10 deletions
diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp
index c2e91a7..ae18146 100644
--- a/utils/training/gen_unigram.cpp
+++ b/utils/training/gen_unigram.cpp
@@ -22,35 +22,39 @@
#include <stdio.h>
#include "pinyin.h"
-//increase all unigram frequency by one.
+
+/* increase all unigram frequency by a constant. */
int main(int argc, char * argv[]){
FacadePhraseIndex phrase_index;
- //gb_char binary file
+ /* gb_char binary file */
MemoryChunk * chunk = new MemoryChunk;
chunk->load("gb_char.bin");
phrase_index.load(1, chunk);
- //gbk_char binary file
+ /* gbk_char binary file */
chunk = new MemoryChunk;
chunk->load("gbk_char.bin");
phrase_index.load(2, chunk);
- PhraseIndexRange range;
+ /* Note: please increase the value when corpus size becomes larger.
+ * To avoid zero value when computing unigram frequency in float format.
+ */
+ guint32 freq = 1; PhraseIndexRange range;
int result = phrase_index.get_range(1, range);
if ( result == ERROR_OK ) {
- for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i){
- phrase_index.add_unigram_frequency(i, 1);
+ for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) {
+ phrase_index.add_unigram_frequency(i, freq);
}
}
-#if 0
- int result = phrase_index.get_range(2, range);
+#if 1
+ result = phrase_index.get_range(2, range);
if ( result == ERROR_OK ) {
- for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i){
- phrase_index.add_unigram_frequency(i, 1);
+ for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) {
+ phrase_index.add_unigram_frequency(i, freq);
}
}
#endif