summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-05-31 10:58:06 +0800
committerPeng Wu <alexepico@gmail.com>2011-05-31 10:58:06 +0800
commitca16b0628857bd3cef581414a850108c56f782b1 (patch)
tree3cd587d46dc2aadb2e415e8196f0c402ad56db35 /utils
parent5f91468f3fb6a26b15a3ba417c4b1978646a17c4 (diff)
downloadlibpinyin-ca16b0628857bd3cef581414a850108c56f782b1.tar.gz
libpinyin-ca16b0628857bd3cef581414a850108c56f782b1.tar.xz
libpinyin-ca16b0628857bd3cef581414a850108c56f782b1.zip
refine gen ngram and gen deleted ngram
Diffstat (limited to 'utils')
-rw-r--r--utils/training/gen_deleted_ngram.cpp58
-rw-r--r--utils/training/gen_ngram.cpp65
2 files changed, 65 insertions, 58 deletions
diff --git a/utils/training/gen_deleted_ngram.cpp b/utils/training/gen_deleted_ngram.cpp
index 1db0cf1..93986d6 100644
--- a/utils/training/gen_deleted_ngram.cpp
+++ b/utils/training/gen_deleted_ngram.cpp
@@ -88,33 +88,37 @@ int main(int argc, char * argv[]){
last_token = cur_token;
cur_token = token;
- if ( cur_token ){
- SingleGram * single_gram = NULL;
- if ( 0 == last_token ){
- if (train_pi_gram)
- bigram.load(sentence_start, single_gram);
- } else
- bigram.load(last_token, single_gram);
-
- if ( NULL == single_gram ){
- single_gram = new SingleGram;
- }
- guint32 freq, total_freq;
- //increase freq
- if (single_gram->get_freq(cur_token, freq))
- assert(single_gram->set_freq(cur_token, freq + 1));
- else
- assert(single_gram->insert_freq(cur_token, 1));
- //increase total freq
- single_gram->get_total_freq(total_freq);
- single_gram->set_total_freq(total_freq + 1);
- if ( 0 == last_token ){
- if ( train_pi_gram )
- bigram.store(sentence_start, single_gram);
- }else
- bigram.store(last_token, single_gram);
- delete single_gram;
- }
+
+ /* skip null_token in second word. */
+ if ( null_token == cur_token )
+ continue;
+
+ /* skip pi-gram training. */
+ if ( null_token == last_token ){
+ if ( !train_pi_gram )
+ continue;
+ last_token = sentence_start;
+ }
+
+ /* train bi-gram */
+ SingleGram * single_gram = NULL;
+ bigram.load(last_token, single_gram);
+
+ if ( NULL == single_gram ){
+ single_gram = new SingleGram;
+ }
+ guint32 freq, total_freq;
+ //increase freq
+ if (single_gram->get_freq(cur_token, freq))
+ assert(single_gram->set_freq(cur_token, freq + 1));
+ else
+ assert(single_gram->insert_freq(cur_token, 1));
+ //increase total freq
+ single_gram->get_total_freq(total_freq);
+ single_gram->set_total_freq(total_freq + 1);
+
+ bigram.store(last_token, single_gram);
+ delete single_gram;
}
free(linebuf);
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
index 071f8ab..f6827dc 100644
--- a/utils/training/gen_ngram.cpp
+++ b/utils/training/gen_ngram.cpp
@@ -101,37 +101,40 @@ int main(int argc, char * argv[]){
last_token = cur_token;
cur_token = token;
- if ( cur_token ){
- //training uni-gram
- phrase_index.add_unigram_frequency(cur_token, 1);
- }
- if ( cur_token ){
- SingleGram * single_gram = NULL;
- if ( 0 == last_token ){
- if (train_pi_gram)
- bigram.load(sentence_start, single_gram);
- } else
- bigram.load(last_token, single_gram);
-
- if ( NULL == single_gram ){
- single_gram = new SingleGram;
- }
- guint32 freq, total_freq;
- //increase freq
- if (single_gram->get_freq(cur_token, freq))
- assert(single_gram->set_freq(cur_token, freq + 1));
- else
- assert(single_gram->insert_freq(cur_token, 1));
- //increase total freq
- single_gram->get_total_freq(total_freq);
- single_gram->set_total_freq(total_freq + 1);
- if ( 0 == last_token ){
- if ( train_pi_gram )
- bigram.store(sentence_start, single_gram);
- }else
- bigram.store(last_token, single_gram);
- delete single_gram;
- }
+
+ /* skip null_token in second word. */
+ if ( null_token == cur_token )
+ continue;
+
+ /* skip pi-gram training. */
+ if ( null_token == last_token ){
+ if ( !train_pi_gram )
+ continue;
+ last_token = sentence_start;
+ }
+
+ //training uni-gram
+ phrase_index.add_unigram_frequency(cur_token, 1);
+
+ //train bi-gram
+ SingleGram * single_gram = NULL;
+ bigram.load(last_token, single_gram);
+
+ if ( NULL == single_gram ){
+ single_gram = new SingleGram;
+ }
+ guint32 freq, total_freq;
+ //increase freq
+ if (single_gram->get_freq(cur_token, freq))
+ assert(single_gram->set_freq(cur_token, freq + 1));
+ else
+ assert(single_gram->insert_freq(cur_token, 1));
+ //increase total freq
+ single_gram->get_total_freq(total_freq);
+ single_gram->set_total_freq(total_freq + 1);
+
+ bigram.store(last_token, single_gram);
+ delete single_gram;
}
free(linebuf);