summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-09-03 16:54:15 +0800
committerPeng Wu <alexepico@gmail.com>2012-09-03 16:54:15 +0800
commit3a0be6c617ee70d6b2195882d105221a365f5193 (patch)
tree484b72a3b542d867c568c37dcd0e13b2bc9c184b /utils
parent5e42e9600b54b6db72df2f87196d4a7a6ba37fd0 (diff)
downloadlibpinyin-3a0be6c617ee70d6b2195882d105221a365f5193.tar.gz
libpinyin-3a0be6c617ee70d6b2195882d105221a365f5193.tar.xz
libpinyin-3a0be6c617ee70d6b2195882d105221a365f5193.zip
update gen_ngram.cpp
Diffstat (limited to 'utils')
-rw-r--r--utils/training/gen_ngram.cpp24
1 files changed, 15 insertions, 9 deletions
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
index 20b160c..af2311f 100644
--- a/utils/training/gen_ngram.cpp
+++ b/utils/training/gen_ngram.cpp
@@ -57,7 +57,7 @@ int main(int argc, char * argv[]){
++i;
}
- PhraseLargeTable phrases;
+ PhraseLargeTable2 phrases;
/* init phrase table */
MemoryChunk * chunk = new MemoryChunk;
chunk->load("phrase_index.bin");
@@ -69,6 +69,10 @@ int main(int argc, char * argv[]){
Bigram bigram;
bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(PhraseTokens));
+ phrase_index.prepare_tokens(tokens);
char* linebuf = NULL;
size_t size = 0;
@@ -81,11 +85,12 @@ int main(int argc, char * argv[]){
glong phrase_len = 0;
ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL);
- phrase_token_t token = 0;
+ phrase_token_t token = null_token;
if ( 0 != phrase_len ) {
- int result = phrases.search( phrase_len, phrase, token);
- if ( ! (result & SEARCH_OK) )
- token = 0;
+ int result = phrases.search( phrase_len, phrase, tokens);
+ int num = get_first_token(tokens, token);
+ if ( !(result & SEARCH_OK) )
+ token = null_token;
g_free(phrase);
phrase = NULL;
}
@@ -97,7 +102,7 @@ int main(int argc, char * argv[]){
if ( null_token == cur_token )
continue;
- //training uni-gram
+ /* training uni-gram */
phrase_index.add_unigram_frequency(cur_token, 1);
/* skip pi-gram training. */
@@ -107,7 +112,7 @@ int main(int argc, char * argv[]){
last_token = sentence_start;
}
- //train bi-gram
+ /* train bi-gram */
SingleGram * single_gram = NULL;
bigram.load(last_token, single_gram);
@@ -115,12 +120,12 @@ int main(int argc, char * argv[]){
single_gram = new SingleGram;
}
guint32 freq, total_freq;
- //increase freq
+ /* increase freq */
if (single_gram->get_freq(cur_token, freq))
assert(single_gram->set_freq(cur_token, freq + 1));
else
assert(single_gram->insert_freq(cur_token, 1));
- //increase total freq
+ /* increase total freq */
single_gram->get_total_freq(total_freq);
single_gram->set_total_freq(total_freq + 1);
@@ -128,6 +133,7 @@ int main(int argc, char * argv[]){
delete single_gram;
}
+ phrase_index.destroy_tokens(tokens);
free(linebuf);
if (!save_phrase_index(&phrase_index))