From b1bc974e89adc1ab13ea148686c7e2c2faf64e7a Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 3 Sep 2012 10:54:48 +0800 Subject: update segment to use phrase table ver2 --- utils/segment/ngseg.cpp | 12 ++++++++---- utils/segment/spseg.cpp | 34 +++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp index 994b54c..dd6df30 100644 --- a/utils/segment/ngseg.cpp +++ b/utils/segment/ngseg.cpp @@ -103,7 +103,7 @@ int main(int argc, char * argv[]){ } /* init phrase table */ - FacadePhraseTable phrase_table; + FacadePhraseTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk, NULL); @@ -125,7 +125,10 @@ int main(int argc, char * argv[]){ CONTEXT_STATE state, next_state; GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); - phrase_token_t token = null_token; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index.prepare_tokens(tokens); /* split the sentence */ char * linebuf = NULL; size_t size = 0; ssize_t read; @@ -151,7 +154,7 @@ int main(int argc, char * argv[]){ } state = CONTEXT_INIT; - bool result = phrase_table.search( 1, sentence, token); + bool result = phrase_table.search( 1, sentence, tokens); g_array_append_val( current_ucs4, sentence[0]); if ( result & SEARCH_OK ) state = CONTEXT_SEGMENTABLE; @@ -159,7 +162,7 @@ int main(int argc, char * argv[]){ state = CONTEXT_UNKNOWN; for ( int i = 1; i < num_of_chars; ++i) { - bool result = phrase_table.search( 1, sentence + i, token); + bool result = phrase_table.search( 1, sentence + i, tokens); if ( result & SEARCH_OK ) next_state = CONTEXT_SEGMENTABLE; else @@ -197,6 +200,7 @@ int main(int argc, char * argv[]){ if ( gen_extra_enter ) printf("\n"); } + phrase_index.destroy_tokens(tokens); /* print enter at file tail */ printf("\n"); diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp index 4a03287..073a3d8 100644 --- a/utils/segment/spseg.cpp +++ b/utils/segment/spseg.cpp @@ -24,6 +24,7 @@ #include #include #include "pinyin_internal.h" +#include "utils_helper.h" /* graph shortest path sentence segment. */ @@ -43,7 +44,7 @@ struct SegmentStep{ gint m_backward_nstep; public: SegmentStep(){ - m_handle = 0; + m_handle = null_token; m_phrase = NULL; m_phrase_len = 0; m_nword = UINT_MAX; @@ -54,7 +55,8 @@ public: bool backtrace(GArray * steps, glong phrase_len, GArray * strings); //Note: do not free phrase, as it is used by strings (array of segment). -bool segment(PhraseLargeTable * phrases, //Lookup Phrase +bool segment(FacadePhraseTable2 * phrase_table, + FacadePhraseIndex * phrase_index, ucs4_t * phrase, glong phrase_len, GArray * strings /* Array of Segment *. */){ @@ -68,6 +70,10 @@ bool segment(PhraseLargeTable * phrases, //Lookup Phrase SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0); first_step->m_nword = 0; + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + for ( glong i = 0; i < phrase_len + 1; ++i ) { SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i); size_t nword = step_begin->m_nword; @@ -75,14 +81,17 @@ bool segment(PhraseLargeTable * phrases, //Lookup Phrase size_t len = k - i; ucs4_t * cur_phrase = phrase + i; - phrase_token_t token = 0; - int result = phrases->search(len, cur_phrase, token); + phrase_token_t token = null_token; + int result = phrase_table->search(len, cur_phrase, tokens); + int num = get_first_token(tokens, token); + if ( !(result & SEARCH_OK) ){ - token = 0; + token = null_token; if ( 1 != len ) continue; } ++nword; + SegmentStep * step_end = &g_array_index(steps, SegmentStep, k); if ( nword < step_end->m_nword ) { step_end->m_handle = token; @@ -95,6 +104,8 @@ bool segment(PhraseLargeTable * phrases, //Lookup Phrase break; } } + phrase_index->destroy_tokens(tokens); + return backtrace(steps, phrase_len, strings); } @@ -148,11 +159,16 @@ int main(int argc, char * argv[]){ ++i; } - //init phrase table - PhraseLargeTable phrase_table; + /* init phrase table */ + FacadePhraseTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); - phrase_table.load(chunk); + phrase_table.load(chunk, NULL); + + /* init phrase index */ + FacadePhraseIndex phrase_index; + if (!load_phrase_index(&phrase_index)) + exit(ENOENT); char * linebuf = NULL; size_t size = 0; @@ -174,7 +190,7 @@ int main(int argc, char * argv[]){ //do segment stuff GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); - segment(&phrase_table, sentence, len, strings); + segment(&phrase_table, &phrase_index, sentence, len, strings); //print out the split phrase for ( glong i = 0; i < strings->len; ++i ) { -- cgit