diff options
| author | Peng Wu <alexepico@gmail.com> | 2010-09-10 14:39:40 +0800 |
|---|---|---|
| committer | Peng Wu <alexepico@gmail.com> | 2010-09-10 14:39:40 +0800 |
| commit | 598490883a327c0ab0c414e5851d8e749403492e (patch) | |
| tree | 182654ca1729fe761115c51222c6bf716aebd3f3 /src/segment/spseg.cpp | |
| parent | c8fbacc76996e693d734d0d5c94f63a7d6fe198a (diff) | |
| download | libpinyin-598490883a327c0ab0c414e5851d8e749403492e.tar.gz libpinyin-598490883a327c0ab0c414e5851d8e749403492e.tar.xz libpinyin-598490883a327c0ab0c414e5851d8e749403492e.zip | |
write the shortest path segment
Diffstat (limited to 'src/segment/spseg.cpp')
| -rw-r--r-- | src/segment/spseg.cpp | 110 |
1 files changed, 108 insertions, 2 deletions
diff --git a/src/segment/spseg.cpp b/src/segment/spseg.cpp index 6522b2e..238c732 100644 --- a/src/segment/spseg.cpp +++ b/src/segment/spseg.cpp @@ -26,6 +26,8 @@ #include "novel_types.h" #include "phrase_large_table.h" +/* graph shortest path sentence segment. */ + /* Note: * Currently libpinyin only supports ucs2 characters, as this is a * pre-processor tool for raw corpus, it will skip all sentences @@ -41,9 +43,87 @@ struct SegmentStep{ //use formula W = number of words. Zero handle means one word. size_t m_nword; //backtrace information, -1 one step backward. - gint m_backword_nstep; + gint m_backward_nstep; +public: + SegmentStep(){ + m_handle = 0; + m_phrase = NULL; + m_phrase_len = 0; + m_nword = UINT_MAX; + m_backward_nstep = -0; + } }; +bool backtrace(GArray * steps, glong phrase_len, GArray * strings); + +//Note: do not free phrase, as it is used by strings (array of segment). +bool segment(PhraseLargeTable * phrases, //Lookup Phrase + utf16_t * phrase, + glong phrase_len, + GArray * strings /* Array of Segment *. */){ + /* Prepare for shortest path segment dynamic programming. */ + GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); + SegmentStep step; + for ( glong i = 0; i < phrase_len + 1; ++i ){ + g_array_append_val(steps, step); + } + + for ( glong i = 0; i < phrase_len + 1; ++i ) { + SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i); + size_t nword = step_begin->m_nword; + for ( glong k = i + 1; k < phrase_len + 1; ++k ) { + size_t len = k - i; + utf16_t * cur_phrase = phrase + i; + + phrase_token_t token = 0; + int result = g_phrases->search(len, cur_phrase, token); + if ( !(result & SEARCH_OK) ){ + token = 0; + if ( 1 != len ) + continue; + } + ++nword; + SegmentStep * step_end = &g_array_index(steps, SegmentStep, k); + if ( nword < step_end->m_nword ) { + step_end->m_handle = token; + step_end->m_phrase = cur_phrase; + step_end->m_phrase_len = len; + step_end->m_nword = nword; + step_end->m_backward_nstep = i - k; + } + if ( !(result & SEARCH_CONTINUED) ) + break; + } + } + return backtrace(steps, phrase_len, strings); +} + +bool backtrace(GArray * steps, glong phrase_len, GArray * strings){ + //backtracing to get the result. + size_t cur_step = phrase_len; + g_array_set_size(strings, 0); + while ( cur_step ){ + SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step); + step->m_nword = 0; step->m_backward_nstep = 0; + g_array_append_val(strings, *step); + cur_step = cur_step + step->m_backward_nstep; + } + + //reverse the strings + for ( size_t i = 0; i < strings->len / 2; ++i ) { + SegmentStep * head, * tail; + head = &g_array_index(strings, SegmentStep, i); + tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i ); + SegmentStep tmp; + tmp = *head; + *head = *tail; + *tail = tmp; + } + + g_array_free(steps, TRUE); + return true; +} + void print_help(){ printf("Usage: mmseg [--generate-extra-enter]\n"); exit(1); @@ -93,7 +173,33 @@ int main(int argc, char * argv[]){ break; linebuf[strlen(linebuf) - 1] = '\0'; - + //check non-ucs2 characters + const glong num_of_chars = g_utf8_strlen(linebuf, -1); + glong len = 0; + utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL); + if ( len != num_of_chars ) { + fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf); + continue; + } + + //do segment stuff + GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); + segment(g_phrases, sentence, len, strings); + + //print out the split phrase + for ( glong i = 0; i < strings->len; ++i ) { + SegmentStep * step = &g_array_index(strings, SegmentStep, i); + char * string = g_utf16_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL); + printf("%s\n", string); + g_free(string); + } + + //print extra enter + if ( gen_extra_enter ) + printf("\n"); + + g_array_free(strings, TRUE); + g_free(sentence); } return 0; } |
