From 05f2dbbd28982192545fcfb2f38479d560987a90 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 18 Aug 2011 15:15:57 +0800 Subject: write pinyin apis --- src/pinyin.cpp | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 3 deletions(-) diff --git a/src/pinyin.cpp b/src/pinyin.cpp index 381bd1f..2711240 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -29,6 +29,7 @@ struct _pinyin_context_t{ char * m_user_dir; }; + pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ pinyin_context_t * context = new pinyin_context_t; @@ -133,7 +134,7 @@ bool pinyin_set_pinyin_keys(pinyin_context_t * context, g_array_set_size(context->m_constraints, context->m_pinyin_keys->len); for (size_t i = key_len; i < context->m_pinyin_keys->len; ++i ) { - lookup_constraint_t * constraint = + lookup_constraint_t * constraint = &g_array_index(context->m_constraints, lookup_constraint_t, i); constraint->m_type = NO_CONSTRAINT; } @@ -206,8 +207,82 @@ bool pinyin_parse_more_doubles(pinyin_context_t * context, return pinyin_len == parse_len; } +static gint compare_token( gconstpointer lhs, gconstpointer rhs){ + phrase_token_t token_lhs = *((phrase_token_t *)lhs); + phrase_token_t token_rhs = *((phrase_token_t *)rhs); + return token_lhs - token_rhs; +} + +static gint compare_token_with_unigram_freq(gconstpointer lhs, + gconstpointer rhs, + gpointer user_data){ + phrase_token_t token_lhs = *((phrase_token_t *)lhs); + phrase_token_t token_rhs = *((phrase_token_t *)rhs); + FacadePhraseIndex * phrase_index = + (FacadePhraseIndex *)user_data; + + PhraseItem item; + phrase_index->get_phrase_item(token_lhs, item); + guint32 freq_lhs = item.get_unigram_frequency(); + phrase_index->get_phrase_item(token_rhs, item); + guint32 freq_rhs = item.get_unigram_frequency(); + return freq_lhs - freq_rhs; +} + bool pinyin_get_candidates(pinyin_context_t * context, - size_t offset, TokenVector tokens); + size_t offset, TokenVector candidates){ + g_array_set_size(candidates, 0); + + PinyinKey * pinyin_keys = &g_array_index + (context->m_pinyin_keys, PinyinKey, offset); + size_t pinyin_len = context->m_pinyin_keys->len - offset; + + PhraseIndexRanges ranges; + size_t min_index = 1, max_index = 2; + memset(ranges, 0, sizeof(ranges)); + + for (size_t m = min_index; m <= max_index; ++m) { + ranges[m] = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange)); + } + + GArray * tokens = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange)); + + for (ssize_t i = pinyin_len; i >= 1; --i) { + g_array_set_size(tokens, 0); + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (i, pinyin_keys, ranges); + /* reduce to a single GArray. */ + for (size_t m = min_index; m <= max_index; ++m) { + g_array_append_vals(tokens, ranges[m]->data, ranges[m]->len); + } + + g_array_sort(tokens, compare_token); + /* remove the duplicated items. */ + phrase_token_t last_token = null_token; + for ( size_t n = 0; n < tokens->len; ++n) { + phrase_token_t token = g_array_index(tokens, phrase_token_t, n); + if ( last_token == token ){ + g_array_remove_index(tokens, n); + } + last_token = token; + } + + /* sort the candidates of the same length by uni-gram freqs. */ + g_array_sort_with_data(tokens, compare_token_with_unigram_freq, + context->m_phrase_index); + + /* copy out candidates. */ + g_array_append_vals(candidates, tokens->data, tokens->len); + } + + g_array_free(tokens, TRUE); + for (size_t m = min_index; m <= max_index; ++m) { + g_array_free(ranges[m], TRUE); + } + + return true; +} bool pinyin_choose_candidate(pinyin_context_t * context, size_t offset, phrase_token_t token){ @@ -280,7 +355,9 @@ bool pinyin_reset(pinyin_context_t * context){ return true; } -/** TODO: to be implemented. +/** + * TODO: to be implemented. + * Note: prefix is the text before the pre-edit string. * bool pinyin_get_guessed_sentence_with_prefix(...); * bool pinyin_get_candidates_with_prefix(...); * For context-dependent order of the candidates list. -- cgit