diff options
author | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-07-22 11:37:11 +0800 |
commit | b78429d78df745dd327b6dada6b9bd71ea5df84e (patch) | |
tree | 82c4625db8674c66d69fd566fce8efc347e3cb3a /src/pinyin.cpp | |
download | libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.gz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.tar.xz libzhuyin-b78429d78df745dd327b6dada6b9bd71ea5df84e.zip |
import libpinyin code
Diffstat (limited to 'src/pinyin.cpp')
-rw-r--r-- | src/pinyin.cpp | 2096 |
1 files changed, 2096 insertions, 0 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp new file mode 100644 index 0000000..95215ae --- /dev/null +++ b/src/pinyin.cpp @@ -0,0 +1,2096 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin.h" +#include <stdio.h> +#include <unistd.h> +#include <glib/gstdio.h> +#include "pinyin_internal.h" + + +using namespace pinyin; + +/* a glue layer for input method integration. */ + +typedef GArray * CandidateVector; /* GArray of lookup_candidate_t */ + +struct _pinyin_context_t{ + pinyin_option_t m_options; + + FullPinyinParser2 * m_full_pinyin_parser; + DoublePinyinParser2 * m_double_pinyin_parser; + ChewingParser2 * m_chewing_parser; + + FacadeChewingTable * m_pinyin_table; + FacadePhraseTable2 * m_phrase_table; + FacadePhraseIndex * m_phrase_index; + Bigram * m_system_bigram; + Bigram * m_user_bigram; + + PinyinLookup2 * m_pinyin_lookup; + PhraseLookup * m_phrase_lookup; + + char * m_system_dir; + char * m_user_dir; + bool m_modified; + + SystemTableInfo m_system_table_info; +}; + +struct _pinyin_instance_t{ + pinyin_context_t * m_context; + gchar * m_raw_full_pinyin; + TokenVector m_prefixes; + ChewingKeyVector m_pinyin_keys; + ChewingKeyRestVector m_pinyin_key_rests; + CandidateConstraints m_constraints; + MatchResults m_match_results; + CandidateVector m_candidates; +}; + +struct _lookup_candidate_t{ + lookup_candidate_type_t m_candidate_type; + gchar * m_phrase_string; + phrase_token_t m_token; + ChewingKeyRest m_orig_rest; + gchar * m_new_pinyins; + guint32 m_freq; /* the amplifed gfloat numerical value. */ +public: + _lookup_candidate_t() { + m_candidate_type = NORMAL_CANDIDATE; + m_phrase_string = NULL; + m_token = null_token; + m_new_pinyins = NULL; + m_freq = 0; + } +}; + +struct _import_iterator_t{ + pinyin_context_t * m_context; + guint8 m_phrase_index; +}; + + +static bool check_format(pinyin_context_t * context){ + const char * userdir = context->m_user_dir; + + UserTableInfo user_table_info; + gchar * filename = g_build_filename + (userdir, USER_TABLE_INFO, NULL); + user_table_info.load(filename); + g_free(filename); + + bool exists = user_table_info.is_conform + (&context->m_system_table_info); + + if (exists) + return exists; + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + /* clean up files, if version mis-matches. */ + for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (NOT_USED == table_info->m_file_type) + continue; + + if (NULL == table_info->m_user_filename) + continue; + + const char * userfilename = table_info->m_user_filename; + + /* remove dbin file. */ + filename = g_build_filename(userdir, userfilename, NULL); + unlink(filename); + g_free(filename); + } + + filename = g_build_filename + (userdir, USER_PINYIN_INDEX, NULL); + unlink(filename); + g_free(filename); + + filename = g_build_filename + (userdir, USER_PHRASE_INDEX, NULL); + unlink(filename); + g_free(filename); + + filename = g_build_filename + (userdir, USER_BIGRAM, NULL); + unlink(filename); + g_free(filename); + + return exists; +} + +static bool mark_version(pinyin_context_t * context){ + const char * userdir = context->m_user_dir; + + UserTableInfo user_table_info; + user_table_info.make_conform(&context->m_system_table_info); + + gchar * filename = g_build_filename + (userdir, USER_TABLE_INFO, NULL); + bool retval = user_table_info.save(filename); + g_free(filename); + + return retval; +} + +pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ + pinyin_context_t * context = new pinyin_context_t; + + context->m_options = USE_TONE; + + context->m_system_dir = g_strdup(systemdir); + context->m_user_dir = g_strdup(userdir); + context->m_modified = false; + + gchar * filename = g_build_filename + (context->m_system_dir, SYSTEM_TABLE_INFO, NULL); + if (!context->m_system_table_info.load(filename)) { + fprintf(stderr, "load %s failed!\n", filename); + return NULL; + } + g_free(filename); + + + check_format(context); + + context->m_full_pinyin_parser = new FullPinyinParser2; + context->m_double_pinyin_parser = new DoublePinyinParser2; + context->m_chewing_parser = new ChewingParser2; + + /* load chewing table. */ + context->m_pinyin_table = new FacadeChewingTable; + + /* load system chewing table. */ + MemoryChunk * chunk = new MemoryChunk; + filename = g_build_filename + (context->m_system_dir, SYSTEM_PINYIN_INDEX, NULL); + if (!chunk->load(filename)) { + fprintf(stderr, "open %s failed!\n", filename); + return NULL; + } + g_free(filename); + + /* load user chewing table */ + MemoryChunk * userchunk = new MemoryChunk; + filename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX, NULL); + if (!userchunk->load(filename)) { + /* hack here: use local Chewing Table to create empty memory chunk. */ + ChewingLargeTable table(context->m_options); + table.store(userchunk); + } + g_free(filename); + + context->m_pinyin_table->load(context->m_options, chunk, userchunk); + + /* load phrase table */ + context->m_phrase_table = new FacadePhraseTable2; + + /* load system phrase table */ + chunk = new MemoryChunk; + filename = g_build_filename + (context->m_system_dir, SYSTEM_PHRASE_INDEX, NULL); + if (!chunk->load(filename)) { + fprintf(stderr, "open %s failed!\n", filename); + return NULL; + } + g_free(filename); + + /* load user phrase table */ + userchunk = new MemoryChunk; + filename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX, NULL); + if (!userchunk->load(filename)) { + /* hack here: use local Phrase Table to create empty memory chunk. */ + PhraseLargeTable2 table; + table.store(userchunk); + } + g_free(filename); + + context->m_phrase_table->load(chunk, userchunk); + + context->m_phrase_index = new FacadePhraseIndex; + + /* hack here: directly call load phrase library. */ + pinyin_load_phrase_library(context, GB_DICTIONARY); + pinyin_load_phrase_library(context, MERGED_DICTIONARY); + + context->m_system_bigram = new Bigram; + filename = g_build_filename(context->m_system_dir, SYSTEM_BIGRAM, NULL); + context->m_system_bigram->attach(filename, ATTACH_READONLY); + g_free(filename); + + context->m_user_bigram = new Bigram; + filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); + context->m_user_bigram->load_db(filename); + g_free(filename); + + gfloat lambda = context->m_system_table_info.get_lambda(); + + context->m_pinyin_lookup = new PinyinLookup2 + ( lambda, context->m_options, + context->m_pinyin_table, context->m_phrase_index, + context->m_system_bigram, context->m_user_bigram); + + context->m_phrase_lookup = new PhraseLookup + (lambda, + context->m_phrase_table, context->m_phrase_index, + context->m_system_bigram, context->m_user_bigram); + + return context; +} + +bool pinyin_load_phrase_library(pinyin_context_t * context, + guint8 index){ + if (!(index < PHRASE_INDEX_LIBRARY_COUNT)) + return false; + + /* check whether the sub phrase index is already loaded. */ + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(index, range); + if (ERROR_OK == retval) + return false; + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + const pinyin_table_info_t * table_info = phrase_files + index; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + /* system phrase library */ + MemoryChunk * chunk = new MemoryChunk; + + const char * systemfilename = table_info->m_system_filename; + /* check bin file in system dir. */ + gchar * chunkfilename = g_build_filename(context->m_system_dir, + systemfilename, NULL); + chunk->load(chunkfilename); + g_free(chunkfilename); + + context->m_phrase_index->load(index, chunk); + + const char * userfilename = table_info->m_user_filename; + + chunkfilename = g_build_filename(context->m_user_dir, + userfilename, NULL); + + MemoryChunk * log = new MemoryChunk; + log->load(chunkfilename); + g_free(chunkfilename); + + /* merge the chunk log. */ + context->m_phrase_index->merge(index, log); + return true; + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + MemoryChunk * chunk = new MemoryChunk; + const char * userfilename = table_info->m_user_filename; + + gchar * chunkfilename = g_build_filename(context->m_user_dir, + userfilename, NULL); + + /* check bin file exists. if not, create a new one. */ + if (chunk->load(chunkfilename)) { + context->m_phrase_index->load(index, chunk); + } else { + delete chunk; + context->m_phrase_index->create_sub_phrase(index); + } + + g_free(chunkfilename); + return true; + } + + return false; +} + +bool pinyin_unload_phrase_library(pinyin_context_t * context, + guint8 index){ + /* gb_char.bin and merged.bin can't be unloaded. */ + if (GB_DICTIONARY == index || MERGED_DICTIONARY == index) + return false; + + assert(index < PHRASE_INDEX_LIBRARY_COUNT); + + context->m_phrase_index->unload(index); + return true; +} + +import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context, + guint8 index){ + import_iterator_t * iter = new import_iterator_t; + iter->m_context = context; + iter->m_phrase_index = index; + return iter; +} + +bool pinyin_iterator_add_phrase(import_iterator_t * iter, + const char * phrase, + const char * pinyin, + gint count){ + /* if -1 == count, use the default value. */ + const gint default_count = 5; + const guint32 unigram_factor = 3; + if (-1 == count) + count = default_count; + + pinyin_context_t * & context = iter->m_context; + FacadePhraseTable2 * & phrase_table = context->m_phrase_table; + FacadeChewingTable * & pinyin_table = context->m_pinyin_table; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + bool result = false; + + if (NULL == phrase || NULL == pinyin) + return result; + + /* check whether the phrase exists in phrase table */ + glong len_phrase = 0; + ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &len_phrase, NULL); + + pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; + FullPinyinParser2 parser; + ChewingKeyVector keys = + g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + /* parse the pinyin. */ + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + + if (len_phrase != keys->len) + return result; + + if (0 == len_phrase || len_phrase >= MAX_PHRASE_LENGTH) + return result; + + phrase_token_t token = null_token; + GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + /* do phrase table search. */ + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + int retval = phrase_table->search(len_phrase, ucs4_phrase, tokens); + int num = reduce_tokens(tokens, tokenarray); + phrase_index->destroy_tokens(tokens); + + /* find the best token candidate. */ + for (size_t i = 0; i < tokenarray->len; ++i) { + phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i); + if (null_token == token) { + token = candidate; + continue; + } + + if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == iter->m_phrase_index) { + /* only one phrase string per sub phrase index. */ + assert(PHRASE_INDEX_LIBRARY_INDEX(token) != iter->m_phrase_index); + token = candidate; + continue; + } + } + g_array_free(tokenarray, TRUE); + + PhraseItem item; + /* check whether it exists in the same sub phrase index; */ + if (null_token != token && + PHRASE_INDEX_LIBRARY_INDEX(token) == iter->m_phrase_index) { + /* if so, remove the phrase, add the pinyin for the phrase item, + then add it back;*/ + phrase_index->get_phrase_item(token, item); + assert(len_phrase == item.get_phrase_length()); + ucs4_t tmp_phrase[MAX_PHRASE_LENGTH]; + item.get_phrase_string(tmp_phrase); + assert(0 == memcmp + (ucs4_phrase, tmp_phrase, sizeof(ucs4_t) * len_phrase)); + + PhraseItem * removed_item = NULL; + retval = phrase_index->remove_phrase_item(token, removed_item); + if (ERROR_OK == retval) { + /* maybe check whether there are duplicated pronunciations here. */ + removed_item->add_pronunciation((ChewingKey *)keys->data, + count); + phrase_index->add_phrase_item(token, removed_item); + delete removed_item; + result = true; + } + } else { + /* if not exists in the same sub phrase index, + get the maximum token, + then add it directly with maximum token + 1; */ + PhraseIndexRange range; + retval = phrase_index->get_range(iter->m_phrase_index, range); + + if (ERROR_OK == retval) { + token = range.m_range_end; + if (0x00000000 == (token & PHRASE_MASK)) + token++; + + if (len_phrase == keys->len) { /* valid pinyin */ + phrase_table->add_index(len_phrase, ucs4_phrase, token); + pinyin_table->add_index + (keys->len, (ChewingKey *)(keys->data), token); + + item.set_phrase_string(len_phrase, ucs4_phrase); + item.add_pronunciation((ChewingKey *)(keys->data), count); + phrase_index->add_phrase_item(token, &item); + phrase_index->add_unigram_frequency(token, + count * unigram_factor); + result = true; + } + } + } + + g_array_free(key_rests, TRUE); + g_array_free(keys, TRUE); + g_free(ucs4_phrase); + return result; +} + +void pinyin_end_add_phrases(import_iterator_t * iter){ + /* compact the content memory chunk of phrase index. */ + iter->m_context->m_phrase_index->compact(); + iter->m_context->m_modified = true; + delete iter; +} + +bool pinyin_save(pinyin_context_t * context){ + if (!context->m_user_dir) + return false; + + if (!context->m_modified) + return false; + + context->m_phrase_index->compact(); + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + /* skip the reserved zero phrase library. */ + for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(i, range); + + if (ERROR_NO_SUB_PHRASE_INDEX == retval) + continue; + + const pinyin_table_info_t * table_info = phrase_files + i; + + if (NOT_USED == table_info->m_file_type) + continue; + + const char * userfilename = table_info->m_user_filename; + + if (NULL == userfilename) + continue; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + /* system phrase library */ + MemoryChunk * chunk = new MemoryChunk; + MemoryChunk * log = new MemoryChunk; + const char * systemfilename = table_info->m_system_filename; + + /* check bin file in system dir. */ + gchar * chunkfilename = g_build_filename(context->m_system_dir, + systemfilename, NULL); + chunk->load(chunkfilename); + g_free(chunkfilename); + context->m_phrase_index->diff(i, chunk, log); + + const char * userfilename = table_info->m_user_filename; + gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); + + gchar * tmppathname = g_build_filename(context->m_user_dir, + tmpfilename, NULL); + g_free(tmpfilename); + + gchar * chunkpathname = g_build_filename(context->m_user_dir, + userfilename, NULL); + log->save(tmppathname); + + int result = rename(tmppathname, chunkpathname); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmppathname, chunkpathname); + + g_free(chunkpathname); + g_free(tmppathname); + delete log; + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + MemoryChunk * chunk = new MemoryChunk; + context->m_phrase_index->store(i, chunk); + + const char * userfilename = table_info->m_user_filename; + gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); + gchar * tmppathname = g_build_filename(context->m_user_dir, + tmpfilename, NULL); + g_free(tmpfilename); + + gchar * chunkpathname = g_build_filename(context->m_user_dir, + userfilename, NULL); + + chunk->save(tmppathname); + + int result = rename(tmppathname, chunkpathname); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmppathname, chunkpathname); + + g_free(chunkpathname); + g_free(tmppathname); + delete chunk; + } + } + + /* save user pinyin table */ + gchar * tmpfilename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL); + unlink(tmpfilename); + gchar * filename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX, NULL); + + MemoryChunk * chunk = new MemoryChunk; + context->m_pinyin_table->store(chunk); + chunk->save(tmpfilename); + delete chunk; + + int result = rename(tmpfilename, filename); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmpfilename, filename); + + g_free(tmpfilename); + g_free(filename); + + /* save user phrase table */ + tmpfilename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL); + unlink(tmpfilename); + filename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX, NULL); + + chunk = new MemoryChunk; + context->m_phrase_table->store(chunk); + chunk->save(tmpfilename); + delete chunk; + + result = rename(tmpfilename, filename); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmpfilename, filename); + + g_free(tmpfilename); + g_free(filename); + + /* save user bi-gram */ + tmpfilename = g_build_filename + (context->m_user_dir, USER_BIGRAM ".tmp", NULL); + unlink(tmpfilename); + filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); + context->m_user_bigram->save_db(tmpfilename); + + result = rename(tmpfilename, filename); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmpfilename, filename); + + g_free(tmpfilename); + g_free(filename); + + mark_version(context); + + context->m_modified = false; + return true; +} + +bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context, + DoublePinyinScheme scheme){ + context->m_double_pinyin_parser->set_scheme(scheme); + return true; +} + +bool pinyin_set_chewing_scheme(pinyin_context_t * context, + ChewingScheme scheme){ + context->m_chewing_parser->set_scheme(scheme); + return true; +} + +void pinyin_fini(pinyin_context_t * context){ + delete context->m_full_pinyin_parser; + delete context->m_double_pinyin_parser; + delete context->m_chewing_parser; + delete context->m_pinyin_table; + delete context->m_phrase_table; + delete context->m_phrase_index; + delete context->m_system_bigram; + delete context->m_user_bigram; + delete context->m_pinyin_lookup; + delete context->m_phrase_lookup; + + g_free(context->m_system_dir); + g_free(context->m_user_dir); + context->m_modified = false; + + delete context; +} + +bool pinyin_mask_out(pinyin_context_t * context, + phrase_token_t mask, + phrase_token_t value) { + + context->m_pinyin_table->mask_out(mask, value); + context->m_phrase_table->mask_out(mask, value); + context->m_user_bigram->mask_out(mask, value); + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + /* mask out the phrase index. */ + for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) { + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(index, range); + + if (ERROR_NO_SUB_PHRASE_INDEX == retval) + continue; + + const pinyin_table_info_t * table_info = phrase_files + index; + + if (NOT_USED == table_info->m_file_type) + continue; + + const char * userfilename = table_info->m_user_filename; + + if (NULL == userfilename) + continue; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + /* system phrase library */ + MemoryChunk * chunk = new MemoryChunk; + + const char * systemfilename = table_info->m_system_filename; + /* check bin file in system dir. */ + gchar * chunkfilename = g_build_filename(context->m_system_dir, + systemfilename, NULL); + chunk->load(chunkfilename); + g_free(chunkfilename); + + context->m_phrase_index->load(index, chunk); + + const char * userfilename = table_info->m_user_filename; + + chunkfilename = g_build_filename(context->m_user_dir, + userfilename, NULL); + + MemoryChunk * log = new MemoryChunk; + log->load(chunkfilename); + g_free(chunkfilename); + + /* merge the chunk log with mask. */ + context->m_phrase_index->merge_with_mask(index, log, mask, value); + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + context->m_phrase_index->mask_out(index, mask, value); + } + } + + context->m_phrase_index->compact(); + return true; +} + +/* copy from options to context->m_options. */ +bool pinyin_set_options(pinyin_context_t * context, + pinyin_option_t options){ + context->m_options = options; + context->m_pinyin_table->set_options(context->m_options); + context->m_pinyin_lookup->set_options(context->m_options); + return true; +} + + +pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){ + pinyin_instance_t * instance = new pinyin_instance_t; + instance->m_context = context; + + instance->m_raw_full_pinyin = NULL; + + instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + instance->m_pinyin_key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + instance->m_constraints = g_array_new + (TRUE, FALSE, sizeof(lookup_constraint_t)); + instance->m_match_results = + g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + instance->m_candidates = + g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); + + return instance; +} + +void pinyin_free_instance(pinyin_instance_t * instance){ + g_free(instance->m_raw_full_pinyin); + g_array_free(instance->m_prefixes, TRUE); + g_array_free(instance->m_pinyin_keys, TRUE); + g_array_free(instance->m_pinyin_key_rests, TRUE); + g_array_free(instance->m_constraints, TRUE); + g_array_free(instance->m_match_results, TRUE); + g_array_free(instance->m_candidates, TRUE); + + delete instance; +} + + +static bool pinyin_update_constraints(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + CandidateConstraints & constraints = instance->m_constraints; + + size_t key_len = constraints->len; + g_array_set_size(constraints, pinyin_keys->len); + for (size_t i = key_len; i < pinyin_keys->len; ++i ) { + lookup_constraint_t * constraint = + &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + context->m_pinyin_lookup->validate_constraint + (constraints, pinyin_keys); + + return true; +} + + +bool pinyin_guess_sentence(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; + + g_array_set_size(instance->m_prefixes, 0); + g_array_append_val(instance->m_prefixes, sentence_start); + + pinyin_update_constraints(instance); + bool retval = context->m_pinyin_lookup->get_best_match + (instance->m_prefixes, + instance->m_pinyin_keys, + instance->m_constraints, + instance->m_match_results); + + return retval; +} + +bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, + const char * prefix){ + pinyin_context_t * & context = instance->m_context; + + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + g_array_set_size(instance->m_prefixes, 0); + g_array_append_val(instance->m_prefixes, sentence_start); + + glong len_str = 0; + ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL); + GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + if (ucs4_str && len_str) { + /* add prefixes. */ + for (ssize_t i = 1; i <= len_str; ++i) { + if (i > MAX_PHRASE_LENGTH) + break; + + ucs4_t * start = ucs4_str + len_str - i; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(tokens)); + phrase_index->prepare_tokens(tokens); + int result = context->m_phrase_table->search(i, start, tokens); + int num = reduce_tokens(tokens, tokenarray); + phrase_index->destroy_tokens(tokens); + + if (result & SEARCH_OK) + g_array_append_vals(instance->m_prefixes, + tokenarray->data, tokenarray->len); + } + } + g_array_free(tokenarray, TRUE); + g_free(ucs4_str); + + pinyin_update_constraints(instance); + bool retval = context->m_pinyin_lookup->get_best_match + (instance->m_prefixes, + instance->m_pinyin_keys, + instance->m_constraints, + instance->m_match_results); + + return retval; +} + +bool pinyin_phrase_segment(pinyin_instance_t * instance, + const char * sentence){ + pinyin_context_t * & context = instance->m_context; + + const glong num_of_chars = g_utf8_strlen(sentence, -1); + glong ucs4_len = 0; + ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL); + + g_return_val_if_fail(num_of_chars == ucs4_len, FALSE); + + bool retval = context->m_phrase_lookup->get_best_match + (ucs4_len, ucs4_str, instance->m_match_results); + + g_free(ucs4_str); + return retval; +} + +/* the returned sentence should be freed by g_free(). */ +bool pinyin_get_sentence(pinyin_instance_t * instance, + char ** sentence){ + pinyin_context_t * & context = instance->m_context; + + bool retval = pinyin::convert_to_utf8 + (context->m_phrase_index, instance->m_match_results, + NULL, false, *sentence); + + return retval; +} + +bool pinyin_parse_full_pinyin(pinyin_instance_t * instance, + const char * onepinyin, + ChewingKey * onekey){ + pinyin_context_t * & context = instance->m_context; + + int pinyin_len = strlen(onepinyin); + bool retval = context->m_full_pinyin_parser->parse_one_key + ( context->m_options, *onekey, onepinyin, pinyin_len); + return retval; +} + +size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance, + const char * pinyins){ + pinyin_context_t * & context = instance->m_context; + + g_free(instance->m_raw_full_pinyin); + instance->m_raw_full_pinyin = g_strdup(pinyins); + int pinyin_len = strlen(pinyins); + + int parse_len = context->m_full_pinyin_parser->parse + ( context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, pinyins, pinyin_len); + + return parse_len; +} + +bool pinyin_parse_double_pinyin(pinyin_instance_t * instance, + const char * onepinyin, + ChewingKey * onekey){ + pinyin_context_t * & context = instance->m_context; + + int pinyin_len = strlen(onepinyin); + bool retval = context->m_double_pinyin_parser->parse_one_key + ( context->m_options, *onekey, onepinyin, pinyin_len); + return retval; +} + +size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance, + const char * pinyins){ + pinyin_context_t * & context = instance->m_context; + int pinyin_len = strlen(pinyins); + + int parse_len = context->m_double_pinyin_parser->parse + ( context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, pinyins, pinyin_len); + + return parse_len; +} + +bool pinyin_parse_chewing(pinyin_instance_t * instance, + const char * onechewing, + ChewingKey * onekey){ + pinyin_context_t * & context = instance->m_context; + + int chewing_len = strlen(onechewing); + bool retval = context->m_chewing_parser->parse_one_key + ( context->m_options, *onekey, onechewing, chewing_len ); + return retval; +} + +size_t pinyin_parse_more_chewings(pinyin_instance_t * instance, + const char * chewings){ + pinyin_context_t * & context = instance->m_context; + int chewing_len = strlen(chewings); + + int parse_len = context->m_chewing_parser->parse + ( context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, chewings, chewing_len); + + return parse_len; +} + +bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, + const char key, const char ** symbol) { + pinyin_context_t * & context = instance->m_context; + return context->m_chewing_parser->in_chewing_scheme + (context->m_options, key, symbol); +} + +#if 0 +static gint compare_item_with_token(gconstpointer lhs, + gconstpointer rhs) { + lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; + lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; + + phrase_token_t token_lhs = item_lhs->m_token; + phrase_token_t token_rhs = item_rhs->m_token; + + return (token_lhs - token_rhs); +} +#endif + +static gint compare_item_with_frequency(gconstpointer lhs, + gconstpointer rhs) { + lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; + lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; + + guint32 freq_lhs = item_lhs->m_freq; + guint32 freq_rhs = item_rhs->m_freq; + + return -(freq_lhs - freq_rhs); /* in descendant order */ +} + +static phrase_token_t _get_previous_token(pinyin_instance_t * instance, + size_t offset) { + phrase_token_t prev_token = null_token; + ssize_t i; + + if (0 == offset) { + /* get previous token from prefixes. */ + prev_token = sentence_start; + size_t prev_token_len = 0; + + pinyin_context_t * context = instance->m_context; + TokenVector prefixes = instance->m_prefixes; + PhraseItem item; + + for (size_t i = 0; i < prefixes->len; ++i) { + phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); + if (sentence_start == token) + continue; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK == retval) { + size_t token_len = item.get_phrase_length(); + if (token_len > prev_token_len) { + /* found longer match, and save it. */ + prev_token = token; + prev_token_len = token_len; + } + } + } + } else { + /* get previous token from match results. */ + assert (0 < offset); + + phrase_token_t cur_token = g_array_index + (instance->m_match_results, phrase_token_t, offset); + if (null_token != cur_token) { + for (i = offset - 1; i >= 0; --i) { + cur_token = g_array_index + (instance->m_match_results, phrase_token_t, i); + if (null_token != cur_token) { + prev_token = cur_token; + break; + } + } + } + } + + return prev_token; +} + +static void _append_items(pinyin_context_t * context, + PhraseIndexRanges ranges, + lookup_candidate_t * template_item, + CandidateVector items) { + /* reduce and append to a single GArray. */ + for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) { + if (NULL == ranges[m]) + continue; + + for (size_t n = 0; n < ranges[m]->len; ++n) { + PhraseIndexRange * range = + &g_array_index(ranges[m], PhraseIndexRange, n); + for (size_t k = range->m_range_begin; + k < range->m_range_end; ++k) { + lookup_candidate_t item; + item.m_candidate_type = template_item->m_candidate_type; + item.m_token = k; + item.m_orig_rest = template_item->m_orig_rest; + item.m_new_pinyins = g_strdup(template_item->m_new_pinyins); + item.m_freq = template_item->m_freq; + g_array_append_val(items, item); + } + } + } +} + +#if 0 +static void _remove_duplicated_items(CandidateVector items) { + /* remove the duplicated items. */ + phrase_token_t last_token = null_token, saved_token; + for (size_t n = 0; n < items->len; ++n) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, n); + + saved_token = item->m_token; + if (last_token == saved_token) { + g_array_remove_index(items, n); + n--; + } + last_token = saved_token; + } +} +#endif + +static void _compute_frequency_of_items(pinyin_context_t * context, + phrase_token_t prev_token, + SingleGram * merged_gram, + CandidateVector items) { + pinyin_option_t & options = context->m_options; + ssize_t i; + + PhraseItem cached_item; + /* compute all freqs. */ + for (i = 0; i < items->len; ++i) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, i); + phrase_token_t & token = item->m_token; + + gfloat bigram_poss = 0; guint32 total_freq = 0; + if (options & DYNAMIC_ADJUST) { + if (null_token != prev_token) { + guint32 bigram_freq = 0; + merged_gram->get_total_freq(total_freq); + merged_gram->get_freq(token, bigram_freq); + if (0 != total_freq) + bigram_poss = bigram_freq / (gfloat)total_freq; + } + } + + /* compute the m_freq. */ + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + phrase_index->get_phrase_item(token, cached_item); + total_freq = phrase_index->get_phrase_index_total_freq(); + assert (0 < total_freq); + + gfloat lambda = context->m_system_table_info.get_lambda(); + + /* Note: possibility value <= 1.0. */ + guint32 freq = (lambda * bigram_poss + + (1 - lambda) * + cached_item.get_unigram_frequency() / + (gfloat) total_freq) * 256 * 256 * 256; + item->m_freq = freq; + } +} + +static bool _prepend_sentence_candidate(pinyin_instance_t * instance, + CandidateVector candidates) { + /* check whether the best match candidate exists. */ + gchar * sentence = NULL; + pinyin_get_sentence(instance, &sentence); + if (NULL == sentence) + return false; + g_free(sentence); + + /* prepend best match candidate to candidates. */ + lookup_candidate_t candidate; + candidate.m_candidate_type = BEST_MATCH_CANDIDATE; + g_array_prepend_val(candidates, candidate); + + return true; +} + +static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance, + size_t offset, + CandidateVector candidates) { + /* populate m_phrase_string in lookup_candidate_t. */ + + for(size_t i = 0; i < candidates->len; ++i) { + lookup_candidate_t * candidate = &g_array_index + (candidates, lookup_candidate_t, i); + + switch(candidate->m_candidate_type) { + case BEST_MATCH_CANDIDATE: { + gchar * sentence = NULL; + pinyin_get_sentence(instance, &sentence); + candidate->m_phrase_string = g_strdup + (g_utf8_offset_to_pointer(sentence, offset)); + g_free(sentence); + break; + } + case NORMAL_CANDIDATE: + case DIVIDED_CANDIDATE: + case RESPLIT_CANDIDATE: + pinyin_token_get_phrase + (instance, candidate->m_token, NULL, + &(candidate->m_phrase_string)); + break; + case ZOMBIE_CANDIDATE: + break; + } + } + + return true; +} + +static gint compare_indexed_item_with_phrase_string(gconstpointer lhs, + gconstpointer rhs, + gpointer userdata) { + size_t index_lhs = *((size_t *) lhs); + size_t index_rhs = *((size_t *) rhs); + CandidateVector candidates = (CandidateVector) userdata; + + lookup_candidate_t * candidate_lhs = + &g_array_index(candidates, lookup_candidate_t, index_lhs); + lookup_candidate_t * candidate_rhs = + &g_array_index(candidates, lookup_candidate_t, index_rhs); + + return -strcmp(candidate_lhs->m_phrase_string, + candidate_rhs->m_phrase_string); /* in descendant order */ +} + + +static bool _remove_duplicated_items_by_phrase_string +(pinyin_instance_t * instance, + CandidateVector candidates) { + size_t i; + /* create the GArray of indexed item */ + GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t)); + for (i = 0; i < candidates->len; ++i) + g_array_append_val(indices, i); + + /* sort the indices array by phrase array */ + g_array_sort_with_data + (indices, compare_indexed_item_with_phrase_string, candidates); + + /* mark duplicated items as zombie candidate */ + lookup_candidate_t * cur_item, * saved_item = NULL; + for (i = 0; i < indices->len; ++i) { + size_t cur_index = g_array_index(indices, size_t, i); + cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index); + + /* handle the first candidate */ + if (NULL == saved_item) { + saved_item = cur_item; + continue; + } + + if (0 == strcmp(saved_item->m_phrase_string, + cur_item->m_phrase_string)) { + /* found duplicated candidates */ + + /* keep best match candidate */ + if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) { + cur_item->m_candidate_type = ZOMBIE_CANDIDATE; + continue; + } + + if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { + saved_item->m_candidate_type = ZOMBIE_CANDIDATE; + saved_item = cur_item; + continue; + } + + /* keep the higher possiblity one + to quickly move the word forward in the candidate list */ + if (cur_item->m_freq > saved_item->m_freq) { + /* find better candidate */ + saved_item->m_candidate_type = ZOMBIE_CANDIDATE; + saved_item = cur_item; + continue; + } else { + cur_item->m_candidate_type = ZOMBIE_CANDIDATE; + continue; + } + } else { + /* keep the current candidate */ + saved_item = cur_item; + } + } + + g_array_free(indices, TRUE); + + /* remove zombie candidate from the returned candidates */ + for (i = 0; i < candidates->len; ++i) { + lookup_candidate_t * candidate = &g_array_index + (candidates, lookup_candidate_t, i); + + if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) { + g_free(candidate->m_phrase_string); + g_free(candidate->m_new_pinyins); + g_array_remove_index(candidates, i); + i--; + } + } + + return true; +} + +static bool _free_candidates(CandidateVector candidates) { + /* free candidates */ + for (size_t i = 0; i < candidates->len; ++i) { + lookup_candidate_t * candidate = &g_array_index + (candidates, lookup_candidate_t, i); + g_free(candidate->m_phrase_string); + g_free(candidate->m_new_pinyins); + } + g_array_set_size(candidates, 0); + + return true; +} + +bool pinyin_guess_candidates(pinyin_instance_t * instance, + size_t offset) { + + pinyin_context_t * & context = instance->m_context; + pinyin_option_t & options = context->m_options; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + + _free_candidates(instance->m_candidates); + + size_t pinyin_len = pinyin_keys->len - offset; + ssize_t i; + + /* lookup the previous token here. */ + phrase_token_t prev_token = null_token; + + if (options & DYNAMIC_ADJUST) { + prev_token = _get_previous_token(instance, offset); + } + + SingleGram merged_gram; + SingleGram * system_gram = NULL, * user_gram = NULL; + + if (options & DYNAMIC_ADJUST) { + if (null_token != prev_token) { + context->m_system_bigram->load(prev_token, system_gram); + context->m_user_bigram->load(prev_token, user_gram); + merge_single_gram(&merged_gram, system_gram, user_gram); + } + } + + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(ranges)); + context->m_phrase_index->prepare_ranges(ranges); + + GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); + + for (i = pinyin_len; i >= 1; --i) { + g_array_set_size(items, 0); + + ChewingKey * keys = &g_array_index + (pinyin_keys, ChewingKey, offset); + + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (i, keys, ranges); + + if ( !(retval & SEARCH_OK) ) + continue; + + lookup_candidate_t template_item; + _append_items(context, ranges, &template_item, items); + +#if 0 + g_array_sort(items, compare_item_with_token); + + _remove_duplicated_items(items); +#endif + + _compute_frequency_of_items(context, prev_token, &merged_gram, items); + + /* sort the candidates of the same length by frequency. */ + g_array_sort(items, compare_item_with_frequency); + + /* transfer back items to tokens, and save it into candidates */ + for (size_t k = 0; k < items->len; ++k) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, k); + g_array_append_val(instance->m_candidates, *item); + } + +#if 0 + if (!(retval & SEARCH_CONTINUED)) + break; +#endif + } + + g_array_free(items, TRUE); + context->m_phrase_index->destroy_ranges(ranges); + if (system_gram) + delete system_gram; + if (user_gram) + delete user_gram; + + /* post process to remove duplicated candidates */ + + _prepend_sentence_candidate(instance, instance->m_candidates); + + _compute_phrase_strings_of_items(instance, offset, instance->m_candidates); + + _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); + + return true; +} + + +static bool _try_divided_table(pinyin_instance_t * instance, + PhraseIndexRanges ranges, + size_t offset, + CandidateVector items){ + bool found = false; + + pinyin_context_t * & context = instance->m_context; + pinyin_option_t & options = context->m_options; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; + + assert(pinyin_keys->len == pinyin_key_rests->len); + guint num_keys = pinyin_keys->len; + assert(offset < num_keys); + + /* handle "^xian$" -> "xi'an" here */ + ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset); + ChewingKeyRest * rest = &g_array_index(pinyin_key_rests, + ChewingKeyRest, offset); + ChewingKeyRest orig_rest = *rest; + guint16 tone = CHEWING_ZERO_TONE; + + const divided_table_item_t * item = NULL; + + /* back up tone */ + if (options & USE_TONE) { + tone = key->m_tone; + if (CHEWING_ZERO_TONE != tone) { + key->m_tone = CHEWING_ZERO_TONE; + rest->m_raw_end --; + } + } + + item = context->m_full_pinyin_parser->retrieve_divided_item + (options, key, rest, instance->m_raw_full_pinyin, + strlen(instance->m_raw_full_pinyin)); + + if (item) { + /* no ops */ + assert(item->m_new_freq > 0); + + ChewingKey divided_keys[2]; + const char * pinyin = item->m_new_keys[0]; + assert(context->m_full_pinyin_parser-> + parse_one_key(options, divided_keys[0], + pinyin, strlen(pinyin))); + pinyin = item->m_new_keys[1]; + assert(context->m_full_pinyin_parser-> + parse_one_key(options, divided_keys[1], + pinyin, strlen(pinyin))); + + gchar * new_pinyins = g_strdup_printf + ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]); + + /* propagate the tone */ + if (options & USE_TONE) { + if (CHEWING_ZERO_TONE != tone) { + assert(0 < tone && tone <= 5); + divided_keys[1].m_tone = tone; + + gchar * tmp_str = g_strdup_printf + ("%s%d", new_pinyins, tone); + g_free(new_pinyins); + new_pinyins = tmp_str; + } + } + + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (2, divided_keys, ranges); + + if (retval & SEARCH_OK) { + lookup_candidate_t template_item; + template_item.m_candidate_type = DIVIDED_CANDIDATE; + template_item.m_orig_rest = orig_rest; + template_item.m_new_pinyins = new_pinyins; + + _append_items(context, ranges, &template_item, items); + found = true; + } + g_free(new_pinyins); + } + + /* restore tones */ + if (options & USE_TONE) { + if (CHEWING_ZERO_TONE != tone) { + key->m_tone = tone; + rest->m_raw_end ++; + } + } + + return found; +} + +static bool _try_resplit_table(pinyin_instance_t * instance, + PhraseIndexRanges ranges, + size_t offset, + CandidateVector items){ + bool found = false; + + pinyin_context_t * & context = instance->m_context; + pinyin_option_t & options = context->m_options; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; + + assert(pinyin_keys->len == pinyin_key_rests->len); + guint num_keys = pinyin_keys->len; + assert(offset + 1 < num_keys); + + guint16 next_tone = CHEWING_ZERO_TONE; + + /* handle "^fa'nan$" -> "fan'an" here */ + ChewingKeyRest * cur_rest = &g_array_index(pinyin_key_rests, + ChewingKeyRest, offset); + ChewingKeyRest * next_rest = &g_array_index(pinyin_key_rests, + ChewingKeyRest, offset + 1); + /* some "'" here */ + if (cur_rest->m_raw_end != next_rest->m_raw_begin) + return found; + + ChewingKey * cur_key = &g_array_index(pinyin_keys, ChewingKey, offset); + ChewingKey * next_key = &g_array_index(pinyin_keys, ChewingKey, + offset + 1); + + /* some tone here */ + if (CHEWING_ZERO_TONE != cur_key->m_tone) + return found; + + ChewingKeyRest orig_rest; + orig_rest.m_raw_begin = cur_rest->m_raw_begin; + orig_rest.m_raw_end = next_rest->m_raw_end; + + /* backup tone */ + if (options & USE_TONE) { + next_tone = next_key->m_tone; + if (CHEWING_ZERO_TONE != next_tone) { + next_key->m_tone = CHEWING_ZERO_TONE; + next_rest->m_raw_end --; + } + } + + /* lookup re-split table */ + const char * str = instance->m_raw_full_pinyin; + const resplit_table_item_t * item_by_orig = + context->m_full_pinyin_parser-> + retrieve_resplit_item_by_original_pinyins + (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str)); + + const resplit_table_item_t * item_by_new = + context->m_full_pinyin_parser-> + retrieve_resplit_item_by_resplit_pinyins + (options, cur_key, cur_rest, next_key, next_rest, str, strlen(str)); + + /* there are no same couple of pinyins in re-split table. */ + assert(!(item_by_orig && item_by_new)); + + ChewingKey resplit_keys[2]; + const char * pinyins[2]; + + bool tosearch = false; + if (item_by_orig && item_by_orig->m_new_freq) { + pinyins[0] = item_by_orig->m_new_keys[0]; + pinyins[1] = item_by_orig->m_new_keys[1]; + + assert(context->m_full_pinyin_parser-> + parse_one_key(options, resplit_keys[0], + pinyins[0], strlen(pinyins[0]))); + + assert(context->m_full_pinyin_parser-> + parse_one_key(options, resplit_keys[1], + pinyins[1], strlen(pinyins[1]))); + tosearch = true; + } + + if (item_by_new && item_by_new->m_orig_freq) { + pinyins[0] = item_by_new->m_orig_keys[0]; + pinyins[1] = item_by_new->m_orig_keys[1]; + + assert(context->m_full_pinyin_parser-> + parse_one_key(options, resplit_keys[0], + pinyins[0], strlen(pinyins[0]))); + + assert(context->m_full_pinyin_parser-> + parse_one_key(options, resplit_keys[1], + pinyins[1], strlen(pinyins[1]))); + tosearch = true; + } + + if (tosearch) { + gchar * new_pinyins = g_strdup_printf + ("%s'%s", pinyins[0], pinyins[1]); + + /* propagate the tone */ + if (options & USE_TONE) { + if (CHEWING_ZERO_TONE != next_tone) { + assert(0 < next_tone && next_tone <= 5); + resplit_keys[1].m_tone = next_tone; + + gchar * tmp_str = g_strdup_printf + ("%s%d", new_pinyins, next_tone); + g_free(new_pinyins); + new_pinyins = tmp_str; + } + } + + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (2, resplit_keys, ranges); + + if (retval & SEARCH_OK) { + lookup_candidate_t template_item; + template_item.m_candidate_type = RESPLIT_CANDIDATE; + template_item.m_orig_rest = orig_rest; + template_item.m_new_pinyins = new_pinyins; + + _append_items(context, ranges, &template_item, items); + found = true; + } + g_free(new_pinyins); + } + + /* restore tones */ + if (options & USE_TONE) { + if (CHEWING_ZERO_TONE != next_tone) { + next_key->m_tone = next_tone; + next_rest->m_raw_end ++; + } + } + + return found; +} + +bool pinyin_guess_full_pinyin_candidates(pinyin_instance_t * instance, + size_t offset){ + + pinyin_context_t * & context = instance->m_context; + pinyin_option_t & options = context->m_options; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + + _free_candidates(instance->m_candidates); + + size_t pinyin_len = pinyin_keys->len - offset; + pinyin_len = std_lite::min((size_t)MAX_PHRASE_LENGTH, pinyin_len); + ssize_t i; + + /* lookup the previous token here. */ + phrase_token_t prev_token = null_token; + + if (options & DYNAMIC_ADJUST) { + prev_token = _get_previous_token(instance, offset); + } + + SingleGram merged_gram; + SingleGram * system_gram = NULL, * user_gram = NULL; + + if (options & DYNAMIC_ADJUST) { + if (null_token != prev_token) { + context->m_system_bigram->load(prev_token, system_gram); + context->m_user_bigram->load(prev_token, user_gram); + merge_single_gram(&merged_gram, system_gram, user_gram); + } + } + + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(ranges)); + context->m_phrase_index->prepare_ranges(ranges); + + GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); + + if (1 == pinyin_len) { + /* because there is only one pinyin left, + * the following for-loop will not produce 2 character candidates. + * the if-branch will fill the candidate list with + * 2 character candidates. + */ + + if (options & USE_DIVIDED_TABLE) { + g_array_set_size(items, 0); + + if (_try_divided_table(instance, ranges, offset, items)) { + +#if 0 + g_array_sort(items, compare_item_with_token); + + _remove_duplicated_items(items); +#endif + + _compute_frequency_of_items(context, prev_token, + &merged_gram, items); + + /* sort the candidates of the same length by frequency. */ + g_array_sort(items, compare_item_with_frequency); + + /* transfer back items to tokens, and save it into candidates */ + for (i = 0; i < items->len; ++i) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, i); + g_array_append_val(instance->m_candidates, *item); + } + } + } + } + + for (i = pinyin_len; i >= 1; --i) { + bool found = false; + g_array_set_size(items, 0); + + if (2 == i) { + /* handle fuzzy pinyin segment here. */ + if (options & USE_DIVIDED_TABLE) { + found = _try_divided_table(instance, ranges, offset, items) || + found; + } + if (options & USE_RESPLIT_TABLE) { + found = _try_resplit_table(instance, ranges, offset, items) || + found; + } + } + + ChewingKey * keys = &g_array_index + (pinyin_keys, ChewingKey, offset); + + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (i, keys, ranges); + + found = (retval & SEARCH_OK) || found; + + if ( !found ) + continue; + + lookup_candidate_t template_item; + _append_items(context, ranges, &template_item, items); + +#if 0 + g_array_sort(items, compare_item_with_token); + + _remove_duplicated_items(items); +#endif + + _compute_frequency_of_items(context, prev_token, &merged_gram, items); + + g_array_sort(items, compare_item_with_frequency); + + for (size_t k = 0; k < items->len; ++k) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, k); + g_array_append_val(instance->m_candidates, *item); + } + +#if 0 + if (!(retval & SEARCH_CONTINUED)) + break; +#endif + } + + g_array_free(items, TRUE); + context->m_phrase_index->destroy_ranges(ranges); + if (system_gram) + delete system_gram; + if (user_gram) + delete user_gram; + + /* post process to remove duplicated candidates */ + + _prepend_sentence_candidate(instance, instance->m_candidates); + + _compute_phrase_strings_of_items(instance, offset, instance->m_candidates); + + _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); + + return true; +} + + +int pinyin_choose_candidate(pinyin_instance_t * instance, + size_t offset, + lookup_candidate_t * candidate){ + pinyin_context_t * & context = instance->m_context; + + if (DIVIDED_CANDIDATE == candidate->m_candidate_type || + RESPLIT_CANDIDATE == candidate->m_candidate_type) { + /* update full pinyin. */ + gchar * oldpinyins = instance->m_raw_full_pinyin; + const ChewingKeyRest rest = candidate->m_orig_rest; + oldpinyins[rest.m_raw_begin] = '\0'; + const gchar * left_part = oldpinyins; + const gchar * right_part = oldpinyins + rest.m_raw_end; + gchar * newpinyins = g_strconcat(left_part, candidate->m_new_pinyins, + right_part, NULL); + g_free(oldpinyins); + instance->m_raw_full_pinyin = newpinyins; + + /* re-parse the full pinyin. */ + const gchar * pinyins = instance->m_raw_full_pinyin; + int pinyin_len = strlen(pinyins); + int parse_len = context->m_full_pinyin_parser->parse + (context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, pinyins, pinyin_len); + + /* Note: there may be some un-parsable input here. */ + } + + /* sync m_constraints to the length of m_pinyin_keys. */ + bool retval = context->m_pinyin_lookup->validate_constraint + (instance->m_constraints, instance->m_pinyin_keys); + + phrase_token_t token = candidate->m_token; + guint8 len = context->m_pinyin_lookup->add_constraint + (instance->m_constraints, offset, token); + + /* safe guard: validate the m_constraints again. */ + retval = context->m_pinyin_lookup->validate_constraint + (instance->m_constraints, instance->m_pinyin_keys) && len; + + return offset + len; +} + +bool pinyin_clear_constraint(pinyin_instance_t * instance, + size_t offset){ + pinyin_context_t * & context = instance->m_context; + + bool retval = context->m_pinyin_lookup->clear_constraint + (instance->m_constraints, offset); + + return retval; +} + +bool pinyin_lookup_tokens(pinyin_instance_t * instance, + const char * phrase, GArray * tokenarray){ + pinyin_context_t * & context = instance->m_context; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + glong ucs4_len = 0; + ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens); + int num = reduce_tokens(tokens, tokenarray); + phrase_index->destroy_tokens(tokens); + + return SEARCH_OK & retval; +} + +bool pinyin_train(pinyin_instance_t * instance){ + if (!instance->m_context->m_user_dir) + return false; + + pinyin_context_t * & context = instance->m_context; + context->m_modified = true; + + bool retval = context->m_pinyin_lookup->train_result2 + (instance->m_pinyin_keys, instance->m_constraints, + instance->m_match_results); + + return retval; +} + +bool pinyin_reset(pinyin_instance_t * instance){ + g_free(instance->m_raw_full_pinyin); + instance->m_raw_full_pinyin = NULL; + + g_array_set_size(instance->m_prefixes, 0); + g_array_set_size(instance->m_pinyin_keys, 0); + g_array_set_size(instance->m_pinyin_key_rests, 0); + g_array_set_size(instance->m_constraints, 0); + g_array_set_size(instance->m_match_results, 0); + _free_candidates(instance->m_candidates); + + return true; +} + +bool pinyin_get_chewing_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str) { + *utf8_str = NULL; + if (0 == key->get_table_index()) + return false; + + *utf8_str = key->get_chewing_string(); + return true; +} + +bool pinyin_get_pinyin_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str) { + *utf8_str = NULL; + if (0 == key->get_table_index()) + return false; + + *utf8_str = key->get_pinyin_string(); + return true; +} + +bool pinyin_get_pinyin_strings(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** shengmu, + gchar ** yunmu) { + if (0 == key->get_table_index()) + return false; + + if (shengmu) + *shengmu = key->get_shengmu_string(); + if (yunmu) + *yunmu = key->get_yunmu_string(); + return true; +} + +bool pinyin_token_get_phrase(pinyin_instance_t * instance, + phrase_token_t token, + guint * len, + gchar ** utf8_str) { + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + ucs4_t buffer[MAX_PHRASE_LENGTH]; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + item.get_phrase_string(buffer); + guint length = item.get_phrase_length(); + if (len) + *len = length; + if (utf8_str) + *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); + return true; +} + +bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint * num){ + *num = 0; + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + *num = item.get_n_pronunciation(); + return true; +} + +bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint nth, + ChewingKeyVector keys){ + g_array_set_size(keys, 0); + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + ChewingKey buffer[MAX_PHRASE_LENGTH]; + guint32 freq = 0; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + item.get_nth_pronunciation(nth, buffer, freq); + guint8 len = item.get_phrase_length(); + g_array_append_vals(keys, buffer, len); + return true; +} + +bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint * freq) { + *freq = 0; + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + *freq = item.get_unigram_frequency(); + return true; +} + +bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint delta){ + pinyin_context_t * & context = instance->m_context; + int retval = context->m_phrase_index->add_unigram_frequency + (token, delta); + return ERROR_OK == retval; +} + +bool pinyin_get_n_candidate(pinyin_instance_t * instance, + guint * num) { + *num = instance->m_candidates->len; + return true; +} + +bool pinyin_get_candidate(pinyin_instance_t * instance, + guint index, + lookup_candidate_t ** candidate) { + CandidateVector & candidates = instance->m_candidates; + + *candidate = NULL; + + if (index >= candidates->len) + return false; + + *candidate = &g_array_index(candidates, lookup_candidate_t, index); + + return true; +} + +bool pinyin_get_candidate_type(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + lookup_candidate_type_t * type) { + *type = candidate->m_candidate_type; + return true; +} + +bool pinyin_get_candidate_string(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + const gchar ** utf8_str) { + *utf8_str = candidate->m_phrase_string; + return true; +} + +bool pinyin_get_n_pinyin(pinyin_instance_t * instance, + guint * num) { + *num = 0; + + if (instance->m_pinyin_keys->len != + instance->m_pinyin_key_rests->len) + return false; + + *num = instance->m_pinyin_keys->len; + return true; +} + +bool pinyin_get_pinyin_key(pinyin_instance_t * instance, + guint index, + ChewingKey ** key) { + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + + *key = NULL; + + if (index >= pinyin_keys->len) + return false; + + *key = &g_array_index(pinyin_keys, ChewingKey, index); + + return true; +} + +bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance, + guint index, + ChewingKeyRest ** key_rest) { + ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; + + *key_rest = NULL; + + if (index >= pinyin_key_rests->len) + return false; + + *key_rest = &g_array_index(pinyin_key_rests, ChewingKeyRest, index); + + return true; +} + +bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * begin, guint16 * end) { + if (begin) + *begin = key_rest->m_raw_begin; + + if (end) + *end = key_rest->m_raw_end; + + return true; +} + +bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * length) { + *length = key_rest->length(); + return true; +} + +bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance, + const gchar ** utf8_str) { + *utf8_str = instance->m_raw_full_pinyin; + return true; +} + +bool pinyin_get_n_phrase(pinyin_instance_t * instance, + guint * num) { + *num = instance->m_match_results->len; + return true; +} + +bool pinyin_get_phrase_token(pinyin_instance_t * instance, + guint index, + phrase_token_t * token){ + MatchResults & match_results = instance->m_match_results; + + *token = null_token; + + if (index >= match_results->len) + return false; + + *token = g_array_index(match_results, phrase_token_t, index); + + return true; +} + + +/** + * Note: prefix is the text before the pre-edit string. + */ |