From a27716de40c35554b12fa1d5f94670a843e988bb Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 21 Oct 2013 11:14:53 +0800 Subject: rename files --- src/libpinyin.ver | 54 -- src/libzhuyin.ver | 54 ++ src/pinyin.cpp | 1681 ----------------------------------------------------- src/pinyin.h | 663 --------------------- src/zhuyin.cpp | 1681 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/zhuyin.h | 663 +++++++++++++++++++++ 6 files changed, 2398 insertions(+), 2398 deletions(-) delete mode 100644 src/libpinyin.ver create mode 100644 src/libzhuyin.ver delete mode 100644 src/pinyin.cpp delete mode 100644 src/pinyin.h create mode 100644 src/zhuyin.cpp create mode 100644 src/zhuyin.h diff --git a/src/libpinyin.ver b/src/libpinyin.ver deleted file mode 100644 index b7287f7..0000000 --- a/src/libpinyin.ver +++ /dev/null @@ -1,54 +0,0 @@ -LIBPINYIN { - global: - pinyin_init; - pinyin_save; - pinyin_set_chewing_scheme; - pinyin_set_full_pinyin_scheme; - pinyin_load_phrase_library; - pinyin_unload_phrase_library; - pinyin_begin_add_phrases; - pinyin_iterator_add_phrase; - pinyin_end_add_phrases; - pinyin_fini; - pinyin_mask_out; - pinyin_set_options; - pinyin_alloc_instance; - pinyin_free_instance; - pinyin_guess_sentence; - pinyin_guess_sentence_with_prefix; - pinyin_phrase_segment; - pinyin_get_sentence; - pinyin_parse_full_pinyin; - pinyin_parse_more_full_pinyins; - pinyin_parse_chewing; - pinyin_parse_more_chewings; - pinyin_in_chewing_keyboard; - pinyin_guess_candidates; - pinyin_choose_candidate; - pinyin_clear_constraint; - pinyin_lookup_tokens; - pinyin_train; - pinyin_reset; - pinyin_get_bopomofo_string; - pinyin_get_pinyin_string; - pinyin_token_get_phrase; - pinyin_token_get_n_pronunciation; - pinyin_token_get_nth_pronunciation; - pinyin_token_get_unigram_frequency; - pinyin_token_add_unigram_frequency; - pinyin_get_n_candidate; - pinyin_get_candidate; - pinyin_get_candidate_type; - pinyin_get_candidate_string; - pinyin_get_n_pinyin; - pinyin_get_pinyin_key; - pinyin_get_pinyin_key_rest; - pinyin_get_pinyin_key_rest_positions; - pinyin_get_pinyin_key_rest_length; - pinyin_get_raw_full_pinyin; - pinyin_get_n_phrase; - pinyin_get_phrase_token; - - local: - *; -}; diff --git a/src/libzhuyin.ver b/src/libzhuyin.ver new file mode 100644 index 0000000..b7287f7 --- /dev/null +++ b/src/libzhuyin.ver @@ -0,0 +1,54 @@ +LIBPINYIN { + global: + pinyin_init; + pinyin_save; + pinyin_set_chewing_scheme; + pinyin_set_full_pinyin_scheme; + pinyin_load_phrase_library; + pinyin_unload_phrase_library; + pinyin_begin_add_phrases; + pinyin_iterator_add_phrase; + pinyin_end_add_phrases; + pinyin_fini; + pinyin_mask_out; + pinyin_set_options; + pinyin_alloc_instance; + pinyin_free_instance; + pinyin_guess_sentence; + pinyin_guess_sentence_with_prefix; + pinyin_phrase_segment; + pinyin_get_sentence; + pinyin_parse_full_pinyin; + pinyin_parse_more_full_pinyins; + pinyin_parse_chewing; + pinyin_parse_more_chewings; + pinyin_in_chewing_keyboard; + pinyin_guess_candidates; + pinyin_choose_candidate; + pinyin_clear_constraint; + pinyin_lookup_tokens; + pinyin_train; + pinyin_reset; + pinyin_get_bopomofo_string; + pinyin_get_pinyin_string; + pinyin_token_get_phrase; + pinyin_token_get_n_pronunciation; + pinyin_token_get_nth_pronunciation; + pinyin_token_get_unigram_frequency; + pinyin_token_add_unigram_frequency; + pinyin_get_n_candidate; + pinyin_get_candidate; + pinyin_get_candidate_type; + pinyin_get_candidate_string; + pinyin_get_n_pinyin; + pinyin_get_pinyin_key; + pinyin_get_pinyin_key_rest; + pinyin_get_pinyin_key_rest_positions; + pinyin_get_pinyin_key_rest_length; + pinyin_get_raw_full_pinyin; + pinyin_get_n_phrase; + pinyin_get_phrase_token; + + local: + *; +}; diff --git a/src/pinyin.cpp b/src/pinyin.cpp deleted file mode 100644 index f1a8aff..0000000 --- a/src/pinyin.cpp +++ /dev/null @@ -1,1681 +0,0 @@ -/* - * libzhuyin - * Library to deal with zhuyin. - * - * Copyright (C) 2011 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - - -#include "pinyin.h" -#include -#include -#include -#include "pinyin_internal.h" - - -using namespace zhuyin; - -/* a glue layer for input method integration. */ - -typedef GArray * CandidateVector; /* GArray of lookup_candidate_t */ - -struct _pinyin_context_t{ - pinyin_option_t m_options; - - FullPinyinScheme m_full_pinyin_scheme; - FullPinyinParser2 * m_full_pinyin_parser; - ChewingParser2 * m_chewing_parser; - - FacadeChewingTable * m_pinyin_table; - FacadePhraseTable2 * m_phrase_table; - FacadePhraseIndex * m_phrase_index; - Bigram * m_system_bigram; - Bigram * m_user_bigram; - - PinyinLookup2 * m_pinyin_lookup; - PhraseLookup * m_phrase_lookup; - - char * m_system_dir; - char * m_user_dir; - bool m_modified; - - SystemTableInfo m_system_table_info; -}; - -struct _pinyin_instance_t{ - pinyin_context_t * m_context; - gchar * m_raw_full_pinyin; - TokenVector m_prefixes; - ChewingKeyVector m_pinyin_keys; - ChewingKeyRestVector m_pinyin_key_rests; - CandidateConstraints m_constraints; - MatchResults m_match_results; - CandidateVector m_candidates; -}; - -struct _lookup_candidate_t{ - lookup_candidate_type_t m_candidate_type; - gchar * m_phrase_string; - phrase_token_t m_token; - ChewingKeyRest m_orig_rest; - gchar * m_new_pinyins; - guint32 m_freq; /* the amplifed gfloat numerical value. */ -public: - _lookup_candidate_t() { - m_candidate_type = NORMAL_CANDIDATE; - m_phrase_string = NULL; - m_token = null_token; - m_new_pinyins = NULL; - m_freq = 0; - } -}; - -struct _import_iterator_t{ - pinyin_context_t * m_context; - guint8 m_phrase_index; -}; - - -static bool check_format(pinyin_context_t * context){ - const char * userdir = context->m_user_dir; - - UserTableInfo user_table_info; - gchar * filename = g_build_filename - (userdir, USER_TABLE_INFO, NULL); - user_table_info.load(filename); - g_free(filename); - - bool exists = user_table_info.is_conform - (&context->m_system_table_info); - - if (exists) - return exists; - - const pinyin_table_info_t * phrase_files = - context->m_system_table_info.get_table_info(); - - /* clean up files, if version mis-matches. */ - for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - const pinyin_table_info_t * table_info = phrase_files + i; - - if (NOT_USED == table_info->m_file_type) - continue; - - if (NULL == table_info->m_user_filename) - continue; - - const char * userfilename = table_info->m_user_filename; - - /* remove dbin file. */ - filename = g_build_filename(userdir, userfilename, NULL); - unlink(filename); - g_free(filename); - } - - filename = g_build_filename - (userdir, USER_PINYIN_INDEX, NULL); - unlink(filename); - g_free(filename); - - filename = g_build_filename - (userdir, USER_PHRASE_INDEX, NULL); - unlink(filename); - g_free(filename); - - filename = g_build_filename - (userdir, USER_BIGRAM, NULL); - unlink(filename); - g_free(filename); - - return exists; -} - -static bool mark_version(pinyin_context_t * context){ - const char * userdir = context->m_user_dir; - - UserTableInfo user_table_info; - user_table_info.make_conform(&context->m_system_table_info); - - gchar * filename = g_build_filename - (userdir, USER_TABLE_INFO, NULL); - bool retval = user_table_info.save(filename); - g_free(filename); - - return retval; -} - -pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ - pinyin_context_t * context = new pinyin_context_t; - - context->m_options = USE_TONE; - - context->m_system_dir = g_strdup(systemdir); - context->m_user_dir = g_strdup(userdir); - context->m_modified = false; - - gchar * filename = g_build_filename - (context->m_system_dir, SYSTEM_TABLE_INFO, NULL); - if (!context->m_system_table_info.load(filename)) { - fprintf(stderr, "load %s failed!\n", filename); - return NULL; - } - g_free(filename); - - - check_format(context); - - context->m_full_pinyin_scheme = FULL_PINYIN_DEFAULT; - context->m_full_pinyin_parser = new FullPinyinParser2; - context->m_chewing_parser = new ChewingDiscreteParser2; - - /* load chewing table. */ - context->m_pinyin_table = new FacadeChewingTable; - - /* load system chewing table. */ - MemoryChunk * chunk = new MemoryChunk; - filename = g_build_filename - (context->m_system_dir, SYSTEM_PINYIN_INDEX, NULL); - if (!chunk->load(filename)) { - fprintf(stderr, "open %s failed!\n", filename); - return NULL; - } - g_free(filename); - - /* load user chewing table */ - MemoryChunk * userchunk = new MemoryChunk; - filename = g_build_filename - (context->m_user_dir, USER_PINYIN_INDEX, NULL); - if (!userchunk->load(filename)) { - /* hack here: use local Chewing Table to create empty memory chunk. */ - ChewingLargeTable table(context->m_options); - table.store(userchunk); - } - g_free(filename); - - context->m_pinyin_table->load(context->m_options, chunk, userchunk); - - /* load phrase table */ - context->m_phrase_table = new FacadePhraseTable2; - - /* load system phrase table */ - chunk = new MemoryChunk; - filename = g_build_filename - (context->m_system_dir, SYSTEM_PHRASE_INDEX, NULL); - if (!chunk->load(filename)) { - fprintf(stderr, "open %s failed!\n", filename); - return NULL; - } - g_free(filename); - - /* load user phrase table */ - userchunk = new MemoryChunk; - filename = g_build_filename - (context->m_user_dir, USER_PHRASE_INDEX, NULL); - if (!userchunk->load(filename)) { - /* hack here: use local Phrase Table to create empty memory chunk. */ - PhraseLargeTable2 table; - table.store(userchunk); - } - g_free(filename); - - context->m_phrase_table->load(chunk, userchunk); - - context->m_phrase_index = new FacadePhraseIndex; - - /* hack here: directly call load phrase library. */ - pinyin_load_phrase_library(context, GB_DICTIONARY); - pinyin_load_phrase_library(context, MERGED_DICTIONARY); - - context->m_system_bigram = new Bigram; - filename = g_build_filename(context->m_system_dir, SYSTEM_BIGRAM, NULL); - context->m_system_bigram->attach(filename, ATTACH_READONLY); - g_free(filename); - - context->m_user_bigram = new Bigram; - filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); - context->m_user_bigram->load_db(filename); - g_free(filename); - - gfloat lambda = context->m_system_table_info.get_lambda(); - - context->m_pinyin_lookup = new PinyinLookup2 - ( lambda, context->m_options, - context->m_pinyin_table, context->m_phrase_index, - context->m_system_bigram, context->m_user_bigram); - - context->m_phrase_lookup = new PhraseLookup - (lambda, - context->m_phrase_table, context->m_phrase_index, - context->m_system_bigram, context->m_user_bigram); - - return context; -} - -bool pinyin_load_phrase_library(pinyin_context_t * context, - guint8 index){ - if (!(index < PHRASE_INDEX_LIBRARY_COUNT)) - return false; - - /* check whether the sub phrase index is already loaded. */ - PhraseIndexRange range; - int retval = context->m_phrase_index->get_range(index, range); - if (ERROR_OK == retval) - return false; - - const pinyin_table_info_t * phrase_files = - context->m_system_table_info.get_table_info(); - - const pinyin_table_info_t * table_info = phrase_files + index; - - if (SYSTEM_FILE == table_info->m_file_type || - DICTIONARY == table_info->m_file_type) { - /* system phrase library */ - MemoryChunk * chunk = new MemoryChunk; - - const char * systemfilename = table_info->m_system_filename; - /* check bin file in system dir. */ - gchar * chunkfilename = g_build_filename(context->m_system_dir, - systemfilename, NULL); - chunk->load(chunkfilename); - g_free(chunkfilename); - - context->m_phrase_index->load(index, chunk); - - const char * userfilename = table_info->m_user_filename; - - chunkfilename = g_build_filename(context->m_user_dir, - userfilename, NULL); - - MemoryChunk * log = new MemoryChunk; - log->load(chunkfilename); - g_free(chunkfilename); - - /* merge the chunk log. */ - context->m_phrase_index->merge(index, log); - return true; - } - - if (USER_FILE == table_info->m_file_type) { - /* user phrase library */ - MemoryChunk * chunk = new MemoryChunk; - const char * userfilename = table_info->m_user_filename; - - gchar * chunkfilename = g_build_filename(context->m_user_dir, - userfilename, NULL); - - /* check bin file exists. if not, create a new one. */ - if (chunk->load(chunkfilename)) { - context->m_phrase_index->load(index, chunk); - } else { - delete chunk; - context->m_phrase_index->create_sub_phrase(index); - } - - g_free(chunkfilename); - return true; - } - - return false; -} - -bool pinyin_unload_phrase_library(pinyin_context_t * context, - guint8 index){ - /* gb_char.bin and merged.bin can't be unloaded. */ - if (GB_DICTIONARY == index || MERGED_DICTIONARY == index) - return false; - - assert(index < PHRASE_INDEX_LIBRARY_COUNT); - - context->m_phrase_index->unload(index); - return true; -} - -import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context, - guint8 index){ - import_iterator_t * iter = new import_iterator_t; - iter->m_context = context; - iter->m_phrase_index = index; - return iter; -} - -bool pinyin_iterator_add_phrase(import_iterator_t * iter, - const char * phrase, - const char * pinyin, - gint count){ - /* if -1 == count, use the default value. */ - const gint default_count = 5; - const guint32 unigram_factor = 3; - if (-1 == count) - count = default_count; - - pinyin_context_t * & context = iter->m_context; - FacadePhraseTable2 * & phrase_table = context->m_phrase_table; - FacadeChewingTable * & pinyin_table = context->m_pinyin_table; - FacadePhraseIndex * & phrase_index = context->m_phrase_index; - - bool result = false; - - if (NULL == phrase || NULL == pinyin) - return result; - - /* check whether the phrase exists in phrase table */ - glong len_phrase = 0; - ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &len_phrase, NULL); - - /* pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; */ - pinyin_option_t options = USE_TONE; - FullPinyinParser2 parser; - ChewingKeyVector keys = - g_array_new(FALSE, FALSE, sizeof(ChewingKey)); - ChewingKeyRestVector key_rests = - g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); - - /* parse the pinyin. */ - parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); - - if (len_phrase != keys->len) - return result; - - if (0 == len_phrase || len_phrase >= MAX_PHRASE_LENGTH) - return result; - - phrase_token_t token = null_token; - GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - - /* do phrase table search. */ - PhraseTokens tokens; - memset(tokens, 0, sizeof(PhraseTokens)); - phrase_index->prepare_tokens(tokens); - int retval = phrase_table->search(len_phrase, ucs4_phrase, tokens); - int num = reduce_tokens(tokens, tokenarray); - phrase_index->destroy_tokens(tokens); - - /* find the best token candidate. */ - for (size_t i = 0; i < tokenarray->len; ++i) { - phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i); - if (null_token == token) { - token = candidate; - continue; - } - - if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == iter->m_phrase_index) { - /* only one phrase string per sub phrase index. */ - assert(PHRASE_INDEX_LIBRARY_INDEX(token) != iter->m_phrase_index); - token = candidate; - continue; - } - } - g_array_free(tokenarray, TRUE); - - PhraseItem item; - /* check whether it exists in the same sub phrase index; */ - if (null_token != token && - PHRASE_INDEX_LIBRARY_INDEX(token) == iter->m_phrase_index) { - /* if so, remove the phrase, add the pinyin for the phrase item, - then add it back;*/ - phrase_index->get_phrase_item(token, item); - assert(len_phrase == item.get_phrase_length()); - ucs4_t tmp_phrase[MAX_PHRASE_LENGTH]; - item.get_phrase_string(tmp_phrase); - assert(0 == memcmp - (ucs4_phrase, tmp_phrase, sizeof(ucs4_t) * len_phrase)); - - PhraseItem * removed_item = NULL; - retval = phrase_index->remove_phrase_item(token, removed_item); - if (ERROR_OK == retval) { - /* maybe check whether there are duplicated pronunciations here. */ - removed_item->add_pronunciation((ChewingKey *)keys->data, - count); - phrase_index->add_phrase_item(token, removed_item); - delete removed_item; - result = true; - } - } else { - /* if not exists in the same sub phrase index, - get the maximum token, - then add it directly with maximum token + 1; */ - PhraseIndexRange range; - retval = phrase_index->get_range(iter->m_phrase_index, range); - - if (ERROR_OK == retval) { - token = range.m_range_end; - if (0x00000000 == (token & PHRASE_MASK)) - token++; - - if (len_phrase == keys->len) { /* valid pinyin */ - phrase_table->add_index(len_phrase, ucs4_phrase, token); - pinyin_table->add_index - (keys->len, (ChewingKey *)(keys->data), token); - - item.set_phrase_string(len_phrase, ucs4_phrase); - item.add_pronunciation((ChewingKey *)(keys->data), count); - phrase_index->add_phrase_item(token, &item); - phrase_index->add_unigram_frequency(token, - count * unigram_factor); - result = true; - } - } - } - - g_array_free(key_rests, TRUE); - g_array_free(keys, TRUE); - g_free(ucs4_phrase); - return result; -} - -void pinyin_end_add_phrases(import_iterator_t * iter){ - /* compact the content memory chunk of phrase index. */ - iter->m_context->m_phrase_index->compact(); - iter->m_context->m_modified = true; - delete iter; -} - -bool pinyin_save(pinyin_context_t * context){ - if (!context->m_user_dir) - return false; - - if (!context->m_modified) - return false; - - context->m_phrase_index->compact(); - - const pinyin_table_info_t * phrase_files = - context->m_system_table_info.get_table_info(); - - /* skip the reserved zero phrase library. */ - for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - PhraseIndexRange range; - int retval = context->m_phrase_index->get_range(i, range); - - if (ERROR_NO_SUB_PHRASE_INDEX == retval) - continue; - - const pinyin_table_info_t * table_info = phrase_files + i; - - if (NOT_USED == table_info->m_file_type) - continue; - - const char * userfilename = table_info->m_user_filename; - - if (NULL == userfilename) - continue; - - if (SYSTEM_FILE == table_info->m_file_type || - DICTIONARY == table_info->m_file_type) { - /* system phrase library */ - MemoryChunk * chunk = new MemoryChunk; - MemoryChunk * log = new MemoryChunk; - const char * systemfilename = table_info->m_system_filename; - - /* check bin file in system dir. */ - gchar * chunkfilename = g_build_filename(context->m_system_dir, - systemfilename, NULL); - chunk->load(chunkfilename); - g_free(chunkfilename); - context->m_phrase_index->diff(i, chunk, log); - - const char * userfilename = table_info->m_user_filename; - gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); - - gchar * tmppathname = g_build_filename(context->m_user_dir, - tmpfilename, NULL); - g_free(tmpfilename); - - gchar * chunkpathname = g_build_filename(context->m_user_dir, - userfilename, NULL); - log->save(tmppathname); - - int result = rename(tmppathname, chunkpathname); - if (0 != result) - fprintf(stderr, "rename %s to %s failed.\n", - tmppathname, chunkpathname); - - g_free(chunkpathname); - g_free(tmppathname); - delete log; - } - - if (USER_FILE == table_info->m_file_type) { - /* user phrase library */ - MemoryChunk * chunk = new MemoryChunk; - context->m_phrase_index->store(i, chunk); - - const char * userfilename = table_info->m_user_filename; - gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); - gchar * tmppathname = g_build_filename(context->m_user_dir, - tmpfilename, NULL); - g_free(tmpfilename); - - gchar * chunkpathname = g_build_filename(context->m_user_dir, - userfilename, NULL); - - chunk->save(tmppathname); - - int result = rename(tmppathname, chunkpathname); - if (0 != result) - fprintf(stderr, "rename %s to %s failed.\n", - tmppathname, chunkpathname); - - g_free(chunkpathname); - g_free(tmppathname); - delete chunk; - } - } - - /* save user pinyin table */ - gchar * tmpfilename = g_build_filename - (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL); - unlink(tmpfilename); - gchar * filename = g_build_filename - (context->m_user_dir, USER_PINYIN_INDEX, NULL); - - MemoryChunk * chunk = new MemoryChunk; - context->m_pinyin_table->store(chunk); - chunk->save(tmpfilename); - delete chunk; - - int result = rename(tmpfilename, filename); - if (0 != result) - fprintf(stderr, "rename %s to %s failed.\n", - tmpfilename, filename); - - g_free(tmpfilename); - g_free(filename); - - /* save user phrase table */ - tmpfilename = g_build_filename - (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL); - unlink(tmpfilename); - filename = g_build_filename - (context->m_user_dir, USER_PHRASE_INDEX, NULL); - - chunk = new MemoryChunk; - context->m_phrase_table->store(chunk); - chunk->save(tmpfilename); - delete chunk; - - result = rename(tmpfilename, filename); - if (0 != result) - fprintf(stderr, "rename %s to %s failed.\n", - tmpfilename, filename); - - g_free(tmpfilename); - g_free(filename); - - /* save user bi-gram */ - tmpfilename = g_build_filename - (context->m_user_dir, USER_BIGRAM ".tmp", NULL); - unlink(tmpfilename); - filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); - context->m_user_bigram->save_db(tmpfilename); - - result = rename(tmpfilename, filename); - if (0 != result) - fprintf(stderr, "rename %s to %s failed.\n", - tmpfilename, filename); - - g_free(tmpfilename); - g_free(filename); - - mark_version(context); - - context->m_modified = false; - return true; -} - -bool pinyin_set_chewing_scheme(pinyin_context_t * context, - ChewingScheme scheme){ - delete context->m_chewing_parser; - context->m_chewing_parser = NULL; - - switch(scheme) { - case CHEWING_STANDARD: - case CHEWING_HSU: - case CHEWING_IBM: - case CHEWING_GINYIEH: - case CHEWING_ETEN: - case CHEWING_ETEN26: - case CHEWING_STANDARD_DVORAK: - case CHEWING_HSU_DVORAK: { - ChewingDiscreteParser2 * parser = new ChewingDiscreteParser2(); - parser->set_scheme(scheme); - context->m_chewing_parser = parser; - break; - } - case CHEWING_DACHEN_CP26: - context->m_chewing_parser = new ChewingDaChenCP26Parser2(); - break; - } - return true; -} - -bool pinyin_set_full_pinyin_scheme(pinyin_context_t * context, - FullPinyinScheme scheme){ - context->m_full_pinyin_scheme = scheme; - context->m_full_pinyin_parser->set_scheme(scheme); - return true; -} - -void pinyin_fini(pinyin_context_t * context){ - delete context->m_full_pinyin_parser; - delete context->m_chewing_parser; - delete context->m_pinyin_table; - delete context->m_phrase_table; - delete context->m_phrase_index; - delete context->m_system_bigram; - delete context->m_user_bigram; - delete context->m_pinyin_lookup; - delete context->m_phrase_lookup; - - g_free(context->m_system_dir); - g_free(context->m_user_dir); - context->m_modified = false; - - delete context; -} - -bool pinyin_mask_out(pinyin_context_t * context, - phrase_token_t mask, - phrase_token_t value) { - - context->m_pinyin_table->mask_out(mask, value); - context->m_phrase_table->mask_out(mask, value); - context->m_user_bigram->mask_out(mask, value); - - const pinyin_table_info_t * phrase_files = - context->m_system_table_info.get_table_info(); - - /* mask out the phrase index. */ - for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) { - PhraseIndexRange range; - int retval = context->m_phrase_index->get_range(index, range); - - if (ERROR_NO_SUB_PHRASE_INDEX == retval) - continue; - - const pinyin_table_info_t * table_info = phrase_files + index; - - if (NOT_USED == table_info->m_file_type) - continue; - - const char * userfilename = table_info->m_user_filename; - - if (NULL == userfilename) - continue; - - if (SYSTEM_FILE == table_info->m_file_type || - DICTIONARY == table_info->m_file_type) { - /* system phrase library */ - MemoryChunk * chunk = new MemoryChunk; - - const char * systemfilename = table_info->m_system_filename; - /* check bin file in system dir. */ - gchar * chunkfilename = g_build_filename(context->m_system_dir, - systemfilename, NULL); - chunk->load(chunkfilename); - g_free(chunkfilename); - - context->m_phrase_index->load(index, chunk); - - const char * userfilename = table_info->m_user_filename; - - chunkfilename = g_build_filename(context->m_user_dir, - userfilename, NULL); - - MemoryChunk * log = new MemoryChunk; - log->load(chunkfilename); - g_free(chunkfilename); - - /* merge the chunk log with mask. */ - context->m_phrase_index->merge_with_mask(index, log, mask, value); - } - - if (USER_FILE == table_info->m_file_type) { - /* user phrase library */ - context->m_phrase_index->mask_out(index, mask, value); - } - } - - context->m_phrase_index->compact(); - return true; -} - -/* copy from options to context->m_options. */ -bool pinyin_set_options(pinyin_context_t * context, - pinyin_option_t options){ - context->m_options = options; - context->m_pinyin_table->set_options(context->m_options); - context->m_pinyin_lookup->set_options(context->m_options); - return true; -} - - -pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){ - pinyin_instance_t * instance = new pinyin_instance_t; - instance->m_context = context; - - instance->m_raw_full_pinyin = NULL; - - instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); - instance->m_pinyin_key_rests = - g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); - instance->m_constraints = g_array_new - (TRUE, FALSE, sizeof(lookup_constraint_t)); - instance->m_match_results = - g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - instance->m_candidates = - g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); - - return instance; -} - -void pinyin_free_instance(pinyin_instance_t * instance){ - g_free(instance->m_raw_full_pinyin); - g_array_free(instance->m_prefixes, TRUE); - g_array_free(instance->m_pinyin_keys, TRUE); - g_array_free(instance->m_pinyin_key_rests, TRUE); - g_array_free(instance->m_constraints, TRUE); - g_array_free(instance->m_match_results, TRUE); - g_array_free(instance->m_candidates, TRUE); - - delete instance; -} - - -static bool pinyin_update_constraints(pinyin_instance_t * instance){ - pinyin_context_t * & context = instance->m_context; - ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; - CandidateConstraints & constraints = instance->m_constraints; - - size_t key_len = constraints->len; - g_array_set_size(constraints, pinyin_keys->len); - for (size_t i = key_len; i < pinyin_keys->len; ++i ) { - lookup_constraint_t * constraint = - &g_array_index(constraints, lookup_constraint_t, i); - constraint->m_type = NO_CONSTRAINT; - } - - context->m_pinyin_lookup->validate_constraint - (constraints, pinyin_keys); - - return true; -} - - -bool pinyin_guess_sentence(pinyin_instance_t * instance){ - pinyin_context_t * & context = instance->m_context; - - g_array_set_size(instance->m_prefixes, 0); - g_array_append_val(instance->m_prefixes, sentence_start); - - pinyin_update_constraints(instance); - bool retval = context->m_pinyin_lookup->get_best_match - (instance->m_prefixes, - instance->m_pinyin_keys, - instance->m_constraints, - instance->m_match_results); - - return retval; -} - -bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, - const char * prefix){ - pinyin_context_t * & context = instance->m_context; - - FacadePhraseIndex * & phrase_index = context->m_phrase_index; - - g_array_set_size(instance->m_prefixes, 0); - g_array_append_val(instance->m_prefixes, sentence_start); - - glong len_str = 0; - ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL); - GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); - - if (ucs4_str && len_str) { - /* add prefixes. */ - for (ssize_t i = 1; i <= len_str; ++i) { - if (i > MAX_PHRASE_LENGTH) - break; - - ucs4_t * start = ucs4_str + len_str - i; - - PhraseTokens tokens; - memset(tokens, 0, sizeof(tokens)); - phrase_index->prepare_tokens(tokens); - int result = context->m_phrase_table->search(i, start, tokens); - int num = reduce_tokens(tokens, tokenarray); - phrase_index->destroy_tokens(tokens); - - if (result & SEARCH_OK) - g_array_append_vals(instance->m_prefixes, - tokenarray->data, tokenarray->len); - } - } - g_array_free(tokenarray, TRUE); - g_free(ucs4_str); - - pinyin_update_constraints(instance); - bool retval = context->m_pinyin_lookup->get_best_match - (instance->m_prefixes, - instance->m_pinyin_keys, - instance->m_constraints, - instance->m_match_results); - - return retval; -} - -bool pinyin_phrase_segment(pinyin_instance_t * instance, - const char * sentence){ - pinyin_context_t * & context = instance->m_context; - - const glong num_of_chars = g_utf8_strlen(sentence, -1); - glong ucs4_len = 0; - ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL); - - g_return_val_if_fail(num_of_chars == ucs4_len, FALSE); - - bool retval = context->m_phrase_lookup->get_best_match - (ucs4_len, ucs4_str, instance->m_match_results); - - g_free(ucs4_str); - return retval; -} - -/* the returned sentence should be freed by g_free(). */ -bool pinyin_get_sentence(pinyin_instance_t * instance, - char ** sentence){ - pinyin_context_t * & context = instance->m_context; - - bool retval = zhuyin::convert_to_utf8 - (context->m_phrase_index, instance->m_match_results, - NULL, false, *sentence); - - return retval; -} - -bool pinyin_parse_full_pinyin(pinyin_instance_t * instance, - const char * onepinyin, - ChewingKey * onekey){ - pinyin_context_t * & context = instance->m_context; - - int pinyin_len = strlen(onepinyin); - bool retval = context->m_full_pinyin_parser->parse_one_key - ( context->m_options, *onekey, onepinyin, pinyin_len); - return retval; -} - -size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance, - const char * pinyins){ - pinyin_context_t * & context = instance->m_context; - - g_free(instance->m_raw_full_pinyin); - instance->m_raw_full_pinyin = g_strdup(pinyins); - int pinyin_len = strlen(pinyins); - - int parse_len = context->m_full_pinyin_parser->parse - ( context->m_options, instance->m_pinyin_keys, - instance->m_pinyin_key_rests, pinyins, pinyin_len); - - return parse_len; -} - -bool pinyin_parse_chewing(pinyin_instance_t * instance, - const char * onechewing, - ChewingKey * onekey){ - pinyin_context_t * & context = instance->m_context; - - int chewing_len = strlen(onechewing); - bool retval = context->m_chewing_parser->parse_one_key - ( context->m_options, *onekey, onechewing, chewing_len ); - return retval; -} - -size_t pinyin_parse_more_chewings(pinyin_instance_t * instance, - const char * chewings){ - pinyin_context_t * & context = instance->m_context; - int chewing_len = strlen(chewings); - - int parse_len = context->m_chewing_parser->parse - ( context->m_options, instance->m_pinyin_keys, - instance->m_pinyin_key_rests, chewings, chewing_len); - - return parse_len; -} - -bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, - const char key, const char ** symbol) { - pinyin_context_t * & context = instance->m_context; - return context->m_chewing_parser->in_chewing_scheme - (context->m_options, key, symbol); -} - -#if 0 -static gint compare_item_with_token(gconstpointer lhs, - gconstpointer rhs) { - lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; - lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; - - phrase_token_t token_lhs = item_lhs->m_token; - phrase_token_t token_rhs = item_rhs->m_token; - - return (token_lhs - token_rhs); -} -#endif - -static gint compare_item_with_frequency(gconstpointer lhs, - gconstpointer rhs) { - lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; - lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; - - guint32 freq_lhs = item_lhs->m_freq; - guint32 freq_rhs = item_rhs->m_freq; - - return -(freq_lhs - freq_rhs); /* in descendant order */ -} - -static phrase_token_t _get_previous_token(pinyin_instance_t * instance, - size_t offset) { - phrase_token_t prev_token = null_token; - ssize_t i; - - if (0 == offset) { - /* get previous token from prefixes. */ - prev_token = sentence_start; - size_t prev_token_len = 0; - - pinyin_context_t * context = instance->m_context; - TokenVector prefixes = instance->m_prefixes; - PhraseItem item; - - for (size_t i = 0; i < prefixes->len; ++i) { - phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); - if (sentence_start == token) - continue; - - int retval = context->m_phrase_index->get_phrase_item(token, item); - if (ERROR_OK == retval) { - size_t token_len = item.get_phrase_length(); - if (token_len > prev_token_len) { - /* found longer match, and save it. */ - prev_token = token; - prev_token_len = token_len; - } - } - } - } else { - /* get previous token from match results. */ - assert (0 < offset); - - phrase_token_t cur_token = g_array_index - (instance->m_match_results, phrase_token_t, offset); - if (null_token != cur_token) { - for (i = offset - 1; i >= 0; --i) { - cur_token = g_array_index - (instance->m_match_results, phrase_token_t, i); - if (null_token != cur_token) { - prev_token = cur_token; - break; - } - } - } - } - - return prev_token; -} - -static void _append_items(pinyin_context_t * context, - PhraseIndexRanges ranges, - lookup_candidate_t * template_item, - CandidateVector items) { - /* reduce and append to a single GArray. */ - for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) { - if (NULL == ranges[m]) - continue; - - for (size_t n = 0; n < ranges[m]->len; ++n) { - PhraseIndexRange * range = - &g_array_index(ranges[m], PhraseIndexRange, n); - for (size_t k = range->m_range_begin; - k < range->m_range_end; ++k) { - lookup_candidate_t item; - item.m_candidate_type = template_item->m_candidate_type; - item.m_token = k; - item.m_orig_rest = template_item->m_orig_rest; - item.m_new_pinyins = g_strdup(template_item->m_new_pinyins); - item.m_freq = template_item->m_freq; - g_array_append_val(items, item); - } - } - } -} - -#if 0 -static void _remove_duplicated_items(CandidateVector items) { - /* remove the duplicated items. */ - phrase_token_t last_token = null_token, saved_token; - for (size_t n = 0; n < items->len; ++n) { - lookup_candidate_t * item = &g_array_index - (items, lookup_candidate_t, n); - - saved_token = item->m_token; - if (last_token == saved_token) { - g_array_remove_index(items, n); - n--; - } - last_token = saved_token; - } -} -#endif - -static void _compute_frequency_of_items(pinyin_context_t * context, - phrase_token_t prev_token, - SingleGram * merged_gram, - CandidateVector items) { - pinyin_option_t & options = context->m_options; - ssize_t i; - - PhraseItem cached_item; - /* compute all freqs. */ - for (i = 0; i < items->len; ++i) { - lookup_candidate_t * item = &g_array_index - (items, lookup_candidate_t, i); - phrase_token_t & token = item->m_token; - - gfloat bigram_poss = 0; guint32 total_freq = 0; - if (options & DYNAMIC_ADJUST) { - if (null_token != prev_token) { - guint32 bigram_freq = 0; - merged_gram->get_total_freq(total_freq); - merged_gram->get_freq(token, bigram_freq); - if (0 != total_freq) - bigram_poss = bigram_freq / (gfloat)total_freq; - } - } - - /* compute the m_freq. */ - FacadePhraseIndex * & phrase_index = context->m_phrase_index; - phrase_index->get_phrase_item(token, cached_item); - total_freq = phrase_index->get_phrase_index_total_freq(); - assert (0 < total_freq); - - gfloat lambda = context->m_system_table_info.get_lambda(); - - /* Note: possibility value <= 1.0. */ - guint32 freq = (lambda * bigram_poss + - (1 - lambda) * - cached_item.get_unigram_frequency() / - (gfloat) total_freq) * 256 * 256 * 256; - item->m_freq = freq; - } -} - -static bool _prepend_sentence_candidate(pinyin_instance_t * instance, - CandidateVector candidates) { - /* check whether the best match candidate exists. */ - gchar * sentence = NULL; - pinyin_get_sentence(instance, &sentence); - if (NULL == sentence) - return false; - g_free(sentence); - - /* prepend best match candidate to candidates. */ - lookup_candidate_t candidate; - candidate.m_candidate_type = BEST_MATCH_CANDIDATE; - g_array_prepend_val(candidates, candidate); - - return true; -} - -static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance, - size_t offset, - CandidateVector candidates) { - /* populate m_phrase_string in lookup_candidate_t. */ - - for(size_t i = 0; i < candidates->len; ++i) { - lookup_candidate_t * candidate = &g_array_index - (candidates, lookup_candidate_t, i); - - switch(candidate->m_candidate_type) { - case BEST_MATCH_CANDIDATE: { - gchar * sentence = NULL; - pinyin_get_sentence(instance, &sentence); - candidate->m_phrase_string = g_strdup - (g_utf8_offset_to_pointer(sentence, offset)); - g_free(sentence); - break; - } - case NORMAL_CANDIDATE: - pinyin_token_get_phrase - (instance, candidate->m_token, NULL, - &(candidate->m_phrase_string)); - break; - case ZOMBIE_CANDIDATE: - break; - } - } - - return true; -} - -static gint compare_indexed_item_with_phrase_string(gconstpointer lhs, - gconstpointer rhs, - gpointer userdata) { - size_t index_lhs = *((size_t *) lhs); - size_t index_rhs = *((size_t *) rhs); - CandidateVector candidates = (CandidateVector) userdata; - - lookup_candidate_t * candidate_lhs = - &g_array_index(candidates, lookup_candidate_t, index_lhs); - lookup_candidate_t * candidate_rhs = - &g_array_index(candidates, lookup_candidate_t, index_rhs); - - return -strcmp(candidate_lhs->m_phrase_string, - candidate_rhs->m_phrase_string); /* in descendant order */ -} - - -static bool _remove_duplicated_items_by_phrase_string -(pinyin_instance_t * instance, - CandidateVector candidates) { - size_t i; - /* create the GArray of indexed item */ - GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t)); - for (i = 0; i < candidates->len; ++i) - g_array_append_val(indices, i); - - /* sort the indices array by phrase array */ - g_array_sort_with_data - (indices, compare_indexed_item_with_phrase_string, candidates); - - /* mark duplicated items as zombie candidate */ - lookup_candidate_t * cur_item, * saved_item = NULL; - for (i = 0; i < indices->len; ++i) { - size_t cur_index = g_array_index(indices, size_t, i); - cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index); - - /* handle the first candidate */ - if (NULL == saved_item) { - saved_item = cur_item; - continue; - } - - if (0 == strcmp(saved_item->m_phrase_string, - cur_item->m_phrase_string)) { - /* found duplicated candidates */ - - /* keep best match candidate */ - if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) { - cur_item->m_candidate_type = ZOMBIE_CANDIDATE; - continue; - } - - if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { - saved_item->m_candidate_type = ZOMBIE_CANDIDATE; - saved_item = cur_item; - continue; - } - - /* keep the higher possiblity one - to quickly move the word forward in the candidate list */ - if (cur_item->m_freq > saved_item->m_freq) { - /* find better candidate */ - saved_item->m_candidate_type = ZOMBIE_CANDIDATE; - saved_item = cur_item; - continue; - } else { - cur_item->m_candidate_type = ZOMBIE_CANDIDATE; - continue; - } - } else { - /* keep the current candidate */ - saved_item = cur_item; - } - } - - g_array_free(indices, TRUE); - - /* remove zombie candidate from the returned candidates */ - for (i = 0; i < candidates->len; ++i) { - lookup_candidate_t * candidate = &g_array_index - (candidates, lookup_candidate_t, i); - - if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) { - g_free(candidate->m_phrase_string); - g_free(candidate->m_new_pinyins); - g_array_remove_index(candidates, i); - i--; - } - } - - return true; -} - -static bool _free_candidates(CandidateVector candidates) { - /* free candidates */ - for (size_t i = 0; i < candidates->len; ++i) { - lookup_candidate_t * candidate = &g_array_index - (candidates, lookup_candidate_t, i); - g_free(candidate->m_phrase_string); - g_free(candidate->m_new_pinyins); - } - g_array_set_size(candidates, 0); - - return true; -} - -bool pinyin_guess_candidates(pinyin_instance_t * instance, - size_t offset) { - - pinyin_context_t * & context = instance->m_context; - pinyin_option_t & options = context->m_options; - ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; - - _free_candidates(instance->m_candidates); - - size_t pinyin_len = pinyin_keys->len - offset; - ssize_t i; - - /* lookup the previous token here. */ - phrase_token_t prev_token = null_token; - - if (options & DYNAMIC_ADJUST) { - prev_token = _get_previous_token(instance, offset); - } - - SingleGram merged_gram; - SingleGram * system_gram = NULL, * user_gram = NULL; - - if (options & DYNAMIC_ADJUST) { - if (null_token != prev_token) { - context->m_system_bigram->load(prev_token, system_gram); - context->m_user_bigram->load(prev_token, user_gram); - merge_single_gram(&merged_gram, system_gram, user_gram); - } - } - - PhraseIndexRanges ranges; - memset(ranges, 0, sizeof(ranges)); - context->m_phrase_index->prepare_ranges(ranges); - - GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); - - for (i = pinyin_len; i >= 1; --i) { - g_array_set_size(items, 0); - - ChewingKey * keys = &g_array_index - (pinyin_keys, ChewingKey, offset); - - /* do pinyin search. */ - int retval = context->m_pinyin_table->search - (i, keys, ranges); - - if ( !(retval & SEARCH_OK) ) - continue; - - lookup_candidate_t template_item; - _append_items(context, ranges, &template_item, items); - -#if 0 - g_array_sort(items, compare_item_with_token); - - _remove_duplicated_items(items); -#endif - - _compute_frequency_of_items(context, prev_token, &merged_gram, items); - - /* sort the candidates of the same length by frequency. */ - g_array_sort(items, compare_item_with_frequency); - - /* transfer back items to tokens, and save it into candidates */ - for (size_t k = 0; k < items->len; ++k) { - lookup_candidate_t * item = &g_array_index - (items, lookup_candidate_t, k); - g_array_append_val(instance->m_candidates, *item); - } - -#if 0 - if (!(retval & SEARCH_CONTINUED)) - break; -#endif - } - - g_array_free(items, TRUE); - context->m_phrase_index->destroy_ranges(ranges); - if (system_gram) - delete system_gram; - if (user_gram) - delete user_gram; - - /* post process to remove duplicated candidates */ - - _prepend_sentence_candidate(instance, instance->m_candidates); - - _compute_phrase_strings_of_items(instance, offset, instance->m_candidates); - - _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); - - return true; -} - -int pinyin_choose_candidate(pinyin_instance_t * instance, - size_t offset, - lookup_candidate_t * candidate){ - pinyin_context_t * & context = instance->m_context; - - /* sync m_constraints to the length of m_pinyin_keys. */ - bool retval = context->m_pinyin_lookup->validate_constraint - (instance->m_constraints, instance->m_pinyin_keys); - - phrase_token_t token = candidate->m_token; - guint8 len = context->m_pinyin_lookup->add_constraint - (instance->m_constraints, offset, token); - - /* safe guard: validate the m_constraints again. */ - retval = context->m_pinyin_lookup->validate_constraint - (instance->m_constraints, instance->m_pinyin_keys) && len; - - return offset + len; -} - -bool pinyin_clear_constraint(pinyin_instance_t * instance, - size_t offset){ - pinyin_context_t * & context = instance->m_context; - - bool retval = context->m_pinyin_lookup->clear_constraint - (instance->m_constraints, offset); - - return retval; -} - -bool pinyin_lookup_tokens(pinyin_instance_t * instance, - const char * phrase, GArray * tokenarray){ - pinyin_context_t * & context = instance->m_context; - FacadePhraseIndex * & phrase_index = context->m_phrase_index; - - glong ucs4_len = 0; - ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL); - - PhraseTokens tokens; - memset(tokens, 0, sizeof(PhraseTokens)); - phrase_index->prepare_tokens(tokens); - int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens); - int num = reduce_tokens(tokens, tokenarray); - phrase_index->destroy_tokens(tokens); - - return SEARCH_OK & retval; -} - -bool pinyin_train(pinyin_instance_t * instance){ - if (!instance->m_context->m_user_dir) - return false; - - pinyin_context_t * & context = instance->m_context; - context->m_modified = true; - - bool retval = context->m_pinyin_lookup->train_result2 - (instance->m_pinyin_keys, instance->m_constraints, - instance->m_match_results); - - return retval; -} - -bool pinyin_reset(pinyin_instance_t * instance){ - g_free(instance->m_raw_full_pinyin); - instance->m_raw_full_pinyin = NULL; - - g_array_set_size(instance->m_prefixes, 0); - g_array_set_size(instance->m_pinyin_keys, 0); - g_array_set_size(instance->m_pinyin_key_rests, 0); - g_array_set_size(instance->m_constraints, 0); - g_array_set_size(instance->m_match_results, 0); - _free_candidates(instance->m_candidates); - - return true; -} - -bool pinyin_get_bopomofo_string(pinyin_instance_t * instance, - ChewingKey * key, - gchar ** utf8_str) { - *utf8_str = NULL; - if (0 == key->get_table_index()) - return false; - - *utf8_str = key->get_bopomofo_string(); - return true; -} - -bool pinyin_get_pinyin_string(pinyin_instance_t * instance, - ChewingKey * key, - gchar ** utf8_str) { - pinyin_context_t * context = instance->m_context; - - *utf8_str = NULL; - if (0 == key->get_table_index()) - return false; - - *utf8_str = key->get_pinyin_string(context->m_full_pinyin_scheme); - return true; -} - -bool pinyin_token_get_phrase(pinyin_instance_t * instance, - phrase_token_t token, - guint * len, - gchar ** utf8_str) { - pinyin_context_t * & context = instance->m_context; - PhraseItem item; - ucs4_t buffer[MAX_PHRASE_LENGTH]; - - int retval = context->m_phrase_index->get_phrase_item(token, item); - if (ERROR_OK != retval) - return false; - - item.get_phrase_string(buffer); - guint length = item.get_phrase_length(); - if (len) - *len = length; - if (utf8_str) - *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); - return true; -} - -bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance, - phrase_token_t token, - guint * num){ - *num = 0; - pinyin_context_t * & context = instance->m_context; - PhraseItem item; - - int retval = context->m_phrase_index->get_phrase_item(token, item); - if (ERROR_OK != retval) - return false; - - *num = item.get_n_pronunciation(); - return true; -} - -bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance, - phrase_token_t token, - guint nth, - ChewingKeyVector keys){ - g_array_set_size(keys, 0); - pinyin_context_t * & context = instance->m_context; - PhraseItem item; - ChewingKey buffer[MAX_PHRASE_LENGTH]; - guint32 freq = 0; - - int retval = context->m_phrase_index->get_phrase_item(token, item); - if (ERROR_OK != retval) - return false; - - item.get_nth_pronunciation(nth, buffer, freq); - guint8 len = item.get_phrase_length(); - g_array_append_vals(keys, buffer, len); - return true; -} - -bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance, - phrase_token_t token, - guint * freq) { - *freq = 0; - pinyin_context_t * & context = instance->m_context; - PhraseItem item; - - int retval = context->m_phrase_index->get_phrase_item(token, item); - if (ERROR_OK != retval) - return false; - - *freq = item.get_unigram_frequency(); - return true; -} - -bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance, - phrase_token_t token, - guint delta){ - pinyin_context_t * & context = instance->m_context; - int retval = context->m_phrase_index->add_unigram_frequency - (token, delta); - return ERROR_OK == retval; -} - -bool pinyin_get_n_candidate(pinyin_instance_t * instance, - guint * num) { - *num = instance->m_candidates->len; - return true; -} - -bool pinyin_get_candidate(pinyin_instance_t * instance, - guint index, - lookup_candidate_t ** candidate) { - CandidateVector & candidates = instance->m_candidates; - - *candidate = NULL; - - if (index >= candidates->len) - return false; - - *candidate = &g_array_index(candidates, lookup_candidate_t, index); - - return true; -} - -bool pinyin_get_candidate_type(pinyin_instance_t * instance, - lookup_candidate_t * candidate, - lookup_candidate_type_t * type) { - *type = candidate->m_candidate_type; - return true; -} - -bool pinyin_get_candidate_string(pinyin_instance_t * instance, - lookup_candidate_t * candidate, - const gchar ** utf8_str) { - *utf8_str = candidate->m_phrase_string; - return true; -} - -bool pinyin_get_n_pinyin(pinyin_instance_t * instance, - guint * num) { - *num = 0; - - if (instance->m_pinyin_keys->len != - instance->m_pinyin_key_rests->len) - return false; - - *num = instance->m_pinyin_keys->len; - return true; -} - -bool pinyin_get_pinyin_key(pinyin_instance_t * instance, - guint index, - ChewingKey ** key) { - ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; - - *key = NULL; - - if (index >= pinyin_keys->len) - return false; - - *key = &g_array_index(pinyin_keys, ChewingKey, index); - - return true; -} - -bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance, - guint index, - ChewingKeyRest ** key_rest) { - ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; - - *key_rest = NULL; - - if (index >= pinyin_key_rests->len) - return false; - - *key_rest = &g_array_index(pinyin_key_rests, ChewingKeyRest, index); - - return true; -} - -bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance, - ChewingKeyRest * key_rest, - guint16 * begin, guint16 * end) { - if (begin) - *begin = key_rest->m_raw_begin; - - if (end) - *end = key_rest->m_raw_end; - - return true; -} - -bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance, - ChewingKeyRest * key_rest, - guint16 * length) { - *length = key_rest->length(); - return true; -} - -bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance, - const gchar ** utf8_str) { - *utf8_str = instance->m_raw_full_pinyin; - return true; -} - -bool pinyin_get_n_phrase(pinyin_instance_t * instance, - guint * num) { - *num = instance->m_match_results->len; - return true; -} - -bool pinyin_get_phrase_token(pinyin_instance_t * instance, - guint index, - phrase_token_t * token){ - MatchResults & match_results = instance->m_match_results; - - *token = null_token; - - if (index >= match_results->len) - return false; - - *token = g_array_index(match_results, phrase_token_t, index); - - return true; -} - - -/** - * Note: prefix is the text before the pre-edit string. - */ diff --git a/src/pinyin.h b/src/pinyin.h deleted file mode 100644 index a9601a0..0000000 --- a/src/pinyin.h +++ /dev/null @@ -1,663 +0,0 @@ -/* - * libzhuyin - * Library to deal with zhuyin. - * - * Copyright (C) 2011 Peng Wu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - - -#ifndef PINYIN_H -#define PINYIN_H - - -#include "novel_types.h" -#include "zhuyin_custom2.h" - - -G_BEGIN_DECLS - -typedef struct _ChewingKey ChewingKey; -typedef struct _ChewingKeyRest ChewingKeyRest; - -typedef struct _pinyin_context_t pinyin_context_t; -typedef struct _pinyin_instance_t pinyin_instance_t; -typedef struct _lookup_candidate_t lookup_candidate_t; - -typedef struct _import_iterator_t import_iterator_t; - -typedef enum _lookup_candidate_type_t{ - BEST_MATCH_CANDIDATE = 1, - NORMAL_CANDIDATE, - ZOMBIE_CANDIDATE -} lookup_candidate_type_t; - -/** - * pinyin_init: - * @systemdir: the system wide language model data directory. - * @userdir: the user's language model data directory. - * @returns: the newly created pinyin context, NULL if failed. - * - * Create a new pinyin context. - * - */ -pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir); - -/** - * pinyin_load_phrase_library: - * @context: the pinyin context. - * @index: the phrase index to be loaded. - * @returns: whether the load succeeded. - * - * Load the sub phrase library of the index. - * - */ -bool pinyin_load_phrase_library(pinyin_context_t * context, - guint8 index); - -/** - * pinyin_unload_phrase_library: - * @context: the pinyin context. - * @index: the phrase index to be unloaded. - * @returns: whether the unload succeeded. - * - * Unload the sub phrase library of the index. - * - */ -bool pinyin_unload_phrase_library(pinyin_context_t * context, - guint8 index); - -/** - * pinyin_begin_add_phrases: - * @context: the pinyin context. - * @index: the phrase index to be imported. - * @returns: the import iterator. - * - * Begin to add phrases. - * - */ -import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context, - guint8 index); - -/** - * pinyin_iterator_add_phrase: - * @iter: the import iterator. - * @phrase: the phrase string. - * @pinyin: the pinyin string. - * @count: the count of the phrase/pinyin pair, -1 to use the default value. - * @returns: whether the add operation succeeded. - * - * Add a pair of phrase and pinyin with count. - * - */ -bool pinyin_iterator_add_phrase(import_iterator_t * iter, - const char * phrase, - const char * pinyin, - gint count); - -/** - * pinyin_end_add_phrases: - * @iter: the import iterator. - * - * End adding phrases. - * - */ -void pinyin_end_add_phrases(import_iterator_t * iter); - -/** - * pinyin_save: - * @context: the pinyin context to be saved into user directory. - * @returns: whether the save succeeded. - * - * Save the user's self-learning information of the pinyin context. - * - */ -bool pinyin_save(pinyin_context_t * context); - -/** - * pinyin_set_chewing_scheme: - * @context: the pinyin context. - * @scheme: the chewing scheme. - * @returns: whether the set chewing scheme succeeded. - * - * Change the chewing scheme of the pinyin context. - * - */ -bool pinyin_set_chewing_scheme(pinyin_context_t * context, - ChewingScheme scheme); - -/** - * pinyin_set_full_pinyin_scheme: - * @context: the pinyin context. - * @scheme: the full pinyin scheme. - * @returns: whether the set full pinyin scheme succeeded. - * - * Change the full pinyin scheme of the pinyin context. - * - */ -bool pinyin_set_full_pinyin_scheme(pinyin_context_t * context, - FullPinyinScheme scheme); - -/** - * pinyin_fini: - * @context: the pinyin context. - * - * Finalize the pinyin context. - * - */ -void pinyin_fini(pinyin_context_t * context); - - -/** - * pinyin_mask_out: - * @context: the pinyin context. - * @mask: the mask. - * @value: the value. - * @returns: whether the mask out operation is successful. - * - * Mask out the matched phrase tokens. - * - */ -bool pinyin_mask_out(pinyin_context_t * context, - phrase_token_t mask, - phrase_token_t value); - - -/** - * pinyin_set_options: - * @context: the pinyin context. - * @options: the pinyin options of the pinyin context. - * @returns: whether the set options scheme succeeded. - * - * Set the options of the pinyin context. - * - */ -bool pinyin_set_options(pinyin_context_t * context, - pinyin_option_t options); - -/** - * pinyin_alloc_instance: - * @context: the pinyin context. - * @returns: the newly allocated pinyin instance, NULL if failed. - * - * Allocate a new pinyin instance from the context. - * - */ -pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context); - -/** - * pinyin_free_instance: - * @instance: the pinyin instance. - * - * Free the pinyin instance. - * - */ -void pinyin_free_instance(pinyin_instance_t * instance); - - -/** - * pinyin_guess_sentence: - * @instance: the pinyin instance. - * @returns: whether the sentence are guessed successfully. - * - * Guess a sentence from the saved pinyin keys in the instance. - * - */ -bool pinyin_guess_sentence(pinyin_instance_t * instance); - -/** - * pinyin_guess_sentence_with_prefix: - * @instance: the pinyin instance. - * @prefix: the prefix before the sentence. - * @returns: whether the sentence are guessed successfully. - * - * Guess a sentence from the saved pinyin keys with a prefix. - * - */ -bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, - const char * prefix); - -/** - * pinyin_phrase_segment: - * @instance: the pinyin instance. - * @sentence: the utf-8 sentence to be segmented. - * @returns: whether the sentence are segmented successfully. - * - * Segment a sentence and saved the result in the instance. - * - */ -bool pinyin_phrase_segment(pinyin_instance_t * instance, - const char * sentence); - -/** - * pinyin_get_sentence: - * @instance: the pinyin instance. - * @sentence: the saved sentence in the instance. - * @returns: whether the sentence is already saved in the instance. - * - * Get the sentence from the instance. - * - * Note: the returned sentence should be freed by g_free(). - * - */ -bool pinyin_get_sentence(pinyin_instance_t * instance, - char ** sentence); - -/** - * pinyin_parse_full_pinyin: - * @instance: the pinyin instance. - * @onepinyin: a single full pinyin to be parsed. - * @onekey: the parsed key. - * @returns: whether the parse is successfully. - * - * Parse a single full pinyin. - * - */ -bool pinyin_parse_full_pinyin(pinyin_instance_t * instance, - const char * onepinyin, - ChewingKey * onekey); - -/** - * pinyin_parse_more_full_pinyins: - * @instance: the pinyin instance. - * @pinyins: the full pinyins to be parsed. - * @returns: the parsed length of the full pinyins. - * - * Parse multiple full pinyins and save it in the instance. - * - */ -size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance, - const char * pinyins); - -/** - * pinyin_parse_chewing: - * @instance: the pinyin instance. - * @onechewing: the single chewing to be parsed. - * @onekey: the parsed key. - * @returns: whether the parse is successfully. - * - * Parse a single chewing. - * - */ -bool pinyin_parse_chewing(pinyin_instance_t * instance, - const char * onechewing, - ChewingKey * onekey); - -/** - * pinyin_parse_more_chewings: - * @instance: the pinyin instance. - * @chewings: the chewings to be parsed. - * @returns: the parsed length of the chewings. - * - * Parse multiple chewings and save it in the instance. - * - */ -size_t pinyin_parse_more_chewings(pinyin_instance_t * instance, - const char * chewings); - -/** - * pinyin_in_chewing_keyboard: - * @instance: the pinyin instance. - * @key: the input key. - * @symbol: the chewing symbol. - * @returns: whether the key is in current chewing scheme. - * - * Check whether the input key is in current chewing scheme. - * - */ -bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, - const char key, const char ** symbol); -/** - * pinyin_guess_candidates: - * @instance: the pinyin instance. - * @offset: the offset in the pinyin keys. - * @returns: whether a list of tokens are gotten. - * - * Guess the candidates at the offset. - * - */ -bool pinyin_guess_candidates(pinyin_instance_t * instance, - size_t offset); - -/** - * pinyin_choose_candidate: - * @instance: the pinyin instance. - * @offset: the offset in the pinyin keys. - * @candidate: the selected candidate. - * @returns: the cursor after the chosen candidate. - * - * Choose a full pinyin candidate at the offset. - * - */ -int pinyin_choose_candidate(pinyin_instance_t * instance, - size_t offset, - lookup_candidate_t * candidate); - -/** -* pinyin_clear_constraint: -* @instance: the pinyin instance. -* @offset: the offset in the pinyin keys. -* @returns: whether the constraint is cleared. -* -* Clear the previous chosen candidate. -* -*/ -bool pinyin_clear_constraint(pinyin_instance_t * instance, - size_t offset); - -/** - * pinyin_lookup_tokens: - * @instance: the pinyin instance. - * @phrase: the phrase to be looked up. - * @tokenarray: the returned GArray of tokens. - * @returns: whether the lookup operation is successful. - * - * Lookup the tokens for the phrase utf8 string. - * - */ -bool pinyin_lookup_tokens(pinyin_instance_t * instance, - const char * phrase, GArray * tokenarray); - -/** - * pinyin_train: - * @instance: the pinyin instance. - * @returns: whether the sentence is trained. - * - * Train the current user input sentence. - * - */ -bool pinyin_train(pinyin_instance_t * instance); - -/** - * pinyin_reset: - * @instance: the pinyin instance. - * @returns: whether the pinyin instance is resetted. - * - * Reset the pinyin instance. - * - */ -bool pinyin_reset(pinyin_instance_t * instance); - -/** - * pinyin_get_bopomofo_string: - * @instance: the pinyin instance. - * @key: the chewing key. - * @utf8_str: the chewing string. - * @returns: whether the get operation is successful. - * - * Get the chewing string of the key. - * - */ -bool pinyin_get_bopomofo_string(pinyin_instance_t * instance, - ChewingKey * key, - gchar ** utf8_str); - -/** - * pinyin_get_pinyin_string: - * @instance: the pinyin instance. - * @key: the pinyin key. - * @utf8_str: the pinyin string. - * @returns: whether the get operation is successful. - * - * Get the pinyin string of the key. - * - */ -bool pinyin_get_pinyin_string(pinyin_instance_t * instance, - ChewingKey * key, - gchar ** utf8_str); - -/** - * pinyin_token_get_phrase: - * @instance: the pinyin instance. - * @token: the phrase token. - * @len: the phrase length. - * @utf8_str: the phrase string. - * @returns: whether the get operation is successful. - * - * Get the phrase length and utf8 string. - * - */ -bool pinyin_token_get_phrase(pinyin_instance_t * instance, - phrase_token_t token, - guint * len, - gchar ** utf8_str); - -/** - * pinyin_token_get_n_pronunciation: - * @instance: the pinyin instance. - * @token: the phrase token. - * @num: the number of pinyins. - * @returns: whether the get operation is successful. - * - * Get the number of the pinyins. - * - */ -bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance, - phrase_token_t token, - guint * num); - -/** - * pinyin_token_get_nth_pronunciation: - * @instance: the pinyin instance. - * @token: the phrase token. - * @nth: the index of the pinyin. - * @keys: the GArray of chewing key. - * @returns: whether the get operation is successful. - * - * Get the nth pinyin from the phrase. - * - */ -bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance, - phrase_token_t token, - guint nth, - ChewingKeyVector keys); - -/** - * pinyin_token_get_unigram_frequency: - * @instance: the pinyin instance. - * @token: the phrase token. - * @freq: the unigram frequency of the phrase. - * @returns: whether the get operation is successful. - * - * Get the unigram frequency of the phrase. - * - */ -bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance, - phrase_token_t token, - guint * freq); - -/** - * pinyin_token_add_unigram_frequency: - * @instance: the pinyin instance. - * @token: the phrase token. - * @delta: the delta of the unigram frequency. - * @returns: whether the add operation is successful. - * - * Add delta to the unigram frequency of the phrase token. - * - */ -bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance, - phrase_token_t token, - guint delta); - -/** - * pinyin_get_n_candidate: - * @instance: the pinyin instance. - * @num: the number of the candidates. - * @returns: whether the get operation is successful. - * - * Get the number of the candidates. - * - */ -bool pinyin_get_n_candidate(pinyin_instance_t * instance, - guint * num); - -/** - * pinyin_get_candidate: - * @instance: the pinyin instance. - * @index: the index of the candidate. - * @candidate: the retrieved candidate. - * - * Get the candidate of the index from the candidates. - * - */ -bool pinyin_get_candidate(pinyin_instance_t * instance, - guint index, - lookup_candidate_t ** candidate); - -/** - * pinyin_get_candidate_type: - * @instance: the pinyin instance. - * @candidate: the lookup candidate. - * @type: the type of the candidate. - * @returns: whether the get operation is successful. - * - * Get the type of the lookup candidate. - * - */ -bool pinyin_get_candidate_type(pinyin_instance_t * instance, - lookup_candidate_t * candidate, - lookup_candidate_type_t * type); - -/** - * pinyin_get_candidate_string: - * @instance: the pinyin instance. - * @candidate: the lookup candidate. - * @utf8_str: the string of the candidate. - * @returns: whether the get operation is successful. - * - * Get the string of the candidate. - * - */ -bool pinyin_get_candidate_string(pinyin_instance_t * instance, - lookup_candidate_t * candidate, - const gchar ** utf8_str); - -/** - * pinyin_get_n_pinyin: - * @instance: the pinyin instance. - * @num: the number of the pinyins. - * @returns: whether the get operation is successful. - * - * Get the number of the pinyins. - * - */ -bool pinyin_get_n_pinyin(pinyin_instance_t * instance, - guint * num); - -/** - * pinyin_get_pinyin_key: - * @instance: the pinyin instance. - * @index: the index of the pinyin key. - * @key: the retrieved pinyin key. - * @returns: whether the get operation is successful. - * - * Get the pinyin key of the index from the pinyin keys. - * - */ -bool pinyin_get_pinyin_key(pinyin_instance_t * instance, - guint index, - ChewingKey ** key); - -/** - * pinyin_get_pinyin_key_rest: - * @instance: the pinyin index. - * @index: the index of the pinyin key rest. - * @key_rest: the retrieved pinyin key rest. - * @returns: whether the get operation is successful. - * - * Get the pinyin key rest of the index from the pinyin key rests. - * - */ -bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance, - guint index, - ChewingKeyRest ** key_rest); - -/** - * pinyin_get_pinyin_key_rest_positions: - * @instance: the pinyin instance. - * @key_rest: the pinyin key rest. - * @begin: the begin position of the corresponding pinyin key. - * @end: the end position of the corresponding pinyin key. - * @returns: whether the get operation is successful. - * - * Get the positions of the pinyin key rest. - * - */ -bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance, - ChewingKeyRest * key_rest, - guint16 * begin, guint16 * end); - -/** - * pinyin_get_pinyin_key_rest_length: - * @instance: the pinyin instance. - * @key_rest: the pinyin key rest. - * @length: the length of the corresponding pinyin key. - * @returns: whether the get operation is successful. - * - * Get the length of the corresponding pinyin key. - * - */ -bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance, - ChewingKeyRest * key_rest, - guint16 * length); - -/** - * pinyin_get_raw_full_pinyin: - * @instance: the pinyin instance. - * @utf8_str: the modified raw full pinyin after choose candidate. - * @returns: whether the get operation is successful. - * - * Get the modified raw full pinyin after choose candidate. - * - */ -bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance, - const gchar ** utf8_str); - -/** - * pinyin_get_n_phrase: - * @instance: the pinyin instance. - * @num: the number of the phrase tokens. - * @returns: whether the get operation is successful. - * - * Get the number of the phrase tokens. - * - */ -bool pinyin_get_n_phrase(pinyin_instance_t * instance, - guint * num); - -/** - * pinyin_get_phrase_token: - * @instance: the pinyin instance. - * @index: the index of the phrase token. - * @token: the retrieved phrase token. - * @returns: whether the get operation is successful. - * - * Get the phrase token of the index from the phrase tokens. - * - */ -bool pinyin_get_phrase_token(pinyin_instance_t * instance, - guint index, - phrase_token_t * token); - -/* hack here. */ -typedef ChewingKey PinyinKey; -typedef ChewingKeyRest PinyinKeyPos; - - -G_END_DECLS - -#endif diff --git a/src/zhuyin.cpp b/src/zhuyin.cpp new file mode 100644 index 0000000..f1a8aff --- /dev/null +++ b/src/zhuyin.cpp @@ -0,0 +1,1681 @@ +/* + * libzhuyin + * Library to deal with zhuyin. + * + * Copyright (C) 2011 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#include "pinyin.h" +#include +#include +#include +#include "pinyin_internal.h" + + +using namespace zhuyin; + +/* a glue layer for input method integration. */ + +typedef GArray * CandidateVector; /* GArray of lookup_candidate_t */ + +struct _pinyin_context_t{ + pinyin_option_t m_options; + + FullPinyinScheme m_full_pinyin_scheme; + FullPinyinParser2 * m_full_pinyin_parser; + ChewingParser2 * m_chewing_parser; + + FacadeChewingTable * m_pinyin_table; + FacadePhraseTable2 * m_phrase_table; + FacadePhraseIndex * m_phrase_index; + Bigram * m_system_bigram; + Bigram * m_user_bigram; + + PinyinLookup2 * m_pinyin_lookup; + PhraseLookup * m_phrase_lookup; + + char * m_system_dir; + char * m_user_dir; + bool m_modified; + + SystemTableInfo m_system_table_info; +}; + +struct _pinyin_instance_t{ + pinyin_context_t * m_context; + gchar * m_raw_full_pinyin; + TokenVector m_prefixes; + ChewingKeyVector m_pinyin_keys; + ChewingKeyRestVector m_pinyin_key_rests; + CandidateConstraints m_constraints; + MatchResults m_match_results; + CandidateVector m_candidates; +}; + +struct _lookup_candidate_t{ + lookup_candidate_type_t m_candidate_type; + gchar * m_phrase_string; + phrase_token_t m_token; + ChewingKeyRest m_orig_rest; + gchar * m_new_pinyins; + guint32 m_freq; /* the amplifed gfloat numerical value. */ +public: + _lookup_candidate_t() { + m_candidate_type = NORMAL_CANDIDATE; + m_phrase_string = NULL; + m_token = null_token; + m_new_pinyins = NULL; + m_freq = 0; + } +}; + +struct _import_iterator_t{ + pinyin_context_t * m_context; + guint8 m_phrase_index; +}; + + +static bool check_format(pinyin_context_t * context){ + const char * userdir = context->m_user_dir; + + UserTableInfo user_table_info; + gchar * filename = g_build_filename + (userdir, USER_TABLE_INFO, NULL); + user_table_info.load(filename); + g_free(filename); + + bool exists = user_table_info.is_conform + (&context->m_system_table_info); + + if (exists) + return exists; + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + /* clean up files, if version mis-matches. */ + for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const pinyin_table_info_t * table_info = phrase_files + i; + + if (NOT_USED == table_info->m_file_type) + continue; + + if (NULL == table_info->m_user_filename) + continue; + + const char * userfilename = table_info->m_user_filename; + + /* remove dbin file. */ + filename = g_build_filename(userdir, userfilename, NULL); + unlink(filename); + g_free(filename); + } + + filename = g_build_filename + (userdir, USER_PINYIN_INDEX, NULL); + unlink(filename); + g_free(filename); + + filename = g_build_filename + (userdir, USER_PHRASE_INDEX, NULL); + unlink(filename); + g_free(filename); + + filename = g_build_filename + (userdir, USER_BIGRAM, NULL); + unlink(filename); + g_free(filename); + + return exists; +} + +static bool mark_version(pinyin_context_t * context){ + const char * userdir = context->m_user_dir; + + UserTableInfo user_table_info; + user_table_info.make_conform(&context->m_system_table_info); + + gchar * filename = g_build_filename + (userdir, USER_TABLE_INFO, NULL); + bool retval = user_table_info.save(filename); + g_free(filename); + + return retval; +} + +pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ + pinyin_context_t * context = new pinyin_context_t; + + context->m_options = USE_TONE; + + context->m_system_dir = g_strdup(systemdir); + context->m_user_dir = g_strdup(userdir); + context->m_modified = false; + + gchar * filename = g_build_filename + (context->m_system_dir, SYSTEM_TABLE_INFO, NULL); + if (!context->m_system_table_info.load(filename)) { + fprintf(stderr, "load %s failed!\n", filename); + return NULL; + } + g_free(filename); + + + check_format(context); + + context->m_full_pinyin_scheme = FULL_PINYIN_DEFAULT; + context->m_full_pinyin_parser = new FullPinyinParser2; + context->m_chewing_parser = new ChewingDiscreteParser2; + + /* load chewing table. */ + context->m_pinyin_table = new FacadeChewingTable; + + /* load system chewing table. */ + MemoryChunk * chunk = new MemoryChunk; + filename = g_build_filename + (context->m_system_dir, SYSTEM_PINYIN_INDEX, NULL); + if (!chunk->load(filename)) { + fprintf(stderr, "open %s failed!\n", filename); + return NULL; + } + g_free(filename); + + /* load user chewing table */ + MemoryChunk * userchunk = new MemoryChunk; + filename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX, NULL); + if (!userchunk->load(filename)) { + /* hack here: use local Chewing Table to create empty memory chunk. */ + ChewingLargeTable table(context->m_options); + table.store(userchunk); + } + g_free(filename); + + context->m_pinyin_table->load(context->m_options, chunk, userchunk); + + /* load phrase table */ + context->m_phrase_table = new FacadePhraseTable2; + + /* load system phrase table */ + chunk = new MemoryChunk; + filename = g_build_filename + (context->m_system_dir, SYSTEM_PHRASE_INDEX, NULL); + if (!chunk->load(filename)) { + fprintf(stderr, "open %s failed!\n", filename); + return NULL; + } + g_free(filename); + + /* load user phrase table */ + userchunk = new MemoryChunk; + filename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX, NULL); + if (!userchunk->load(filename)) { + /* hack here: use local Phrase Table to create empty memory chunk. */ + PhraseLargeTable2 table; + table.store(userchunk); + } + g_free(filename); + + context->m_phrase_table->load(chunk, userchunk); + + context->m_phrase_index = new FacadePhraseIndex; + + /* hack here: directly call load phrase library. */ + pinyin_load_phrase_library(context, GB_DICTIONARY); + pinyin_load_phrase_library(context, MERGED_DICTIONARY); + + context->m_system_bigram = new Bigram; + filename = g_build_filename(context->m_system_dir, SYSTEM_BIGRAM, NULL); + context->m_system_bigram->attach(filename, ATTACH_READONLY); + g_free(filename); + + context->m_user_bigram = new Bigram; + filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); + context->m_user_bigram->load_db(filename); + g_free(filename); + + gfloat lambda = context->m_system_table_info.get_lambda(); + + context->m_pinyin_lookup = new PinyinLookup2 + ( lambda, context->m_options, + context->m_pinyin_table, context->m_phrase_index, + context->m_system_bigram, context->m_user_bigram); + + context->m_phrase_lookup = new PhraseLookup + (lambda, + context->m_phrase_table, context->m_phrase_index, + context->m_system_bigram, context->m_user_bigram); + + return context; +} + +bool pinyin_load_phrase_library(pinyin_context_t * context, + guint8 index){ + if (!(index < PHRASE_INDEX_LIBRARY_COUNT)) + return false; + + /* check whether the sub phrase index is already loaded. */ + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(index, range); + if (ERROR_OK == retval) + return false; + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + const pinyin_table_info_t * table_info = phrase_files + index; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + /* system phrase library */ + MemoryChunk * chunk = new MemoryChunk; + + const char * systemfilename = table_info->m_system_filename; + /* check bin file in system dir. */ + gchar * chunkfilename = g_build_filename(context->m_system_dir, + systemfilename, NULL); + chunk->load(chunkfilename); + g_free(chunkfilename); + + context->m_phrase_index->load(index, chunk); + + const char * userfilename = table_info->m_user_filename; + + chunkfilename = g_build_filename(context->m_user_dir, + userfilename, NULL); + + MemoryChunk * log = new MemoryChunk; + log->load(chunkfilename); + g_free(chunkfilename); + + /* merge the chunk log. */ + context->m_phrase_index->merge(index, log); + return true; + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + MemoryChunk * chunk = new MemoryChunk; + const char * userfilename = table_info->m_user_filename; + + gchar * chunkfilename = g_build_filename(context->m_user_dir, + userfilename, NULL); + + /* check bin file exists. if not, create a new one. */ + if (chunk->load(chunkfilename)) { + context->m_phrase_index->load(index, chunk); + } else { + delete chunk; + context->m_phrase_index->create_sub_phrase(index); + } + + g_free(chunkfilename); + return true; + } + + return false; +} + +bool pinyin_unload_phrase_library(pinyin_context_t * context, + guint8 index){ + /* gb_char.bin and merged.bin can't be unloaded. */ + if (GB_DICTIONARY == index || MERGED_DICTIONARY == index) + return false; + + assert(index < PHRASE_INDEX_LIBRARY_COUNT); + + context->m_phrase_index->unload(index); + return true; +} + +import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context, + guint8 index){ + import_iterator_t * iter = new import_iterator_t; + iter->m_context = context; + iter->m_phrase_index = index; + return iter; +} + +bool pinyin_iterator_add_phrase(import_iterator_t * iter, + const char * phrase, + const char * pinyin, + gint count){ + /* if -1 == count, use the default value. */ + const gint default_count = 5; + const guint32 unigram_factor = 3; + if (-1 == count) + count = default_count; + + pinyin_context_t * & context = iter->m_context; + FacadePhraseTable2 * & phrase_table = context->m_phrase_table; + FacadeChewingTable * & pinyin_table = context->m_pinyin_table; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + bool result = false; + + if (NULL == phrase || NULL == pinyin) + return result; + + /* check whether the phrase exists in phrase table */ + glong len_phrase = 0; + ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &len_phrase, NULL); + + /* pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; */ + pinyin_option_t options = USE_TONE; + FullPinyinParser2 parser; + ChewingKeyVector keys = + g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + ChewingKeyRestVector key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + /* parse the pinyin. */ + parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); + + if (len_phrase != keys->len) + return result; + + if (0 == len_phrase || len_phrase >= MAX_PHRASE_LENGTH) + return result; + + phrase_token_t token = null_token; + GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + /* do phrase table search. */ + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + int retval = phrase_table->search(len_phrase, ucs4_phrase, tokens); + int num = reduce_tokens(tokens, tokenarray); + phrase_index->destroy_tokens(tokens); + + /* find the best token candidate. */ + for (size_t i = 0; i < tokenarray->len; ++i) { + phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i); + if (null_token == token) { + token = candidate; + continue; + } + + if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == iter->m_phrase_index) { + /* only one phrase string per sub phrase index. */ + assert(PHRASE_INDEX_LIBRARY_INDEX(token) != iter->m_phrase_index); + token = candidate; + continue; + } + } + g_array_free(tokenarray, TRUE); + + PhraseItem item; + /* check whether it exists in the same sub phrase index; */ + if (null_token != token && + PHRASE_INDEX_LIBRARY_INDEX(token) == iter->m_phrase_index) { + /* if so, remove the phrase, add the pinyin for the phrase item, + then add it back;*/ + phrase_index->get_phrase_item(token, item); + assert(len_phrase == item.get_phrase_length()); + ucs4_t tmp_phrase[MAX_PHRASE_LENGTH]; + item.get_phrase_string(tmp_phrase); + assert(0 == memcmp + (ucs4_phrase, tmp_phrase, sizeof(ucs4_t) * len_phrase)); + + PhraseItem * removed_item = NULL; + retval = phrase_index->remove_phrase_item(token, removed_item); + if (ERROR_OK == retval) { + /* maybe check whether there are duplicated pronunciations here. */ + removed_item->add_pronunciation((ChewingKey *)keys->data, + count); + phrase_index->add_phrase_item(token, removed_item); + delete removed_item; + result = true; + } + } else { + /* if not exists in the same sub phrase index, + get the maximum token, + then add it directly with maximum token + 1; */ + PhraseIndexRange range; + retval = phrase_index->get_range(iter->m_phrase_index, range); + + if (ERROR_OK == retval) { + token = range.m_range_end; + if (0x00000000 == (token & PHRASE_MASK)) + token++; + + if (len_phrase == keys->len) { /* valid pinyin */ + phrase_table->add_index(len_phrase, ucs4_phrase, token); + pinyin_table->add_index + (keys->len, (ChewingKey *)(keys->data), token); + + item.set_phrase_string(len_phrase, ucs4_phrase); + item.add_pronunciation((ChewingKey *)(keys->data), count); + phrase_index->add_phrase_item(token, &item); + phrase_index->add_unigram_frequency(token, + count * unigram_factor); + result = true; + } + } + } + + g_array_free(key_rests, TRUE); + g_array_free(keys, TRUE); + g_free(ucs4_phrase); + return result; +} + +void pinyin_end_add_phrases(import_iterator_t * iter){ + /* compact the content memory chunk of phrase index. */ + iter->m_context->m_phrase_index->compact(); + iter->m_context->m_modified = true; + delete iter; +} + +bool pinyin_save(pinyin_context_t * context){ + if (!context->m_user_dir) + return false; + + if (!context->m_modified) + return false; + + context->m_phrase_index->compact(); + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + /* skip the reserved zero phrase library. */ + for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(i, range); + + if (ERROR_NO_SUB_PHRASE_INDEX == retval) + continue; + + const pinyin_table_info_t * table_info = phrase_files + i; + + if (NOT_USED == table_info->m_file_type) + continue; + + const char * userfilename = table_info->m_user_filename; + + if (NULL == userfilename) + continue; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + /* system phrase library */ + MemoryChunk * chunk = new MemoryChunk; + MemoryChunk * log = new MemoryChunk; + const char * systemfilename = table_info->m_system_filename; + + /* check bin file in system dir. */ + gchar * chunkfilename = g_build_filename(context->m_system_dir, + systemfilename, NULL); + chunk->load(chunkfilename); + g_free(chunkfilename); + context->m_phrase_index->diff(i, chunk, log); + + const char * userfilename = table_info->m_user_filename; + gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); + + gchar * tmppathname = g_build_filename(context->m_user_dir, + tmpfilename, NULL); + g_free(tmpfilename); + + gchar * chunkpathname = g_build_filename(context->m_user_dir, + userfilename, NULL); + log->save(tmppathname); + + int result = rename(tmppathname, chunkpathname); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmppathname, chunkpathname); + + g_free(chunkpathname); + g_free(tmppathname); + delete log; + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + MemoryChunk * chunk = new MemoryChunk; + context->m_phrase_index->store(i, chunk); + + const char * userfilename = table_info->m_user_filename; + gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); + gchar * tmppathname = g_build_filename(context->m_user_dir, + tmpfilename, NULL); + g_free(tmpfilename); + + gchar * chunkpathname = g_build_filename(context->m_user_dir, + userfilename, NULL); + + chunk->save(tmppathname); + + int result = rename(tmppathname, chunkpathname); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmppathname, chunkpathname); + + g_free(chunkpathname); + g_free(tmppathname); + delete chunk; + } + } + + /* save user pinyin table */ + gchar * tmpfilename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL); + unlink(tmpfilename); + gchar * filename = g_build_filename + (context->m_user_dir, USER_PINYIN_INDEX, NULL); + + MemoryChunk * chunk = new MemoryChunk; + context->m_pinyin_table->store(chunk); + chunk->save(tmpfilename); + delete chunk; + + int result = rename(tmpfilename, filename); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmpfilename, filename); + + g_free(tmpfilename); + g_free(filename); + + /* save user phrase table */ + tmpfilename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL); + unlink(tmpfilename); + filename = g_build_filename + (context->m_user_dir, USER_PHRASE_INDEX, NULL); + + chunk = new MemoryChunk; + context->m_phrase_table->store(chunk); + chunk->save(tmpfilename); + delete chunk; + + result = rename(tmpfilename, filename); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmpfilename, filename); + + g_free(tmpfilename); + g_free(filename); + + /* save user bi-gram */ + tmpfilename = g_build_filename + (context->m_user_dir, USER_BIGRAM ".tmp", NULL); + unlink(tmpfilename); + filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); + context->m_user_bigram->save_db(tmpfilename); + + result = rename(tmpfilename, filename); + if (0 != result) + fprintf(stderr, "rename %s to %s failed.\n", + tmpfilename, filename); + + g_free(tmpfilename); + g_free(filename); + + mark_version(context); + + context->m_modified = false; + return true; +} + +bool pinyin_set_chewing_scheme(pinyin_context_t * context, + ChewingScheme scheme){ + delete context->m_chewing_parser; + context->m_chewing_parser = NULL; + + switch(scheme) { + case CHEWING_STANDARD: + case CHEWING_HSU: + case CHEWING_IBM: + case CHEWING_GINYIEH: + case CHEWING_ETEN: + case CHEWING_ETEN26: + case CHEWING_STANDARD_DVORAK: + case CHEWING_HSU_DVORAK: { + ChewingDiscreteParser2 * parser = new ChewingDiscreteParser2(); + parser->set_scheme(scheme); + context->m_chewing_parser = parser; + break; + } + case CHEWING_DACHEN_CP26: + context->m_chewing_parser = new ChewingDaChenCP26Parser2(); + break; + } + return true; +} + +bool pinyin_set_full_pinyin_scheme(pinyin_context_t * context, + FullPinyinScheme scheme){ + context->m_full_pinyin_scheme = scheme; + context->m_full_pinyin_parser->set_scheme(scheme); + return true; +} + +void pinyin_fini(pinyin_context_t * context){ + delete context->m_full_pinyin_parser; + delete context->m_chewing_parser; + delete context->m_pinyin_table; + delete context->m_phrase_table; + delete context->m_phrase_index; + delete context->m_system_bigram; + delete context->m_user_bigram; + delete context->m_pinyin_lookup; + delete context->m_phrase_lookup; + + g_free(context->m_system_dir); + g_free(context->m_user_dir); + context->m_modified = false; + + delete context; +} + +bool pinyin_mask_out(pinyin_context_t * context, + phrase_token_t mask, + phrase_token_t value) { + + context->m_pinyin_table->mask_out(mask, value); + context->m_phrase_table->mask_out(mask, value); + context->m_user_bigram->mask_out(mask, value); + + const pinyin_table_info_t * phrase_files = + context->m_system_table_info.get_table_info(); + + /* mask out the phrase index. */ + for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) { + PhraseIndexRange range; + int retval = context->m_phrase_index->get_range(index, range); + + if (ERROR_NO_SUB_PHRASE_INDEX == retval) + continue; + + const pinyin_table_info_t * table_info = phrase_files + index; + + if (NOT_USED == table_info->m_file_type) + continue; + + const char * userfilename = table_info->m_user_filename; + + if (NULL == userfilename) + continue; + + if (SYSTEM_FILE == table_info->m_file_type || + DICTIONARY == table_info->m_file_type) { + /* system phrase library */ + MemoryChunk * chunk = new MemoryChunk; + + const char * systemfilename = table_info->m_system_filename; + /* check bin file in system dir. */ + gchar * chunkfilename = g_build_filename(context->m_system_dir, + systemfilename, NULL); + chunk->load(chunkfilename); + g_free(chunkfilename); + + context->m_phrase_index->load(index, chunk); + + const char * userfilename = table_info->m_user_filename; + + chunkfilename = g_build_filename(context->m_user_dir, + userfilename, NULL); + + MemoryChunk * log = new MemoryChunk; + log->load(chunkfilename); + g_free(chunkfilename); + + /* merge the chunk log with mask. */ + context->m_phrase_index->merge_with_mask(index, log, mask, value); + } + + if (USER_FILE == table_info->m_file_type) { + /* user phrase library */ + context->m_phrase_index->mask_out(index, mask, value); + } + } + + context->m_phrase_index->compact(); + return true; +} + +/* copy from options to context->m_options. */ +bool pinyin_set_options(pinyin_context_t * context, + pinyin_option_t options){ + context->m_options = options; + context->m_pinyin_table->set_options(context->m_options); + context->m_pinyin_lookup->set_options(context->m_options); + return true; +} + + +pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){ + pinyin_instance_t * instance = new pinyin_instance_t; + instance->m_context = context; + + instance->m_raw_full_pinyin = NULL; + + instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); + instance->m_pinyin_key_rests = + g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + instance->m_constraints = g_array_new + (TRUE, FALSE, sizeof(lookup_constraint_t)); + instance->m_match_results = + g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + instance->m_candidates = + g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); + + return instance; +} + +void pinyin_free_instance(pinyin_instance_t * instance){ + g_free(instance->m_raw_full_pinyin); + g_array_free(instance->m_prefixes, TRUE); + g_array_free(instance->m_pinyin_keys, TRUE); + g_array_free(instance->m_pinyin_key_rests, TRUE); + g_array_free(instance->m_constraints, TRUE); + g_array_free(instance->m_match_results, TRUE); + g_array_free(instance->m_candidates, TRUE); + + delete instance; +} + + +static bool pinyin_update_constraints(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + CandidateConstraints & constraints = instance->m_constraints; + + size_t key_len = constraints->len; + g_array_set_size(constraints, pinyin_keys->len); + for (size_t i = key_len; i < pinyin_keys->len; ++i ) { + lookup_constraint_t * constraint = + &g_array_index(constraints, lookup_constraint_t, i); + constraint->m_type = NO_CONSTRAINT; + } + + context->m_pinyin_lookup->validate_constraint + (constraints, pinyin_keys); + + return true; +} + + +bool pinyin_guess_sentence(pinyin_instance_t * instance){ + pinyin_context_t * & context = instance->m_context; + + g_array_set_size(instance->m_prefixes, 0); + g_array_append_val(instance->m_prefixes, sentence_start); + + pinyin_update_constraints(instance); + bool retval = context->m_pinyin_lookup->get_best_match + (instance->m_prefixes, + instance->m_pinyin_keys, + instance->m_constraints, + instance->m_match_results); + + return retval; +} + +bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, + const char * prefix){ + pinyin_context_t * & context = instance->m_context; + + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + g_array_set_size(instance->m_prefixes, 0); + g_array_append_val(instance->m_prefixes, sentence_start); + + glong len_str = 0; + ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL); + GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); + + if (ucs4_str && len_str) { + /* add prefixes. */ + for (ssize_t i = 1; i <= len_str; ++i) { + if (i > MAX_PHRASE_LENGTH) + break; + + ucs4_t * start = ucs4_str + len_str - i; + + PhraseTokens tokens; + memset(tokens, 0, sizeof(tokens)); + phrase_index->prepare_tokens(tokens); + int result = context->m_phrase_table->search(i, start, tokens); + int num = reduce_tokens(tokens, tokenarray); + phrase_index->destroy_tokens(tokens); + + if (result & SEARCH_OK) + g_array_append_vals(instance->m_prefixes, + tokenarray->data, tokenarray->len); + } + } + g_array_free(tokenarray, TRUE); + g_free(ucs4_str); + + pinyin_update_constraints(instance); + bool retval = context->m_pinyin_lookup->get_best_match + (instance->m_prefixes, + instance->m_pinyin_keys, + instance->m_constraints, + instance->m_match_results); + + return retval; +} + +bool pinyin_phrase_segment(pinyin_instance_t * instance, + const char * sentence){ + pinyin_context_t * & context = instance->m_context; + + const glong num_of_chars = g_utf8_strlen(sentence, -1); + glong ucs4_len = 0; + ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL); + + g_return_val_if_fail(num_of_chars == ucs4_len, FALSE); + + bool retval = context->m_phrase_lookup->get_best_match + (ucs4_len, ucs4_str, instance->m_match_results); + + g_free(ucs4_str); + return retval; +} + +/* the returned sentence should be freed by g_free(). */ +bool pinyin_get_sentence(pinyin_instance_t * instance, + char ** sentence){ + pinyin_context_t * & context = instance->m_context; + + bool retval = zhuyin::convert_to_utf8 + (context->m_phrase_index, instance->m_match_results, + NULL, false, *sentence); + + return retval; +} + +bool pinyin_parse_full_pinyin(pinyin_instance_t * instance, + const char * onepinyin, + ChewingKey * onekey){ + pinyin_context_t * & context = instance->m_context; + + int pinyin_len = strlen(onepinyin); + bool retval = context->m_full_pinyin_parser->parse_one_key + ( context->m_options, *onekey, onepinyin, pinyin_len); + return retval; +} + +size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance, + const char * pinyins){ + pinyin_context_t * & context = instance->m_context; + + g_free(instance->m_raw_full_pinyin); + instance->m_raw_full_pinyin = g_strdup(pinyins); + int pinyin_len = strlen(pinyins); + + int parse_len = context->m_full_pinyin_parser->parse + ( context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, pinyins, pinyin_len); + + return parse_len; +} + +bool pinyin_parse_chewing(pinyin_instance_t * instance, + const char * onechewing, + ChewingKey * onekey){ + pinyin_context_t * & context = instance->m_context; + + int chewing_len = strlen(onechewing); + bool retval = context->m_chewing_parser->parse_one_key + ( context->m_options, *onekey, onechewing, chewing_len ); + return retval; +} + +size_t pinyin_parse_more_chewings(pinyin_instance_t * instance, + const char * chewings){ + pinyin_context_t * & context = instance->m_context; + int chewing_len = strlen(chewings); + + int parse_len = context->m_chewing_parser->parse + ( context->m_options, instance->m_pinyin_keys, + instance->m_pinyin_key_rests, chewings, chewing_len); + + return parse_len; +} + +bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, + const char key, const char ** symbol) { + pinyin_context_t * & context = instance->m_context; + return context->m_chewing_parser->in_chewing_scheme + (context->m_options, key, symbol); +} + +#if 0 +static gint compare_item_with_token(gconstpointer lhs, + gconstpointer rhs) { + lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; + lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; + + phrase_token_t token_lhs = item_lhs->m_token; + phrase_token_t token_rhs = item_rhs->m_token; + + return (token_lhs - token_rhs); +} +#endif + +static gint compare_item_with_frequency(gconstpointer lhs, + gconstpointer rhs) { + lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; + lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; + + guint32 freq_lhs = item_lhs->m_freq; + guint32 freq_rhs = item_rhs->m_freq; + + return -(freq_lhs - freq_rhs); /* in descendant order */ +} + +static phrase_token_t _get_previous_token(pinyin_instance_t * instance, + size_t offset) { + phrase_token_t prev_token = null_token; + ssize_t i; + + if (0 == offset) { + /* get previous token from prefixes. */ + prev_token = sentence_start; + size_t prev_token_len = 0; + + pinyin_context_t * context = instance->m_context; + TokenVector prefixes = instance->m_prefixes; + PhraseItem item; + + for (size_t i = 0; i < prefixes->len; ++i) { + phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); + if (sentence_start == token) + continue; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK == retval) { + size_t token_len = item.get_phrase_length(); + if (token_len > prev_token_len) { + /* found longer match, and save it. */ + prev_token = token; + prev_token_len = token_len; + } + } + } + } else { + /* get previous token from match results. */ + assert (0 < offset); + + phrase_token_t cur_token = g_array_index + (instance->m_match_results, phrase_token_t, offset); + if (null_token != cur_token) { + for (i = offset - 1; i >= 0; --i) { + cur_token = g_array_index + (instance->m_match_results, phrase_token_t, i); + if (null_token != cur_token) { + prev_token = cur_token; + break; + } + } + } + } + + return prev_token; +} + +static void _append_items(pinyin_context_t * context, + PhraseIndexRanges ranges, + lookup_candidate_t * template_item, + CandidateVector items) { + /* reduce and append to a single GArray. */ + for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) { + if (NULL == ranges[m]) + continue; + + for (size_t n = 0; n < ranges[m]->len; ++n) { + PhraseIndexRange * range = + &g_array_index(ranges[m], PhraseIndexRange, n); + for (size_t k = range->m_range_begin; + k < range->m_range_end; ++k) { + lookup_candidate_t item; + item.m_candidate_type = template_item->m_candidate_type; + item.m_token = k; + item.m_orig_rest = template_item->m_orig_rest; + item.m_new_pinyins = g_strdup(template_item->m_new_pinyins); + item.m_freq = template_item->m_freq; + g_array_append_val(items, item); + } + } + } +} + +#if 0 +static void _remove_duplicated_items(CandidateVector items) { + /* remove the duplicated items. */ + phrase_token_t last_token = null_token, saved_token; + for (size_t n = 0; n < items->len; ++n) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, n); + + saved_token = item->m_token; + if (last_token == saved_token) { + g_array_remove_index(items, n); + n--; + } + last_token = saved_token; + } +} +#endif + +static void _compute_frequency_of_items(pinyin_context_t * context, + phrase_token_t prev_token, + SingleGram * merged_gram, + CandidateVector items) { + pinyin_option_t & options = context->m_options; + ssize_t i; + + PhraseItem cached_item; + /* compute all freqs. */ + for (i = 0; i < items->len; ++i) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, i); + phrase_token_t & token = item->m_token; + + gfloat bigram_poss = 0; guint32 total_freq = 0; + if (options & DYNAMIC_ADJUST) { + if (null_token != prev_token) { + guint32 bigram_freq = 0; + merged_gram->get_total_freq(total_freq); + merged_gram->get_freq(token, bigram_freq); + if (0 != total_freq) + bigram_poss = bigram_freq / (gfloat)total_freq; + } + } + + /* compute the m_freq. */ + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + phrase_index->get_phrase_item(token, cached_item); + total_freq = phrase_index->get_phrase_index_total_freq(); + assert (0 < total_freq); + + gfloat lambda = context->m_system_table_info.get_lambda(); + + /* Note: possibility value <= 1.0. */ + guint32 freq = (lambda * bigram_poss + + (1 - lambda) * + cached_item.get_unigram_frequency() / + (gfloat) total_freq) * 256 * 256 * 256; + item->m_freq = freq; + } +} + +static bool _prepend_sentence_candidate(pinyin_instance_t * instance, + CandidateVector candidates) { + /* check whether the best match candidate exists. */ + gchar * sentence = NULL; + pinyin_get_sentence(instance, &sentence); + if (NULL == sentence) + return false; + g_free(sentence); + + /* prepend best match candidate to candidates. */ + lookup_candidate_t candidate; + candidate.m_candidate_type = BEST_MATCH_CANDIDATE; + g_array_prepend_val(candidates, candidate); + + return true; +} + +static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance, + size_t offset, + CandidateVector candidates) { + /* populate m_phrase_string in lookup_candidate_t. */ + + for(size_t i = 0; i < candidates->len; ++i) { + lookup_candidate_t * candidate = &g_array_index + (candidates, lookup_candidate_t, i); + + switch(candidate->m_candidate_type) { + case BEST_MATCH_CANDIDATE: { + gchar * sentence = NULL; + pinyin_get_sentence(instance, &sentence); + candidate->m_phrase_string = g_strdup + (g_utf8_offset_to_pointer(sentence, offset)); + g_free(sentence); + break; + } + case NORMAL_CANDIDATE: + pinyin_token_get_phrase + (instance, candidate->m_token, NULL, + &(candidate->m_phrase_string)); + break; + case ZOMBIE_CANDIDATE: + break; + } + } + + return true; +} + +static gint compare_indexed_item_with_phrase_string(gconstpointer lhs, + gconstpointer rhs, + gpointer userdata) { + size_t index_lhs = *((size_t *) lhs); + size_t index_rhs = *((size_t *) rhs); + CandidateVector candidates = (CandidateVector) userdata; + + lookup_candidate_t * candidate_lhs = + &g_array_index(candidates, lookup_candidate_t, index_lhs); + lookup_candidate_t * candidate_rhs = + &g_array_index(candidates, lookup_candidate_t, index_rhs); + + return -strcmp(candidate_lhs->m_phrase_string, + candidate_rhs->m_phrase_string); /* in descendant order */ +} + + +static bool _remove_duplicated_items_by_phrase_string +(pinyin_instance_t * instance, + CandidateVector candidates) { + size_t i; + /* create the GArray of indexed item */ + GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t)); + for (i = 0; i < candidates->len; ++i) + g_array_append_val(indices, i); + + /* sort the indices array by phrase array */ + g_array_sort_with_data + (indices, compare_indexed_item_with_phrase_string, candidates); + + /* mark duplicated items as zombie candidate */ + lookup_candidate_t * cur_item, * saved_item = NULL; + for (i = 0; i < indices->len; ++i) { + size_t cur_index = g_array_index(indices, size_t, i); + cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index); + + /* handle the first candidate */ + if (NULL == saved_item) { + saved_item = cur_item; + continue; + } + + if (0 == strcmp(saved_item->m_phrase_string, + cur_item->m_phrase_string)) { + /* found duplicated candidates */ + + /* keep best match candidate */ + if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) { + cur_item->m_candidate_type = ZOMBIE_CANDIDATE; + continue; + } + + if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { + saved_item->m_candidate_type = ZOMBIE_CANDIDATE; + saved_item = cur_item; + continue; + } + + /* keep the higher possiblity one + to quickly move the word forward in the candidate list */ + if (cur_item->m_freq > saved_item->m_freq) { + /* find better candidate */ + saved_item->m_candidate_type = ZOMBIE_CANDIDATE; + saved_item = cur_item; + continue; + } else { + cur_item->m_candidate_type = ZOMBIE_CANDIDATE; + continue; + } + } else { + /* keep the current candidate */ + saved_item = cur_item; + } + } + + g_array_free(indices, TRUE); + + /* remove zombie candidate from the returned candidates */ + for (i = 0; i < candidates->len; ++i) { + lookup_candidate_t * candidate = &g_array_index + (candidates, lookup_candidate_t, i); + + if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) { + g_free(candidate->m_phrase_string); + g_free(candidate->m_new_pinyins); + g_array_remove_index(candidates, i); + i--; + } + } + + return true; +} + +static bool _free_candidates(CandidateVector candidates) { + /* free candidates */ + for (size_t i = 0; i < candidates->len; ++i) { + lookup_candidate_t * candidate = &g_array_index + (candidates, lookup_candidate_t, i); + g_free(candidate->m_phrase_string); + g_free(candidate->m_new_pinyins); + } + g_array_set_size(candidates, 0); + + return true; +} + +bool pinyin_guess_candidates(pinyin_instance_t * instance, + size_t offset) { + + pinyin_context_t * & context = instance->m_context; + pinyin_option_t & options = context->m_options; + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + + _free_candidates(instance->m_candidates); + + size_t pinyin_len = pinyin_keys->len - offset; + ssize_t i; + + /* lookup the previous token here. */ + phrase_token_t prev_token = null_token; + + if (options & DYNAMIC_ADJUST) { + prev_token = _get_previous_token(instance, offset); + } + + SingleGram merged_gram; + SingleGram * system_gram = NULL, * user_gram = NULL; + + if (options & DYNAMIC_ADJUST) { + if (null_token != prev_token) { + context->m_system_bigram->load(prev_token, system_gram); + context->m_user_bigram->load(prev_token, user_gram); + merge_single_gram(&merged_gram, system_gram, user_gram); + } + } + + PhraseIndexRanges ranges; + memset(ranges, 0, sizeof(ranges)); + context->m_phrase_index->prepare_ranges(ranges); + + GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); + + for (i = pinyin_len; i >= 1; --i) { + g_array_set_size(items, 0); + + ChewingKey * keys = &g_array_index + (pinyin_keys, ChewingKey, offset); + + /* do pinyin search. */ + int retval = context->m_pinyin_table->search + (i, keys, ranges); + + if ( !(retval & SEARCH_OK) ) + continue; + + lookup_candidate_t template_item; + _append_items(context, ranges, &template_item, items); + +#if 0 + g_array_sort(items, compare_item_with_token); + + _remove_duplicated_items(items); +#endif + + _compute_frequency_of_items(context, prev_token, &merged_gram, items); + + /* sort the candidates of the same length by frequency. */ + g_array_sort(items, compare_item_with_frequency); + + /* transfer back items to tokens, and save it into candidates */ + for (size_t k = 0; k < items->len; ++k) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, k); + g_array_append_val(instance->m_candidates, *item); + } + +#if 0 + if (!(retval & SEARCH_CONTINUED)) + break; +#endif + } + + g_array_free(items, TRUE); + context->m_phrase_index->destroy_ranges(ranges); + if (system_gram) + delete system_gram; + if (user_gram) + delete user_gram; + + /* post process to remove duplicated candidates */ + + _prepend_sentence_candidate(instance, instance->m_candidates); + + _compute_phrase_strings_of_items(instance, offset, instance->m_candidates); + + _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); + + return true; +} + +int pinyin_choose_candidate(pinyin_instance_t * instance, + size_t offset, + lookup_candidate_t * candidate){ + pinyin_context_t * & context = instance->m_context; + + /* sync m_constraints to the length of m_pinyin_keys. */ + bool retval = context->m_pinyin_lookup->validate_constraint + (instance->m_constraints, instance->m_pinyin_keys); + + phrase_token_t token = candidate->m_token; + guint8 len = context->m_pinyin_lookup->add_constraint + (instance->m_constraints, offset, token); + + /* safe guard: validate the m_constraints again. */ + retval = context->m_pinyin_lookup->validate_constraint + (instance->m_constraints, instance->m_pinyin_keys) && len; + + return offset + len; +} + +bool pinyin_clear_constraint(pinyin_instance_t * instance, + size_t offset){ + pinyin_context_t * & context = instance->m_context; + + bool retval = context->m_pinyin_lookup->clear_constraint + (instance->m_constraints, offset); + + return retval; +} + +bool pinyin_lookup_tokens(pinyin_instance_t * instance, + const char * phrase, GArray * tokenarray){ + pinyin_context_t * & context = instance->m_context; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + + glong ucs4_len = 0; + ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL); + + PhraseTokens tokens; + memset(tokens, 0, sizeof(PhraseTokens)); + phrase_index->prepare_tokens(tokens); + int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens); + int num = reduce_tokens(tokens, tokenarray); + phrase_index->destroy_tokens(tokens); + + return SEARCH_OK & retval; +} + +bool pinyin_train(pinyin_instance_t * instance){ + if (!instance->m_context->m_user_dir) + return false; + + pinyin_context_t * & context = instance->m_context; + context->m_modified = true; + + bool retval = context->m_pinyin_lookup->train_result2 + (instance->m_pinyin_keys, instance->m_constraints, + instance->m_match_results); + + return retval; +} + +bool pinyin_reset(pinyin_instance_t * instance){ + g_free(instance->m_raw_full_pinyin); + instance->m_raw_full_pinyin = NULL; + + g_array_set_size(instance->m_prefixes, 0); + g_array_set_size(instance->m_pinyin_keys, 0); + g_array_set_size(instance->m_pinyin_key_rests, 0); + g_array_set_size(instance->m_constraints, 0); + g_array_set_size(instance->m_match_results, 0); + _free_candidates(instance->m_candidates); + + return true; +} + +bool pinyin_get_bopomofo_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str) { + *utf8_str = NULL; + if (0 == key->get_table_index()) + return false; + + *utf8_str = key->get_bopomofo_string(); + return true; +} + +bool pinyin_get_pinyin_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str) { + pinyin_context_t * context = instance->m_context; + + *utf8_str = NULL; + if (0 == key->get_table_index()) + return false; + + *utf8_str = key->get_pinyin_string(context->m_full_pinyin_scheme); + return true; +} + +bool pinyin_token_get_phrase(pinyin_instance_t * instance, + phrase_token_t token, + guint * len, + gchar ** utf8_str) { + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + ucs4_t buffer[MAX_PHRASE_LENGTH]; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + item.get_phrase_string(buffer); + guint length = item.get_phrase_length(); + if (len) + *len = length; + if (utf8_str) + *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); + return true; +} + +bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint * num){ + *num = 0; + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + *num = item.get_n_pronunciation(); + return true; +} + +bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint nth, + ChewingKeyVector keys){ + g_array_set_size(keys, 0); + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + ChewingKey buffer[MAX_PHRASE_LENGTH]; + guint32 freq = 0; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + item.get_nth_pronunciation(nth, buffer, freq); + guint8 len = item.get_phrase_length(); + g_array_append_vals(keys, buffer, len); + return true; +} + +bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint * freq) { + *freq = 0; + pinyin_context_t * & context = instance->m_context; + PhraseItem item; + + int retval = context->m_phrase_index->get_phrase_item(token, item); + if (ERROR_OK != retval) + return false; + + *freq = item.get_unigram_frequency(); + return true; +} + +bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint delta){ + pinyin_context_t * & context = instance->m_context; + int retval = context->m_phrase_index->add_unigram_frequency + (token, delta); + return ERROR_OK == retval; +} + +bool pinyin_get_n_candidate(pinyin_instance_t * instance, + guint * num) { + *num = instance->m_candidates->len; + return true; +} + +bool pinyin_get_candidate(pinyin_instance_t * instance, + guint index, + lookup_candidate_t ** candidate) { + CandidateVector & candidates = instance->m_candidates; + + *candidate = NULL; + + if (index >= candidates->len) + return false; + + *candidate = &g_array_index(candidates, lookup_candidate_t, index); + + return true; +} + +bool pinyin_get_candidate_type(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + lookup_candidate_type_t * type) { + *type = candidate->m_candidate_type; + return true; +} + +bool pinyin_get_candidate_string(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + const gchar ** utf8_str) { + *utf8_str = candidate->m_phrase_string; + return true; +} + +bool pinyin_get_n_pinyin(pinyin_instance_t * instance, + guint * num) { + *num = 0; + + if (instance->m_pinyin_keys->len != + instance->m_pinyin_key_rests->len) + return false; + + *num = instance->m_pinyin_keys->len; + return true; +} + +bool pinyin_get_pinyin_key(pinyin_instance_t * instance, + guint index, + ChewingKey ** key) { + ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys; + + *key = NULL; + + if (index >= pinyin_keys->len) + return false; + + *key = &g_array_index(pinyin_keys, ChewingKey, index); + + return true; +} + +bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance, + guint index, + ChewingKeyRest ** key_rest) { + ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests; + + *key_rest = NULL; + + if (index >= pinyin_key_rests->len) + return false; + + *key_rest = &g_array_index(pinyin_key_rests, ChewingKeyRest, index); + + return true; +} + +bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * begin, guint16 * end) { + if (begin) + *begin = key_rest->m_raw_begin; + + if (end) + *end = key_rest->m_raw_end; + + return true; +} + +bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * length) { + *length = key_rest->length(); + return true; +} + +bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance, + const gchar ** utf8_str) { + *utf8_str = instance->m_raw_full_pinyin; + return true; +} + +bool pinyin_get_n_phrase(pinyin_instance_t * instance, + guint * num) { + *num = instance->m_match_results->len; + return true; +} + +bool pinyin_get_phrase_token(pinyin_instance_t * instance, + guint index, + phrase_token_t * token){ + MatchResults & match_results = instance->m_match_results; + + *token = null_token; + + if (index >= match_results->len) + return false; + + *token = g_array_index(match_results, phrase_token_t, index); + + return true; +} + + +/** + * Note: prefix is the text before the pre-edit string. + */ diff --git a/src/zhuyin.h b/src/zhuyin.h new file mode 100644 index 0000000..a9601a0 --- /dev/null +++ b/src/zhuyin.h @@ -0,0 +1,663 @@ +/* + * libzhuyin + * Library to deal with zhuyin. + * + * Copyright (C) 2011 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef PINYIN_H +#define PINYIN_H + + +#include "novel_types.h" +#include "zhuyin_custom2.h" + + +G_BEGIN_DECLS + +typedef struct _ChewingKey ChewingKey; +typedef struct _ChewingKeyRest ChewingKeyRest; + +typedef struct _pinyin_context_t pinyin_context_t; +typedef struct _pinyin_instance_t pinyin_instance_t; +typedef struct _lookup_candidate_t lookup_candidate_t; + +typedef struct _import_iterator_t import_iterator_t; + +typedef enum _lookup_candidate_type_t{ + BEST_MATCH_CANDIDATE = 1, + NORMAL_CANDIDATE, + ZOMBIE_CANDIDATE +} lookup_candidate_type_t; + +/** + * pinyin_init: + * @systemdir: the system wide language model data directory. + * @userdir: the user's language model data directory. + * @returns: the newly created pinyin context, NULL if failed. + * + * Create a new pinyin context. + * + */ +pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir); + +/** + * pinyin_load_phrase_library: + * @context: the pinyin context. + * @index: the phrase index to be loaded. + * @returns: whether the load succeeded. + * + * Load the sub phrase library of the index. + * + */ +bool pinyin_load_phrase_library(pinyin_context_t * context, + guint8 index); + +/** + * pinyin_unload_phrase_library: + * @context: the pinyin context. + * @index: the phrase index to be unloaded. + * @returns: whether the unload succeeded. + * + * Unload the sub phrase library of the index. + * + */ +bool pinyin_unload_phrase_library(pinyin_context_t * context, + guint8 index); + +/** + * pinyin_begin_add_phrases: + * @context: the pinyin context. + * @index: the phrase index to be imported. + * @returns: the import iterator. + * + * Begin to add phrases. + * + */ +import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context, + guint8 index); + +/** + * pinyin_iterator_add_phrase: + * @iter: the import iterator. + * @phrase: the phrase string. + * @pinyin: the pinyin string. + * @count: the count of the phrase/pinyin pair, -1 to use the default value. + * @returns: whether the add operation succeeded. + * + * Add a pair of phrase and pinyin with count. + * + */ +bool pinyin_iterator_add_phrase(import_iterator_t * iter, + const char * phrase, + const char * pinyin, + gint count); + +/** + * pinyin_end_add_phrases: + * @iter: the import iterator. + * + * End adding phrases. + * + */ +void pinyin_end_add_phrases(import_iterator_t * iter); + +/** + * pinyin_save: + * @context: the pinyin context to be saved into user directory. + * @returns: whether the save succeeded. + * + * Save the user's self-learning information of the pinyin context. + * + */ +bool pinyin_save(pinyin_context_t * context); + +/** + * pinyin_set_chewing_scheme: + * @context: the pinyin context. + * @scheme: the chewing scheme. + * @returns: whether the set chewing scheme succeeded. + * + * Change the chewing scheme of the pinyin context. + * + */ +bool pinyin_set_chewing_scheme(pinyin_context_t * context, + ChewingScheme scheme); + +/** + * pinyin_set_full_pinyin_scheme: + * @context: the pinyin context. + * @scheme: the full pinyin scheme. + * @returns: whether the set full pinyin scheme succeeded. + * + * Change the full pinyin scheme of the pinyin context. + * + */ +bool pinyin_set_full_pinyin_scheme(pinyin_context_t * context, + FullPinyinScheme scheme); + +/** + * pinyin_fini: + * @context: the pinyin context. + * + * Finalize the pinyin context. + * + */ +void pinyin_fini(pinyin_context_t * context); + + +/** + * pinyin_mask_out: + * @context: the pinyin context. + * @mask: the mask. + * @value: the value. + * @returns: whether the mask out operation is successful. + * + * Mask out the matched phrase tokens. + * + */ +bool pinyin_mask_out(pinyin_context_t * context, + phrase_token_t mask, + phrase_token_t value); + + +/** + * pinyin_set_options: + * @context: the pinyin context. + * @options: the pinyin options of the pinyin context. + * @returns: whether the set options scheme succeeded. + * + * Set the options of the pinyin context. + * + */ +bool pinyin_set_options(pinyin_context_t * context, + pinyin_option_t options); + +/** + * pinyin_alloc_instance: + * @context: the pinyin context. + * @returns: the newly allocated pinyin instance, NULL if failed. + * + * Allocate a new pinyin instance from the context. + * + */ +pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context); + +/** + * pinyin_free_instance: + * @instance: the pinyin instance. + * + * Free the pinyin instance. + * + */ +void pinyin_free_instance(pinyin_instance_t * instance); + + +/** + * pinyin_guess_sentence: + * @instance: the pinyin instance. + * @returns: whether the sentence are guessed successfully. + * + * Guess a sentence from the saved pinyin keys in the instance. + * + */ +bool pinyin_guess_sentence(pinyin_instance_t * instance); + +/** + * pinyin_guess_sentence_with_prefix: + * @instance: the pinyin instance. + * @prefix: the prefix before the sentence. + * @returns: whether the sentence are guessed successfully. + * + * Guess a sentence from the saved pinyin keys with a prefix. + * + */ +bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, + const char * prefix); + +/** + * pinyin_phrase_segment: + * @instance: the pinyin instance. + * @sentence: the utf-8 sentence to be segmented. + * @returns: whether the sentence are segmented successfully. + * + * Segment a sentence and saved the result in the instance. + * + */ +bool pinyin_phrase_segment(pinyin_instance_t * instance, + const char * sentence); + +/** + * pinyin_get_sentence: + * @instance: the pinyin instance. + * @sentence: the saved sentence in the instance. + * @returns: whether the sentence is already saved in the instance. + * + * Get the sentence from the instance. + * + * Note: the returned sentence should be freed by g_free(). + * + */ +bool pinyin_get_sentence(pinyin_instance_t * instance, + char ** sentence); + +/** + * pinyin_parse_full_pinyin: + * @instance: the pinyin instance. + * @onepinyin: a single full pinyin to be parsed. + * @onekey: the parsed key. + * @returns: whether the parse is successfully. + * + * Parse a single full pinyin. + * + */ +bool pinyin_parse_full_pinyin(pinyin_instance_t * instance, + const char * onepinyin, + ChewingKey * onekey); + +/** + * pinyin_parse_more_full_pinyins: + * @instance: the pinyin instance. + * @pinyins: the full pinyins to be parsed. + * @returns: the parsed length of the full pinyins. + * + * Parse multiple full pinyins and save it in the instance. + * + */ +size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance, + const char * pinyins); + +/** + * pinyin_parse_chewing: + * @instance: the pinyin instance. + * @onechewing: the single chewing to be parsed. + * @onekey: the parsed key. + * @returns: whether the parse is successfully. + * + * Parse a single chewing. + * + */ +bool pinyin_parse_chewing(pinyin_instance_t * instance, + const char * onechewing, + ChewingKey * onekey); + +/** + * pinyin_parse_more_chewings: + * @instance: the pinyin instance. + * @chewings: the chewings to be parsed. + * @returns: the parsed length of the chewings. + * + * Parse multiple chewings and save it in the instance. + * + */ +size_t pinyin_parse_more_chewings(pinyin_instance_t * instance, + const char * chewings); + +/** + * pinyin_in_chewing_keyboard: + * @instance: the pinyin instance. + * @key: the input key. + * @symbol: the chewing symbol. + * @returns: whether the key is in current chewing scheme. + * + * Check whether the input key is in current chewing scheme. + * + */ +bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, + const char key, const char ** symbol); +/** + * pinyin_guess_candidates: + * @instance: the pinyin instance. + * @offset: the offset in the pinyin keys. + * @returns: whether a list of tokens are gotten. + * + * Guess the candidates at the offset. + * + */ +bool pinyin_guess_candidates(pinyin_instance_t * instance, + size_t offset); + +/** + * pinyin_choose_candidate: + * @instance: the pinyin instance. + * @offset: the offset in the pinyin keys. + * @candidate: the selected candidate. + * @returns: the cursor after the chosen candidate. + * + * Choose a full pinyin candidate at the offset. + * + */ +int pinyin_choose_candidate(pinyin_instance_t * instance, + size_t offset, + lookup_candidate_t * candidate); + +/** +* pinyin_clear_constraint: +* @instance: the pinyin instance. +* @offset: the offset in the pinyin keys. +* @returns: whether the constraint is cleared. +* +* Clear the previous chosen candidate. +* +*/ +bool pinyin_clear_constraint(pinyin_instance_t * instance, + size_t offset); + +/** + * pinyin_lookup_tokens: + * @instance: the pinyin instance. + * @phrase: the phrase to be looked up. + * @tokenarray: the returned GArray of tokens. + * @returns: whether the lookup operation is successful. + * + * Lookup the tokens for the phrase utf8 string. + * + */ +bool pinyin_lookup_tokens(pinyin_instance_t * instance, + const char * phrase, GArray * tokenarray); + +/** + * pinyin_train: + * @instance: the pinyin instance. + * @returns: whether the sentence is trained. + * + * Train the current user input sentence. + * + */ +bool pinyin_train(pinyin_instance_t * instance); + +/** + * pinyin_reset: + * @instance: the pinyin instance. + * @returns: whether the pinyin instance is resetted. + * + * Reset the pinyin instance. + * + */ +bool pinyin_reset(pinyin_instance_t * instance); + +/** + * pinyin_get_bopomofo_string: + * @instance: the pinyin instance. + * @key: the chewing key. + * @utf8_str: the chewing string. + * @returns: whether the get operation is successful. + * + * Get the chewing string of the key. + * + */ +bool pinyin_get_bopomofo_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str); + +/** + * pinyin_get_pinyin_string: + * @instance: the pinyin instance. + * @key: the pinyin key. + * @utf8_str: the pinyin string. + * @returns: whether the get operation is successful. + * + * Get the pinyin string of the key. + * + */ +bool pinyin_get_pinyin_string(pinyin_instance_t * instance, + ChewingKey * key, + gchar ** utf8_str); + +/** + * pinyin_token_get_phrase: + * @instance: the pinyin instance. + * @token: the phrase token. + * @len: the phrase length. + * @utf8_str: the phrase string. + * @returns: whether the get operation is successful. + * + * Get the phrase length and utf8 string. + * + */ +bool pinyin_token_get_phrase(pinyin_instance_t * instance, + phrase_token_t token, + guint * len, + gchar ** utf8_str); + +/** + * pinyin_token_get_n_pronunciation: + * @instance: the pinyin instance. + * @token: the phrase token. + * @num: the number of pinyins. + * @returns: whether the get operation is successful. + * + * Get the number of the pinyins. + * + */ +bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint * num); + +/** + * pinyin_token_get_nth_pronunciation: + * @instance: the pinyin instance. + * @token: the phrase token. + * @nth: the index of the pinyin. + * @keys: the GArray of chewing key. + * @returns: whether the get operation is successful. + * + * Get the nth pinyin from the phrase. + * + */ +bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance, + phrase_token_t token, + guint nth, + ChewingKeyVector keys); + +/** + * pinyin_token_get_unigram_frequency: + * @instance: the pinyin instance. + * @token: the phrase token. + * @freq: the unigram frequency of the phrase. + * @returns: whether the get operation is successful. + * + * Get the unigram frequency of the phrase. + * + */ +bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint * freq); + +/** + * pinyin_token_add_unigram_frequency: + * @instance: the pinyin instance. + * @token: the phrase token. + * @delta: the delta of the unigram frequency. + * @returns: whether the add operation is successful. + * + * Add delta to the unigram frequency of the phrase token. + * + */ +bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance, + phrase_token_t token, + guint delta); + +/** + * pinyin_get_n_candidate: + * @instance: the pinyin instance. + * @num: the number of the candidates. + * @returns: whether the get operation is successful. + * + * Get the number of the candidates. + * + */ +bool pinyin_get_n_candidate(pinyin_instance_t * instance, + guint * num); + +/** + * pinyin_get_candidate: + * @instance: the pinyin instance. + * @index: the index of the candidate. + * @candidate: the retrieved candidate. + * + * Get the candidate of the index from the candidates. + * + */ +bool pinyin_get_candidate(pinyin_instance_t * instance, + guint index, + lookup_candidate_t ** candidate); + +/** + * pinyin_get_candidate_type: + * @instance: the pinyin instance. + * @candidate: the lookup candidate. + * @type: the type of the candidate. + * @returns: whether the get operation is successful. + * + * Get the type of the lookup candidate. + * + */ +bool pinyin_get_candidate_type(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + lookup_candidate_type_t * type); + +/** + * pinyin_get_candidate_string: + * @instance: the pinyin instance. + * @candidate: the lookup candidate. + * @utf8_str: the string of the candidate. + * @returns: whether the get operation is successful. + * + * Get the string of the candidate. + * + */ +bool pinyin_get_candidate_string(pinyin_instance_t * instance, + lookup_candidate_t * candidate, + const gchar ** utf8_str); + +/** + * pinyin_get_n_pinyin: + * @instance: the pinyin instance. + * @num: the number of the pinyins. + * @returns: whether the get operation is successful. + * + * Get the number of the pinyins. + * + */ +bool pinyin_get_n_pinyin(pinyin_instance_t * instance, + guint * num); + +/** + * pinyin_get_pinyin_key: + * @instance: the pinyin instance. + * @index: the index of the pinyin key. + * @key: the retrieved pinyin key. + * @returns: whether the get operation is successful. + * + * Get the pinyin key of the index from the pinyin keys. + * + */ +bool pinyin_get_pinyin_key(pinyin_instance_t * instance, + guint index, + ChewingKey ** key); + +/** + * pinyin_get_pinyin_key_rest: + * @instance: the pinyin index. + * @index: the index of the pinyin key rest. + * @key_rest: the retrieved pinyin key rest. + * @returns: whether the get operation is successful. + * + * Get the pinyin key rest of the index from the pinyin key rests. + * + */ +bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance, + guint index, + ChewingKeyRest ** key_rest); + +/** + * pinyin_get_pinyin_key_rest_positions: + * @instance: the pinyin instance. + * @key_rest: the pinyin key rest. + * @begin: the begin position of the corresponding pinyin key. + * @end: the end position of the corresponding pinyin key. + * @returns: whether the get operation is successful. + * + * Get the positions of the pinyin key rest. + * + */ +bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * begin, guint16 * end); + +/** + * pinyin_get_pinyin_key_rest_length: + * @instance: the pinyin instance. + * @key_rest: the pinyin key rest. + * @length: the length of the corresponding pinyin key. + * @returns: whether the get operation is successful. + * + * Get the length of the corresponding pinyin key. + * + */ +bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance, + ChewingKeyRest * key_rest, + guint16 * length); + +/** + * pinyin_get_raw_full_pinyin: + * @instance: the pinyin instance. + * @utf8_str: the modified raw full pinyin after choose candidate. + * @returns: whether the get operation is successful. + * + * Get the modified raw full pinyin after choose candidate. + * + */ +bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance, + const gchar ** utf8_str); + +/** + * pinyin_get_n_phrase: + * @instance: the pinyin instance. + * @num: the number of the phrase tokens. + * @returns: whether the get operation is successful. + * + * Get the number of the phrase tokens. + * + */ +bool pinyin_get_n_phrase(pinyin_instance_t * instance, + guint * num); + +/** + * pinyin_get_phrase_token: + * @instance: the pinyin instance. + * @index: the index of the phrase token. + * @token: the retrieved phrase token. + * @returns: whether the get operation is successful. + * + * Get the phrase token of the index from the phrase tokens. + * + */ +bool pinyin_get_phrase_token(pinyin_instance_t * instance, + guint index, + phrase_token_t * token); + +/* hack here. */ +typedef ChewingKey PinyinKey; +typedef ChewingKeyRest PinyinKeyPos; + + +G_END_DECLS + +#endif -- cgit