/* * libpinyin * Library to deal with pinyin. * * Copyright (C) 2011 Peng Wu * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "pinyin.h" #include #include #include #include #include "pinyin_internal.h" using namespace pinyin; /* a glue layer for input method integration. */ typedef GArray * CandidateVector; /* GArray of lookup_candidate_t */ struct _pinyin_context_t{ pinyin_option_t m_options; /* input parsers. */ FullPinyinParser2 * m_full_pinyin_parser; DoublePinyinParser2 * m_double_pinyin_parser; ZhuyinParser2 * m_chewing_parser; /* default tables. */ FacadeChewingTable2 * m_pinyin_table; FacadePhraseTable3 * m_phrase_table; FacadePhraseIndex * m_phrase_index; Bigram * m_system_bigram; Bigram * m_user_bigram; /* lookups. */ PhoneticLookup<3> * m_pinyin_lookup; PhraseLookup * m_phrase_lookup; /* addon tables. */ FacadeChewingTable2 * m_addon_pinyin_table; FacadePhraseTable3 * m_addon_phrase_table; FacadePhraseIndex * m_addon_phrase_index; char * m_system_dir; char * m_user_dir; bool m_modified; SystemTableInfo2 m_system_table_info; }; struct _pinyin_instance_t{ /* pointer of pinyin_context_t. */ pinyin_context_t * m_context; /* the tokens of phrases before the user input. */ TokenVector m_prefixes; /* cached parsed pinyin keys. */ PhoneticKeyMatrix m_matrix; size_t m_parsed_len; /* cached pinyin lookup variables. */ ForwardPhoneticConstraints * m_constraints; NBestMatchResults m_nbest_results; TokenVector m_phrase_result; CandidateVector m_candidates; }; struct _lookup_candidate_t{ lookup_candidate_type_t m_candidate_type; gchar * m_phrase_string; phrase_token_t m_token; guint8 m_phrase_length; gint8 m_nbest_index; /* only for NBEST_MATCH_CANDIDATE. */ guint16 m_begin; /* must contain the preceding "'" character. */ guint16 m_end; /* must not contain the following "'" character. */ guint32 m_freq; /* the amplifed gfloat numerical value. */ public: _lookup_candidate_t() { m_candidate_type = NORMAL_CANDIDATE; m_phrase_string = NULL; m_token = null_token; m_phrase_length = 0; m_nbest_index = -1; m_begin = 0; m_end = 0; m_freq = 0; } }; struct _import_iterator_t{ pinyin_context_t * m_context; guint8 m_phrase_index; }; struct _export_iterator_t{ pinyin_context_t * m_context; guint8 m_phrase_index; /* null token means no next item. */ phrase_token_t m_next_token; guint8 m_next_pronunciation; }; static bool _clean_user_files(const char * user_dir, const pinyin_table_info_t * phrase_files){ /* clean up files, if version mis-matches. */ for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { const pinyin_table_info_t * table_info = phrase_files + i; if (NOT_USED == table_info->m_file_type) continue; if (NULL == table_info->m_user_filename) continue; const char * userfilename = table_info->m_user_filename; /* remove dbin file. */ gchar * filename = g_build_filename(user_dir, userfilename, NULL); unlink(filename); g_free(filename); } return true; } static bool check_format(pinyin_context_t * context){ const char * user_dir = context->m_user_dir; UserTableInfo user_table_info; gchar * filename = g_build_filename (user_dir, USER_TABLE_INFO, NULL); user_table_info.load(filename); g_free(filename); bool exists = user_table_info.is_conform (&context->m_system_table_info); if (exists) return exists; const pinyin_table_info_t * phrase_files = NULL; phrase_files = context->m_system_table_info.get_default_tables(); _clean_user_files(user_dir, phrase_files); phrase_files = context->m_system_table_info.get_addon_tables(); _clean_user_files(user_dir, phrase_files); filename = g_build_filename (user_dir, USER_PINYIN_INDEX, NULL); unlink(filename); g_free(filename); filename = g_build_filename (user_dir, USER_PHRASE_INDEX, NULL); unlink(filename); g_free(filename); filename = g_build_filename (user_dir, USER_BIGRAM, NULL); unlink(filename); g_free(filename); return exists; } static bool mark_version(pinyin_context_t * context){ const char * userdir = context->m_user_dir; UserTableInfo user_table_info; user_table_info.make_conform(&context->m_system_table_info); gchar * filename = g_build_filename (userdir, USER_TABLE_INFO, NULL); bool retval = user_table_info.save(filename); g_free(filename); return retval; } static bool _load_phrase_library (const char * system_dir, const char * user_dir, FacadePhraseIndex * phrase_index, const pinyin_table_info_t * table_info){ /* check whether the sub phrase index is already loaded. */ PhraseIndexRange range; guint8 index = table_info->m_dict_index; int retval = phrase_index->get_range(index, range); if (ERROR_OK == retval) return false; if (SYSTEM_FILE == table_info->m_file_type) { /* system phrase library */ MemoryChunk * chunk = new MemoryChunk; const char * systemfilename = table_info->m_system_filename; /* check bin file in system dir. */ gchar * chunkfilename = g_build_filename(system_dir, systemfilename, NULL); #ifdef LIBPINYIN_USE_MMAP if (!chunk->mmap(chunkfilename)) fprintf(stderr, "mmap %s failed!\n", chunkfilename); #else if (!chunk->load(chunkfilename)) fprintf(stderr, "open %s failed!\n", chunkfilename); #endif g_free(chunkfilename); phrase_index->load(index, chunk); const char * userfilename = table_info->m_user_filename; chunkfilename = g_build_filename(user_dir, userfilename, NULL); MemoryChunk * log = new MemoryChunk; log->load(chunkfilename); g_free(chunkfilename); /* merge the chunk log. */ phrase_index->merge(index, log); return true; } if (DICTIONARY == table_info->m_file_type) { /* addon dictionary. */ MemoryChunk * chunk = new MemoryChunk; const char * systemfilename = table_info->m_system_filename; /* check bin file in system dir. */ gchar * chunkfilename = g_build_filename(system_dir, systemfilename, NULL); #ifdef LIBPINYIN_USE_MMAP if (!chunk->mmap(chunkfilename)) fprintf(stderr, "mmap %s failed!\n", chunkfilename); #else if (!chunk->load(chunkfilename)) fprintf(stderr, "open %s failed!\n", chunkfilename); #endif g_free(chunkfilename); phrase_index->load(index, chunk); return true; } if (USER_FILE == table_info->m_file_type) { /* user phrase library */ MemoryChunk * chunk = new MemoryChunk; const char * userfilename = table_info->m_user_filename; gchar * chunkfilename = g_build_filename(user_dir, userfilename, NULL); /* check bin file exists. if not, create a new one. */ if (chunk->load(chunkfilename)) { phrase_index->load(index, chunk); } else { delete chunk; phrase_index->create_sub_phrase(index); } g_free(chunkfilename); return true; } return false; } pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){ pinyin_context_t * context = new pinyin_context_t; context->m_options = USE_TONE; context->m_system_dir = g_strdup(systemdir); context->m_user_dir = g_strdup(userdir); context->m_modified = false; gchar * filename = g_build_filename (context->m_system_dir, SYSTEM_TABLE_INFO, NULL); if (!context->m_system_table_info.load(filename)) { fprintf(stderr, "load %s failed!\n", filename); return NULL; } g_free(filename); check_format(context); context->m_full_pinyin_parser = new FullPinyinParser2; context->m_double_pinyin_parser = new DoublePinyinParser2; context->m_chewing_parser = new ZhuyinSimpleParser2; /* load chewing table. */ context->m_pinyin_table = new FacadeChewingTable2; gchar * system_filename = g_build_filename (context->m_system_dir, SYSTEM_PINYIN_INDEX, NULL); gchar * user_filename = g_build_filename (context->m_user_dir, USER_PINYIN_INDEX, NULL); context->m_pinyin_table->load(system_filename, user_filename); g_free(user_filename); g_free(system_filename); /* load phrase table */ context->m_phrase_table = new FacadePhraseTable3; system_filename = g_build_filename (context->m_system_dir, SYSTEM_PHRASE_INDEX, NULL); user_filename = g_build_filename (context->m_user_dir, USER_PHRASE_INDEX, NULL); context->m_phrase_table->load(system_filename, user_filename); g_free(user_filename); g_free(system_filename); context->m_phrase_index = new FacadePhraseIndex; /* load all default tables. */ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){ const pinyin_table_info_t * phrase_files = context->m_system_table_info.get_default_tables(); const pinyin_table_info_t * table_info = phrase_files + i; if (NOT_USED == table_info->m_file_type) continue; /* addon dictionary should not in default tables. */ assert(DICTIONARY != table_info->m_file_type); _load_phrase_library(context->m_system_dir, context->m_user_dir, context->m_phrase_index, table_info); } context->m_system_bigram = new Bigram; filename = g_build_filename(context->m_system_dir, SYSTEM_BIGRAM, NULL); context->m_system_bigram->attach(filename, ATTACH_READONLY); g_free(filename); context->m_user_bigram = new Bigram; filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); context->m_user_bigram->load_db(filename); g_free(filename); gfloat lambda = context->m_system_table_info.get_lambda(); context->m_pinyin_lookup = new PhoneticLookup<3> (lambda, context->m_pinyin_table, context->m_phrase_index, context->m_system_bigram, context->m_user_bigram); context->m_phrase_lookup = new PhraseLookup (lambda, context->m_phrase_table, context->m_phrase_index, context->m_system_bigram, context->m_user_bigram); /* load addon chewing table. */ context->m_addon_pinyin_table = new FacadeChewingTable2; system_filename = g_build_filename (context->m_system_dir, ADDON_SYSTEM_PINYIN_INDEX, NULL); context->m_addon_pinyin_table->load(system_filename, NULL); g_free(system_filename); /* load addon phrase table */ context->m_addon_phrase_table = new FacadePhraseTable3; system_filename = g_build_filename (context->m_system_dir, ADDON_SYSTEM_PHRASE_INDEX, NULL); context->m_addon_phrase_table->load(system_filename, NULL); g_free(system_filename); context->m_addon_phrase_index = new FacadePhraseIndex; /* don't load addon phrase libraries. */ return context; } bool pinyin_load_phrase_library(pinyin_context_t * context, guint8 index){ if (!(index < PHRASE_INDEX_LIBRARY_COUNT)) return false; const pinyin_table_info_t * phrase_files = context->m_system_table_info.get_default_tables(); FacadePhraseIndex * phrase_index = context->m_phrase_index; const pinyin_table_info_t * table_info = phrase_files + index; /* Only SYSTEM_FILE or USER_FILE is allowed here. */ assert(SYSTEM_FILE == table_info->m_file_type || USER_FILE == table_info->m_file_type); return _load_phrase_library(context->m_system_dir, context->m_user_dir, phrase_index, table_info); } bool pinyin_unload_phrase_library(pinyin_context_t * context, guint8 index){ assert(index < PHRASE_INDEX_LIBRARY_COUNT); /* default table. */ /* only GBK table can be unloaded. */ if (GBK_DICTIONARY != index) return false; context->m_phrase_index->unload(index); return true; } bool pinyin_load_addon_phrase_library(pinyin_context_t * context, guint8 index){ if (!(index < PHRASE_INDEX_LIBRARY_COUNT)) return false; const pinyin_table_info_t * phrase_files = context->m_system_table_info.get_addon_tables(); FacadePhraseIndex * phrase_index = context->m_addon_phrase_index; const pinyin_table_info_t * table_info = phrase_files + index; if (NOT_USED == table_info->m_file_type) return false; /* Only DICTIONARY is allowed here. */ assert(DICTIONARY == table_info->m_file_type); return _load_phrase_library(context->m_system_dir, context->m_user_dir, phrase_index, table_info); } bool pinyin_unload_addon_phrase_library(pinyin_context_t * context, guint8 index){ assert(index < PHRASE_INDEX_LIBRARY_COUNT); /* addon table. */ context->m_addon_phrase_index->unload(index); return true; } import_iterator_t * pinyin_begin_add_phrases(pinyin_context_t * context, guint8 index){ import_iterator_t * iter = new import_iterator_t; iter->m_context = context; iter->m_phrase_index = index; return iter; } static bool _add_phrase(pinyin_context_t * context, guint8 index, ChewingKeyVector keys, ucs4_t * phrase, glong phrase_length, gint count) { /* if -1 == count, use the default value. */ const gint default_count = 5; const guint32 unigram_factor = 3; if (-1 == count) count = default_count; FacadePhraseTable3 * phrase_table = context->m_phrase_table; FacadeChewingTable2 * pinyin_table = context->m_pinyin_table; FacadePhraseIndex * phrase_index = context->m_phrase_index; bool result = false; /* check whether the phrase exists in phrase table */ phrase_token_t token = null_token; GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); /* do phrase table search. */ PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index->prepare_tokens(tokens); int retval = phrase_table->search(phrase_length, phrase, tokens); int num = reduce_tokens(tokens, tokenarray); phrase_index->destroy_tokens(tokens); /* find the best token candidate. */ for (size_t i = 0; i < tokenarray->len; ++i) { phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i); if (null_token == token) { token = candidate; continue; } if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == index) { /* only one phrase string per sub phrase index. */ assert(PHRASE_INDEX_LIBRARY_INDEX(token) != index); token = candidate; continue; } } g_array_free(tokenarray, TRUE); PhraseItem item; /* check whether it exists in the same sub phrase index; */ if (null_token != token && PHRASE_INDEX_LIBRARY_INDEX(token) == index) { /* if so, remove the phrase, add the pinyin for the phrase item, then add it back;*/ phrase_index->get_phrase_item(token, item); assert(phrase_length == item.get_phrase_length()); ucs4_t tmp_phrase[MAX_PHRASE_LENGTH]; item.get_phrase_string(tmp_phrase); assert(0 == memcmp (phrase, tmp_phrase, sizeof(ucs4_t) * phrase_length)); PhraseItem * removed_item = NULL; retval = phrase_index->remove_phrase_item(token, removed_item); if (ERROR_OK == retval) { /* maybe check whether there are duplicated pronunciations here. */ removed_item->add_pronunciation((ChewingKey *)keys->data, count); phrase_index->add_phrase_item(token, removed_item); delete removed_item; result = true; } } else { /* if not exists in the same sub phrase index, get the maximum token, then add it directly with maximum token + 1; */ PhraseIndexRange range; retval = phrase_index->get_range(index, range); if (ERROR_OK == retval) { token = range.m_range_end; if (0x00000000 == (token & PHRASE_MASK)) token++; if (phrase_length == keys->len) { /* valid pinyin */ phrase_table->add_index(phrase_length, phrase, token); pinyin_table->add_index (keys->len, (ChewingKey *)(keys->data), token); item.set_phrase_string(phrase_length, phrase); item.add_pronunciation((ChewingKey *)(keys->data), count); phrase_index->add_phrase_item(token, &item); phrase_index->add_unigram_frequency(token, count * unigram_factor); result = true; } } } return result; } bool pinyin_iterator_add_phrase(import_iterator_t * iter, const char * phrase, const char * pinyin, gint count){ pinyin_context_t * context = iter->m_context; guint8 index = iter->m_phrase_index; bool result = false; if (NULL == phrase || NULL == pinyin) return result; glong phrase_length = 0; ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &phrase_length, NULL); pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; FullPinyinParser2 parser; ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); /* parse the pinyin. */ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); if (phrase_length != keys->len) return result; if (0 == phrase_length || phrase_length >= MAX_PHRASE_LENGTH) return result; result = _add_phrase(context, index, keys, ucs4_phrase, phrase_length, count); g_array_free(key_rests, TRUE); g_array_free(keys, TRUE); g_free(ucs4_phrase); return result; } void pinyin_end_add_phrases(import_iterator_t * iter){ /* compact the content memory chunk of phrase index. */ iter->m_context->m_phrase_index->compact(); iter->m_context->m_modified = true; delete iter; } export_iterator_t * pinyin_begin_get_phrases(pinyin_context_t * context, guint index){ export_iterator_t * iter = new export_iterator_t; iter->m_context = context; iter->m_phrase_index = index; iter->m_next_token = null_token; iter->m_next_pronunciation = 0; /* probe next token. */ PhraseIndexRange range; int retval = iter->m_context->m_phrase_index->get_range (iter->m_phrase_index, range); if (retval != ERROR_OK) return iter; PhraseItem item; phrase_token_t token = range.m_range_begin; for (; token < range.m_range_end; ++token) { retval = iter->m_context->m_phrase_index->get_phrase_item (token, item); if (ERROR_OK == retval && item.get_n_pronunciation() >= 1) { iter->m_next_token = token; break; } } return iter; } bool pinyin_iterator_has_next_phrase(export_iterator_t * iter){ /* no next token. */ if (null_token == iter->m_next_token) return false; return true; } /* phrase, pinyin should be freed by g_free(). */ bool pinyin_iterator_get_next_phrase(export_iterator_t * iter, gchar ** phrase, gchar ** pinyin, gint * count){ /* count "-1" means default count. */ *phrase = NULL; *pinyin = NULL; *count = -1; PhraseItem item; int retval = iter->m_context->m_phrase_index->get_phrase_item (iter->m_next_token, item); /* assume valid next token from previous call. */ assert(ERROR_OK == retval); /* fill phrase and pronunciation pair. */ ucs4_t phrase_ucs4[MAX_PHRASE_LENGTH]; guint8 len = item.get_phrase_length(); assert(item.get_phrase_string(phrase_ucs4)); gchar * phrase_utf8 = g_ucs4_to_utf8 (phrase_ucs4, len, NULL, NULL, NULL); guint8 nth_pronun = iter->m_next_pronunciation; guint8 n_pronuns = item.get_n_pronunciation(); /* assume valid pronunciation from previous call. */ assert(nth_pronun < n_pronuns); ChewingKey keys[MAX_PHRASE_LENGTH]; guint32 freq = 0; assert(item.get_nth_pronunciation(nth_pronun, keys, freq)); GPtrArray * array = g_ptr_array_new(); for(size_t i = 0; i < len; ++i) { g_ptr_array_add(array, keys[i].get_pinyin_string()); } g_ptr_array_add(array, NULL); gchar ** strings = (gchar **)g_ptr_array_free(array, FALSE); gchar * pinyins = g_strjoinv("'", strings); g_strfreev(strings); /* use default value. */ *phrase = phrase_utf8; *pinyin = pinyins; if (freq > 0) *count = freq; /* probe next pronunciation. */ nth_pronun ++; if (nth_pronun < n_pronuns) { iter->m_next_pronunciation = nth_pronun; return true; } iter->m_next_pronunciation = 0; /* probe next token. */ PhraseIndexRange range; retval = iter->m_context->m_phrase_index->get_range (iter->m_phrase_index, range); if (retval != ERROR_OK) { iter->m_next_token = null_token; return true; } phrase_token_t token = iter->m_next_token + 1; iter->m_next_token = null_token; for (; token < range.m_range_end; ++token) { retval = iter->m_context->m_phrase_index->get_phrase_item (token, item); if (ERROR_OK == retval && item.get_n_pronunciation() >= 1) { iter->m_next_token = token; break; } } return true; } void pinyin_end_get_phrases(export_iterator_t * iter){ delete iter; } bool pinyin_save(pinyin_context_t * context){ if (!context->m_user_dir) return false; if (!context->m_modified) return false; context->m_phrase_index->compact(); const pinyin_table_info_t * phrase_files = context->m_system_table_info.get_default_tables(); /* skip the reserved zero phrase library. */ for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { PhraseIndexRange range; int retval = context->m_phrase_index->get_range(i, range); if (ERROR_NO_SUB_PHRASE_INDEX == retval) continue; const pinyin_table_info_t * table_info = phrase_files + i; if (NOT_USED == table_info->m_file_type) continue; const char * userfilename = table_info->m_user_filename; if (NULL == userfilename) continue; if (SYSTEM_FILE == table_info->m_file_type || DICTIONARY == table_info->m_file_type) { /* system phrase library */ MemoryChunk * chunk = new MemoryChunk; MemoryChunk * log = new MemoryChunk; const char * systemfilename = table_info->m_system_filename; /* check bin file in system dir. */ gchar * chunkfilename = g_build_filename(context->m_system_dir, systemfilename, NULL); #ifdef LIBPINYIN_USE_MMAP if (!chunk->mmap(chunkfilename)) fprintf(stderr, "mmap %s failed!\n", chunkfilename); #else if (!chunk->load(chunkfilename)) fprintf(stderr, "open %s failed!\n", chunkfilename); #endif g_free(chunkfilename); context->m_phrase_index->diff(i, chunk, log); const char * userfilename = table_info->m_user_filename; gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); gchar * tmppathname = g_build_filename(context->m_user_dir, tmpfilename, NULL); g_free(tmpfilename); gchar * chunkpathname = g_build_filename(context->m_user_dir, userfilename, NULL); log->save(tmppathname); int result = rename(tmppathname, chunkpathname); if (0 != result) fprintf(stderr, "rename %s to %s failed.\n", tmppathname, chunkpathname); g_free(chunkpathname); g_free(tmppathname); delete log; } if (USER_FILE == table_info->m_file_type) { /* user phrase library */ MemoryChunk * chunk = new MemoryChunk; context->m_phrase_index->store(i, chunk); const char * userfilename = table_info->m_user_filename; gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename); gchar * tmppathname = g_build_filename(context->m_user_dir, tmpfilename, NULL); g_free(tmpfilename); gchar * chunkpathname = g_build_filename(context->m_user_dir, userfilename, NULL); chunk->save(tmppathname); int result = rename(tmppathname, chunkpathname); if (0 != result) fprintf(stderr, "rename %s to %s failed.\n", tmppathname, chunkpathname); g_free(chunkpathname); g_free(tmppathname); delete chunk; } } /* save user pinyin table */ gchar * tmpfilename = g_build_filename (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL); unlink(tmpfilename); gchar * filename = g_build_filename (context->m_user_dir, USER_PINYIN_INDEX, NULL); context->m_pinyin_table->store(tmpfilename); int result = rename(tmpfilename, filename); if (0 != result) fprintf(stderr, "rename %s to %s failed.\n", tmpfilename, filename); g_free(tmpfilename); g_free(filename); /* save user phrase table */ tmpfilename = g_build_filename (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL); unlink(tmpfilename); filename = g_build_filename (context->m_user_dir, USER_PHRASE_INDEX, NULL); context->m_phrase_table->store(tmpfilename); result = rename(tmpfilename, filename); if (0 != result) fprintf(stderr, "rename %s to %s failed.\n", tmpfilename, filename); g_free(tmpfilename); g_free(filename); /* save user bi-gram */ tmpfilename = g_build_filename (context->m_user_dir, USER_BIGRAM ".tmp", NULL); unlink(tmpfilename); filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL); context->m_user_bigram->save_db(tmpfilename); result = rename(tmpfilename, filename); if (0 != result) fprintf(stderr, "rename %s to %s failed.\n", tmpfilename, filename); g_free(tmpfilename); g_free(filename); mark_version(context); context->m_modified = false; return true; } bool pinyin_set_full_pinyin_scheme(pinyin_context_t * context, FullPinyinScheme scheme){ context->m_full_pinyin_parser->set_scheme(scheme); return true; } bool pinyin_set_double_pinyin_scheme(pinyin_context_t * context, DoublePinyinScheme scheme){ context->m_double_pinyin_parser->set_scheme(scheme); return true; } bool pinyin_set_zhuyin_scheme(pinyin_context_t * context, ZhuyinScheme scheme){ delete context->m_chewing_parser; context->m_chewing_parser = NULL; switch(scheme) { case ZHUYIN_STANDARD: case ZHUYIN_IBM: case ZHUYIN_GINYIEH: case ZHUYIN_ETEN: case ZHUYIN_STANDARD_DVORAK: { ZhuyinSimpleParser2 * parser = new ZhuyinSimpleParser2(); parser->set_scheme(scheme); context->m_chewing_parser = parser; break; } case ZHUYIN_HSU: case ZHUYIN_ETEN26: case ZHUYIN_HSU_DVORAK: { ZhuyinDiscreteParser2 * parser = new ZhuyinDiscreteParser2(); parser->set_scheme(scheme); context->m_chewing_parser = parser; break; } case ZHUYIN_DACHEN_CP26: context->m_chewing_parser = new ZhuyinDaChenCP26Parser2(); break; default: assert(FALSE); } return true; } void pinyin_fini(pinyin_context_t * context){ delete context->m_full_pinyin_parser; delete context->m_double_pinyin_parser; delete context->m_chewing_parser; delete context->m_pinyin_table; delete context->m_phrase_table; delete context->m_phrase_index; delete context->m_system_bigram; delete context->m_user_bigram; delete context->m_pinyin_lookup; delete context->m_phrase_lookup; delete context->m_addon_pinyin_table; delete context->m_addon_phrase_table; delete context->m_addon_phrase_index; g_free(context->m_system_dir); g_free(context->m_user_dir); context->m_modified = false; delete context; } bool pinyin_mask_out(pinyin_context_t * context, phrase_token_t mask, phrase_token_t value) { context->m_pinyin_table->mask_out(mask, value); context->m_phrase_table->mask_out(mask, value); context->m_user_bigram->mask_out(mask, value); const pinyin_table_info_t * phrase_files = context->m_system_table_info.get_default_tables(); /* mask out the phrase index. */ for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) { PhraseIndexRange range; int retval = context->m_phrase_index->get_range(index, range); if (ERROR_NO_SUB_PHRASE_INDEX == retval) continue; const pinyin_table_info_t * table_info = phrase_files + index; if (NOT_USED == table_info->m_file_type) continue; const char * userfilename = table_info->m_user_filename; if (NULL == userfilename) continue; if (SYSTEM_FILE == table_info->m_file_type || DICTIONARY == table_info->m_file_type) { /* system phrase library */ MemoryChunk * chunk = new MemoryChunk; const char * systemfilename = table_info->m_system_filename; /* check bin file in system dir. */ gchar * chunkfilename = g_build_filename(context->m_system_dir, systemfilename, NULL); #ifdef LIBPINYIN_USE_MMAP if (!chunk->mmap(chunkfilename)) fprintf(stderr, "mmap %s failed!\n", chunkfilename); #else if (!chunk->load(chunkfilename)) fprintf(stderr, "open %s failed!\n", chunkfilename); #endif g_free(chunkfilename); context->m_phrase_index->load(index, chunk); const char * userfilename = table_info->m_user_filename; chunkfilename = g_build_filename(context->m_user_dir, userfilename, NULL); MemoryChunk * log = new MemoryChunk; log->load(chunkfilename); g_free(chunkfilename); /* merge the chunk log with mask. */ context->m_phrase_index->merge_with_mask(index, log, mask, value); } if (USER_FILE == table_info->m_file_type) { /* user phrase library */ context->m_phrase_index->mask_out(index, mask, value); } } context->m_phrase_index->compact(); return true; } /* copy from options to context->m_options. */ bool pinyin_set_options(pinyin_context_t * context, pinyin_option_t options){ context->m_options = options; #if 0 context->m_pinyin_table->set_options(context->m_options); context->m_pinyin_lookup->set_options(context->m_options); #endif return true; } pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){ pinyin_instance_t * instance = new pinyin_instance_t; instance->m_context = context; instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); instance->m_parsed_len = 0; instance->m_constraints = new ForwardPhoneticConstraints (context->m_phrase_index); instance->m_phrase_result = g_array_new (TRUE, TRUE, sizeof(phrase_token_t)); instance->m_candidates = g_array_new(TRUE, TRUE, sizeof(lookup_candidate_t)); return instance; } void pinyin_free_instance(pinyin_instance_t * instance){ g_array_free(instance->m_prefixes, TRUE); delete instance->m_constraints; g_array_free(instance->m_phrase_result, TRUE); g_array_free(instance->m_candidates, TRUE); delete instance; } pinyin_context_t * pinyin_get_context (pinyin_instance_t * instance){ return instance->m_context; } static bool pinyin_update_constraints(pinyin_instance_t * instance){ PhoneticKeyMatrix & matrix = instance->m_matrix; ForwardPhoneticConstraints * constraints = instance->m_constraints; constraints->validate_constraint(&matrix); return true; } bool pinyin_guess_sentence(pinyin_instance_t * instance){ pinyin_context_t * & context = instance->m_context; PhoneticKeyMatrix & matrix = instance->m_matrix; g_array_set_size(instance->m_prefixes, 0); g_array_append_val(instance->m_prefixes, sentence_start); pinyin_update_constraints(instance); bool retval = context->m_pinyin_lookup->get_nbest_match (instance->m_prefixes, &matrix, instance->m_constraints, &instance->m_nbest_results); return retval; } static void _compute_prefixes(pinyin_instance_t * instance, const char * prefix){ pinyin_context_t * & context = instance->m_context; FacadePhraseIndex * & phrase_index = context->m_phrase_index; glong len_str = 0; ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL); GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); if (ucs4_str && len_str) { /* add prefixes. */ for (ssize_t i = 1; i <= len_str; ++i) { if (i > MAX_PHRASE_LENGTH) break; ucs4_t * start = ucs4_str + len_str - i; PhraseTokens tokens; memset(tokens, 0, sizeof(tokens)); phrase_index->prepare_tokens(tokens); int result = context->m_phrase_table->search(i, start, tokens); int num = reduce_tokens(tokens, tokenarray); phrase_index->destroy_tokens(tokens); if (result & SEARCH_OK) g_array_append_vals(instance->m_prefixes, tokenarray->data, tokenarray->len); } } g_array_free(tokenarray, TRUE); g_free(ucs4_str); } bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, const char * prefix){ pinyin_context_t * & context = instance->m_context; PhoneticKeyMatrix & matrix = instance->m_matrix; g_array_set_size(instance->m_prefixes, 0); g_array_append_val(instance->m_prefixes, sentence_start); _compute_prefixes(instance, prefix); pinyin_update_constraints(instance); bool retval = context->m_pinyin_lookup->get_nbest_match (instance->m_prefixes, &matrix, instance->m_constraints, &instance->m_nbest_results); return retval; } bool pinyin_phrase_segment(pinyin_instance_t * instance, const char * sentence){ pinyin_context_t * & context = instance->m_context; const glong num_of_chars = g_utf8_strlen(sentence, -1); glong ucs4_len = 0; ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL); g_return_val_if_fail(num_of_chars == ucs4_len, FALSE); bool retval = context->m_phrase_lookup->get_best_match (ucs4_len, ucs4_str, instance->m_phrase_result); g_free(ucs4_str); return retval; } /* the returned sentence should be freed by g_free(). */ bool pinyin_get_sentence(pinyin_instance_t * instance, guint8 index, char ** sentence){ pinyin_context_t * & context = instance->m_context; NBestMatchResults & results = instance->m_nbest_results; if (0 == results.size()) return false; MatchResult result = NULL; assert(index < results.size()); assert(results.get_result(index, result)); bool retval = pinyin::convert_to_utf8 (context->m_phrase_index, result, NULL, false, *sentence); return retval; } bool pinyin_parse_full_pinyin(pinyin_instance_t * instance, const char * onepinyin, ChewingKey * onekey){ pinyin_context_t * & context = instance->m_context; int pinyin_len = strlen(onepinyin); bool retval = context->m_full_pinyin_parser->parse_one_key ( context->m_options, *onekey, onepinyin, pinyin_len); return retval; } size_t pinyin_parse_more_full_pinyins(pinyin_instance_t * instance, const char * pinyins){ pinyin_context_t * & context = instance->m_context; pinyin_option_t & options = context->m_options; PhoneticKeyMatrix & matrix = instance->m_matrix; ChewingKeyVector keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(TRUE, TRUE, sizeof(ChewingKeyRest)); int parsed_len = context->m_full_pinyin_parser->parse (options, keys, key_rests, pinyins, strlen(pinyins)); instance->m_parsed_len = parsed_len; fill_matrix(&matrix, keys, key_rests, parsed_len); resplit_step(options, &matrix); inner_split_step(options, &matrix); fuzzy_syllable_step(options, &matrix); g_array_free(key_rests, TRUE); g_array_free(keys, TRUE); return parsed_len; } bool pinyin_parse_double_pinyin(pinyin_instance_t * instance, const char * onepinyin, ChewingKey * onekey){ pinyin_context_t * & context = instance->m_context; int pinyin_len = strlen(onepinyin); bool retval = context->m_double_pinyin_parser->parse_one_key ( context->m_options, *onekey, onepinyin, pinyin_len); return retval; } size_t pinyin_parse_more_double_pinyins(pinyin_instance_t * instance, const char * pinyins){ pinyin_context_t * & context = instance->m_context; pinyin_option_t & options = context->m_options; PhoneticKeyMatrix & matrix = instance->m_matrix; ChewingKeyVector keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(TRUE, TRUE, sizeof(ChewingKeyRest)); int parsed_len = context->m_double_pinyin_parser->parse (options, keys, key_rests, pinyins, strlen(pinyins)); instance->m_parsed_len = parsed_len; fill_matrix(&matrix, keys, key_rests, parsed_len); fuzzy_syllable_step(options, &matrix); g_array_free(key_rests, TRUE); g_array_free(keys, TRUE); return parsed_len; } bool pinyin_parse_chewing(pinyin_instance_t * instance, const char * onechewing, ChewingKey * onekey){ pinyin_context_t * & context = instance->m_context; int chewing_len = strlen(onechewing); bool retval = context->m_chewing_parser->parse_one_key ( context->m_options, *onekey, onechewing, chewing_len ); return retval; } size_t pinyin_parse_more_chewings(pinyin_instance_t * instance, const char * chewings){ pinyin_context_t * & context = instance->m_context; pinyin_option_t & options = context->m_options; PhoneticKeyMatrix & matrix = instance->m_matrix; ChewingKeyVector keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(TRUE, TRUE, sizeof(ChewingKeyRest)); int parsed_len = context->m_chewing_parser->parse (options, keys, key_rests, chewings, strlen(chewings)); instance->m_parsed_len = parsed_len; fill_matrix(&matrix, keys, key_rests, parsed_len); fuzzy_syllable_step(options, &matrix); g_array_free(key_rests, TRUE); g_array_free(keys, TRUE); return parsed_len; } size_t pinyin_get_parsed_input_length(pinyin_instance_t * instance) { return instance->m_parsed_len; } bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, const char key, gchar *** symbols) { pinyin_context_t * & context = instance->m_context; return context->m_chewing_parser->in_chewing_scheme (context->m_options, key, *symbols); } static bool _token_get_phrase(FacadePhraseIndex * phrase_index, phrase_token_t token, guint * len, gchar ** utf8_str) { PhraseItem item; ucs4_t buffer[MAX_PHRASE_LENGTH]; int retval = phrase_index->get_phrase_item(token, item); if (ERROR_OK != retval) return false; item.get_phrase_string(buffer); guint length = item.get_phrase_length(); if (len) *len = length; if (utf8_str) *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); return true; } #if 0 static gint compare_item_with_token(gconstpointer lhs, gconstpointer rhs) { lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; phrase_token_t token_lhs = item_lhs->m_token; phrase_token_t token_rhs = item_rhs->m_token; return (token_lhs - token_rhs); } #endif static gint compare_item_with_length_and_frequency(gconstpointer lhs, gconstpointer rhs) { lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs; lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs; guint8 len_lhs = item_lhs->m_phrase_length; guint8 len_rhs = item_rhs->m_phrase_length; if (len_lhs != len_rhs) return -(len_lhs - len_rhs); /* in descendant order */ guint32 freq_lhs = item_lhs->m_freq; guint32 freq_rhs = item_rhs->m_freq; return -(freq_lhs - freq_rhs); /* in descendant order */ } static phrase_token_t _get_previous_token(pinyin_instance_t * instance, size_t offset) { pinyin_context_t * context = instance->m_context; TokenVector prefixes = instance->m_prefixes; NBestMatchResults & results = instance->m_nbest_results; phrase_token_t prev_token = null_token; ssize_t i; if (0 == offset) { /* get previous token from prefixes. */ prev_token = sentence_start; size_t prev_token_len = 0; PhraseItem item; for (size_t i = 0; i < prefixes->len; ++i) { phrase_token_t token = g_array_index(prefixes, phrase_token_t, i); if (sentence_start == token) continue; int retval = context->m_phrase_index->get_phrase_item(token, item); if (ERROR_OK == retval) { size_t token_len = item.get_phrase_length(); if (token_len > prev_token_len) { /* found longer match, and save it. */ prev_token = token; prev_token_len = token_len; } } } } else { /* get previous token from match results. */ assert (0 < offset); /* no nbest match result. */ if (0 == results.size()) return prev_token; /* use the first candidate. */ MatchResult result = NULL; assert(results.get_result(0, result)); phrase_token_t cur_token = g_array_index (result, phrase_token_t, offset); if (null_token != cur_token) { for (i = offset - 1; i >= 0; --i) { cur_token = g_array_index(result, phrase_token_t, i); if (null_token != cur_token) { prev_token = cur_token; break; } } } } return prev_token; } static void _append_items(PhraseIndexRanges ranges, lookup_candidate_t * template_item, CandidateVector items) { /* reduce and append to a single GArray. */ for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) { if (NULL == ranges[m]) continue; for (size_t n = 0; n < ranges[m]->len; ++n) { PhraseIndexRange * range = &g_array_index(ranges[m], PhraseIndexRange, n); for (size_t k = range->m_range_begin; k < range->m_range_end; ++k) { lookup_candidate_t item; item.m_candidate_type = template_item->m_candidate_type; item.m_token = k; item.m_begin = template_item->m_begin; item.m_end = template_item->m_end; item.m_freq = template_item->m_freq; g_array_append_val(items, item); } } } } static void _compute_frequency_of_items(pinyin_context_t * context, phrase_token_t prev_token, SingleGram * merged_gram, CandidateVector items) { pinyin_option_t & options = context->m_options; ssize_t i; PhraseItem cached_item; /* compute all freqs. */ for (i = 0; i < items->len; ++i) { lookup_candidate_t * item = &g_array_index (items, lookup_candidate_t, i); phrase_token_t & token = item->m_token; gfloat bigram_poss = 0; guint32 total_freq = 0; gfloat lambda = context->m_system_table_info.get_lambda(); /* handle addon candidates first. */ if (ADDON_CANDIDATE == item->m_candidate_type) { total_freq = context->m_phrase_index-> get_phrase_index_total_freq(); /* assume the unigram of every addon phrases is 1. */ context->m_addon_phrase_index->get_phrase_item (token, cached_item); /* Note: possibility value <= 1.0. */ guint32 freq = ((1 - lambda) * cached_item.get_unigram_frequency() / (gfloat) total_freq) * 256 * 256 * 256; item->m_freq = freq; continue; } if (options & DYNAMIC_ADJUST) { if (null_token != prev_token) { guint32 bigram_freq = 0; merged_gram->get_total_freq(total_freq); merged_gram->get_freq(token, bigram_freq); if (0 != total_freq) bigram_poss = bigram_freq / (gfloat)total_freq; } } /* compute the m_freq. */ FacadePhraseIndex * & phrase_index = context->m_phrase_index; phrase_index->get_phrase_item(token, cached_item); total_freq = phrase_index->get_phrase_index_total_freq(); assert (0 < total_freq); /* Note: possibility value <= 1.0. */ guint32 freq = (lambda * bigram_poss + (1 - lambda) * cached_item.get_unigram_frequency() / (gfloat) total_freq) * 256 * 256 * 256; item->m_freq = freq; } } static bool _prepend_sentence_candidates(pinyin_instance_t * instance, CandidateVector candidates) { const size_t size = instance->m_nbest_results.size(); /* check whether the nbest match candidate exists. */ if (0 == size) return false; /* prepend nbest match candidates to candidates. */ for (ssize_t i = size - 1; i >= 0; --i) { lookup_candidate_t candidate; candidate.m_candidate_type = NBEST_MATCH_CANDIDATE; candidate.m_nbest_index = i; g_array_prepend_val(candidates, candidate); } return true; } static bool _compute_phrase_length(pinyin_context_t * context, CandidateVector candidates) { FacadePhraseIndex * phrase_index = context->m_phrase_index; FacadePhraseIndex * addon_phrase_index = context->m_addon_phrase_index; /* populate m_phrase_length in lookup_candidate_t. */ PhraseItem item; for(size_t i = 0; i < candidates->len; ++i) { lookup_candidate_t * candidate = &g_array_index (candidates, lookup_candidate_t, i); switch(candidate->m_candidate_type) { case NBEST_MATCH_CANDIDATE: assert(FALSE); case NORMAL_CANDIDATE: case PREDICTED_CANDIDATE: { phrase_index->get_phrase_item(candidate->m_token, item); candidate->m_phrase_length = item.get_phrase_length(); break; } case ADDON_CANDIDATE: { addon_phrase_index->get_phrase_item(candidate->m_token, item); candidate->m_phrase_length = item.get_phrase_length(); break; } case ZOMBIE_CANDIDATE: assert(FALSE); } } return true; } static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance, CandidateVector candidates) { /* populate m_phrase_string in lookup_candidate_t. */ for(size_t i = 0; i < candidates->len; ++i) { lookup_candidate_t * candidate = &g_array_index (candidates, lookup_candidate_t, i); switch(candidate->m_candidate_type) { case NBEST_MATCH_CANDIDATE: { gchar * sentence = NULL; pinyin_get_sentence(instance, candidate->m_nbest_index, &sentence); candidate->m_phrase_string = sentence; break; } case NORMAL_CANDIDATE: case PREDICTED_CANDIDATE: _token_get_phrase (instance->m_context->m_phrase_index, candidate->m_token, NULL, &(candidate->m_phrase_string)); break; case ADDON_CANDIDATE: _token_get_phrase (instance->m_context->m_addon_phrase_index, candidate->m_token, NULL, &(candidate->m_phrase_string)); break; case ZOMBIE_CANDIDATE: assert(FALSE); } } return true; } static gint compare_indexed_item_with_phrase_string(gconstpointer lhs, gconstpointer rhs, gpointer userdata) { size_t index_lhs = *((size_t *) lhs); size_t index_rhs = *((size_t *) rhs); CandidateVector candidates = (CandidateVector) userdata; lookup_candidate_t * candidate_lhs = &g_array_index(candidates, lookup_candidate_t, index_lhs); lookup_candidate_t * candidate_rhs = &g_array_index(candidates, lookup_candidate_t, index_rhs); return -strcmp(candidate_lhs->m_phrase_string, candidate_rhs->m_phrase_string); /* in descendant order */ } static bool _remove_duplicated_items_by_phrase_string (pinyin_instance_t * instance, CandidateVector candidates) { size_t i; /* create the GArray of indexed item */ GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t)); for (i = 0; i < candidates->len; ++i) g_array_append_val(indices, i); /* sort the indices array by phrase array */ g_array_sort_with_data (indices, compare_indexed_item_with_phrase_string, candidates); /* mark duplicated items as zombie candidate */ lookup_candidate_t * cur_item, * saved_item = NULL; for (i = 0; i < indices->len; ++i) { size_t cur_index = g_array_index(indices, size_t, i); cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index); /* handle the first candidate */ if (NULL == saved_item) { saved_item = cur_item; continue; } if (0 == strcmp(saved_item->m_phrase_string, cur_item->m_phrase_string)) { /* found duplicated candidates */ /* both are nbest match candidate */ if (NBEST_MATCH_CANDIDATE == saved_item->m_candidate_type && NBEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { /* keep the high possiblity one */ if (saved_item->m_nbest_index < cur_item->m_nbest_index) { cur_item->m_candidate_type = ZOMBIE_CANDIDATE; } else { saved_item->m_candidate_type = ZOMBIE_CANDIDATE; saved_item = cur_item; } continue; } /* keep nbest match candidate */ if (NBEST_MATCH_CANDIDATE == saved_item->m_candidate_type) { cur_item->m_candidate_type = ZOMBIE_CANDIDATE; continue; } if (NBEST_MATCH_CANDIDATE == cur_item->m_candidate_type) { saved_item->m_candidate_type = ZOMBIE_CANDIDATE; saved_item = cur_item; continue; } /* keep the higher possiblity one to quickly move the word forward in the candidate list */ if (cur_item->m_freq > saved_item->m_freq) { /* find better candidate */ saved_item->m_candidate_type = ZOMBIE_CANDIDATE; saved_item = cur_item; continue; } else { cur_item->m_candidate_type = ZOMBIE_CANDIDATE; continue; } } else { /* keep the current candidate */ saved_item = cur_item; } } g_array_free(indices, TRUE); /* remove zombie candidate from the returned candidates */ for (i = 0; i < candidates->len; ++i) { lookup_candidate_t * candidate = &g_array_index (candidates, lookup_candidate_t, i); if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) { g_free(candidate->m_phrase_string); g_array_remove_index(candidates, i); i--; } } return true; } static bool _free_candidates(CandidateVector candidates) { /* free candidates */ for (size_t i = 0; i < candidates->len; ++i) { lookup_candidate_t * candidate = &g_array_index (candidates, lookup_candidate_t, i); g_free(candidate->m_phrase_string); } g_array_set_size(candidates, 0); return true; } /* offset must at the beginning of zero ChewingKey "'". */ static bool _check_offset(PhoneticKeyMatrix & matrix, size_t offset) { const size_t start = offset; ChewingKey key; ChewingKeyRest key_rest; const ChewingKey zero_key; if (start > 0) { const size_t index = start - 1; const size_t size = matrix.get_column_size(index); if (1 == size) { /* assume only one zero ChewingKey "'" here, but no check. */ matrix.get_item(index, 0, key, key_rest); assert(zero_key != key); } } return true; } bool pinyin_guess_candidates(pinyin_instance_t * instance, size_t offset) { pinyin_context_t * & context = instance->m_context; pinyin_option_t & options = context->m_options; PhoneticKeyMatrix & matrix = instance->m_matrix; CandidateVector candidates = instance->m_candidates; _free_candidates(candidates); if (0 == matrix.size()) return false; /* lookup the previous token here. */ phrase_token_t prev_token = null_token; if (options & DYNAMIC_ADJUST) { prev_token = _get_previous_token(instance, offset); } SingleGram merged_gram; SingleGram * system_gram = NULL, * user_gram = NULL; if (options & DYNAMIC_ADJUST) { if (null_token != prev_token) { context->m_system_bigram->load(prev_token, system_gram); context->m_user_bigram->load(prev_token, user_gram); merge_single_gram(&merged_gram, system_gram, user_gram); } } PhraseIndexRanges ranges; memset(ranges, 0, sizeof(ranges)); context->m_phrase_index->prepare_ranges(ranges); PhraseIndexRanges addon_ranges; memset(addon_ranges, 0, sizeof(addon_ranges)); context->m_addon_phrase_index->prepare_ranges(addon_ranges); _check_offset(matrix, offset); /* matrix reserved one extra slot. */ const size_t start = offset; for (size_t end = start + 1; end < matrix.size();) { /* do pinyin search. */ context->m_phrase_index->clear_ranges(ranges); int retval = search_matrix(context->m_pinyin_table, &matrix, start, end, ranges); context->m_addon_phrase_index->clear_ranges(addon_ranges); retval = search_matrix(context->m_addon_pinyin_table, &matrix, start, end, addon_ranges) | retval; if ( !(retval & SEARCH_OK) ) { ++end; continue; } lookup_candidate_t template_item; template_item.m_begin = start; template_item.m_end = end; _append_items(ranges, &template_item, candidates); lookup_candidate_t addon_template_item; addon_template_item.m_candidate_type = ADDON_CANDIDATE; addon_template_item.m_begin = start; addon_template_item.m_end = end; _append_items(addon_ranges, &addon_template_item, candidates); if ( !(retval & SEARCH_CONTINUED) ) break; /* skip the consecutive zero ChewingKey "'", to avoid duplicates of candidates. */ ++end; ChewingKey key; ChewingKeyRest key_rest; const ChewingKey zero_key; for (; end < matrix.size(); ++end) { const size_t index = end - 1; const size_t size = matrix.get_column_size(index); /* assume only one zero ChewingKey "'" here, but no check. */ if (1 != size) break; matrix.get_item(index, 0, key, key_rest); if (zero_key != key) break; } } context->m_phrase_index->destroy_ranges(ranges); if (system_gram) delete system_gram; if (user_gram) delete user_gram; /* post process to sort the candidates */ _compute_phrase_length(context, candidates); _compute_frequency_of_items(context, prev_token, &merged_gram, candidates); /* sort the candidates by length and frequency. */ g_array_sort(candidates, compare_item_with_length_and_frequency); /* post process to remove duplicated candidates */ _prepend_sentence_candidates(instance, instance->m_candidates); _compute_phrase_strings_of_items(instance, instance->m_candidates); _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); return true; } bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance, const char * prefix) { const guint32 filter = 256; pinyin_context_t * context = instance->m_context; FacadePhraseIndex * phrase_index = context->m_phrase_index; CandidateVector candidates = instance->m_candidates; _free_candidates(candidates); _compute_prefixes(instance, prefix); phrase_token_t prev_token = _get_previous_token(instance, 0); if (null_token == prev_token) return false; /* merge single gram. */ SingleGram merged_gram; SingleGram * system_gram = NULL, * user_gram = NULL; context->m_system_bigram->load(prev_token, system_gram); context->m_user_bigram->load(prev_token, user_gram); merge_single_gram(&merged_gram, system_gram, user_gram); /* retrieve all items. */ BigramPhraseWithCountArray tokens = g_array_new (FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); merged_gram.retrieve_all(tokens); /* sort the longer word first. */ PhraseItem cached_item; for (size_t len = MAX_PHRASE_LENGTH; len > 0; --len) { /* append items. */ for (size_t k = 0; k < tokens->len; ++k){ BigramPhraseItemWithCount * phrase_item = &g_array_index (tokens, BigramPhraseItemWithCount, k); if (phrase_item->m_count < filter) continue; int result = phrase_index->get_phrase_item (phrase_item->m_token, cached_item); if (ERROR_NO_SUB_PHRASE_INDEX == result) continue; if (len != cached_item.get_phrase_length()) continue; lookup_candidate_t item; item.m_candidate_type = PREDICTED_CANDIDATE; item.m_token = phrase_item->m_token; g_array_append_val(candidates, item); } } if (system_gram) delete system_gram; if (user_gram) delete user_gram; /* post process to sort the candidates */ _compute_phrase_length(context, candidates); _compute_frequency_of_items(context, prev_token, &merged_gram, candidates); /* sort the candidates by length and frequency. */ g_array_sort(candidates, compare_item_with_length_and_frequency); /* post process to remove duplicated candidates */ _compute_phrase_strings_of_items(instance, instance->m_candidates); _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); return true; } int pinyin_choose_candidate(pinyin_instance_t * instance, size_t offset, lookup_candidate_t * candidate){ assert(PREDICTED_CANDIDATE != candidate->m_candidate_type); pinyin_context_t * context = instance->m_context; PhoneticKeyMatrix & matrix = instance->m_matrix; ForwardPhoneticConstraints * constraints = instance->m_constraints; NBestMatchResults & results = instance->m_nbest_results; if (NBEST_MATCH_CANDIDATE == candidate->m_candidate_type) { MatchResult best = NULL, other = NULL; assert(results.get_result(0, best)); assert(results.get_result(candidate->m_nbest_index, other)); constraints->diff_result(best, other); return matrix.size() - 1; } if (ADDON_CANDIDATE == candidate->m_candidate_type) { PhraseItem item; context->m_addon_phrase_index->get_phrase_item (candidate->m_token, item); guint8 len = item.get_phrase_length(); guint8 npron = item.get_n_pronunciation(); PhraseIndexRange range; context->m_phrase_index->get_range(ADDON_DICTIONARY, range); /* assume not over flow here. */ phrase_token_t token = range.m_range_end; /* add pinyin index. */ for (size_t i = 0; i < npron; ++i) { ChewingKey keys[MAX_PHRASE_LENGTH]; guint32 freq = 0; item.get_nth_pronunciation(i, keys, freq); context->m_pinyin_table->add_index(len, keys, token); } /* add phrase index. */ ucs4_t phrase[MAX_PHRASE_LENGTH]; item.get_phrase_string(phrase); context->m_phrase_table->add_index(len, phrase, token); context->m_phrase_index->add_phrase_item(token, &item); /* update the candidate. */ candidate->m_candidate_type = NORMAL_CANDIDATE; candidate->m_token = token; } /* sync m_constraints to the length of m_pinyin_keys. */ bool retval = constraints->validate_constraint(&matrix); phrase_token_t token = candidate->m_token; guint8 len = constraints->add_constraint (candidate->m_begin, candidate->m_end, token); /* safe guard: validate the m_constraints again. */ retval = constraints->validate_constraint(&matrix) && len; return offset + len; } bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance, lookup_candidate_t * candidate){ assert(PREDICTED_CANDIDATE == candidate->m_candidate_type); const guint32 initial_seed = 23 * 3; const guint32 unigram_factor = 7; pinyin_context_t * & context = instance->m_context; FacadePhraseIndex * & phrase_index = context->m_phrase_index; /* train uni-gram */ phrase_token_t token = candidate->m_token; int error = phrase_index->add_unigram_frequency (token, initial_seed * unigram_factor); if (ERROR_INTEGER_OVERFLOW == error) return false; phrase_token_t prev_token = _get_previous_token(instance, 0); if (null_token == prev_token) return false; SingleGram * user_gram = NULL; context->m_user_bigram->load(prev_token, user_gram); if (NULL == user_gram) user_gram = new SingleGram; /* train bi-gram */ guint32 total_freq = 0; assert(user_gram->get_total_freq(total_freq)); guint32 freq = 0; if (!user_gram->get_freq(token, freq)) { assert(user_gram->insert_freq(token, initial_seed)); } else { assert(user_gram->set_freq(token, freq + initial_seed)); } assert(user_gram->set_total_freq(total_freq + initial_seed)); context->m_user_bigram->store(prev_token, user_gram); delete user_gram; return true; } bool pinyin_clear_constraint(pinyin_instance_t * instance, size_t offset){ ForwardPhoneticConstraints * constraints = instance->m_constraints; bool retval = constraints->clear_constraint(offset); return retval; } bool pinyin_lookup_tokens(pinyin_instance_t * instance, const char * phrase, GArray * tokenarray){ pinyin_context_t * & context = instance->m_context; FacadePhraseIndex * & phrase_index = context->m_phrase_index; glong ucs4_len = 0; ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index->prepare_tokens(tokens); int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens); int num = reduce_tokens(tokens, tokenarray); phrase_index->destroy_tokens(tokens); return SEARCH_OK & retval; } bool pinyin_train(pinyin_instance_t * instance, guint8 index){ if (!instance->m_context->m_user_dir) return false; pinyin_context_t * context = instance->m_context; PhoneticKeyMatrix & matrix = instance->m_matrix; NBestMatchResults & results = instance->m_nbest_results; if (0 == results.size()) return false; context->m_modified = true; MatchResult result = NULL; assert(index < results.size()); assert(results.get_result(index, result)); bool retval = context->m_pinyin_lookup->train_result3 (&matrix, instance->m_constraints, result); return retval; } bool pinyin_reset(pinyin_instance_t * instance){ instance->m_parsed_len = 0; instance->m_matrix.clear_all(); g_array_set_size(instance->m_prefixes, 0); instance->m_constraints->clear(); instance->m_nbest_results.clear(); g_array_set_size(instance->m_phrase_result, 0); _free_candidates(instance->m_candidates); return true; } bool pinyin_get_zhuyin_string(pinyin_instance_t * instance, ChewingKey * key, gchar ** utf8_str) { *utf8_str = NULL; if (0 == key->get_table_index()) return false; *utf8_str = key->get_zhuyin_string(); return true; } bool pinyin_get_pinyin_string(pinyin_instance_t * instance, ChewingKey * key, gchar ** utf8_str) { *utf8_str = NULL; if (0 == key->get_table_index()) return false; *utf8_str = key->get_pinyin_string(); return true; } bool pinyin_get_luoma_pinyin_string(pinyin_instance_t * instance, ChewingKey * key, gchar ** utf8_str) { *utf8_str = NULL; if (0 == key->get_table_index()) return false; *utf8_str = key->get_luoma_pinyin_string(); return true; } bool pinyin_get_secondary_zhuyin_string(pinyin_instance_t * instance, ChewingKey * key, gchar ** utf8_str) { *utf8_str = NULL; if (0 == key->get_table_index()) return false; *utf8_str = key->get_secondary_zhuyin_string(); return true; } bool pinyin_get_pinyin_strings(pinyin_instance_t * instance, ChewingKey * key, gchar ** shengmu, gchar ** yunmu) { if (0 == key->get_table_index()) return false; if (shengmu) *shengmu = key->get_shengmu_string(); if (yunmu) *yunmu = key->get_yunmu_string(); return true; } bool pinyin_get_pinyin_is_incomplete(pinyin_instance_t * instance, ChewingKey * key) { if (CHEWING_ZERO_MIDDLE == key->m_middle && CHEWING_ZERO_FINAL == key->m_final) { assert(CHEWING_ZERO_TONE == key->m_tone); return true; } return false; } bool pinyin_token_get_phrase(pinyin_instance_t * instance, phrase_token_t token, guint * len, gchar ** utf8_str) { pinyin_context_t * & context = instance->m_context; return _token_get_phrase(context->m_phrase_index, token, len, utf8_str); } bool pinyin_token_get_n_pronunciation(pinyin_instance_t * instance, phrase_token_t token, guint * num){ *num = 0; pinyin_context_t * & context = instance->m_context; PhraseItem item; int retval = context->m_phrase_index->get_phrase_item(token, item); if (ERROR_OK != retval) return false; *num = item.get_n_pronunciation(); return true; } bool pinyin_token_get_nth_pronunciation(pinyin_instance_t * instance, phrase_token_t token, guint nth, ChewingKeyVector keys){ g_array_set_size(keys, 0); pinyin_context_t * & context = instance->m_context; PhraseItem item; ChewingKey buffer[MAX_PHRASE_LENGTH]; guint32 freq = 0; int retval = context->m_phrase_index->get_phrase_item(token, item); if (ERROR_OK != retval) return false; item.get_nth_pronunciation(nth, buffer, freq); guint8 len = item.get_phrase_length(); g_array_append_vals(keys, buffer, len); return true; } bool pinyin_token_get_unigram_frequency(pinyin_instance_t * instance, phrase_token_t token, guint * freq) { *freq = 0; pinyin_context_t * & context = instance->m_context; PhraseItem item; int retval = context->m_phrase_index->get_phrase_item(token, item); if (ERROR_OK != retval) return false; *freq = item.get_unigram_frequency(); return true; } bool pinyin_token_add_unigram_frequency(pinyin_instance_t * instance, phrase_token_t token, guint delta){ pinyin_context_t * & context = instance->m_context; int retval = context->m_phrase_index->add_unigram_frequency (token, delta); return ERROR_OK == retval; } bool pinyin_get_n_candidate(pinyin_instance_t * instance, guint * num) { *num = instance->m_candidates->len; return true; } bool pinyin_get_candidate(pinyin_instance_t * instance, guint index, lookup_candidate_t ** candidate) { CandidateVector & candidates = instance->m_candidates; *candidate = NULL; if (index >= candidates->len) return false; *candidate = &g_array_index(candidates, lookup_candidate_t, index); return true; } bool pinyin_get_candidate_type(pinyin_instance_t * instance, lookup_candidate_t * candidate, lookup_candidate_type_t * type) { *type = candidate->m_candidate_type; return true; } bool pinyin_get_candidate_string(pinyin_instance_t * instance, lookup_candidate_t * candidate, const gchar ** utf8_str) { *utf8_str = candidate->m_phrase_string; return true; } bool pinyin_get_candidate_nbest_index(pinyin_instance_t * instance, lookup_candidate_t * candidate, guint8 * index) { assert(NBEST_MATCH_CANDIDATE == candidate->m_candidate_type); *index = candidate->m_nbest_index; return true; } #if 0 bool pinyin_get_n_pinyin(pinyin_instance_t * instance, guint * num) { *num = 0; if (instance->m_pinyin_keys->len != instance->m_pinyin_key_rests->len) return false; *num = instance->m_pinyin_keys->len; return true; } #endif /* skip the beginning of zero ChewingKey "'". */ static size_t _compute_pinyin_start(PhoneticKeyMatrix & matrix, size_t offset) { size_t start = offset; ChewingKey key; ChewingKeyRest key_rest; const ChewingKey zero_key; for (; start < matrix.size() - 1; ++start) { size_t size = matrix.get_column_size(start); if (1 != size) break; matrix.get_item(start, 0, key, key_rest); if (zero_key != key) break; } return start; } bool pinyin_get_pinyin_key(pinyin_instance_t * instance, size_t offset, ChewingKey ** ppkey) { PhoneticKeyMatrix & matrix = instance->m_matrix; *ppkey = NULL; if (offset >= matrix.size() - 1) return false; if (0 == matrix.get_column_size(offset)) return false; _check_offset(matrix, offset); offset = _compute_pinyin_start(matrix, offset); static ChewingKey key; ChewingKeyRest key_rest; matrix.get_item(offset, 0, key, key_rest); *ppkey = &key; return true; } bool pinyin_get_pinyin_key_rest(pinyin_instance_t * instance, size_t offset, ChewingKeyRest ** ppkey_rest) { PhoneticKeyMatrix & matrix = instance->m_matrix; *ppkey_rest = NULL; if (offset >= matrix.size() - 1) return false; if (0 == matrix.get_column_size(offset)) return false; _check_offset(matrix, offset); offset = _compute_pinyin_start(matrix, offset); ChewingKey key; static ChewingKeyRest key_rest; matrix.get_item(offset, 0, key, key_rest); *ppkey_rest = &key_rest; return true; } bool pinyin_get_pinyin_key_rest_positions(pinyin_instance_t * instance, ChewingKeyRest * key_rest, guint16 * begin, guint16 * end) { if (begin) *begin = key_rest->m_raw_begin; if (end) *end = key_rest->m_raw_end; return true; } bool pinyin_get_pinyin_key_rest_length(pinyin_instance_t * instance, ChewingKeyRest * key_rest, guint16 * length) { *length = key_rest->length(); return true; } /* find the first zero ChewingKey "'". */ static size_t _compute_zero_start(PhoneticKeyMatrix & matrix, size_t offset) { ChewingKey key; ChewingKeyRest key_rest; const ChewingKey zero_key; ssize_t index = offset - 1; for (; index > 0; --index) { const size_t size = matrix.get_column_size(index); if (1 != size) break; matrix.get_item(index, 0, key, key_rest); if (zero_key == key) offset = index; else break; } return offset; } /* when lookup offset: get the previous non-zero ChewingKey, then the first zero ChewingKey. */ bool pinyin_get_pinyin_offset(pinyin_instance_t * instance, size_t cursor, size_t * poffset) { PhoneticKeyMatrix & matrix = instance->m_matrix; size_t offset = std_lite::min(cursor, instance->m_parsed_len); /* find the first ChewingKey. */ for (; offset > 0; --offset) { const size_t size = matrix.get_column_size(offset); if (size > 0) break; } offset = _compute_zero_start(matrix, offset); _check_offset(matrix, offset); *poffset = offset; return true; } bool pinyin_get_left_pinyin_offset(pinyin_instance_t * instance, size_t offset, size_t * pleft) { PhoneticKeyMatrix & matrix = instance->m_matrix; _check_offset(matrix, offset); /* find the ChewingKey ends at offset. */ size_t left = offset > 0 ? offset - 1 : 0; ChewingKey key; ChewingKeyRest key_rest; for (; left > 0; --left) { const size_t size = matrix.get_column_size(left); size_t i = 0; for (; i < size; ++i) { matrix.get_item(left, i, key, key_rest); if (offset == key_rest.m_raw_end) break; } if (i < size) break; } left = _compute_zero_start(matrix, left); _check_offset(matrix, left); *pleft = left; return true; } bool pinyin_get_right_pinyin_offset(pinyin_instance_t * instance, size_t offset, size_t * pright) { PhoneticKeyMatrix & matrix = instance->m_matrix; _check_offset(matrix, offset); /* find the first non-zero ChewingKey. */ size_t right = offset; ChewingKey key; ChewingKeyRest key_rest; const ChewingKey zero_key; for (size_t index = right; index < matrix.size() - 1; ++index) { const size_t size = matrix.get_column_size(index); if (1 != size) break; matrix.get_item(index, 0, key, key_rest); if (zero_key == key) right = index + 1; else break; } if (0 == matrix.get_column_size(right)) return false; matrix.get_item(right, 0, key, key_rest); right = key_rest.m_raw_end; _check_offset(matrix, right); *pright = right; return true; } static bool _pre_compute_tokens(pinyin_context_t * context, TokenVector cached_tokens, ucs4_t * phrase, size_t phrase_length) { FacadePhraseIndex * phrase_index = context->m_phrase_index; FacadePhraseTable3 * phrase_table = context->m_phrase_table; /* do phrase table search. */ PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index->prepare_tokens(tokens); for (size_t i = 0; i < phrase_length; ++i) { phrase_token_t token = null_token; ucs4_t character = phrase[i]; phrase_index->clear_tokens(tokens); int retval = phrase_table->search(1, &character, tokens); int num = get_first_token(tokens, token); /* en-counter un-known character, such as the emoji unicode. */ if (0 == num) { phrase_index->destroy_tokens(tokens); return false; } g_array_append_val(cached_tokens, token); } phrase_index->destroy_tokens(tokens); return true; } static bool _get_char_offset_recur(pinyin_instance_t * instance, TokenVector cached_tokens, size_t start, size_t offset, size_t * plength) { pinyin_context_t * context = instance->m_context; PhoneticKeyMatrix & matrix = instance->m_matrix; FacadePhraseIndex * phrase_index = context->m_phrase_index; size_t length = *plength; if (start > offset) return true; const size_t size = matrix.get_column_size(start); /* assume pinyin parsers will filter invalid keys. */ assert(size > 0); bool result = false; PhraseItem item; for (size_t i = 0; i < size; ++i) { ChewingKey key; ChewingKeyRest key_rest; matrix.get_item(start, i, key, key_rest); const size_t newstart = key_rest.m_raw_end; const ChewingKey zero_key; if (zero_key == key) { /* assume only one key here for "'" or the last key. */ assert(1 == size); return _get_char_offset_recur (instance, cached_tokens, newstart, offset, plength); } /* check pronunciation */ phrase_token_t token = g_array_index (cached_tokens, phrase_token_t, length); phrase_index->get_phrase_item(token, item); gfloat pinyin_poss = item.get_pronunciation_possibility(&key); if (pinyin_poss < FLT_EPSILON) continue; if (newstart > offset) return true; ++length; result = _get_char_offset_recur (instance, cached_tokens, newstart, offset, &length); if (result) { *plength = length; return result; } --length; } return result; } bool pinyin_get_character_offset(pinyin_instance_t * instance, const char * phrase, size_t offset, size_t * plength) { pinyin_context_t * context = instance->m_context; PhoneticKeyMatrix & matrix = instance->m_matrix; if (0 == matrix.size()) return false; assert(offset < matrix.size()); _check_offset(matrix, offset); if (NULL == phrase) return false; glong phrase_length = 0; ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &phrase_length, NULL); if (0 == phrase_length) return false; size_t length = 0; const size_t start = 0; /* pre-compute the tokens vector from phrase. */ TokenVector cached_tokens = g_array_new(TRUE, TRUE, sizeof(phrase_token_t)); bool retval = _pre_compute_tokens (context, cached_tokens, ucs4_phrase, phrase_length); if (!retval) { g_array_free(cached_tokens, TRUE); g_free(ucs4_phrase); return false; } assert(cached_tokens->len == phrase_length); bool result = _get_char_offset_recur (instance, cached_tokens, start, offset, &length); g_array_free(cached_tokens, TRUE); g_free(ucs4_phrase); *plength = length; return result; } #if 0 bool pinyin_get_character_offset(pinyin_instance_t * instance, size_t offset, size_t * plength) { pinyin_context_t * context = instance->m_context; FacadePhraseIndex * phrase_index = context->m_phrase_index; PhoneticKeyMatrix & matrix = instance->m_matrix; MatchResults results = instance->m_match_results; _check_offset(matrix, offset); size_t length = 0; PhraseItem item; for (size_t i = 0; i < offset; ++i) { phrase_token_t token = g_array_index(results, phrase_token_t, i); if (null_token == token) continue; int retval = phrase_index->get_phrase_item(token, item); assert(ERROR_OK == retval); guint8 len = item.get_phrase_length(); length += len; } *plength = length; return true; } #endif bool pinyin_get_n_phrase(pinyin_instance_t * instance, guint * num) { *num = instance->m_phrase_result->len; return true; } bool pinyin_get_phrase_token(pinyin_instance_t * instance, guint index, phrase_token_t * token){ MatchResult & result = instance->m_phrase_result; *token = null_token; if (index >= result->len) return false; *token = g_array_index(result, phrase_token_t, index); return true; } /* for auxiliary text: use slow string concatenation, and show the first ChewingKey and ChewingKeyRest together. */ static gchar * _get_aux_text_prefix(pinyin_instance_t * instance, size_t cursor, pinyin_option_t options) { PhoneticKeyMatrix & matrix = instance->m_matrix; gchar * prefix = g_strdup(""); assert(cursor < matrix.size()); size_t offset = 0; ChewingKey key; ChewingKeyRest key_rest; while (offset < matrix.size()) { offset = _compute_pinyin_start(matrix, offset); /* at the end of user input */ if (matrix.size() - 1 == offset) break; assert(matrix.get_column_size(offset) >= 1); matrix.get_item(offset, 0, key, key_rest); if (cursor < key_rest.m_raw_end) break; gchar * str = NULL; if (IS_PINYIN == options) str = key.get_pinyin_string(); else if (IS_ZHUYIN == options) str = key.get_zhuyin_string(); else assert(FALSE); gchar * newprefix = g_strconcat(prefix, str, " ", NULL); g_free(str); g_free(prefix); prefix = newprefix; offset = key_rest.m_raw_end; } return prefix; } static gchar * _get_aux_text_postfix(pinyin_instance_t * instance, size_t cursor, pinyin_option_t options) { PhoneticKeyMatrix & matrix = instance->m_matrix; gchar * postfix = g_strdup(""); assert(cursor < matrix.size()); size_t offset = 0; ChewingKey key; ChewingKeyRest key_rest; while (offset < matrix.size()) { offset = _compute_pinyin_start(matrix, offset); /* at the end of user input */ if (matrix.size() - 1 == offset) break; assert(matrix.get_column_size(offset) >= 1); matrix.get_item(offset, 0, key, key_rest); if (cursor > key_rest.m_raw_begin) { offset = key_rest.m_raw_end; continue; } gchar * str = NULL; if (IS_PINYIN == options) str = key.get_pinyin_string(); else if (IS_ZHUYIN == options) str = key.get_zhuyin_string(); else assert(FALSE); gchar * newpostfix = g_strconcat(postfix, str, " ", NULL); g_free(str); g_free(postfix); postfix = newpostfix; offset = key_rest.m_raw_end; } return postfix; } bool pinyin_get_full_pinyin_auxiliary_text(pinyin_instance_t * instance, size_t cursor, gchar ** aux_text) { PhoneticKeyMatrix & matrix = instance->m_matrix; if (0 == matrix.size()) { *aux_text = g_strdup(""); return false; } cursor = std_lite::min(cursor, instance->m_parsed_len); gchar * prefix = _get_aux_text_prefix (instance, cursor, IS_PINYIN); gchar * postfix = _get_aux_text_postfix (instance, cursor, IS_PINYIN); gchar * middle = NULL; assert(cursor < matrix.size()); size_t offset = 0; ChewingKey key; ChewingKeyRest key_rest; while(offset < matrix.size()) { size_t newoffset = _compute_pinyin_start(matrix, offset); /* at the pinyin boundary of user input */ if (offset <= cursor && cursor <= newoffset){ middle = g_strdup("|"); break; } offset = newoffset; assert(matrix.get_column_size(offset) >= 1); matrix.get_item(offset, 0, key, key_rest); /* at the middle of pinyin key */ const size_t begin = key_rest.m_raw_begin; const size_t end = key_rest.m_raw_end; const size_t len = cursor - begin; if (begin < cursor && cursor < end) { gchar * pinyin = key.get_pinyin_string(); gchar * left = g_strndup(pinyin, len); gchar * right = g_strdup(pinyin + len); middle = g_strconcat(left, "|", right, " ", NULL); g_free(left); g_free(right); g_free(pinyin); break; } offset = key_rest.m_raw_end; } gchar * auxtext = g_strconcat(prefix, middle, postfix, NULL); g_free(prefix); g_free(middle); g_free(postfix); *aux_text = auxtext; return true; } bool pinyin_get_double_pinyin_auxiliary_text(pinyin_instance_t * instance, size_t cursor, gchar ** aux_text) { PhoneticKeyMatrix & matrix = instance->m_matrix; if (0 == matrix.size()) { *aux_text = g_strdup(""); return false; } cursor = std_lite::min(cursor, instance->m_parsed_len); gchar * prefix = _get_aux_text_prefix (instance, cursor, IS_PINYIN); gchar * postfix = _get_aux_text_postfix (instance, cursor, IS_PINYIN); gchar * middle = NULL; /* no "'" support in double pinyin. */ assert(cursor < matrix.size()); size_t offset = 0; ChewingKey key; ChewingKeyRest key_rest; while(offset < matrix.size()) { /* at the pinyin boundary of user input */ if (cursor == offset) { middle = g_strdup("|"); break; } assert(matrix.get_column_size(offset) >= 1); matrix.get_item(offset, 0, key, key_rest); const size_t begin = key_rest.m_raw_begin; const size_t end = key_rest.m_raw_end; if (!(begin < cursor && cursor < end)) { offset = key_rest.m_raw_end; continue; } gchar * shengmu = key.get_shengmu_string(); gchar * yunmu = key.get_yunmu_string(); const size_t len = cursor - begin; switch(len) { case 1: middle = g_strconcat(shengmu, "|", yunmu, NULL); break; case 2: middle = g_strconcat(shengmu, yunmu, "|", NULL); break; default: assert(FALSE); } g_free(shengmu); g_free(yunmu); gchar * newmiddle = NULL; if (CHEWING_ZERO_TONE != key.m_tone) { newmiddle = g_strdup_printf("%s%d", middle, key.m_tone); g_free(middle); middle = newmiddle; } newmiddle = g_strconcat(middle, " ", NULL); g_free(middle); middle = newmiddle; offset = key_rest.m_raw_end; } gchar * auxtext = g_strconcat(prefix, middle, postfix, NULL); g_free(prefix); g_free(middle); g_free(postfix); *aux_text = auxtext; return true; } bool pinyin_get_chewing_auxiliary_text(pinyin_instance_t * instance, size_t cursor, gchar ** aux_text) { PhoneticKeyMatrix & matrix = instance->m_matrix; if (0 == matrix.size()) { *aux_text = g_strdup(""); return false; } cursor = std_lite::min(cursor, instance->m_parsed_len); gchar * prefix = _get_aux_text_prefix (instance, cursor, IS_ZHUYIN); gchar * postfix = _get_aux_text_postfix (instance, cursor, IS_ZHUYIN); gchar * middle = NULL; /* no "'" support in zhuyin */ assert(cursor < matrix.size()); size_t offset = 0; ChewingKey key; ChewingKeyRest key_rest; while(offset < matrix.size()) { /* at the pinyin boundary of user input */ if (cursor == offset) { middle = g_strdup("|"); break; } assert(matrix.get_column_size(offset) >= 1); matrix.get_item(offset, 0, key, key_rest); const size_t begin = key_rest.m_raw_begin; const size_t end = key_rest.m_raw_end; if (!(begin < cursor && cursor < end)) { offset = key_rest.m_raw_end; continue; } gchar * zhuyin = key.get_zhuyin_string(); const size_t len = cursor - begin; gchar * left = g_utf8_substring(zhuyin, 0, len); gchar * right = g_utf8_substring(zhuyin, len, end); middle = g_strconcat(left, "|", right, " ", NULL); g_free(left); g_free(right); g_free(zhuyin); offset = key_rest.m_raw_end; } gchar * auxtext = g_strconcat(prefix, middle, postfix, NULL); g_free(prefix); g_free(middle); g_free(postfix); *aux_text = auxtext; return true; } static bool _remember_phrase_recur(pinyin_instance_t * instance, ChewingKeyVector cached_keys, TokenVector cached_tokens, size_t start, ucs4_t * phrase, gint count) { pinyin_context_t * context = instance->m_context; PhoneticKeyMatrix & matrix = instance->m_matrix; FacadePhraseIndex * phrase_index = context->m_phrase_index; const guint8 index = USER_DICTIONARY; const size_t end = matrix.size() - 1; const glong phrase_length = cached_tokens->len; if (start > end) return false; /* only do remember phrase with 'start' and 'end' */ if (start == end) { if (cached_keys->len != phrase_length) return false; /* as cached_keys and phrase has the same length. */ assert(cached_keys->len > 0); assert(cached_keys->len <= MAX_PHRASE_LENGTH); return _add_phrase(context, index, cached_keys, phrase, phrase_length, count); } const size_t size = matrix.get_column_size(start); /* assume pinyin parsers will filter invalid keys. */ assert(size > 0); bool result = false; PhraseItem item; for (size_t i = 0; i < size; ++i) { ChewingKey key; ChewingKeyRest key_rest; matrix.get_item(start, i, key, key_rest); const size_t newstart = key_rest.m_raw_end; const ChewingKey zero_key; if (zero_key == key) { /* assume only one key here for "'" or the last key. */ assert(1 == size); return _remember_phrase_recur (instance, cached_keys, cached_tokens, newstart, phrase, count); } /* meet in-complete pinyin */ if (CHEWING_ZERO_MIDDLE == key.m_middle && CHEWING_ZERO_FINAL == key.m_final) { assert(CHEWING_ZERO_TONE == key.m_tone); return false; } /* check pronunciation */ if (cached_keys->len >= phrase_length) return false; phrase_token_t token = g_array_index (cached_tokens, phrase_token_t, cached_keys->len); phrase_index->get_phrase_item(token, item); gfloat pinyin_poss = item.get_pronunciation_possibility(&key); if (pinyin_poss < FLT_EPSILON) continue; /* push value */ g_array_append_val(cached_keys, key); result = _remember_phrase_recur (instance, cached_keys, cached_tokens, newstart, phrase, count) || result; /* pop value */ g_array_set_size(cached_keys, cached_keys->len - 1); } return result; } bool pinyin_remember_user_input(pinyin_instance_t * instance, const char * phrase, gint count) { pinyin_context_t * context = instance->m_context; if (NULL == phrase) return false; glong phrase_length = 0; ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &phrase_length, NULL); if (0 == phrase_length || phrase_length >= MAX_PHRASE_LENGTH) return false; const size_t start = 0; /* pre-compute the tokens vector from phrase. */ TokenVector cached_tokens = g_array_new(TRUE, TRUE, sizeof(phrase_token_t)); bool retval = _pre_compute_tokens (context, cached_tokens, ucs4_phrase, phrase_length); if (!retval) { g_array_free(cached_tokens, TRUE); g_free(ucs4_phrase); return false; } assert(cached_tokens->len == phrase_length); ChewingKeyVector cached_keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey)); bool result = _remember_phrase_recur (instance, cached_keys, cached_tokens, start, ucs4_phrase, count); g_array_free(cached_tokens, TRUE); g_array_free(cached_keys, TRUE); g_free(ucs4_phrase); return result; } bool pinyin_is_user_candidate(pinyin_instance_t * instance, lookup_candidate_t * candidate) { if (NORMAL_CANDIDATE != candidate->m_candidate_type) return false; phrase_token_t token = candidate->m_token; guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); if (USER_DICTIONARY != index) return false; return true; } bool pinyin_remove_user_candidate(pinyin_instance_t * instance, lookup_candidate_t * candidate) { pinyin_context_t * context = instance->m_context; FacadePhraseIndex * phrase_index = context->m_phrase_index; FacadePhraseTable3 * phrase_table = context->m_phrase_table; FacadeChewingTable2 * pinyin_table = context->m_pinyin_table; Bigram * user_bigram = context->m_user_bigram; assert(NORMAL_CANDIDATE == candidate->m_candidate_type); phrase_token_t token = candidate->m_token; guint8 index = PHRASE_INDEX_LIBRARY_INDEX(token); assert(USER_DICTIONARY == index); /* remove from phrase index */ PhraseItem * item = NULL; int retval = phrase_index->remove_phrase_item(token, item); assert(ERROR_OK == retval); /* remove from phrase table */ const guint8 length = item->get_phrase_length(); ucs4_t phrase[MAX_PHRASE_LENGTH]; item->get_phrase_string(phrase); retval = phrase_table->remove_index(length, phrase, token); assert(ERROR_OK == retval); /* remove from pinyin table */ const guint8 num = item->get_n_pronunciation(); ChewingKey keys[MAX_PHRASE_LENGTH]; guint32 freq = 0; for (size_t i = 0; i < num; ++i) { item->get_nth_pronunciation(i, keys, freq); retval = pinyin_table->remove_index(length, keys, token); assert(ERROR_OK == retval); } delete item; /* remove from user bigram */ phrase_token_t mask = PHRASE_INDEX_LIBRARY_MASK | PHRASE_MASK; user_bigram->mask_out(mask, token); return true; } /** * Note: prefix is the text before the pre-edit string. */