summaryrefslogtreecommitdiffstats
path: root/src/pinyin.cpp
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2024-09-30 10:54:56 +0800
committerPeng Wu <alexepico@gmail.com>2024-09-30 10:56:58 +0800
commit3534429abf2df68b03b988ff799a1873700a066e (patch)
tree1864b686473cd206e3b9e8de4796f17c2e7600e6 /src/pinyin.cpp
parent13574d4e9da3cb07fecdce86939f8f3842d6a669 (diff)
downloadlibpinyin-3534429abf2df68b03b988ff799a1873700a066e.tar.gz
libpinyin-3534429abf2df68b03b988ff799a1873700a066e.tar.xz
libpinyin-3534429abf2df68b03b988ff799a1873700a066e.zip
Refactor pinyin_guess_predicted_candidates function
Diffstat (limited to 'src/pinyin.cpp')
-rw-r--r--src/pinyin.cpp69
1 files changed, 51 insertions, 18 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index a8fda16..f08f0f8 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -66,6 +66,8 @@ struct _pinyin_context_t{
SystemTableInfo2 m_system_table_info;
UserTableInfo m_user_table_info;
+
+ PunctTable * m_system_punct_table;
};
struct _pinyin_instance_t{
@@ -431,6 +433,13 @@ pinyin_context_t * pinyin_init(const char * systemdir, const char * userdir){
/* don't load addon phrase libraries. */
+ /* load system punct table. */
+ context->m_system_punct_table = new PunctTable;
+ system_filename = g_build_filename
+ (context->m_system_dir, SYSTEM_PUNCT_TABLE, NULL);
+ context->m_system_punct_table->attach(system_filename, ATTACH_READONLY);
+ g_free(system_filename);
+
return context;
}
@@ -1203,6 +1212,7 @@ void pinyin_fini(pinyin_context_t * context){
delete context->m_addon_pinyin_table;
delete context->m_addon_phrase_table;
delete context->m_addon_phrase_index;
+ delete context->m_system_punct_table;
g_free(context->m_system_dir);
g_free(context->m_user_dir);
@@ -2290,8 +2300,8 @@ bool pinyin_guess_candidates(pinyin_instance_t * instance,
return true;
}
-bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
- const char * prefix) {
+bool _compute_predicted_bigram_candidates(pinyin_instance_t * instance,
+ SingleGram * merged_gram) {
const guint32 length = 2;
const guint32 filter = 10;
@@ -2299,39 +2309,28 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
FacadePhraseIndex * phrase_index = context->m_phrase_index;
CandidateVector candidates = instance->m_candidates;
TokenVector prefixes = instance->m_prefixes;
- phrase_token_t prev_token = null_token;
-
- _free_candidates(candidates);
-
- /* search bigram candidate. */
- g_array_set_size(instance->m_prefixes, 0);
- _compute_prefixes(instance, prefix);
-
- if (0 == prefixes->len)
- return false;
/* merge single gram. */
- SingleGram merged_gram;
SingleGram * user_gram = NULL;
for (gint i = prefixes->len - 1; i >= 0; --i) {
- prev_token = g_array_index(prefixes, phrase_token_t, i);
+ phrase_token_t prev_token = g_array_index(prefixes, phrase_token_t, i);
context->m_user_bigram->load(prev_token, user_gram);
- merge_single_gram(&merged_gram, NULL, user_gram);
+ merge_single_gram(merged_gram, NULL, user_gram);
if (user_gram)
delete user_gram;
- if (merged_gram.get_length())
+ if (merged_gram->get_length())
break;
}
- if (0 != merged_gram.get_length()) {
+ if (0 != merged_gram->get_length()) {
/* retrieve all items. */
BigramPhraseWithCountArray tokens = g_array_new
(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
- merged_gram.retrieve_all(tokens);
+ merged_gram->retrieve_all(tokens);
/* sort the longer word first. */
PhraseItem cached_item;
@@ -2360,6 +2359,15 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
}
}
+ return true;
+}
+
+bool _compute_predicted_prefix_candidates(pinyin_instance_t * instance) {
+ pinyin_context_t * context = instance->m_context;
+ FacadePhraseIndex * phrase_index = context->m_phrase_index;
+ CandidateVector candidates = instance->m_candidates;
+ TokenVector prefixes = instance->m_prefixes;
+
/* search prefix candidate. */
GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
@@ -2392,6 +2400,31 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
g_array_free(tokenarray, TRUE);
+ return true;
+}
+
+bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
+ const char * prefix) {
+ pinyin_context_t * context = instance->m_context;
+ FacadePhraseIndex * phrase_index = context->m_phrase_index;
+ CandidateVector candidates = instance->m_candidates;
+ TokenVector prefixes = instance->m_prefixes;
+ phrase_token_t prev_token = null_token;
+
+ _free_candidates(candidates);
+
+ /* search bigram candidate. */
+ g_array_set_size(instance->m_prefixes, 0);
+ _compute_prefixes(instance, prefix);
+
+ if (0 == prefixes->len)
+ return false;
+
+ SingleGram merged_gram;
+ _compute_predicted_bigram_candidates(instance, &merged_gram);
+
+ _compute_predicted_prefix_candidates(instance);
+
/* post process to sort the candidates */
_compute_phrase_length(context, candidates);