summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-04-18 16:06:05 +0800
committerPeng Wu <alexepico@gmail.com>2012-04-18 16:07:35 +0800
commita586f8da45e11c0cd84d5b1ff0582369fb5333c1 (patch)
treed79aa0b8ed0b50f145737bc454a65e222851ef7b /src
parent1ca1f3a99d1c4fc342bc9f19da93ff822de37018 (diff)
downloadlibpinyin-a586f8da45e11c0cd84d5b1ff0582369fb5333c1.tar.gz
libpinyin-a586f8da45e11c0cd84d5b1ff0582369fb5333c1.tar.xz
libpinyin-a586f8da45e11c0cd84d5b1ff0582369fb5333c1.zip
write pinyin_get_full_pinyin_candidates in progress
Diffstat (limited to 'src')
-rw-r--r--src/pinyin.cpp146
-rw-r--r--src/storage/pinyin_parser2.cpp47
-rw-r--r--src/storage/pinyin_parser2.h5
3 files changed, 192 insertions, 6 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index bef326c..1cc0e3d 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -597,7 +597,7 @@ bool pinyin_get_candidates(pinyin_instance_t * instance,
if ( !(retval & SEARCH_OK) )
continue;
- /* reduce to a single GArray. */
+ /* reduce and append to a single GArray. */
for (size_t m = min_index; m <= max_index; ++m) {
for (size_t n = 0; n < ranges[m]->len; ++n) {
PhraseIndexRange * range =
@@ -693,10 +693,9 @@ bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
pinyin_context_t * & context = instance->m_context;
pinyin_option_t & options = context->m_options;
ChewingKeyVector & pinyin_keys = instance->m_pinyin_keys;
+ ChewingKeyRestVector & pinyin_key_rests = instance->m_pinyin_key_rests;
g_array_set_size(candidates, 0);
- ChewingKey * keys = &g_array_index
- (pinyin_keys, ChewingKey, offset);
size_t pinyin_len = pinyin_keys->len - offset;
ssize_t i;
@@ -733,8 +732,147 @@ bool pinyin_get_full_pinyin_candidates(pinyin_instance_t * instance,
if (pinyin_len == 1) {
if (options & USE_DIVIDED_TABLE) {
+ g_array_set_size(items, 0);
/* handle "^xian$" -> "xi'an" here */
- assert(FALSE);
+
+ /* because there is only one pinyin left,
+ * the following for-loop will not produce 2 character candidates.
+ * the if-branch will fill the candidate list with
+ * 2 character candidates.
+ */
+
+ ChewingKey * key = &g_array_index(pinyin_keys, ChewingKey, offset);
+ ChewingKeyRest * rest = &g_array_index(pinyin_key_rests,
+ ChewingKeyRest, offset);
+ ChewingKeyRest orig_rest = *rest;
+ guint16 tone = CHEWING_ZERO_TONE;
+
+ const divided_table_item_t * item = NULL;
+
+ /* back up tone */
+ if (options & USE_TONE) {
+ tone = key->m_tone;
+ if (CHEWING_ZERO_TONE != tone) {
+ key->m_tone = CHEWING_ZERO_TONE;
+ rest->m_raw_end --;
+ }
+ }
+
+ item = context->m_full_pinyin_parser->retrieve_divided_item
+ (options, offset, pinyin_keys, pinyin_key_rests,
+ instance->m_raw_full_pinyin,
+ strlen(instance->m_raw_full_pinyin));
+
+ ChewingKey divided_keys[2];
+ assert(context->m_full_pinyin_parser->
+ parse_one_key(options, divided_keys[0], item->m_new_keys[0],
+ strlen(item->m_new_keys[0])));
+ assert(context->m_full_pinyin_parser->
+ parse_one_key(options, divided_keys[1], item->m_new_keys[1],
+ strlen(item->m_new_keys[1])));
+
+ gchar * new_pinyins = g_strdup_printf
+ ("%s'%s", item->m_new_keys[0], item->m_new_keys[1]);
+
+ /* propagate the tone */
+ if (options & USE_TONE) {
+ if (CHEWING_ZERO_TONE != tone) {
+ assert(0 < tone && tone <= 5);
+ gchar * tmp_str = g_strdup_printf
+ ("%s%d", new_pinyins, tone);
+ divided_keys[1].m_tone = tone;
+ g_free(new_pinyins);
+ new_pinyins = tmp_str;
+ }
+ }
+
+ /* do pinyin search. */
+ int retval = context->m_pinyin_table->search
+ (2, divided_keys, ranges);
+
+ if (retval & SEARCH_OK) {
+ /* reduce and append to a single GArray. */
+ for (size_t m = min_index; m <= max_index; ++m) {
+ for (size_t n = 0; n < ranges[m]->len; ++n) {
+ PhraseIndexRange * range =
+ &g_array_index(ranges[m], PhraseIndexRange, n);
+ for (size_t k = range->m_range_begin;
+ k < range->m_range_end; ++k) {
+ lookup_candidate_t item;
+ item.m_candidate_type = DIVIDED_CANDIDATE;
+ item.m_token = k; item.m_orig_rest = orig_rest;
+ item.m_new_pinyins = g_strdup(new_pinyins);
+ g_array_append_val(items, item);
+ }
+ }
+ }
+ g_free(new_pinyins);
+
+ g_array_sort(items, compare_item_with_token);
+
+ /* remove the duplicated items. */
+ phrase_token_t last_token = null_token;
+ for (size_t n = 0; n < items->len; ++n) {
+ lookup_candidate_t * item = &g_array_index
+ (items, lookup_candidate_t, n);
+ if (last_token == item->m_token) {
+ g_array_remove_index(items, n);
+ n--;
+ }
+ last_token = item->m_token;
+ }
+
+ PhraseItem cached_item;
+ /* compute all freqs. */
+ for (i = 0; i < items->len; ++i) {
+ lookup_candidate_t * item = &g_array_index
+ (items, lookup_candidate_t, i);
+ phrase_token_t & token = item->m_token;
+
+ gfloat bigram_poss = 0; guint32 total_freq = 0;
+ if (options & DYNAMIC_ADJUST) {
+ if (null_token != prev_token) {
+ guint32 bigram_freq = 0;
+ merged_gram.get_total_freq(total_freq);
+ merged_gram.get_freq(token, bigram_freq);
+ if (0 != total_freq)
+ bigram_poss = bigram_freq / (gfloat)total_freq;
+ }
+ }
+
+ /* compute the m_freq. */
+ FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+ phrase_index->get_phrase_item(token, cached_item);
+ total_freq = phrase_index->get_phrase_index_total_freq();
+ assert (0 < total_freq);
+
+ /* Note: possibility value <= 1.0. */
+ guint32 freq = (LAMBDA_PARAMETER * bigram_poss +
+ (1 - LAMBDA_PARAMETER) *
+ cached_item.get_unigram_frequency() /
+ (gfloat) total_freq) * 256 * 256 * 256;
+ item->m_freq = freq;
+ }
+
+ /* sort the candidates of the same length by frequency. */
+ g_array_sort(items, compare_item_with_frequency);
+
+ /* transfer back items to tokens, and save it into candidates */
+ for (i = 0; i < items->len; ++i) {
+ lookup_candidate_t * item = &g_array_index
+ (items, lookup_candidate_t, i);
+ g_array_append_val(candidates, item->m_token);
+ }
+
+ }
+
+ /* restore tones */
+ if (options & USE_TONE) {
+ if (CHEWING_ZERO_TONE != tone) {
+ key->m_tone = tone;
+ rest->m_raw_end ++;
+ }
+ }
}
}
diff --git a/src/storage/pinyin_parser2.cpp b/src/storage/pinyin_parser2.cpp
index 662ee1b..b4175b1 100644
--- a/src/storage/pinyin_parser2.cpp
+++ b/src/storage/pinyin_parser2.cpp
@@ -484,7 +484,7 @@ bool FullPinyinParser2::post_process2(pinyin_option_t options,
int len) const {
int i;
assert(keys->len == key_rests->len);
- gint16 num_keys = keys->len;
+ gint num_keys = keys->len;
ChewingKey * cur_key = NULL, * next_key = NULL;
ChewingKeyRest * cur_rest = NULL, * next_rest = NULL;
@@ -561,7 +561,7 @@ bool FullPinyinParser2::post_process2(pinyin_option_t options,
assert(parse_one_key(options, *next_key, onepinyin, len));
}
- /* save back tones */
+ /* restore tones */
if (options & USE_TONE) {
if (CHEWING_ZERO_TONE != next_tone) {
next_key->m_tone = next_tone;
@@ -573,6 +573,49 @@ bool FullPinyinParser2::post_process2(pinyin_option_t options,
return true;
}
+const divided_table_item_t * FullPinyinParser2::retrieve_divided_item
+(pinyin_option_t options, size_t offset,
+ ChewingKeyVector & keys, ChewingKeyRestVector & key_rests,
+ const char * str, int len) const {
+ assert(keys->len == key_rests->len);
+
+ gint num_keys = keys->len;
+ assert(offset < num_keys);
+
+ ChewingKey * key = &g_array_index(keys, ChewingKey, offset);
+ ChewingKeyRest * rest = &g_array_index(key_rests,
+ ChewingKeyRest, offset);
+ guint16 tone = CHEWING_ZERO_TONE;
+
+ /* lookup divided table */
+ size_t k;
+ const divided_table_item_t * item = NULL;
+ for (k = 0; k < G_N_ELEMENTS(divided_table); ++k) {
+ item = divided_table + k;
+
+ /* no ops */
+ assert(item->m_new_freq > 0);
+
+ const char * onepinyin = str + rest->m_raw_begin;
+ size_t len = strlen(item->m_orig_key);
+
+ if (rest->length() != len)
+ continue;
+
+ if (0 == strncmp(onepinyin, item->m_orig_key, len))
+ break;
+ }
+
+ /* found the match */
+ if (k < G_N_ELEMENTS(divided_table)) {
+ /* do divided */
+ item = divided_table + k;
+ return item;
+ }
+
+ return NULL;
+}
+
#define IS_KEY(x) (('a' <= x && x <= 'z') || x == ';')
bool DoublePinyinParser2::parse_one_key(pinyin_option_t options,
diff --git a/src/storage/pinyin_parser2.h b/src/storage/pinyin_parser2.h
index 09469e7..77ed3d4 100644
--- a/src/storage/pinyin_parser2.h
+++ b/src/storage/pinyin_parser2.h
@@ -152,6 +152,11 @@ protected:
const char * str, int len) const;
public:
+ const divided_table_item_t * retrieve_divided_item
+ (pinyin_option_t options, size_t offset,
+ ChewingKeyVector & keys, ChewingKeyRestVector & key_rests,
+ const char * str, int len) const;
+public:
FullPinyinParser2();
virtual ~FullPinyinParser2() {
g_array_free(m_parse_steps, TRUE);