From 5c520717118df51ed6977e556fa846495419af24 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 28 Mar 2013 12:21:41 +0800 Subject: write add_pronunciation --- src/pinyin.cpp | 6 ++--- src/storage/phrase_index.cpp | 60 +++++++++++++++++++++++++++++++++++++------- src/storage/phrase_index.h | 9 ++++--- 3 files changed, 59 insertions(+), 16 deletions(-) (limited to 'src') diff --git a/src/pinyin.cpp b/src/pinyin.cpp index 1c31b1a..918d4b9 100644 --- a/src/pinyin.cpp +++ b/src/pinyin.cpp @@ -415,8 +415,8 @@ bool pinyin_iterator_add_phrase(import_iterator_t * iter, retval = phrase_index->remove_phrase_item(token, removed_item); if (ERROR_OK == retval) { /* maybe check whether there are duplicated pronunciations here. */ - removed_item->append_pronunciation((ChewingKey *)keys->data, - count); + removed_item->add_pronunciation((ChewingKey *)keys->data, + count); phrase_index->add_phrase_item(token, removed_item); delete removed_item; result = true; @@ -439,7 +439,7 @@ bool pinyin_iterator_add_phrase(import_iterator_t * iter, (keys->len, (ChewingKey *)(keys->data), token); item.set_phrase_string(len_phrase, ucs4_phrase); - item.append_pronunciation((ChewingKey *)(keys->data), count); + item.add_pronunciation((ChewingKey *)(keys->data), count); phrase_index->add_phrase_item(token, &item); phrase_index->add_unigram_frequency(token, count * unigram_factor); diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp index 67afb97..c462fef 100644 --- a/src/storage/phrase_index.cpp +++ b/src/storage/phrase_index.cpp @@ -42,6 +42,7 @@ bool PhraseItem::get_nth_pronunciation(size_t index, ChewingKey * keys, (offset + phrase_length * sizeof(ChewingKey), &freq , sizeof(guint32)); } +#if 0 void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){ guint8 phrase_length = get_phrase_length(); set_n_pronunciation(get_n_pronunciation() + 1); @@ -49,6 +50,43 @@ void PhraseItem::append_pronunciation(ChewingKey * keys, guint32 freq){ phrase_length * sizeof(ChewingKey)); m_chunk.set_content(m_chunk.size(), &freq, sizeof(guint32)); } +#endif + +bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){ + guint8 phrase_length = get_phrase_length(); + guint8 npron = get_n_pronunciation(); + size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t); + char * buf_begin = (char *) m_chunk.begin(); + guint32 total_freq = 0; + + for (int i = 0; i < npron; ++i) { + char * chewing_begin = buf_begin + offset + + i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); + guint32 * freq = (guint32 *)(chewing_begin + + phrase_length * sizeof(ChewingKey)); + + total_freq += *freq; + + if (0 == pinyin_exact_compare2 + (keys, (ChewingKey *)chewing_begin, phrase_length)) { + /* found the exact match pinyin keys. */ + + /* protect against total_freq overflow. */ + if (delta > 0 && total_freq > total_freq + delta) + return false; + + *freq += delta; + total_freq += delta; + return true; + } + } + + set_n_pronunciation(npron + 1); + m_chunk.set_content(m_chunk.size(), keys, + phrase_length * sizeof(ChewingKey)); + m_chunk.set_content(m_chunk.size(), &delta, sizeof(guint32)); + return true; +} void PhraseItem::remove_nth_pronunciation(size_t index){ guint8 phrase_length = get_phrase_length(); @@ -74,21 +112,25 @@ void PhraseItem::increase_pronunciation_possibility(pinyin_option_t options, gint32 delta){ guint8 phrase_length = get_phrase_length(); guint8 npron = get_n_pronunciation(); - size_t offset = phrase_item_header + phrase_length * sizeof ( ucs4_t ); + size_t offset = phrase_item_header + phrase_length * sizeof(ucs4_t); char * buf_begin = (char *) m_chunk.begin(); guint32 total_freq = 0; - for ( int i = 0 ; i < npron ; ++i){ + + for (int i = 0; i < npron; ++i) { char * chewing_begin = buf_begin + offset + - i * ( phrase_length * sizeof(ChewingKey) + sizeof(guint32) ); + i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32)); guint32 * freq = (guint32 *)(chewing_begin + phrase_length * sizeof(ChewingKey)); total_freq += *freq; - if ( 0 == pinyin_compare_with_ambiguities2 - (options, keys, - (ChewingKey *)chewing_begin, phrase_length) ){ - //protect against total_freq overflow. - if ( delta > 0 && total_freq > total_freq + delta ) + + if (0 == pinyin_compare_with_ambiguities2 + (options, keys, + (ChewingKey *)chewing_begin, phrase_length)) { + + /* protect against total_freq overflow. */ + if (delta > 0 && total_freq > total_freq + delta) return; + *freq += delta; total_freq += delta; } @@ -515,7 +557,7 @@ bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){ parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); if (item_ptr->get_phrase_length() == keys->len) { - item_ptr->append_pronunciation((ChewingKey *)keys->data, freq); + item_ptr->add_pronunciation((ChewingKey *)keys->data, freq); } else { fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n", pinyin, phrase); diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h index 3654369..6a14ff7 100644 --- a/src/storage/phrase_index.h +++ b/src/storage/phrase_index.h @@ -213,14 +213,15 @@ public: /* out */ guint32 & freq); /** - * PhraseItem::append_pronunciation: + * PhraseItem::add_pronunciation: * @keys: the pronunciation keys. - * @freq: the frequency of the pronunciation. + * @delta: the delta of the frequency of the pronunciation. + * @returns: whether the add operation is successful. * - * Append one pronunciation. + * Add one pronunciation. * */ - void append_pronunciation(ChewingKey * keys, guint32 freq); + bool add_pronunciation(ChewingKey * keys, guint32 delta); /** * PhraseItem::remove_nth_pronunciation: -- cgit