diff options
author | Peng Wu <alexepico@gmail.com> | 2012-10-25 15:09:40 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-10-25 15:09:40 +0800 |
commit | fb62ce874969fe4111f3592f55e2c0dcbf2e6d9c (patch) | |
tree | 1f8561f395fecc9163cc2f4e399faf4d75f24edf /src/storage | |
parent | 15d4a0e6298eca0def77b9d80d94ce6c4beccc95 (diff) | |
download | libpinyin-fb62ce874969fe4111f3592f55e2c0dcbf2e6d9c.tar.gz libpinyin-fb62ce874969fe4111f3592f55e2c0dcbf2e6d9c.tar.xz libpinyin-fb62ce874969fe4111f3592f55e2c0dcbf2e6d9c.zip |
write mask_out_phrase_index_logger
Diffstat (limited to 'src/storage')
-rw-r--r-- | src/storage/phrase_index.cpp | 150 | ||||
-rw-r--r-- | src/storage/phrase_index.h | 9 |
2 files changed, 159 insertions, 0 deletions
diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp index 0424b3a..ccf9354 100644 --- a/src/storage/phrase_index.cpp +++ b/src/storage/phrase_index.cpp @@ -589,4 +589,154 @@ const pinyin_table_info_t pinyin_phrase_files[PHRASE_INDEX_LIBRARY_COUNT] = {NULL, NULL, "user.bin", USER_FILE} }; + + +static bool _peek_header(PhraseIndexLogger * logger, + guint32 & old_total_freq){ + old_total_freq = 0; + + size_t header_count = 0; + LOG_TYPE log_type; phrase_token_t token; + MemoryChunk oldchunk, newchunk; + + while (logger->has_next_record()) { + bool retval = logger->next_record + (log_type, token, &oldchunk, &newchunk); + + if (!retval) + break; + + if (LOG_MODIFY_HEADER != log_type) + continue; + + ++header_count; + + oldchunk.get_content(0, &old_total_freq, sizeof(guint32)); + } + + /* 1 for normal case, 0 for corrupted file. */ + assert(1 >= header_count); + + return 1 == header_count? true : false; +} + +bool _compute_new_header(PhraseIndexLogger * logger, + phrase_token_t mask, + phrase_token_t value, + guint32 & new_total_freq) { + + LOG_TYPE log_type; phrase_token_t token; + MemoryChunk oldchunk, newchunk; + PhraseItem olditem, newitem; + + while(logger->has_next_record()) { + bool retval = logger->next_record + (log_type, token, &oldchunk, &newchunk); + + if (!retval) + break; + + if (LOG_MODIFY_HEADER == log_type) + continue; + + if ((token & mask) == value) + continue; + + switch(log_type) { + case LOG_ADD_RECORD:{ + assert( 0 == oldchunk.size() ); + newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(), + NULL); + new_total_freq += newitem.get_unigram_frequency(); + break; + } + case LOG_REMOVE_RECORD:{ + assert( 0 == newchunk.size() ); + olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(), + NULL); + new_total_freq -= olditem.get_unigram_frequency(); + break; + } + case LOG_MODIFY_RECORD:{ + olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(), + NULL); + new_total_freq -= olditem.get_unigram_frequency(); + + newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(), + NULL); + new_total_freq += newitem.get_unigram_frequency(); + break; + } + default: + assert(false); + } + } + + return true; +} + +static bool _write_header(PhraseIndexLogger * logger, + guint32 & old_total_freq, + guint32 & new_total_freq) { + MemoryChunk oldheader, newheader; + oldheader.set_content(0, &old_total_freq, sizeof(guint32)); + newheader.set_content(0, &new_total_freq, sizeof(guint32)); + logger->append_record(LOG_MODIFY_HEADER, null_token, + &oldheader, &newheader); + return true; +} + +static bool _mask_out_records(PhraseIndexLogger * oldlogger, + phrase_token_t mask, + phrase_token_t value, + PhraseIndexLogger * newlogger) { + LOG_TYPE log_type; phrase_token_t token; + MemoryChunk oldchunk, newchunk; + + while(oldlogger->has_next_record()) { + bool retval = oldlogger->next_record + (log_type, token, &oldchunk, &newchunk); + + if (!retval) + break; + + if (LOG_MODIFY_HEADER == log_type) + continue; + + if ((token & mask) == value) + continue; + + newlogger->append_record(log_type, token, &oldchunk, &newchunk); + } + + return true; +} + +PhraseIndexLogger * mask_out_phrase_index_logger +(PhraseIndexLogger * oldlogger, phrase_token_t mask, + phrase_token_t value) { + PhraseIndexLogger * newlogger = new PhraseIndexLogger; + guint32 old_total_freq = 0, new_total_freq = 0; + + /* peek the header value. */ + if (!_peek_header(oldlogger, old_total_freq)) + return newlogger; + + new_total_freq = old_total_freq; + + /* compute the new header based on add/modify/remove records. */ + oldlogger->rewind(); + if (!_compute_new_header(oldlogger, mask, value, new_total_freq)) + return newlogger; + + /* write out the modify header record. */ + _write_header(newlogger, old_total_freq, new_total_freq); + + /* mask out the matched records. */ + oldlogger->rewind(); + _mask_out_records(oldlogger, mask, value, newlogger); + + return newlogger; +} + }; diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h index acc7c4a..e9f4764 100644 --- a/src/storage/phrase_index.h +++ b/src/storage/phrase_index.h @@ -61,6 +61,11 @@ const size_t phrase_item_header = sizeof(guint8) + sizeof(guint8) + sizeof(guint */ class PhraseItem{ friend class SubPhraseIndex; + friend bool _compute_new_header(PhraseIndexLogger * logger, + phrase_token_t mask, + phrase_token_t value, + guint32 & new_total_freq); + private: MemoryChunk m_chunk; bool set_n_pronunciation(guint8 n_prouns); @@ -799,6 +804,10 @@ typedef struct { } pinyin_table_info_t; extern const pinyin_table_info_t pinyin_phrase_files[PHRASE_INDEX_LIBRARY_COUNT]; + +PhraseIndexLogger * mask_out_phrase_index_logger +(const PhraseIndexLogger * oldlogger, phrase_token_t mask, + phrase_token_t value); }; |