From 806ee677ed908de317f0bbf377279d2083dce731 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 22 Aug 2011 18:23:12 +0800 Subject: write test case for phrase index logger --- src/storage/phrase_index.cpp | 32 +++++++++++++++++++++-- src/storage/phrase_index_logger.h | 39 +++++++++++++++++++++++++--- tests/storage/test_phrase_index_logger.cpp | 41 +++++++++++++++++++++++++++++- 3 files changed, 105 insertions(+), 7 deletions(-) diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp index 5517169..b433904 100644 --- a/src/storage/phrase_index.cpp +++ b/src/storage/phrase_index.cpp @@ -185,7 +185,8 @@ bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){ if ( !sub_phrases ){ sub_phrases = new SubPhraseIndex; } - + + m_total_freq -= sub_phrases->get_phrase_index_total_freq(); bool retval = sub_phrases->load(chunk, 0, chunk->size()); if ( !retval ) return retval; @@ -233,10 +234,14 @@ bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){ if ( !sub_phrases ) return false; + m_total_freq -= sub_phrases->get_phrase_index_total_freq(); PhraseIndexLogger logger; logger.load(log); - return sub_phrases->merge(&logger); + bool retval = sub_phrases->merge(&logger); + m_total_freq += sub_phrases->get_phrase_index_total_freq(); + + return retval; } bool SubPhraseIndex::load(MemoryChunk * chunk, @@ -297,6 +302,16 @@ bool SubPhraseIndex::store(MemoryChunk * new_chunk, } bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){ + /* diff the header */ + MemoryChunk oldheader, newheader; + guint32 total_freq = oldone->get_phrase_index_total_freq(); + oldheader.set_content(0, &total_freq, sizeof(guint32)); + total_freq = get_phrase_index_total_freq(); + newheader.set_content(0, &total_freq, sizeof(guint32)); + logger->append_record(LOG_MODIFY_HEADER, null_token, + &oldheader, &newheader); + + /* diff phrase items */ PhraseIndexRange oldrange, currange, range; oldone->get_range(oldrange); get_range(currange); range.m_range_begin = std_lite::min(oldrange.m_range_begin, @@ -388,10 +403,23 @@ bool SubPhraseIndex::merge(PhraseIndexLogger * logger){ } break; } + case LOG_MODIFY_HEADER:{ + guint32 total_freq = get_phrase_index_total_freq(); + guint32 tmp_freq = 0; + assert(null_token == token); + assert(oldchunk.size() == newchunk.size()); + oldchunk.get_content(0, &tmp_freq, sizeof(guint32)); + if (total_freq != tmp_freq) + return false; + newchunk.get_content(0, &tmp_freq, sizeof(guint32)); + m_total_freq = tmp_freq; + break; + } default: assert(false); } } + return true; } bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){ diff --git a/src/storage/phrase_index_logger.h b/src/storage/phrase_index_logger.h index 3cff9b8..95f8e8b 100644 --- a/src/storage/phrase_index_logger.h +++ b/src/storage/phrase_index_logger.h @@ -31,6 +31,8 @@ * File Format * Logger Record type: add/remove/modify * + * Modify Header: header/null token/len/old data chunk/new data chunk + * * Add Record: add/token/len/data chunk * Remove Record: remove/token/len/data chunk * Modify Record: modify/token/old len/new len/old data chunk/new data chunk @@ -41,8 +43,9 @@ namespace pinyin{ enum LOG_TYPE{ LOG_ADD_RECORD = 1, - LOG_REMOVE_RECORD = 2, - LOG_MODIFY_RECORD = 3 + LOG_REMOVE_RECORD, + LOG_MODIFY_RECORD, + LOG_MODIFY_HEADER }; class PhraseIndexLogger{ @@ -127,6 +130,19 @@ public: offset += newlen; break; } + case LOG_MODIFY_HEADER:{ + assert(token == null_token); + size_t len = 0; + m_chunk->get_content(offset, &len, sizeof(size_t)); + offset += sizeof(size_t); + oldone->set_content(0, ((char *)m_chunk->begin()) + offset, + len); + offset += len; + newone->set_content(0, ((char *)m_chunk->begin()) + offset, + len); + offset += len; + break; + } default: assert(false); } @@ -178,9 +194,24 @@ public: chunk.set_content(offset, &newlen, sizeof(size_t)); offset += sizeof(size_t); chunk.set_content(offset, oldone->begin(), oldone->size()); - offset += oldone->size(); + offset += oldlen; chunk.set_content(offset, newone->begin(), newone->size()); - offset += newone->size(); + offset += newlen; + break; + } + case LOG_MODIFY_HEADER:{ + assert(NULL != oldone); + assert(NULL != newone); + assert(null_token == token); + size_t oldlen = oldone->size(); + size_t newlen = newone->size(); + assert(oldlen == newlen); + chunk.set_content(offset, &oldlen, sizeof(size_t)); + offset += sizeof(size_t); + chunk.set_content(offset, oldone->begin(), oldone->size()); + offset += oldlen; + chunk.set_content(offset, newone->begin(), newone->size()); + offset += newlen; break; } default: diff --git a/tests/storage/test_phrase_index_logger.cpp b/tests/storage/test_phrase_index_logger.cpp index 4248db4..965d2c6 100644 --- a/tests/storage/test_phrase_index_logger.cpp +++ b/tests/storage/test_phrase_index_logger.cpp @@ -22,7 +22,46 @@ #include "pinyin.h" +/* TODO: check whether gb_char.bin and gb_char2.bin should be the same. */ + int main(int argc, char * argv[]){ - + FacadePhraseIndex phrase_index; + MemoryChunk * chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + + PhraseIndexRange range; + phrase_index.get_range(1, range); + for (size_t i = range.m_range_begin; i < range.m_range_end; ++i ) { + phrase_index.add_unigram_frequency(i, 1); + } + + printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq()); + + MemoryChunk * new_chunk = new MemoryChunk; + phrase_index.store(1, new_chunk); + new_chunk->save("/tmp/gb_char.bin"); + delete new_chunk; + + chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + new_chunk = new MemoryChunk; + phrase_index.diff(1, chunk, new_chunk); + new_chunk->save("/tmp/gb_char.dbin"); + delete new_chunk; + + chunk = new MemoryChunk; + chunk->load("../../data/gb_char.bin"); + phrase_index.load(1, chunk); + new_chunk = new MemoryChunk; + new_chunk->load("/tmp/gb_char.dbin"); + phrase_index.merge(1, new_chunk); + chunk = new MemoryChunk; + phrase_index.store(1, chunk); + chunk->save("/tmp/gb_char2.bin"); + delete chunk; + + printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq()); + return 0; } -- cgit