summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-08-22 18:23:12 +0800
committerPeng Wu <alexepico@gmail.com>2011-08-22 18:37:35 +0800
commit806ee677ed908de317f0bbf377279d2083dce731 (patch)
tree8ff0e588fb086cc29d7adf1fa2e38e861055ca5e
parentfe1980851c95afa18300c5cf9d8bbda842b784d2 (diff)
downloadlibpinyin-806ee677ed908de317f0bbf377279d2083dce731.tar.gz
libpinyin-806ee677ed908de317f0bbf377279d2083dce731.tar.xz
libpinyin-806ee677ed908de317f0bbf377279d2083dce731.zip
write test case for phrase index logger
-rw-r--r--src/storage/phrase_index.cpp32
-rw-r--r--src/storage/phrase_index_logger.h39
-rw-r--r--tests/storage/test_phrase_index_logger.cpp41
3 files changed, 105 insertions, 7 deletions
diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp
index 5517169..b433904 100644
--- a/src/storage/phrase_index.cpp
+++ b/src/storage/phrase_index.cpp
@@ -185,7 +185,8 @@ bool FacadePhraseIndex::load(guint8 phrase_index, MemoryChunk * chunk){
if ( !sub_phrases ){
sub_phrases = new SubPhraseIndex;
}
-
+
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
bool retval = sub_phrases->load(chunk, 0, chunk->size());
if ( !retval )
return retval;
@@ -233,10 +234,14 @@ bool FacadePhraseIndex::merge(guint8 phrase_index, MemoryChunk * log){
if ( !sub_phrases )
return false;
+ m_total_freq -= sub_phrases->get_phrase_index_total_freq();
PhraseIndexLogger logger;
logger.load(log);
- return sub_phrases->merge(&logger);
+ bool retval = sub_phrases->merge(&logger);
+ m_total_freq += sub_phrases->get_phrase_index_total_freq();
+
+ return retval;
}
bool SubPhraseIndex::load(MemoryChunk * chunk,
@@ -297,6 +302,16 @@ bool SubPhraseIndex::store(MemoryChunk * new_chunk,
}
bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
+ /* diff the header */
+ MemoryChunk oldheader, newheader;
+ guint32 total_freq = oldone->get_phrase_index_total_freq();
+ oldheader.set_content(0, &total_freq, sizeof(guint32));
+ total_freq = get_phrase_index_total_freq();
+ newheader.set_content(0, &total_freq, sizeof(guint32));
+ logger->append_record(LOG_MODIFY_HEADER, null_token,
+ &oldheader, &newheader);
+
+ /* diff phrase items */
PhraseIndexRange oldrange, currange, range;
oldone->get_range(oldrange); get_range(currange);
range.m_range_begin = std_lite::min(oldrange.m_range_begin,
@@ -388,10 +403,23 @@ bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
}
break;
}
+ case LOG_MODIFY_HEADER:{
+ guint32 total_freq = get_phrase_index_total_freq();
+ guint32 tmp_freq = 0;
+ assert(null_token == token);
+ assert(oldchunk.size() == newchunk.size());
+ oldchunk.get_content(0, &tmp_freq, sizeof(guint32));
+ if (total_freq != tmp_freq)
+ return false;
+ newchunk.get_content(0, &tmp_freq, sizeof(guint32));
+ m_total_freq = tmp_freq;
+ break;
+ }
default:
assert(false);
}
}
+ return true;
}
bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
diff --git a/src/storage/phrase_index_logger.h b/src/storage/phrase_index_logger.h
index 3cff9b8..95f8e8b 100644
--- a/src/storage/phrase_index_logger.h
+++ b/src/storage/phrase_index_logger.h
@@ -31,6 +31,8 @@
* File Format
* Logger Record type: add/remove/modify
*
+ * Modify Header: header/null token/len/old data chunk/new data chunk
+ *
* Add Record: add/token/len/data chunk
* Remove Record: remove/token/len/data chunk
* Modify Record: modify/token/old len/new len/old data chunk/new data chunk
@@ -41,8 +43,9 @@ namespace pinyin{
enum LOG_TYPE{
LOG_ADD_RECORD = 1,
- LOG_REMOVE_RECORD = 2,
- LOG_MODIFY_RECORD = 3
+ LOG_REMOVE_RECORD,
+ LOG_MODIFY_RECORD,
+ LOG_MODIFY_HEADER
};
class PhraseIndexLogger{
@@ -127,6 +130,19 @@ public:
offset += newlen;
break;
}
+ case LOG_MODIFY_HEADER:{
+ assert(token == null_token);
+ size_t len = 0;
+ m_chunk->get_content(offset, &len, sizeof(size_t));
+ offset += sizeof(size_t);
+ oldone->set_content(0, ((char *)m_chunk->begin()) + offset,
+ len);
+ offset += len;
+ newone->set_content(0, ((char *)m_chunk->begin()) + offset,
+ len);
+ offset += len;
+ break;
+ }
default:
assert(false);
}
@@ -178,9 +194,24 @@ public:
chunk.set_content(offset, &newlen, sizeof(size_t));
offset += sizeof(size_t);
chunk.set_content(offset, oldone->begin(), oldone->size());
- offset += oldone->size();
+ offset += oldlen;
chunk.set_content(offset, newone->begin(), newone->size());
- offset += newone->size();
+ offset += newlen;
+ break;
+ }
+ case LOG_MODIFY_HEADER:{
+ assert(NULL != oldone);
+ assert(NULL != newone);
+ assert(null_token == token);
+ size_t oldlen = oldone->size();
+ size_t newlen = newone->size();
+ assert(oldlen == newlen);
+ chunk.set_content(offset, &oldlen, sizeof(size_t));
+ offset += sizeof(size_t);
+ chunk.set_content(offset, oldone->begin(), oldone->size());
+ offset += oldlen;
+ chunk.set_content(offset, newone->begin(), newone->size());
+ offset += newlen;
break;
}
default:
diff --git a/tests/storage/test_phrase_index_logger.cpp b/tests/storage/test_phrase_index_logger.cpp
index 4248db4..965d2c6 100644
--- a/tests/storage/test_phrase_index_logger.cpp
+++ b/tests/storage/test_phrase_index_logger.cpp
@@ -22,7 +22,46 @@
#include "pinyin.h"
+/* TODO: check whether gb_char.bin and gb_char2.bin should be the same. */
+
int main(int argc, char * argv[]){
-
+ FacadePhraseIndex phrase_index;
+ MemoryChunk * chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+
+ PhraseIndexRange range;
+ phrase_index.get_range(1, range);
+ for (size_t i = range.m_range_begin; i < range.m_range_end; ++i ) {
+ phrase_index.add_unigram_frequency(i, 1);
+ }
+
+ printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq());
+
+ MemoryChunk * new_chunk = new MemoryChunk;
+ phrase_index.store(1, new_chunk);
+ new_chunk->save("/tmp/gb_char.bin");
+ delete new_chunk;
+
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ new_chunk = new MemoryChunk;
+ phrase_index.diff(1, chunk, new_chunk);
+ new_chunk->save("/tmp/gb_char.dbin");
+ delete new_chunk;
+
+ chunk = new MemoryChunk;
+ chunk->load("../../data/gb_char.bin");
+ phrase_index.load(1, chunk);
+ new_chunk = new MemoryChunk;
+ new_chunk->load("/tmp/gb_char.dbin");
+ phrase_index.merge(1, new_chunk);
+ chunk = new MemoryChunk;
+ phrase_index.store(1, chunk);
+ chunk->save("/tmp/gb_char2.bin");
+ delete chunk;
+
+ printf("total freq:%d\n", phrase_index.get_phrase_index_total_freq());
+
return 0;
}