summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-08-10 12:58:13 +0800
committerPeng Wu <alexepico@gmail.com>2011-08-10 12:58:13 +0800
commitec332e5bcac3af0520803813a48ae4ee93c5713c (patch)
tree068d85466e66028d97033e6bc343342e0731c249
parente4fa0bfd58813a248674bd41b5dcf61e9be3bdeb (diff)
downloadlibpinyin-ec332e5bcac3af0520803813a48ae4ee93c5713c.tar.gz
libpinyin-ec332e5bcac3af0520803813a48ae4ee93c5713c.tar.xz
libpinyin-ec332e5bcac3af0520803813a48ae4ee93c5713c.zip
write phrase index logger format in progress
-rw-r--r--src/storage/phrase_index.cpp73
-rw-r--r--src/storage/phrase_index.h19
-rw-r--r--src/storage/phrase_index_logger.h12
3 files changed, 93 insertions, 11 deletions
diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp
index 59b166f..2be0b5c 100644
--- a/src/storage/phrase_index.cpp
+++ b/src/storage/phrase_index.cpp
@@ -296,6 +296,79 @@ bool SubPhraseIndex::store(MemoryChunk * new_chunk,
return true;
}
+bool SubPhraseIndex::diff(SubPhraseIndex * oldone, PhraseIndexLogger * logger){
+ PhraseIndexRange oldrange, currange, range;
+ oldone->get_range(oldrange); get_range(currange);
+ range.m_range_begin = std_lite::min(oldrange.m_range_begin,
+ currange.m_range_begin);
+ range.m_range_end = std_lite::max(oldrange.m_range_end,
+ currange.m_range_end);
+ PhraseItem olditem, newitem;
+
+ for (phrase_token_t token = range.m_range_begin;
+ token < range.m_range_end; ++token ){
+ bool oldretval = ERROR_OK == oldone->get_phrase_item(token, olditem);
+ bool newretval = ERROR_OK == get_phrase_item(token, newitem);
+
+ if ( oldretval ){
+ if ( newretval ) { /* compare phrase item. */
+ if ( olditem == newitem )
+ continue;
+ logger->append_record(LOG_MODIFY_RECORD, token,
+ &(olditem.m_chunk), &(newitem.m_chunk));
+ } else { /* remove phrase item. */
+ logger->append_record(LOG_REMOVE_RECORD, token,
+ &(olditem.m_chunk), NULL);
+ }
+ } else {
+ if ( newretval ){ /* add phrase item. */
+ logger->append_record(LOG_ADD_RECORD, token,
+ NULL, &(newitem.m_chunk));
+ } else { /* both empty. */
+ /* do nothing. */
+ }
+ }
+ }
+
+ return true;
+}
+
+bool SubPhraseIndex::merge(PhraseIndexLogger * logger){
+ LOG_TYPE log_type; phrase_token_t token;
+ MemoryChunk oldchunk, newchunk;
+ PhraseItem olditem, newitem, * tmpitem;
+
+ while(logger->has_next_record()){
+ logger->next_record(log_type, token, &oldchunk, &newchunk);
+
+ switch(log_type){
+ case LOG_ADD_RECORD:{
+ assert( 0 == oldchunk.size() );
+ newitem.m_chunk.set_chunk(newchunk.begin(), newchunk.size(),
+ NULL);
+ add_phrase_item(token, &newitem);
+ break;
+ }
+ case LOG_REMOVE_RECORD:{
+ assert( 0 == newchunk.size() );
+ tmpitem = NULL;
+ remove_phrase_item(token, tmpitem);
+ olditem.m_chunk.set_chunk(oldchunk.begin(), oldchunk.size(),
+ NULL);
+ if (olditem != *tmpitem)
+ return false;
+ break;
+ }
+ case LOG_MODIFY_RECORD:{
+ TODO:
+ break;
+ }
+ default:
+ assert(false);
+ }
+ }
+}
+
bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
if ( !sub_phrases ){
diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h
index f2648e8..d853aee 100644
--- a/src/storage/phrase_index.h
+++ b/src/storage/phrase_index.h
@@ -67,10 +67,12 @@ public:
memset(m_chunk.begin(), 0, m_chunk.size());
}
- PhraseItem(MemoryChunk chunk){
- m_chunk = chunk;
- assert ( m_chunk.size() >= phrase_item_header);
+#if 0
+ PhraseItem(MemoryChunk & chunk){
+ m_chunk.set_content(0, chunk->begin(), chunk->size());
+ assert ( m_chunk.size() >= phrase_item_header);
}
+#endif
/* functions */
guint8 get_phrase_length(){
@@ -134,6 +136,17 @@ public:
*/
void append_pronunciation(PinyinKey * pinyin, guint32 freq);
void remove_nth_pronunciation(size_t index);
+
+ bool operator == (PhraseItem & rhs){
+ if (m_chunk.size() != rhs.m_chunk.size())
+ return false;
+ return memcmp(m_chunk.begin(), rhs.m_chunk.begin(),
+ m_chunk.size()) == 0;
+ }
+
+ bool operator != (PhraseItem & rhs){
+ return ! (*this == rhs);
+ }
};
/*
diff --git a/src/storage/phrase_index_logger.h b/src/storage/phrase_index_logger.h
index c961112..3cff9b8 100644
--- a/src/storage/phrase_index_logger.h
+++ b/src/storage/phrase_index_logger.h
@@ -87,8 +87,8 @@ public:
}
/* prolog: has_next_record() returned true. */
- bool next(LOG_TYPE & log_type, phrase_token_t & token,
- MemoryChunk * oldone, MemoryChunk * newone){
+ bool next_record(LOG_TYPE & log_type, phrase_token_t & token,
+ MemoryChunk * oldone, MemoryChunk * newone){
size_t offset = m_offset;
m_chunk->get_content(offset, &log_type, sizeof(LOG_TYPE));
offset += sizeof(LOG_TYPE);
@@ -97,8 +97,7 @@ public:
switch(log_type){
case LOG_ADD_RECORD:{
- assert( NULL == oldone);
- assert( NULL != newone);
+ oldone->set_size(0);
size_t len = 0;
m_chunk->get_content(offset, &len, sizeof(size_t));
offset += sizeof(size_t);
@@ -107,8 +106,7 @@ public:
break;
}
case LOG_REMOVE_RECORD:{
- assert( NULL != oldone);
- assert( NULL == newone);
+ newone->set_size(0);
size_t len = 0;
m_chunk->get_content(offset, &len, sizeof(size_t));
offset += sizeof(size_t);
@@ -117,8 +115,6 @@ public:
break;
}
case LOG_MODIFY_RECORD:{
- assert( NULL != oldone);
- assert( NULL != newone);
size_t oldlen = 0, newlen = 0;
m_chunk->get_content(offset, &oldlen, sizeof(size_t));
offset += sizeof(size_t);