From b78429d78df745dd327b6dada6b9bd71ea5df84e Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 22 Jul 2013 11:37:11 +0800 Subject: import libpinyin code --- src/storage/phrase_large_table2.cpp | 809 ++++++++++++++++++++++++++++++++++++ 1 file changed, 809 insertions(+) create mode 100644 src/storage/phrase_large_table2.cpp (limited to 'src/storage/phrase_large_table2.cpp') diff --git a/src/storage/phrase_large_table2.cpp b/src/storage/phrase_large_table2.cpp new file mode 100644 index 0000000..f7d8ae2 --- /dev/null +++ b/src/storage/phrase_large_table2.cpp @@ -0,0 +1,809 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include "phrase_large_table2.h" + + +/* class definition */ + +namespace pinyin{ + +class PhraseLengthIndexLevel2{ +protected: + GArray * m_phrase_array_indexes; +public: + PhraseLengthIndexLevel2(); + ~PhraseLengthIndexLevel2(); + + /* load/store method */ + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end); + + /* search method */ + int search(int phrase_length, /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const; + + /* add_index/remove_index method */ + int add_index(int phrase_length, /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token); + int remove_index(int phrase_length, /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token); + + /* get length method */ + int get_length() const; + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + + +template +struct PhraseIndexItem2{ + phrase_token_t m_token; + ucs4_t m_phrase[phrase_length]; +public: + PhraseIndexItem2(const ucs4_t phrase[], phrase_token_t token){ + memmove(m_phrase, phrase, sizeof(ucs4_t) * phrase_length); + m_token = token; + } +}; + + +template +class PhraseArrayIndexLevel2{ +protected: + typedef PhraseIndexItem2 IndexItem; + +protected: + MemoryChunk m_chunk; +public: + bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end); + bool store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end); + + /* search method */ + int search(/* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const; + + /* add_index/remove_index method */ + int add_index(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token); + int remove_index(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token); + + /* get length method */ + int get_length() const; + + /* mask out method */ + bool mask_out(phrase_token_t mask, phrase_token_t value); +}; + +}; + +using namespace pinyin; + +/* class implementation */ + +template +static int phrase_compare2(const PhraseIndexItem2 &lhs, + const PhraseIndexItem2 &rhs){ + ucs4_t * phrase_lhs = (ucs4_t *) lhs.m_phrase; + ucs4_t * phrase_rhs = (ucs4_t *) rhs.m_phrase; + + return memcmp(phrase_lhs, phrase_rhs, sizeof(ucs4_t) * phrase_length); +} + +template +static bool phrase_less_than2(const PhraseIndexItem2 & lhs, + const PhraseIndexItem2 & rhs){ + return 0 > phrase_compare2(lhs, rhs); +} + +PhraseBitmapIndexLevel2::PhraseBitmapIndexLevel2(){ + memset(m_phrase_length_indexes, 0, sizeof(m_phrase_length_indexes)); +} + +void PhraseBitmapIndexLevel2::reset(){ + for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; i++){ + PhraseLengthIndexLevel2 * & length_array = + m_phrase_length_indexes[i]; + if ( length_array ) + delete length_array; + length_array = NULL; + } +} + + +/* search method */ + +int PhraseBitmapIndexLevel2::search(int phrase_length, + /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const { + assert(phrase_length > 0); + + int result = SEARCH_NONE; + /* use the first 8-bit of the lower 16-bit for bitmap index, + * as most the higher 16-bit are zero. + */ + guint8 first_key = (phrase[0] & 0xFF00) >> 8; + + PhraseLengthIndexLevel2 * phrase_array = m_phrase_length_indexes[first_key]; + if ( phrase_array ) + return phrase_array->search(phrase_length, phrase, tokens); + return result; +} + +PhraseLengthIndexLevel2::PhraseLengthIndexLevel2(){ + m_phrase_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *)); +} + +PhraseLengthIndexLevel2::~PhraseLengthIndexLevel2(){ +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2 * & array = g_array_index \ + (m_phrase_array_indexes, \ + PhraseArrayIndexLevel2 *, len - 1); \ + if ( array ) { \ + delete array; \ + array = NULL; \ + } \ + break; \ + } + + for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i){ + switch (i){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } + } + g_array_free(m_phrase_array_indexes, TRUE); +#undef CASE +} + +int PhraseLengthIndexLevel2::search(int phrase_length, + /* in */ const ucs4_t phrase[], + /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + if(m_phrase_array_indexes->len < phrase_length) + return result; + if (m_phrase_array_indexes->len > phrase_length) + result |= SEARCH_CONTINUED; + +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2 * array = g_array_index \ + (m_phrase_array_indexes, PhraseArrayIndexLevel2 *, len - 1); \ + if ( !array ) \ + return result; \ + result |= array->search(phrase, tokens); \ + return result; \ + } + + switch ( phrase_length ){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } +#undef CASE +} + +template +int PhraseArrayIndexLevel2::search +(/* in */ const ucs4_t phrase[], /* out */ PhraseTokens tokens) const { + int result = SEARCH_NONE; + + IndexItem * chunk_begin = NULL, * chunk_end = NULL; + chunk_begin = (IndexItem *) m_chunk.begin(); + chunk_end = (IndexItem *) m_chunk.end(); + + /* do the search */ + IndexItem search_elem(phrase, -1); + std_lite::pair range; + range = std_lite::equal_range + (chunk_begin, chunk_end, search_elem, + phrase_less_than2); + + const IndexItem * const begin = range.first; + const IndexItem * const end = range.second; + if (begin == end) + return result; + + const IndexItem * iter = NULL; + GArray * array = NULL; + + for (iter = begin; iter != end; ++iter) { + phrase_token_t token = iter->m_token; + + /* filter out disabled sub phrase indices. */ + array = tokens[PHRASE_INDEX_LIBRARY_INDEX(token)]; + if (NULL == array) + continue; + + result |= SEARCH_OK; + + g_array_append_val(array, token); + } + + return result; +} + + +/* add/remove index method */ + +int PhraseBitmapIndexLevel2::add_index(int phrase_length, + /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token){ + guint8 first_key = (phrase[0] & 0xFF00) >> 8; + + PhraseLengthIndexLevel2 * & length_array = + m_phrase_length_indexes[first_key]; + + if ( !length_array ){ + length_array = new PhraseLengthIndexLevel2(); + } + return length_array->add_index(phrase_length, phrase, token); +} + +int PhraseBitmapIndexLevel2::remove_index(int phrase_length, + /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token){ + guint8 first_key = (phrase[0] & 0xFF00) >> 8; + + PhraseLengthIndexLevel2 * & length_array = + m_phrase_length_indexes[first_key]; + + if (NULL == length_array) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + + int retval = length_array->remove_index(phrase_length, phrase, token); + + /* remove empty array. */ + if (0 == length_array->get_length()) { + delete length_array; + length_array = NULL; + } + + return retval; +} + +int PhraseLengthIndexLevel2::add_index(int phrase_length, + /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token) { + if (phrase_length >= MAX_PHRASE_LENGTH) + return ERROR_PHRASE_TOO_LONG; + + if (m_phrase_array_indexes->len < phrase_length) + g_array_set_size(m_phrase_array_indexes, phrase_length); + +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2 * & array = g_array_index \ + (m_phrase_array_indexes, PhraseArrayIndexLevel2 *, len - 1); \ + if ( !array ) \ + array = new PhraseArrayIndexLevel2; \ + return array->add_index(phrase, token); \ + } + + switch(phrase_length){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } + +#undef CASE +} + +int PhraseLengthIndexLevel2::remove_index(int phrase_length, + /* in */ const ucs4_t phrase[], + /* in */ phrase_token_t token) { + if (phrase_length >= MAX_PHRASE_LENGTH) + return ERROR_PHRASE_TOO_LONG; + + if (m_phrase_array_indexes->len < phrase_length) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2 * & array = g_array_index \ + (m_phrase_array_indexes, \ + PhraseArrayIndexLevel2 *, len - 1); \ + if (NULL == array) \ + return ERROR_REMOVE_ITEM_DONOT_EXISTS; \ + int retval = array->remove_index(phrase, token); \ + \ + /* remove empty array. */ \ + if (0 == array->get_length()) { \ + delete array; \ + array = NULL; \ + \ + /* shrink self array. */ \ + g_array_set_size(m_phrase_array_indexes, \ + get_length()); \ + } \ + return retval; \ + } + + switch(phrase_length){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } +#undef CASE +} + +template +int PhraseArrayIndexLevel2::add_index +(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token){ + IndexItem * begin, * end; + + IndexItem add_elem(phrase, token); + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + std_lite::pair range; + range = std_lite::equal_range + (begin, end, add_elem, phrase_less_than2); + + IndexItem * cur_elem; + for (cur_elem = range.first; + cur_elem != range.second; ++cur_elem) { + if (cur_elem->m_token == token) + return ERROR_INSERT_ITEM_EXISTS; + if (cur_elem->m_token > token) + break; + } + + int offset = (cur_elem - begin) * sizeof(IndexItem); + m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem)); + return ERROR_OK; +} + +template +int PhraseArrayIndexLevel2::remove_index +(/* in */ const ucs4_t phrase[], /* in */ phrase_token_t token) { + IndexItem * begin, * end; + + IndexItem remove_elem(phrase, token); + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + std_lite::pair range; + range = std_lite::equal_range + (begin, end, remove_elem, phrase_less_than2); + + IndexItem * cur_elem; + for (cur_elem = range.first; + cur_elem != range.second; ++cur_elem) { + if (cur_elem->m_token == token) + break; + } + + if (cur_elem == range.second) + return ERROR_REMOVE_ITEM_DONOT_EXISTS; + + int offset = (cur_elem - begin) * sizeof(IndexItem); + m_chunk.remove_content(offset, sizeof(IndexItem)); + return ERROR_OK; +} + + +/* load text method */ + +bool PhraseLargeTable2::load_text(FILE * infile){ + char pinyin[256]; + char phrase[256]; + phrase_token_t token; + size_t freq; + + while (!feof(infile)) { + int num = fscanf(infile, "%s %s %u %ld", + pinyin, phrase, &token, &freq); + + if (4 != num) + continue; + + if (feof(infile)) + break; + + glong phrase_len = g_utf8_strlen(phrase, -1); + ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); + add_index(phrase_len, new_phrase, token); + + g_free(new_phrase); + } + return true; +} + + +/* load/store method */ + +bool PhraseBitmapIndexLevel2::load(MemoryChunk * chunk, + table_offset_t offset, + table_offset_t end){ + reset(); + char * buf_begin = (char *) chunk->begin(); + table_offset_t phrase_begin, phrase_end; + table_offset_t * index = (table_offset_t *) (buf_begin + offset); + phrase_end = *index; + + for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) { + phrase_begin = phrase_end; + index++; + phrase_end = *index; + if ( phrase_begin == phrase_end ) //null pointer + continue; + + /* after reset() all phrases are null pointer. */ + PhraseLengthIndexLevel2 * phrases = new PhraseLengthIndexLevel2; + m_phrase_length_indexes[i] = phrases; + + phrases->load(chunk, phrase_begin, phrase_end - 1); + assert( phrase_end <= end ); + assert( *(buf_begin + phrase_end - 1) == c_separate); + } + offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t); + assert( c_separate == *(buf_begin + offset) ); + return true; +} + +bool PhraseBitmapIndexLevel2::store(MemoryChunk * new_chunk, + table_offset_t offset, + table_offset_t & end){ + table_offset_t phrase_end; + table_offset_t index = offset; + offset += (PHRASE_NUMBER_OF_BITMAP_INDEX + 1) * sizeof(table_offset_t); + //add '#' + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset +=sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + for ( size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) { + PhraseLengthIndexLevel2 * phrases = m_phrase_length_indexes[i]; + if ( !phrases ) { //null pointer + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + continue; + } + phrases->store(new_chunk, offset, phrase_end); //has a end '#' + offset = phrase_end; + //add '#' + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + } + end = offset; + return true; +} + +bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk, + table_offset_t offset, + table_offset_t end) { + char * buf_begin = (char *) chunk->begin(); + guint32 nindex = *((guint32 *)(buf_begin + offset)); + table_offset_t * index = (table_offset_t *) + (buf_begin + offset + sizeof(guint32)); + + table_offset_t phrase_begin, phrase_end = *index; + g_array_set_size(m_phrase_array_indexes, 0); + for (size_t i = 1; i <= nindex; ++i) { + phrase_begin = phrase_end; + index++; + phrase_end = *index; + if ( phrase_begin == phrase_end ){ + void * null = NULL; + g_array_append_val(m_phrase_array_indexes, null); + continue; + } + +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2 * phrase = \ + new PhraseArrayIndexLevel2; \ + phrase->load(chunk, phrase_begin, phrase_end - 1); \ + assert( *(buf_begin + phrase_end - 1) == c_separate ); \ + assert( phrase_end <= end ); \ + g_array_append_val(m_phrase_array_indexes, phrase); \ + break; \ + } + switch ( i ){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } +#undef CASE + } + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + assert ( c_separate == * (buf_begin + offset) ); + return true; +} + +bool PhraseLengthIndexLevel2::store(MemoryChunk * new_chunk, + table_offset_t offset, + table_offset_t & end) { + guint32 nindex = m_phrase_array_indexes->len; + new_chunk->set_content(offset, &nindex, sizeof(guint32)); + table_offset_t index = offset + sizeof(guint32); + + offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t); + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + + table_offset_t phrase_end; + for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) { +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2 * phrase = g_array_index \ + (m_phrase_array_indexes, PhraseArrayIndexLevel2 *, len - 1); \ + if ( !phrase ){ \ + new_chunk->set_content \ + (index, &offset, sizeof(table_offset_t)); \ + index += sizeof(table_offset_t); \ + continue; \ + } \ + phrase->store(new_chunk, offset, phrase_end); \ + offset = phrase_end; \ + break; \ + } + switch ( i ){ + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } + //add '#' + new_chunk->set_content(offset, &c_separate, sizeof(char)); + offset += sizeof(char); + new_chunk->set_content(index, &offset, sizeof(table_offset_t)); + index += sizeof(table_offset_t); + +#undef CASE + } + end = offset; + return true; +} + +template +bool PhraseArrayIndexLevel2:: +load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end){ + char * buf_begin = (char *) chunk->begin(); + m_chunk.set_chunk(buf_begin + offset, end - offset, NULL); + return true; +} + +template +bool PhraseArrayIndexLevel2:: +store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) { + new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size()); + end = offset + m_chunk.size(); + return true; +} + + +/* get length method */ + +int PhraseLengthIndexLevel2::get_length() const { + int length = m_phrase_array_indexes->len; + + /* trim trailing zero. */ + for (int i = length - 1; i >= 0; --i) { + void * array = g_array_index(m_phrase_array_indexes, void *, i); + + if (NULL != array) + break; + + --length; + } + + return length; +} + +template +int PhraseArrayIndexLevel2::get_length() const { + IndexItem * chunk_begin = NULL, * chunk_end = NULL; + chunk_begin = (IndexItem *) m_chunk.begin(); + chunk_end = (IndexItem *) m_chunk.end(); + + return chunk_end - chunk_begin; +} + + +/* mask out method */ + +bool PhraseBitmapIndexLevel2::mask_out(phrase_token_t mask, + phrase_token_t value){ + for (size_t i = 0; i < PHRASE_NUMBER_OF_BITMAP_INDEX; ++i) { + PhraseLengthIndexLevel2 * & length_array = + m_phrase_length_indexes[i]; + + if (NULL == length_array) + continue; + + length_array->mask_out(mask, value); + + if (0 == length_array->get_length()) { + delete length_array; + length_array = NULL; + } + } + + return true; +} + +bool PhraseLengthIndexLevel2::mask_out(phrase_token_t mask, + phrase_token_t value){ +#define CASE(len) case len: \ + { \ + PhraseArrayIndexLevel2 * & array = g_array_index \ + (m_phrase_array_indexes, \ + PhraseArrayIndexLevel2 *, len - 1); \ + \ + if (NULL == array) \ + continue; \ + \ + array->mask_out(mask, value); \ + \ + if (0 == array->get_length()) { \ + delete array; \ + array = NULL; \ + } \ + break; \ + } + + for (size_t i = 1; i <= m_phrase_array_indexes->len; ++i) { + switch (i) { + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + default: + assert(false); + } + } + /* shrink self array. */ + g_array_set_size(m_phrase_array_indexes, get_length()); +#undef CASE + return true; +} + +template +bool PhraseArrayIndexLevel2::mask_out +(phrase_token_t mask, phrase_token_t value) { + IndexItem * begin = NULL, * end = NULL; + begin = (IndexItem *) m_chunk.begin(); + end = (IndexItem *) m_chunk.end(); + + for (IndexItem * cur = begin; cur != end; ++cur) { + if ((cur->m_token & mask) != value) + continue; + + int offset = (cur - begin) * sizeof(IndexItem); + m_chunk.remove_content(offset, sizeof(IndexItem)); + + /* update chunk end. */ + end = (IndexItem *) m_chunk.end(); + --cur; + } + + return true; +} -- cgit