From f41d1fdf83408e042ab07925710a8913bad0c27c Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 3 Aug 2010 10:42:47 +0800 Subject: import from pinyin. --- src/include/Makefile.am | 22 ++++ src/include/memory_chunk.h | 264 +++++++++++++++++++++++++++++++++++++++++ src/include/novel_types.h | 117 +++++++++++++++++++ src/include/stl_lite.h | 285 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 688 insertions(+) create mode 100644 src/include/Makefile.am create mode 100755 src/include/memory_chunk.h create mode 100755 src/include/novel_types.h create mode 100644 src/include/stl_lite.h (limited to 'src/include') diff --git a/src/include/Makefile.am b/src/include/Makefile.am new file mode 100644 index 0000000..bb605ee --- /dev/null +++ b/src/include/Makefile.am @@ -0,0 +1,22 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2007 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +MAINTAINERCLEANFILES = Makefile.in + +noinst_HEADERS = memory_chunk.h \ + novel_types.h \ + stl_lite.h diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h new file mode 100755 index 0000000..3571256 --- /dev/null +++ b/src/include/memory_chunk.h @@ -0,0 +1,264 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef MEMORY_CHUNK_H +#define MEMORY_CHUNK_H + +#include +#include +#include +#include + +#include +#include "stl_lite.h" + +/* for unmanaged mode + * m_free_func == free , when memory is allocated by malloc + * m_free_func == NULL, + * when memory is in small protion of allocated area + * m_free_func == other, + * malloc then free. + */ + +class MemoryChunk{ + typedef void (* free_func_t)(void *); +private: + char * m_data_begin; + char * m_data_end; //one data pass the end. + char * m_allocated; //one data pass the end. + free_func_t m_free_func; + +private: + void reset(){ + if ( m_free_func ) + (*m_free_func)(m_data_begin); + m_data_begin = NULL; + m_data_end = NULL; + m_allocated = NULL; + m_free_func = NULL; + } + + void ensure_has_space(size_t new_size){ + int delta_size = m_data_begin + new_size - m_data_end; + if ( delta_size <= 0 ) return; + ensure_has_more_space ( delta_size ); + } + + /* enlarge function */ + void ensure_has_more_space(size_t extra_size){ + if ( 0 == extra_size ) return; + size_t newsize; + size_t cursize = size(); + if ( m_free_func != free ) { + /* copy on resize */ + newsize = cursize + extra_size; + /* do the copy */ + char * tmp = (char *) malloc(newsize); + assert(tmp); + memset(tmp, 0, newsize); + memmove(tmp, m_data_begin, cursize); + /* free the origin memory */ + if ( m_free_func){ + (*m_free_func)(m_data_begin); + } + + /* change varibles */ + m_data_begin = tmp; + m_data_end = m_data_begin + cursize; + m_allocated = m_data_begin + newsize; + m_free_func = free; + return; + } + /* the memory area is managed by this memory chunk */ + if ( extra_size <= (size_t) (m_allocated - m_data_end)) + return; + newsize = std_lite::max( capacity()<<1, cursize + extra_size); + m_data_begin = (char *) realloc(m_data_begin, newsize); + assert(m_data_begin); + memset(m_data_begin + cursize, 0, newsize - cursize); + m_data_end = m_data_begin + cursize; + m_allocated = m_data_begin + newsize; + return; + } + +public: + /* constructors */ + MemoryChunk(){ + m_data_begin = NULL; + m_data_end = NULL; + m_allocated = NULL; + m_free_func = NULL; + } + + /* destructors */ + ~MemoryChunk(){ + reset(); + } + + /* read access method */ + void* begin() const{ + return m_data_begin; + } + + void* end() const{ + return m_data_end; + } + + size_t size(){ + return m_data_end - m_data_begin; + } + + void set_size(size_t newsize){ + ensure_has_space(newsize); + m_data_end = m_data_begin + newsize; + } + + size_t capacity(){ + return m_allocated - m_data_begin; + } + + /* + * Transfer management of a memory chunk allocated by other part system + * to the memory chunk. + */ + void set_chunk(void* begin, size_t length, free_func_t free_func){ + if ( m_free_func ) + m_free_func( m_data_begin ); + + m_data_begin = (char *) begin; + m_data_end = (char *) m_data_begin + length; + m_allocated = (char *) m_data_begin + length; + m_free_func = free_func; + } + + /* subchunk + * use set_buffer internally. + * new chunk need to be deleted. + */ + MemoryChunk * get_sub_chunk(size_t offset, size_t length){ + MemoryChunk * retval = new MemoryChunk(); + char * begin_pos = m_data_begin + offset; + retval->set_chunk(begin_pos, length, NULL); + return retval; + } + /* write function + * Data are written directly to the memory area. + */ + bool set_content(size_t offset, const void * data, size_t len){ + size_t cursize = std_lite::max(size(), offset + len); + ensure_has_space(offset + len); + memmove(m_data_begin + offset, data, len); + m_data_end = m_data_begin + cursize; + return true; + } + /* insert function + * Data are written to the memory area, + * the original content are moved towards the rear. + * parameter offset start from zero. + */ + bool insert_content(size_t offset, const void * data, size_t length){ + ensure_has_more_space(length); + size_t move_size = size() - offset; + memmove(m_data_begin + offset + length, m_data_begin + offset, move_size); + memmove(m_data_begin + offset, data, length); + m_data_end += length; + return true; + } + /* remove function + * Data are removed directly, + * the following content are moved towards the front. + */ + bool remove_content(size_t offset, size_t length){ + size_t move_size = size() - offset - length; + memmove(m_data_begin + offset, m_data_begin + offset + length, move_size); + m_data_end -= length; + return true; + } + + /* get_content function + * Get the binary data + */ + bool get_content(size_t offset, void * buffer, size_t length){ + if ( size() < offset + length ) + return false; + memcpy( buffer, m_data_begin + offset, length); + return true; + } + + /* compact memory, reduce the size */ + void compact_memory(){ + if ( m_free_func != free ) + return; + size_t newsize = size(); + m_data_begin = (char *) realloc(m_data_begin, newsize); + m_allocated = m_data_begin + newsize; + } + + /* file storage functions */ + bool load(const char * filename){ + /* free old data */ + reset(); + + struct stat stat_buf; + + int retval = stat(filename, &stat_buf); + + if ( retval ) + return false; + + FILE* file = fopen(filename, "r"); + if ( !file ) + return false; + int data_len = stat_buf.st_size; + void* data = malloc(data_len); + if ( !data ){ + fclose(file); + return false; + } + + data_len = fread(data, 1, data_len, file); + set_chunk(data, data_len, free); + //Fixes memory chunk end. + if ( stat_buf.st_size > data_len ) + m_allocated = (char *) m_data_begin + stat_buf.st_size; + fclose(file); + return true; + } + + bool save(const char * filename){ + FILE* file = fopen(filename, "w"); + if ( !file ) + return false; + + size_t data_len = fwrite(begin(), 1, size(), file); + if ( data_len != size()){ + fclose(file); + return false; + } + + fsync(fileno(file)); + fclose(file); + return true; + } +}; + +#endif diff --git a/src/include/novel_types.h b/src/include/novel_types.h new file mode 100755 index 0000000..a992e8e --- /dev/null +++ b/src/include/novel_types.h @@ -0,0 +1,117 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef NOVEL_TYPES_H +#define NOVEL_TYPES_H + +#include +#include + +typedef guint32 phrase_token_t; +typedef gunichar2 utf16_t; + +/* + * Phrase Index Library Definition + * Reserve 4-bits for future usage. + */ + +#define PHRASE_MASK 0x00FFFFFF +#define PHRASE_INDEX_LIBRARY_MASK 0x0F000000 +#define PHRASE_INDEX_LIBRARY_COUNT (1<<4) +#define PHRASE_INDEX_LIBRARY_INDEX(token) ((token&PHRASE_INDEX_LIBRARY_MASK)>>24) +#define PHRASE_INDEX_MAKE_TOKEN(phrase_index, token) \ + ( ( (phrase_index<<24) & PHRASE_INDEX_LIBRARY_MASK)|(token & PHRASE_MASK)) + + +/* + * PhraseIndexRanges definitions + */ + +struct PhraseIndexRange{ + phrase_token_t m_range_begin; + phrase_token_t m_range_end; /* pass the last item like stl */ +}; + +/*Array of PhraseIndexRange*/ +typedef GArray * PhraseIndexRanges[PHRASE_INDEX_LIBRARY_COUNT]; + +/* + * PinYin Table Definition + */ +class MemoryChunk; + + +/* For both PinYin Table and Phrase Table */ +enum SearchResult{ + SEARCH_NONE = 0x00, /* found nothing */ + SEARCH_OK = 0x01 , /* found items */ + SEARCH_CONTINUED = 0x02 /* has longer word in the storage to search */ +}; + +enum AddIndexResult{ + INSERT_OK = 0 , /* insert ok */ + INSERT_ITEM_EXISTS /* item already exists */ +}; + +enum RemoveIndexResult{ + REMOVE_OK = 0, /* remove ok */ + REMOVE_ITEM_DONOT_EXISTS /* item don't exists */ +}; +/* + * n-gram Definition + * no B parameter(there are duplicated items in uni-gram and bi-gram) + * used in system n-gram and user n-gram. + * using delta technique. + */ + +struct BigramPhraseItem{ + phrase_token_t m_token; + gfloat m_freq; /* P(W2|W1) */ +}; + +typedef GArray * BigramPhraseArray; /* Array of HighLevelPhraseItem */ + +/* + * n-gram Definition + * n-gram library + */ + +enum AttachOption{ + ATTACH_NEW_FILE = 1, + ATTACH_READ = 2, + ATTACH_READ_WRITE = 3 +}; + +#define MAX_PHRASE_LENGTH 16 + +const phrase_token_t sentence_start = 1; +const phrase_token_t token_min = 0; +const phrase_token_t token_max = UINT_MAX; + +const char c_separate = '#'; +typedef guint32 table_offset_t; + +typedef double parameter_t; + +#define LAMBDA_PARAMETER 0.588792 + +#endif diff --git a/src/include/stl_lite.h b/src/include/stl_lite.h new file mode 100644 index 0000000..0612782 --- /dev/null +++ b/src/include/stl_lite.h @@ -0,0 +1,285 @@ +#ifndef STL_LITE_H +#define STL_LITE_H + +#include +#include +#include + +namespace std_lite{ + + /** + * @brief This does what you think it does. + * @param a A thing of arbitrary type. + * @param b Another thing of arbitrary type. + * @return The lesser of the parameters. + * + * This is the simple classic generic implementation. It will work on + * temporary expressions, since they are only evaluated once, unlike a + * preprocessor macro. + */ + template + inline const _Tp& + min(const _Tp& __a, const _Tp& __b) + { + //return __b < __a ? __b : __a; + if (__b < __a) + return __b; + return __a; + } + + + /** + * @brief This does what you think it does. + * @param a A thing of arbitrary type. + * @param b Another thing of arbitrary type. + * @return The greater of the parameters. + * + * This is the simple classic generic implementation. It will work on + * temporary expressions, since they are only evaluated once, unlike a + * preprocessor macro. + */ + template + inline const _Tp& + max(const _Tp& __a, const _Tp& __b) + { + //return __a < __b ? __b : __a; + if (__a < __b) + return __b; + return __a; + } + + /** + * This is one of the @link s20_3_1_base functor base classes@endlink. + */ + template + struct binary_function + { + typedef _Arg1 first_argument_type; ///< the type of the first argument + /// (no surprises here) + + typedef _Arg2 second_argument_type; ///< the type of the second argument + typedef _Result result_type; ///< type of the return type + }; + /** @} */ + + /// pair holds two objects of arbitrary type. + template + struct pair + { + typedef _T1 first_type; ///< @c first_type is the first bound type + typedef _T2 second_type; ///< @c second_type is the second bound type + + _T1 first; ///< @c first is a copy of the first object + _T2 second; ///< @c second is a copy of the second object + + // _GLIBCXX_RESOLVE_LIB_DEFECTS + // 265. std::pair::pair() effects overly restrictive + /** The default constructor creates @c first and @c second using their + * respective default constructors. */ + pair() + : first(), second() { } + + /** Two objects may be passed to a @c pair constructor to be copied. */ + pair(const _T1& __a, const _T2& __b) + : first(__a), second(__b) { } + + /** There is also a templated copy ctor for the @c pair class itself. */ + template + pair(const pair<_U1, _U2>& __p) + : first(__p.first), second(__p.second) { } + }; + + /// Two pairs of the same type are equal iff their members are equal. + template + inline bool + operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return __x.first == __y.first && __x.second == __y.second; } + + /// + template + inline bool + operator<(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return __x.first < __y.first + || (!(__y.first < __x.first) && __x.second < __y.second); } + + /// Uses @c operator== to find the result. + template + inline bool + operator!=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return !(__x == __y); } + + /// Uses @c operator< to find the result. + template + inline bool + operator>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return __y < __x; } + + /// Uses @c operator< to find the result. + template + inline bool + operator<=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return !(__y < __x); } + + /// Uses @c operator< to find the result. + template + inline bool + operator>=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) + { return !(__x < __y); } + + /** + * @brief A convenience wrapper for creating a pair from two objects. + * @param x The first object. + * @param y The second object. + * @return A newly-constructed pair<> object of the appropriate type. + * + * The standard requires that the objects be passed by reference-to-const, + * but LWG issue #181 says they should be passed by const value. We follow + * the LWG by default. + */ + // _GLIBCXX_RESOLVE_LIB_DEFECTS + // 181. make_pair() unintended behavior + template + inline pair<_T1, _T2> + make_pair(_T1 __x, _T2 __y) + { return pair<_T1, _T2>(__x, __y); } + + /** + * @brief Finds the first position in which @a val could be inserted + * without changing the ordering. + * @param first An iterator. + * @param last Another iterator. + * @param val The search term. + * @param comp A functor to use for comparisons. + * @return An iterator pointing to the first element "not less than" @a val, + * or end() if every element is less than @a val. + * @ingroup binarysearch + * + * The comparison function should have the same effects on ordering as + * the function used for the initial sort. + */ + template + _ForwardIterator + lower_bound(_ForwardIterator __first, _ForwardIterator __last, + const _Tp& __val, _Compare __comp) + { + typedef size_t _DistanceType; + + _DistanceType __len = __last - __first; + _DistanceType __half; + _ForwardIterator __middle; + + while (__len > 0) + { + __half = __len >> 1; + __middle = __first; + __middle += __half; + if (__comp(*__middle, __val)) + { + __first = __middle; + ++__first; + __len = __len - __half - 1; + } + else + __len = __half; + } + return __first; + } + + /** + * @brief Finds the last position in which @a val could be inserted + * without changing the ordering. + * @param first An iterator. + * @param last Another iterator. + * @param val The search term. + * @param comp A functor to use for comparisons. + * @return An iterator pointing to the first element greater than @a val, + * or end() if no elements are greater than @a val. + * @ingroup binarysearch + * + * The comparison function should have the same effects on ordering as + * the function used for the initial sort. + */ + template + _ForwardIterator + upper_bound(_ForwardIterator __first, _ForwardIterator __last, + const _Tp& __val, _Compare __comp) + { + typedef size_t _DistanceType; + _DistanceType __len = __last - __first; + _DistanceType __half; + _ForwardIterator __middle; + + while (__len > 0) + { + __half = __len >> 1; + __middle = __first; + __middle += __half; + if (__comp(__val, *__middle)) + __len = __half; + else + { + __first = __middle; + ++__first; + __len = __len - __half - 1; + } + } + return __first; + } + + /** + * @brief Finds the largest subrange in which @a val could be inserted + * at any place in it without changing the ordering. + * @param first An iterator. + * @param last Another iterator. + * @param val The search term. + * @param comp A functor to use for comparisons. + * @return An pair of iterators defining the subrange. + * @ingroup binarysearch + * + * This is equivalent to + * @code + * std::make_pair(lower_bound(first, last, val, comp), + * upper_bound(first, last, val, comp)) + * @endcode + * but does not actually call those functions. + */ + template + pair<_ForwardIterator, _ForwardIterator> + equal_range(_ForwardIterator __first, _ForwardIterator __last, + const _Tp& __val, + _Compare __comp) + { + + typedef size_t _DistanceType; + + _DistanceType __len = __last - __first; + _DistanceType __half; + _ForwardIterator __middle, __left, __right; + + while (__len > 0) + { + __half = __len >> 1; + __middle = __first; + __middle += __half; + if (__comp(*__middle, __val)) + { + __first = __middle; + ++__first; + __len = __len - __half - 1; + } + else if (__comp(__val, *__middle)) + __len = __half; + else + { + __left = lower_bound(__first, __middle, __val, __comp); + __first += __len; + __right = upper_bound(++__middle, __first, __val, __comp); + return pair<_ForwardIterator, _ForwardIterator>(__left, __right); + } + } + return pair<_ForwardIterator, _ForwardIterator>(__first, __first); + } + + +} +#endif -- cgit