diff options
author | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2010-08-03 10:42:47 +0800 |
commit | f41d1fdf83408e042ab07925710a8913bad0c27c (patch) | |
tree | 1757833ac4cdd0830834d2f9ef92be07c0bc1a5b /src/storage/ngram.cpp | |
parent | 34acf9be9033e0dc0a5905999133482c20b6cbf3 (diff) | |
download | libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.gz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.tar.xz libpinyin-f41d1fdf83408e042ab07925710a8913bad0c27c.zip |
import from pinyin.
Diffstat (limited to 'src/storage/ngram.cpp')
-rw-r--r-- | src/storage/ngram.cpp | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp new file mode 100644 index 0000000..7fdc58f --- /dev/null +++ b/src/storage/ngram.cpp @@ -0,0 +1,283 @@ +/* + * novel-pinyin, + * A Simplified Chinese Sentence-Based Pinyin Input Method Engine + * Based On Markov Model. + * + * Copyright (C) 2006-2007 Peng Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <stdio.h> +#include "memory_chunk.h" +#include "novel_types.h" +#include "ngram.h" + +struct SingleGramItem{ + phrase_token_t m_token; + guint32 m_freq; +}; + +SingleGram::SingleGram(){ + m_chunk.set_size(sizeof(guint32)); + memset(m_chunk.begin(), 0, sizeof(guint32)); +} + +SingleGram::SingleGram(void * buffer, size_t length){ + m_chunk.set_chunk(buffer, length, NULL); +} + +bool SingleGram::set_total_freq(guint32 m_total){ + char * buf_begin = (char *)m_chunk.begin(); + *((guint32 *)buf_begin) = m_total; + return true; +} + +bool SingleGram::get_total_freq(guint32 & m_total){ + char * buf_begin = (char *)m_chunk.begin(); + m_total = *((guint32 *)buf_begin); + return true; +} + +bool SingleGram::prune(){ +#if 1 + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *)m_chunk.end(); + + size_t nitem = 0; + for ( SingleGramItem * cur = begin; cur != end; ++cur){ + cur->m_freq--; + nitem++; + if ( cur->m_freq == 0 ){ + size_t offset = sizeof(guint32) + (cur - begin) + * sizeof(SingleGramItem) ; + m_chunk.remove_content(offset, sizeof(SingleGramItem)); + } + } + guint32 total_freq; + assert(get_total_freq(total_freq)); + assert(set_total_freq(total_freq - nitem)); +#endif + return true; +} + +bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){ + return lhs.m_token < rhs.m_token; +} + +bool SingleGram::search(/* in */ PhraseIndexRange * range, + /* out */ BigramPhraseArray array){ + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = range->m_range_begin; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + guint32 total_freq; + BigramPhraseItem bigram_item; + assert(get_total_freq(total_freq)); + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token >= range->m_range_end ) + break; + bigram_item.m_token = cur_item->m_token; + bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq; + g_array_append_val(array, bigram_item); + } + return true; +} + +bool SingleGram::get_freq(/* in */ phrase_token_t token, + /* out */ guint32 & freq){ + freq = 0; + const SingleGramItem * begin = (const SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + const SingleGramItem * end = (const SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + for ( ; cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ) + return false; + if ( cur_item->m_token == token ){ + freq = cur_item -> m_freq; + return true; + } + } + return false; +} + +bool SingleGram::set_freq(/* in */ phrase_token_t token, + guint32 freq){ + SingleGramItem * begin = (SingleGramItem *) + ((const char *)(m_chunk.begin()) + sizeof(guint32)); + SingleGramItem * end = (SingleGramItem *)m_chunk.end(); + SingleGramItem compare_item; + compare_item.m_token = token; + SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than); + + SingleGramItem insert_item; + insert_item.m_token = token; + insert_item.m_freq = freq; + for ( ;cur_item != end; ++cur_item){ + if ( cur_item->m_token > token ){ + size_t offset = sizeof(guint32) + + sizeof(SingleGramItem) * (cur_item - begin); + m_chunk.insert_content(offset, &insert_item, + sizeof(SingleGramItem)); + return true; + } + if ( cur_item->m_token == token ){ + cur_item -> m_freq = freq; + return true; + } + } + m_chunk.insert_content(m_chunk.size(), &insert_item, + sizeof(SingleGramItem)); + return true; +} + + +bool Bigram::attach(const char * systemfile, const char * userfile){ + reset(); + if ( systemfile ){ + int ret = db_create(&m_system, NULL, 0); + if ( ret != 0 ) + assert(false); + + m_system->open(m_system, NULL, systemfile, NULL, + DB_HASH, DB_RDONLY, 0664); + if ( ret != 0) + return false; + } + + if ( userfile ){ + int ret = db_create(&m_user, NULL, 0); + if ( ret != 0 ) + assert(false); + + m_user->open(m_user, NULL, userfile, NULL, DB_HASH, DB_CREATE, 0664); + if ( ret != 0) + return false; + } + return true; +} + +bool Bigram::load(phrase_token_t index, SingleGram * & system_gram, SingleGram * & user_gram){ + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + + system_gram = NULL; user_gram = NULL; + if ( m_system ){ + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + int ret = m_system->get(m_system, NULL, &db_key, &db_data, 0); + if ( ret == 0 ) + system_gram = new SingleGram(db_data.data, db_data.size); + } + if ( m_user ){ + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + int ret = m_user->get(m_user, NULL, &db_key, &db_data, 0); + if ( ret == 0 ) + user_gram = new SingleGram(db_data.data, db_data.size); + } + return true; +} + +bool Bigram::store(phrase_token_t index, SingleGram * user_gram){ + if ( !m_user ) + return false; + DBT db_key; + memset(&db_key, 0, sizeof(DBT)); + db_key.data = &index; + db_key.size = sizeof(phrase_token_t); + DBT db_data; + memset(&db_data, 0, sizeof(DBT)); + db_data.data = user_gram->m_chunk.begin(); + db_data.size = user_gram->m_chunk.size(); + + int ret = m_user->put(m_user, NULL, &db_key, &db_data, 0); + return ret == 0; +} + +bool Bigram::get_all_items(GArray * system, GArray * user){ + bool retval = false; + g_array_set_size(system, 0); + g_array_set_size(user, 0); + if ( m_system ){ + DBC * cursorp; + DBT key, data; + int ret; + /* Get a cursor */ + m_system->cursor(m_system, NULL, &cursorp, 0); + + /* Initialize our DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + assert(key.size == sizeof(phrase_token_t)); + phrase_token_t * token = (phrase_token_t *)key.data; + g_array_append_val(system, *token); + } + + if (ret != DB_NOTFOUND) { + fprintf(stderr, "system db error, exit!"); + exit(1); + } + + /* Cursors must be closed */ + if (cursorp != NULL) + cursorp->c_close(cursorp); + + retval = true; + } + if ( m_user ){ + DBC * cursorp; + DBT key, data; + int ret; + /* Get a cursor */ + m_user->cursor(m_user, NULL, &cursorp, 0); + + /* Initialize out DBTs. */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* Iterate over the database, retrieving each record in turn. */ + while((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) { + assert(key.size == sizeof(phrase_token_t)); + phrase_token_t * token = (phrase_token_t *) key.data; + g_array_append_val(user, *token); + } + + if (ret != DB_NOTFOUND){ + fprintf(stderr, "user db error, exit!"); + exit(1); + } + + /* Cursor must be closed */ + if ( cursorp != NULL) + cursorp->c_close(cursorp); + + retval = true; + } + return retval; +} |