summaryrefslogtreecommitdiffstats
path: root/src/storage/ngram.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/storage/ngram.h')
-rw-r--r--src/storage/ngram.h329
1 files changed, 329 insertions, 0 deletions
diff --git a/src/storage/ngram.h b/src/storage/ngram.h
new file mode 100644
index 0000000..e4045a9
--- /dev/null
+++ b/src/storage/ngram.h
@@ -0,0 +1,329 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2006-2007 Peng Wu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef NGRAM_H
+#define NGRAM_H
+
+#include <db.h>
+
+namespace pinyin{
+
+class Bigram;
+
+/** Note:
+ * The system single gram contains the trained freqs.
+ * The user single gram contains the delta freqs.
+ * During the Viterbi beam search, use merge_single_gram to merge the system
+ * single gram and the user single gram.
+ */
+
+
+/**
+ * SingleGram:
+ *
+ * The single gram in the bi-gram.
+ *
+ */
+class SingleGram{
+ friend class Bigram;
+ friend bool merge_single_gram(SingleGram * merged,
+ const SingleGram * system,
+ const SingleGram * user);
+
+private:
+ MemoryChunk m_chunk;
+ SingleGram(void * buffer, size_t length);
+public:
+ /**
+ * SingleGram::SingleGram:
+ *
+ * The constructor of the SingleGram.
+ *
+ */
+ SingleGram();
+ /**
+ * SingleGram::retrieve_all:
+ * @array: the GArray to store the retrieved bi-gram phrase item.
+ * @returns: whether the retrieve operation is successful.
+ *
+ * Retrieve all bi-gram phrase items in this single gram.
+ *
+ */
+ bool retrieve_all(/* out */ BigramPhraseWithCountArray array) const;
+
+ /**
+ * SingleGram::search:
+ * @range: the token range.
+ * @array: the GArray to store the matched bi-gram phrase item.
+ * @returns: whether the search operation is successful.
+ *
+ * Search the bi-gram phrase items according to the token range.
+ *
+ * Note: the array result may contain many items.
+ *
+ */
+ bool search(/* in */ PhraseIndexRange * range,
+ /* out */ BigramPhraseArray array) const;
+
+ /**
+ * SingleGram::insert_freq:
+ * @token: the phrase token.
+ * @freq: the freq of this token.
+ * @returns: whether the insert operation is successful.
+ *
+ * Insert the token with the freq.
+ *
+ */
+ bool insert_freq(/* in */ phrase_token_t token,
+ /* in */ guint32 freq);
+
+ /**
+ * SingleGram::remove_freq:
+ * @token: the phrase token.
+ * @freq: the freq of the removed token.
+ * @returns: whether the remove operation is successful.
+ *
+ * Remove the token.
+ *
+ */
+ bool remove_freq(/* in */ phrase_token_t token,
+ /* out */ guint32 & freq);
+
+ /**
+ * SingleGram::get_freq:
+ * @token: the phrase token.
+ * @freq: the freq of the token.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the freq of the token.
+ *
+ */
+ bool get_freq(/* in */ phrase_token_t token,
+ /* out */ guint32 & freq) const;
+
+ /**
+ * SingleGram::set_freq:
+ * @token: the phrase token.
+ * @freq: the freq of the token.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the freq of the token.
+ *
+ */
+ bool set_freq(/* in */ phrase_token_t token,
+ /* in */ guint32 freq);
+
+ /**
+ * SingleGram::get_total_freq:
+ * @total: the total freq of this single gram.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the total freq of this single gram.
+ *
+ */
+ bool get_total_freq(guint32 & total) const;
+
+ /**
+ * SingleGram::set_total_freq:
+ * @total: the total freq of this single gram.
+ * @returns: whether the set operation is successful.
+ *
+ * Set the total freq of this single gram.
+ *
+ */
+ bool set_total_freq(guint32 total);
+
+ /**
+ * SingleGram::get_length:
+ * @returns: the number of items in this single gram.
+ *
+ * Get the number of items in this single gram.
+ *
+ */
+ guint32 get_length();
+
+ /**
+ * SingleGram::mask_out:
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: the number of removed items.
+ *
+ * Mask out the matched items in this single gram.
+ *
+ */
+ guint32 mask_out(phrase_token_t mask, phrase_token_t value);
+
+ /**
+ * SingleGram::prune:
+ * @returns: whether the prune operation is successful.
+ *
+ * Obsoleted by Katz k mixture model pruning.
+ *
+ */
+ bool prune();
+};
+
+
+/**
+ * Bigram:
+ *
+ * The Bi-gram class.
+ *
+ */
+class Bigram{
+private:
+ DB * m_db;
+
+ void reset(){
+ if ( m_db ){
+ m_db->sync(m_db, 0);
+ m_db->close(m_db, 0);
+ m_db = NULL;
+ }
+ }
+
+public:
+ /**
+ * Bigram::Bigram:
+ *
+ * The constructor of the Bigram.
+ *
+ */
+ Bigram(){
+ m_db = NULL;
+ }
+
+ /**
+ * Bigram::~Bigram:
+ *
+ * The destructor of the Bigram.
+ *
+ */
+ ~Bigram(){
+ reset();
+ }
+
+ /**
+ * Bigram::load_db:
+ * @dbfile: the Berkeley DB file name.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the Berkeley DB into memory.
+ *
+ */
+ bool load_db(const char * dbfile);
+
+ /**
+ * Bigram::save_db:
+ * @dbfile: the Berkeley DB file name.
+ * @returns: whether the save operation is successful.
+ *
+ * Save the in-memory Berkeley DB into disk.
+ *
+ */
+ bool save_db(const char * dbfile);
+
+ /**
+ * Bigram::attach:
+ * @dbfile: the Berkeley DB file name.
+ * @flags: the flags of enum ATTACH_FLAG.
+ * @returns: whether the attach operation is successful.
+ *
+ * Attach this Bigram with the Berkeley DB.
+ *
+ */
+ bool attach(const char * dbfile, guint32 flags);
+
+ /**
+ * Bigram::load:
+ * @index: the previous token in the bi-gram.
+ * @single_gram: the single gram of the previous token.
+ * @returns: whether the load operation is successful.
+ *
+ * Load the single gram of the previous token.
+ *
+ */
+ bool load(/* in */ phrase_token_t index,
+ /* out */ SingleGram * & single_gram);
+
+ /**
+ * Bigram::store:
+ * @index: the previous token in the bi-gram.
+ * @single_gram: the single gram of the previous token.
+ * @returns: whether the store operation is successful.
+ *
+ * Store the single gram of the previous token.
+ *
+ */
+ bool store(/* in */ phrase_token_t index,
+ /* in */ SingleGram * single_gram);
+
+ /**
+ * Bigram::remove:
+ * @index: the previous token in the bi-gram.
+ * @returns: whether the remove operation is successful.
+ *
+ * Remove the single gram of the previous token.
+ *
+ */
+ bool remove(/* in */ phrase_token_t index);
+
+ /**
+ * Bigram::get_all_items:
+ * @items: the GArray to store all previous tokens.
+ * @returns: whether the get operation is successful.
+ *
+ * Get the array of all previous tokens for parameter estimation.
+ *
+ */
+ bool get_all_items(/* out */ GArray * items);
+
+ /**
+ * Bigram::mask_out:
+ * @mask: the mask.
+ * @value: the value.
+ * @returns: whether the mask out operation is successful.
+ *
+ * Mask out the matched items.
+ *
+ */
+ bool mask_out(phrase_token_t mask, phrase_token_t value);
+};
+
+/**
+ * merge_single_gram:
+ * @merged: the merged single gram of system and user single gram.
+ * @system: the system single gram to be merged.
+ * @user: the user single gram to be merged.
+ * @returns: whether the merge operation is successful.
+ *
+ * Merge the system and user single gram into one merged single gram.
+ *
+ * Note: Please keep system and user single gram
+ * when using merged single gram.
+ *
+ */
+bool merge_single_gram(SingleGram * merged, const SingleGram * system,
+ const SingleGram * user);
+
+};
+
+#endif