diff options
Diffstat (limited to 'utils/training/k_mixture_model.h')
-rw-r--r-- | utils/training/k_mixture_model.h | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/utils/training/k_mixture_model.h b/utils/training/k_mixture_model.h new file mode 100644 index 0000000..ad8d3d8 --- /dev/null +++ b/utils/training/k_mixture_model.h @@ -0,0 +1,172 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2011 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef K_MIXTURE_MODEL +#define K_MIXTURE_MODEL + +#include <math.h> +#include "novel_types.h" +#include "flexible_ngram.h" + +namespace pinyin{ + +typedef guint32 corpus_count_t; + +/* Note: storage parameters: N, T, n_r. + * N: the total number of documents. + * T: the total number of instances of the word or phrase. + * n_r: the number of documents having exactly <b>r</b> occurrences. + * only n_0, n_1 are used here. + */ + +static inline parameter_t compute_alpha(corpus_count_t N, corpus_count_t n_0){ + parameter_t alpha = 1 - n_0 / (parameter_t) N; + return alpha; +} + +static inline parameter_t compute_gamma(corpus_count_t N, + corpus_count_t n_0, + corpus_count_t n_1){ + parameter_t gamma = 1 - n_1 / (parameter_t) (N - n_0); + return gamma; +} + +static inline parameter_t compute_B(corpus_count_t N, + corpus_count_t T, + corpus_count_t n_0, + corpus_count_t n_1){ + /* Note: re-check this, to see if we can remove if statement. */ + /* Please consider B_2 is no less than 2 in paper. */ +#if 1 + if ( 0 == T - n_1 && 0 == N - n_0 - n_1 ) + return 2; +#endif + + parameter_t B = (T - n_1 ) / (parameter_t) (N - n_0 - n_1); + return B; +} + +/* three parameters model */ +static inline parameter_t compute_Pr_G_3(corpus_count_t k, + parameter_t alpha, + parameter_t gamma, + parameter_t B){ + if ( k == 0 ) + return 1 - alpha; + + if ( k == 1 ) + return alpha * (1 - gamma); + + if ( k > 1 ) { + return (alpha * gamma / (B - 1)) * pow((1 - 1 / (B - 1)) , k - 2); + } + + assert(false); +} + +static inline parameter_t compute_Pr_G_3_with_count(corpus_count_t k, + corpus_count_t N, + corpus_count_t T, + corpus_count_t n_0, + corpus_count_t n_1){ + parameter_t alpha = compute_alpha(N, n_0); + parameter_t gamma = compute_gamma(N, n_0, n_1); + parameter_t B = compute_B(N, T, n_0, n_1); + + return compute_Pr_G_3(k, alpha, gamma, B); +} + +/* two parameters model */ +static inline parameter_t compute_Pr_G_2(corpus_count_t k, + parameter_t alpha, + parameter_t B){ + parameter_t gamma = 1 - 1 / (B - 1); + return compute_Pr_G_3(k, alpha, gamma, B); +} + +static inline parameter_t compute_Pr_G_2_with_count(corpus_count_t k, + corpus_count_t N, + corpus_count_t T, + corpus_count_t n_0, + corpus_count_t n_1){ + parameter_t alpha = compute_alpha(N, n_0); + parameter_t B = compute_B(N, T, n_0, n_1); + return compute_Pr_G_2(k, alpha, B); +} + +#define K_MIXTURE_MODEL_MAGIC_NUMBER "KMMP" + +typedef struct{ + /* the total number of instances of all words. */ + guint32 m_WC; + /* the total number of documents. */ + guint32 m_N; + /* the total freq of uni-gram. */ + guint32 m_total_freq; +} KMixtureModelMagicHeader; + +typedef struct{ + /* the total number of instances of word W1. */ + guint32 m_WC; + /* the freq of uni-gram. see m_total_freq in magic header also. */ + guint32 m_freq; +} KMixtureModelArrayHeader; + +typedef struct{ + /* the total number of all W1,W2 word pair. */ + guint32 m_WC; + + /* the total number of instances of the word or phrase. + (two word phrase) */ + /* guint32 m_T; Please use m_WC instead. + alias of m_WC, always the same. */ + + /* n_r: the number of documents having exactly r occurrences. */ + /* guint32 m_n_0; + Note: compute this value using the following equation. + m_n_0 = KMixtureModelMagicHeader.m_N - m_N_n_0; + m_N_n_0, the number of documents which contains the word or phrase. + (two word phrase) */ + guint32 m_N_n_0; + guint32 m_n_1; + + /* maximum instances of the word or phrase (two word phrase) + in previous documents last seen. */ + guint32 m_Mr; +} KMixtureModelArrayItem; + +typedef FlexibleBigram<KMixtureModelMagicHeader, + KMixtureModelArrayHeader, + KMixtureModelArrayItem> +KMixtureModelBigram; + +typedef FlexibleSingleGram<KMixtureModelArrayHeader, + KMixtureModelArrayItem> +KMixtureModelSingleGram; + +typedef KMixtureModelSingleGram::ArrayItemWithToken +KMixtureModelArrayItemWithToken; + +}; + + +#endif |