From c2b6bcf988dcf6f8af9c02514cf37e41624fb24b Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Wed, 14 Dec 2016 09:34:30 +0800 Subject: import word recognizer --- docs/wordrecog | 96 +++++++++++++++++++++++++++++++++++++++++++ docs/wordrecogimpl | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 213 insertions(+) create mode 100644 docs/wordrecog create mode 100644 docs/wordrecogimpl (limited to 'docs') diff --git a/docs/wordrecog b/docs/wordrecog new file mode 100644 index 0000000..4c4c580 --- /dev/null +++ b/docs/wordrecog @@ -0,0 +1,96 @@ +New Word Recognizer + +== 基于n-gram的新词识别算法 + +== 术语定义 +1. Partial Word: + * 该词汇序列在Uni-gram出现过足够多次,达到P(W)的阈值; + * 该P(W)的定义为: 合并后的词的Uni-gram的概率; + * 该信息来自扫描Bi-gram表; +2. New Word: + * 该词汇是Partial Word; + * 而且出现在Bi-gram中足够多的不同词汇后面(和前面),达到熵的阈值; + * 熵的定义为: + 假设该词为NW, NW 前边(和后边)的词汇依次为W_1, W_2, ..., W_n; + 对应的Bi-gram的NW出现次数为NC, W_1的次数为C_1, W_2的次数为C_2, ..., W_N的次数为C_n; + 则该熵为 H(NW): + NC = \sum_{i=1}^n {C_i} + p(w_i) = C_i / NC + H(NW) = -\sum_{i=1}^n {p(w_i) \log_b p(w_i)} + * 该信息来自扫描Bi-gram表; + +== 阈值的选取 +1. 阈值仅仅从词汇的P(W)和H(NW)值中选取, 既不考虑单字的概率; +2. P(W)的阈值为P(W)的值排列在最后10%位置的已知词汇; +3. H(NW)的阈值为H(NW)的值排列在最后10%位置的已知词汇; + * 分别算出NW的前边和后边的阈值,对前边和后边的H(NW)分别处理; + +== 训练过程 + +== 数据集 +1. 数据集A用来发现Partial Word; +2. 将处理完毕的数据集A称之为数据集B; +3. 数据集B用来发现New Word; + +== 训练过程 +1. 确定P(W)和H(NW)的阈值; +2. 反复扫描数据集A来发现Partial Word, 将其加入数据集A的Partial Word List, 基于新词重新生成数据集A;(扫描2-gram.db的ngram表) + Note: 在重新生成数据集时, 扫描所有的n-gram信息,将对应的序列合并,并将新的序列放入对应的低阶n-gram表中; +3. 重复上面过程直到没有新的Partial Word出现; + +4. 扫描数据集A的Partial Word List来发现New Word, 将其加入数据集B的New Word List; + Note: 由于New Word已经是Partial Word, 没有必要将New Word放入Partial Word List, 应该已经在里边了; + +Note: Partial Word List和New Word List除了保存识别出来的Word外,还保存了合并前的原始序列; 应该可以保存, 同一个词汇的多个不同序列。 + +== 原始设想 +1. 最早考虑过用全文本来存储数据,但是python速度比较慢,可能花费时间过长; +2. 根据以上需求,应该仅仅需要存储n-gram信息即可,直到足够的N, 比如32; + Uni-gram Example: Word Freq; + Bi-gram Example: Word1 Word2 Freq; + ... + N-gram Example: Word1 Word2 ... WordN Freq; + +== 改进设计 +1. 将纯文本格式改为使用sqlite来存储; +2. 使用 python + sqlite来实现; + +== Database Scheme +1. 每一个 n-gram 都有一个独立的 sqlite database 文件; + +== Database DDL +1. 所有的 n-gram scheme 都如下: + * Create Table ngram ( + words TEXT NOT NULL, + freq INTEGER NOT NULL + ); +2. 对于bi-gram, 有一张特殊的表: (转换自2-gram.db 的 ngram 表) + * Create Table bigram ( + prefix TEXT NOT NULL, + postfix TEXT NOT NULL, + freq INTEGER NOT NULL + ); + +== Database DML +1. 相关的SQL 语句如下: + 1. Partial Word 条件, 在 bi-gram 的 sqlite database 中查询: + * SELECT words FROM ngram WHERE freq > threshold; + 2. New Word 条件: + * Get all New Word candidates; + * Copying the Partial Words List from 数据集A; + * Compute the Information Entroy for each New Word candidate; + * SELECT prefix, freq FROM bigram where postfix == "$candidate"; + * SELECT postfix, freq FROM bigram where prefix == "$candidate"; + * New Word found, if both: + before entroy > before threshold; + and after entroy > after threshold; + 3. 当 New Partial Word 被发现后: (转换高阶的n-gram为sqlite fts3 table) + * 在高阶的 n-gram 中,合并包括 New Partial Word 的新词序列; + * SELECT words, freq from ngram_fts WHERE words MATCH '" $prefix $postfix "'; + * 根据返回的 words 内容,分情况处理; + * 将合并后的词汇序列放入到低阶的 n-gram 的 sqlite database 中; + +== Note: +训练过程中,如果两个序列可以组成同一个词,在识别出其中一个序列时,仅合并这一个序列,因为另一个序列可能不符合形成Partial Word的条件; 如果两个序列都被识别为Partial Word, 则合并该Partial Word; ... + +另外, 我可以认为不同的两个序列,由于分词不同,代表着不同的词汇含义, 可以认为是两个不同的词汇. diff --git a/docs/wordrecogimpl b/docs/wordrecogimpl new file mode 100644 index 0000000..1176608 --- /dev/null +++ b/docs/wordrecogimpl @@ -0,0 +1,117 @@ +Word Recognizer Implementation + + +== New Tools == +* prepary.py - prepare the initial sqlite database; +* populate.py - convert the corpus file to n-gram sqlite format; +* partialword.py - recognize partial words; +* newword.py - filter out the new words; +* markpinyin.py - mark pinyin according to the n-gram sequence; + + +== Data Flow == +prepare.py => populate.py => threshold.py => partialword.py => newword.py; + +== Implementation == + +=== populate.py === +word history = [ W1, W2, ... , Wn ] +store word history and freq into ngram table. + +=== populate.py === +multi-pass processing (1...N) +steps: + for each index file: + for each pass for ngram table: + for each word history from corpus file: + UPDATE ngram SET freq = freq + 1 WHERE words = "word history"; + OR INSERT INTO ngram VALUES("word history", 1); + + +=== partialword.py === +get partial word threshold pass: + for each word from libpinyin dictionaries: + get the word uni-gram frequency from ngram table in 1-gram.db; + store the word and freq pair into an array; + sort the word array by freq; + get the threshold from the freq of the last 10% word in position; + +get partial words pass: + words = set([]) + + while True: + get all partial word candidates from ngram of 2-gram.db; + skip all existing or already merged words; + if no new partial word candidates, + break; + save all partial word candidates to "partialword.txt" file; + + for each index file: + for each pass for ngram table from N to 1: + convert ngram table to sqlite fts table; + do combine merged words from higher-gram to lower-gram; + for new each partial word: + select matched word sequences from ngram fts table + update or insert merged word sequences into lower-gram; + delete origin word sequences (before merged) from higher-gram; + + remember all partial word candidates as merged words; + + +=== newword.py === +get new word prefix entropy threshold pass: + for each word from libpinyin dictionaries: + get the prefix information entropy of the word from bigram table; + store the word and entropy pair into an array; + sort the word array by entropy; + get the prefix entropy threshold from the entropy of the last 50% word in position; + +get new word postfix entropy threshold pass: + for each word from libpinyin dictionaries: + get the postfix information entropy of the word from bigram table; + store the word and entropy pair into an array; + sort the word array by entropy; + get the postfix entropy threshold from the entropy of the last 50% word in position; + +filter out new words pass: + for each new word candidates (partial words): + compute the prefix information entropy of the word from bigram table; + if entropy < threshold: + continue + compute the postfix information entropy of the word from bigram table; + if entropy < threshold: + continue + save the new word candidate as new word; (newword.txt) + + +=== markpinyin.py === +mark pinyin according to the merged word sequence; + +atomic word is from libpinyin dictionaries. +merged word is from new words. (in partialword.txt) + +merge pinyin helper: + merge all pairs with the same pinyin, and sum freq; + +steps: + for each new word: + if an atomic word: + return all pinyin and freq pairs; + if an merge word sequence: + for each merged pair: + for each prefix: + for each postfix: + pinyin = prefix pinyin + "'" + postfix pinyin; + freq = default * merged poss * prefix poss * postfix poss + return all pinyin and freq pairs; + +==== notes ==== +oldwords.txt: phrase ␣ pinyin without tone ␣ pinyin freq +partialwords.txt: prefix ␣ postfix ␣ phrase ␣ merge freq +newwords.txt: phrase + +for new words, recursive divide pinyin freq into atomic phrases according to merge freq of partialwords.txt and pinyin freq of oldwords.txt; + if atomic phrase in old words, + then divide pinyin freq by old pinyin freq; + combine the same pinyin and phrase into one, freq = sum freq/all freq; + total pinyin freq is default to 100; -- cgit