diff options
author | Peng Wu <alexepico@gmail.com> | 2013-01-29 14:40:25 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-01-29 14:41:08 +0800 |
commit | 93037fada3ffffa0d0d0db8aaa85970fe6fbe95d (patch) | |
tree | 2fec8ee4d7572d87a2604f2dce75064350ca02c8 | |
parent | a508523904d2f42783b83be5b23be63f3fe3b521 (diff) | |
download | trainer-93037fada3ffffa0d0d0db8aaa85970fe6fbe95d.tar.gz trainer-93037fada3ffffa0d0d0db8aaa85970fe6fbe95d.tar.xz trainer-93037fada3ffffa0d0d0db8aaa85970fe6fbe95d.zip |
write markPinyin
-rw-r--r-- | lib/myconfig.py | 6 | ||||
-rw-r--r-- | markpinyin.py | 93 |
2 files changed, 89 insertions, 10 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py index 9a98852..ea02f79 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -171,3 +171,9 @@ class MyConfig: def getRecognizedWordFileName(self): return "recognized.txt" + + def getDefaultPinyinTotalFrequency(self): + return 100 + + def getMinimumPinyinFrequency(self): + return 3 diff --git a/markpinyin.py b/markpinyin.py index cf03e23..5903fef 100644 --- a/markpinyin.py +++ b/markpinyin.py @@ -2,7 +2,6 @@ import os import sqlite3 from argparse import ArgumentParser -from operator import itemgetter import utils from myconfig import MyConfig from dirwalk import walkIndex @@ -10,14 +9,20 @@ from dirwalk import walkIndex config = MyConfig() +#default pinyin total frequency +default = config.getDefaultPinyinTotalFrequency() + +#minimum pinyin frequency +minimum = config.getMinimumPinyinFrequency() + #change cwd to the word recognizer directory words_dir = config.getWordRecognizerDir() os.chdir(words_dir) #chdir done -atomic_words_list = [] -merged_words_list = [] +atomic_words_dict = {} +merged_words_dict = {} def load_atomic_words(filename): @@ -31,11 +36,12 @@ def load_atomic_words(filename): (word, pinyin, freq) = oneline.split(None, 2) freq = int(freq) - atomic_words_list.append((word, pinyin, freq)) + if word in atomic_words_dict: + atomic_words_dict[word].append((pinyin, freq)) + else: + atomic_words_dict[word] = [(pinyin, freq)] wordsfile.close() - #ascending sort - atomic_words_list.sort(key=itemgetter(0, 1)) def load_merged_words(filename): @@ -49,14 +55,81 @@ def load_merged_words(filename): (word, prefix, postfix, freq) = oneline.split(None, 3) freq = int(freq) - merged_words_list.append((word, prefix, postfix, freq)) + if word in merged_words_dict: + merged_words_dict[word].append((prefix, postfix, freq)) + else: + merged_words_dict[word] = [(prefix, postfix, freq)] wordsfile.close() - #ascending sort - merged_words_list.sort(key=itemgetter(0, 1, 2)) + +def mergePinyin(pinyin_list): + print(pinyin_list) + pinyins = {} + + for (pinyin, freq) in pinyin_list: + if pinyin in pinyins: + pinyins[pinyin] += freq + else: + pinyins[pinyin] = freq + + pinyins = list(pinyins.items()) + total_freq = sum([ freq for pinyin, freq in pinyins ]) + + results = [] + for (pinyin, freq) in pinyins: + freq = default * freq / total_freq + freq = int(freq) + if freq < minimum: + freq = minimum + results.append((pinyin, freq)) + print(results) + return results + + +def markAtomicWord(word): + assert word in atomic_words_dict + results = atomic_words_dict[word] + return mergePinyin(results) + + +def markMergedWord(word): + assert word in merged_words_dict + + merged_list = merged_words_dict[word] + print(merged_list) + merged_sum = sum([ freq for prefix, postfix, freq in merged_list ]) + + results = [] + for (prefix, postfix, freq) in merged_list: + prefix_list = markPinyin(prefix) + prefix_sum = sum([ freq for pinyin, freq in prefix_list ]) + + postfix_list = markPinyin(postfix) + postfix_sum = sum([ freq for pinyin, freq in postfix_list ]) + + for prefix_pinyin, prefix_freq in prefix_list: + for postfix_pinyin, postfix_freq in postfix_list: + merged_pinyin = prefix_pinyin + "'" + postfix_pinyin + merged_freq = default * freq * prefix_freq * postfix_freq / \ + merged_sum / prefix_sum / postfix_sum + results.append((merged_pinyin, merged_freq)) + + return mergePinyin(results) + + +def markPinyin(word): + print(word) + + if word in atomic_words_dict: + return markAtomicWord(word) + elif word in merged_words_dict: + return markMergedWord(word) + else: + assert False, "missed word.\n" #loading old words load_atomic_words(config.getWordsWithPinyinFileName()) -#print(atomic_words_list) +#print(atomic_words_dict) + |