diff options
author | Peng Wu <alexepico@gmail.com> | 2013-01-31 16:58:26 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-01-31 16:58:26 +0800 |
commit | aa75fc05cced38be5bee6d4b8cc572859d3761c8 (patch) | |
tree | 66403600d7d2764898e090aa40967920031e3292 /tools/merge.py | |
parent | 343e4b777d77aa33ffed1d241c763fe5335439fb (diff) | |
download | trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.tar.gz trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.tar.xz trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.zip |
write merge.py in progress
Diffstat (limited to 'tools/merge.py')
-rw-r--r-- | tools/merge.py | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/tools/merge.py b/tools/merge.py new file mode 100644 index 0000000..c018bfe --- /dev/null +++ b/tools/merge.py @@ -0,0 +1,105 @@ +#!/usr/bin/python3 +import os + +#minimum duplicates in recognized dictionaries to be merged +threshold = 3 + +#minimum pinyin frequency +minimum = 3 + +#default pinyin total frequency +default = 100 + +words_dict = {} + +def load_recognized_word(filename): + print(filename) + + words = set([]) + wordfile = open(filename, "r") + for oneline in wordfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (word, pinyin, freq) = oneline.split(None, 2) + + if not word in words: + words.add(word) + + wordfile.close() + + for word in words: + if word in words_dict: + words_dict[word] += 1 + else: + words_dict[word] = 1 + + +merged_words_dict = {} + +def filter_recognized_words(filename): + print(filename) + lines = [] + + #loading + wordfile = open(filename, "r") + for oneline in wordfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + + if not word in words_dict: + lines.append(oneline) + continue + + occurs = words_dict[word] + if occurs < threshold: + lines.append(oneline) + continue + + if word in merged_words_dict: + merged_words_dict[word].append((pinyin, freq)) + else: + merged_words_dict[word] = [(pinyin, freq)] + + wordfile.close() + + #saving + wordfile = open(filename, "w") + for oneline in lines: + wordfile.writelines([oneline, os.linesep]) + wordfile.close() + + +def save_merged_words(filename): + print(filename) + + wordfile = open(filename, "r") + for word, pairs in merged_words_dict.items(): + pinyins = {} + for pinyin, freq in pairs: + if pinyin in pinyins: + pinyins[pinyin] += freq + else: + pinyins[pinyin] = freq + + freqsum = sum([ freq for pinyin, freq in pinyins.items() ]) + + for pinyin, freq in pairs.items(): + freq = int(default * freq / freqsum) + + if freq < minimum: + continue + + freq = str(freq) + + oneline = '\t'.join(word, pinyin, freq) + wordfile.writelines([oneline, os.linesep]) + + wordfile.close() |