summaryrefslogtreecommitdiffstats
path: root/tools/merge.py
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-31 16:58:26 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-31 16:58:26 +0800
commitaa75fc05cced38be5bee6d4b8cc572859d3761c8 (patch)
tree66403600d7d2764898e090aa40967920031e3292 /tools/merge.py
parent343e4b777d77aa33ffed1d241c763fe5335439fb (diff)
downloadtrainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.tar.gz
trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.tar.xz
trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.zip
write merge.py in progress
Diffstat (limited to 'tools/merge.py')
-rw-r--r--tools/merge.py105
1 files changed, 105 insertions, 0 deletions
diff --git a/tools/merge.py b/tools/merge.py
new file mode 100644
index 0000000..c018bfe
--- /dev/null
+++ b/tools/merge.py
@@ -0,0 +1,105 @@
+#!/usr/bin/python3
+import os
+
+#minimum duplicates in recognized dictionaries to be merged
+threshold = 3
+
+#minimum pinyin frequency
+minimum = 3
+
+#default pinyin total frequency
+default = 100
+
+words_dict = {}
+
+def load_recognized_word(filename):
+ print(filename)
+
+ words = set([])
+ wordfile = open(filename, "r")
+ for oneline in wordfile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+
+ if len(oneline) == 0:
+ continue
+
+ (word, pinyin, freq) = oneline.split(None, 2)
+
+ if not word in words:
+ words.add(word)
+
+ wordfile.close()
+
+ for word in words:
+ if word in words_dict:
+ words_dict[word] += 1
+ else:
+ words_dict[word] = 1
+
+
+merged_words_dict = {}
+
+def filter_recognized_words(filename):
+ print(filename)
+ lines = []
+
+ #loading
+ wordfile = open(filename, "r")
+ for oneline in wordfile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+
+ if len(oneline) == 0:
+ continue
+
+ (word, pinyin, freq) = oneline.split(None, 2)
+ freq = int(freq)
+
+ if not word in words_dict:
+ lines.append(oneline)
+ continue
+
+ occurs = words_dict[word]
+ if occurs < threshold:
+ lines.append(oneline)
+ continue
+
+ if word in merged_words_dict:
+ merged_words_dict[word].append((pinyin, freq))
+ else:
+ merged_words_dict[word] = [(pinyin, freq)]
+
+ wordfile.close()
+
+ #saving
+ wordfile = open(filename, "w")
+ for oneline in lines:
+ wordfile.writelines([oneline, os.linesep])
+ wordfile.close()
+
+
+def save_merged_words(filename):
+ print(filename)
+
+ wordfile = open(filename, "r")
+ for word, pairs in merged_words_dict.items():
+ pinyins = {}
+ for pinyin, freq in pairs:
+ if pinyin in pinyins:
+ pinyins[pinyin] += freq
+ else:
+ pinyins[pinyin] = freq
+
+ freqsum = sum([ freq for pinyin, freq in pinyins.items() ])
+
+ for pinyin, freq in pairs.items():
+ freq = int(default * freq / freqsum)
+
+ if freq < minimum:
+ continue
+
+ freq = str(freq)
+
+ oneline = '\t'.join(word, pinyin, freq)
+ wordfile.writelines([oneline, os.linesep])
+
+ wordfile.close()