write merge.py in progress

author: Peng Wu <alexepico@gmail.com> 2013-01-31 16:58:26 +0800
committer: Peng Wu <alexepico@gmail.com> 2013-01-31 16:58:26 +0800
commit: aa75fc05cced38be5bee6d4b8cc572859d3761c8 (patch)
tree: 66403600d7d2764898e090aa40967920031e3292 /tools/merge.py
parent: 343e4b777d77aa33ffed1d241c763fe5335439fb (diff)
download: trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.tar.gz
trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.tar.xz
trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.zip
1 files changed, 105 insertions, 0 deletions
diff --git a/tools/merge.py b/tools/merge.py
new file mode 100644
index 0000000..c018bfe
--- /dev/null
+++ b/tools/merge.py
@@ -0,0 +1,105 @@
+#!/usr/bin/python3
+import os
+
+#minimum duplicates in recognized dictionaries to be merged
+threshold = 3
+
+#minimum pinyin frequency
+minimum = 3
+
+#default pinyin total frequency
+default = 100
+
+words_dict = {}
+
+def load_recognized_word(filename):
+    print(filename)
+
+    words = set([])
+    wordfile = open(filename, "r")
+    for oneline in wordfile.readlines():
+        oneline = oneline.rstrip(os.linesep)
+
+        if len(oneline) == 0:
+            continue
+
+        (word, pinyin, freq) = oneline.split(None, 2)
+
+        if not word in words:
+            words.add(word)
+
+    wordfile.close()
+
+    for word in words:
+        if word in words_dict:
+            words_dict[word] += 1
+        else:
+            words_dict[word] = 1
+
+
+merged_words_dict = {}
+
+def filter_recognized_words(filename):
+    print(filename)
+    lines = []
+
+    #loading
+    wordfile = open(filename, "r")
+    for oneline in wordfile.readlines():
+        oneline = oneline.rstrip(os.linesep)
+
+        if len(oneline) == 0:
+            continue
+
+        (word, pinyin, freq) = oneline.split(None, 2)
+        freq = int(freq)
+
+        if not word in words_dict:
+            lines.append(oneline)
+            continue
+
+        occurs = words_dict[word]
+        if occurs < threshold:
+            lines.append(oneline)
+            continue
+
+        if word in merged_words_dict:
+            merged_words_dict[word].append((pinyin, freq))
+        else:
+            merged_words_dict[word] = [(pinyin, freq)]
+
+    wordfile.close()
+
+    #saving
+    wordfile = open(filename, "w")
+    for oneline in lines:
+        wordfile.writelines([oneline, os.linesep])
+    wordfile.close()
+
+
+def save_merged_words(filename):
+    print(filename)
+
+    wordfile = open(filename, "r")
+    for word, pairs in merged_words_dict.items():
+        pinyins = {}
+        for pinyin, freq in pairs:
+            if pinyin in pinyins:
+                pinyins[pinyin] += freq
+            else:
+                pinyins[pinyin] = freq
+
+        freqsum = sum([ freq for pinyin, freq in pinyins.items() ])
+
+        for pinyin, freq in pairs.items():
+            freq = int(default * freq / freqsum)
+
+            if freq < minimum:
+                continue
+
+            freq = str(freq)
+
+            oneline = '\t'.join(word, pinyin, freq)
+            wordfile.writelines([oneline, os.linesep])
+
+    wordfile.close()
author	Peng Wu <alexepico@gmail.com>	2013-01-31 16:58:26 +0800
committer	Peng Wu <alexepico@gmail.com>	2013-01-31 16:58:26 +0800
commit	aa75fc05cced38be5bee6d4b8cc572859d3761c8 (patch)
tree	66403600d7d2764898e090aa40967920031e3292 /tools/merge.py
parent	343e4b777d77aa33ffed1d241c763fe5335439fb (diff)
download	trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.tar.gz trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.tar.xz trainer-aa75fc05cced38be5bee6d4b8cc572859d3761c8.zip