summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2016-07-25 15:58:40 +0800
committerPeng Wu <alexepico@gmail.com>2016-07-25 15:58:40 +0800
commitc798526a5c4138e09afd10620d645a7c133ef7fe (patch)
treed8ff7ede6fed535b9a12cd1226d951a00e65d63f /tools
parenta2262b86a4e2d043efd2ef1795e7d76c814ade65 (diff)
downloadtrainer-c798526a5c4138e09afd10620d645a7c133ef7fe.tar.gz
trainer-c798526a5c4138e09afd10620d645a7c133ef7fe.tar.xz
trainer-c798526a5c4138e09afd10620d645a7c133ef7fe.zip
write striptones.py
Diffstat (limited to 'tools')
-rw-r--r--tools/striptones.py78
1 files changed, 78 insertions, 0 deletions
diff --git a/tools/striptones.py b/tools/striptones.py
new file mode 100644
index 0000000..b05de15
--- /dev/null
+++ b/tools/striptones.py
@@ -0,0 +1,78 @@
+#!/usr/bin/python3
+import os
+from argparse import ArgumentParser
+
+from distill import strip_tone
+
+'''
+this tool accepts the same format as utils/storage/gen_pinyin_table.cpp .
+
+addon dictionaries already removed pinyin tones by distill.py .
+'''
+
+# keep the word order and only print once
+words_list = []
+words_dict = {}
+
+
+def add_words_dict(word, pinyin, freq):
+ pinyin = strip_tone(pinyin)
+ if not word in words_dict:
+ pinyins = []
+ pinyins.append((pinyin, freq))
+ words_dict[word] = pinyins
+ else:
+ pinyins = words_dict[word]
+
+ found = False
+ for i, item in enumerate(pinyins):
+ (oldpinyin, oldfreq) = item
+ if oldpinyin == pinyin:
+ # print out the collapsed word and pinyin pair
+ print('Collapse: {0} and {1}'.format(word, pinyin))
+ freq += oldfreq
+ pinyins[i] = (pinyin, freq)
+ found = True
+
+ if not found:
+ pinyins.append((pinyin, freq))
+
+
+def load_phrase(filename):
+ phrasefile = open(filename, "r")
+ for oneline in phrasefile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+ (word, pinyin, freq) = oneline.split(None, 2)
+ freq = int(freq)
+ # save the word order into word list
+ words_list.append(word)
+ add_words_dict(word, pinyin, freq)
+
+ phrasefile.close()
+
+def save_phrase(filename):
+ phrasefile = open(filename, "w")
+
+ for word in words_list:
+ if word in words_dict:
+ pinyins = words_dict[word]
+
+ for (pinyin, freq) in pinyins:
+ freq = str(freq)
+ oneline = "\t".join((word, pinyin, freq))
+ phrasefile.writelines([oneline, os.linesep])
+
+ del words_dict[word]
+
+ phrasefile.close()
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser(description='strip tones from gen_pinyin_table input file.')
+ parser.add_argument('infile', help='input file')
+ parser.add_argument('outfile', help='output file')
+ args = parser.parse_args()
+ print(args)
+
+ load_phrase(args.infile)
+ save_phrase(args.outfile)