diff options
author | Peng Wu <alexepico@gmail.com> | 2013-01-25 15:27:55 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-01-25 15:31:26 +0800 |
commit | a508523904d2f42783b83be5b23be63f3fe3b521 (patch) | |
tree | ffa391225514ccdb6940578bf663a2f106d6193c /markpinyin.py | |
parent | 1638a0ef2818b4fab0454d037d3e75fe635154a2 (diff) | |
download | trainer-a508523904d2f42783b83be5b23be63f3fe3b521.tar.gz trainer-a508523904d2f42783b83be5b23be63f3fe3b521.tar.xz trainer-a508523904d2f42783b83be5b23be63f3fe3b521.zip |
write markpinyin.py in progress
Diffstat (limited to 'markpinyin.py')
-rw-r--r-- | markpinyin.py | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/markpinyin.py b/markpinyin.py index ff52181..cf03e23 100644 --- a/markpinyin.py +++ b/markpinyin.py @@ -2,6 +2,7 @@ import os import sqlite3 from argparse import ArgumentParser +from operator import itemgetter import utils from myconfig import MyConfig from dirwalk import walkIndex @@ -14,3 +15,48 @@ words_dir = config.getWordRecognizerDir() os.chdir(words_dir) #chdir done + +atomic_words_list = [] +merged_words_list = [] + + +def load_atomic_words(filename): + wordsfile = open(filename) + for oneline in wordsfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + + atomic_words_list.append((word, pinyin, freq)) + + wordsfile.close() + #ascending sort + atomic_words_list.sort(key=itemgetter(0, 1)) + + +def load_merged_words(filename): + wordsfile = open(filename) + for oneline in wordsfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (word, prefix, postfix, freq) = oneline.split(None, 3) + freq = int(freq) + + merged_words_list.append((word, prefix, postfix, freq)) + + wordsfile.close() + + #ascending sort + merged_words_list.sort(key=itemgetter(0, 1, 2)) + + +#loading old words +load_atomic_words(config.getWordsWithPinyinFileName()) +#print(atomic_words_list) |