diff options
author | Peng Wu <alexepico@gmail.com> | 2013-01-25 15:27:55 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-01-25 15:31:26 +0800 |
commit | a508523904d2f42783b83be5b23be63f3fe3b521 (patch) | |
tree | ffa391225514ccdb6940578bf663a2f106d6193c | |
parent | 1638a0ef2818b4fab0454d037d3e75fe635154a2 (diff) | |
download | trainer-a508523904d2f42783b83be5b23be63f3fe3b521.tar.gz trainer-a508523904d2f42783b83be5b23be63f3fe3b521.tar.xz trainer-a508523904d2f42783b83be5b23be63f3fe3b521.zip |
write markpinyin.py in progress
-rw-r--r-- | lib/myconfig.py | 4 | ||||
-rw-r--r-- | markpinyin.py | 46 |
2 files changed, 50 insertions, 0 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py index 3459070..9a98852 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -14,6 +14,7 @@ class MyConfig: 'PartialWordThresholdEpoch': 4, \ 'PartialWordEpoch': 5, \ 'NewWordEpoch': 6, \ + 'MarkPinyinEpoch': 7, \ } def getEpochs(self): @@ -167,3 +168,6 @@ class MyConfig: def getNewWordFileName(self): return "newword.txt" + + def getRecognizedWordFileName(self): + return "recognized.txt" diff --git a/markpinyin.py b/markpinyin.py index ff52181..cf03e23 100644 --- a/markpinyin.py +++ b/markpinyin.py @@ -2,6 +2,7 @@ import os import sqlite3 from argparse import ArgumentParser +from operator import itemgetter import utils from myconfig import MyConfig from dirwalk import walkIndex @@ -14,3 +15,48 @@ words_dir = config.getWordRecognizerDir() os.chdir(words_dir) #chdir done + +atomic_words_list = [] +merged_words_list = [] + + +def load_atomic_words(filename): + wordsfile = open(filename) + for oneline in wordsfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + + atomic_words_list.append((word, pinyin, freq)) + + wordsfile.close() + #ascending sort + atomic_words_list.sort(key=itemgetter(0, 1)) + + +def load_merged_words(filename): + wordsfile = open(filename) + for oneline in wordsfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (word, prefix, postfix, freq) = oneline.split(None, 3) + freq = int(freq) + + merged_words_list.append((word, prefix, postfix, freq)) + + wordsfile.close() + + #ascending sort + merged_words_list.sort(key=itemgetter(0, 1, 2)) + + +#loading old words +load_atomic_words(config.getWordsWithPinyinFileName()) +#print(atomic_words_list) |