diff options
-rw-r--r-- | lib/myconfig.py | 4 | ||||
-rw-r--r-- | markpinyin.py | 46 |
2 files changed, 50 insertions, 0 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py index 3459070..9a98852 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -14,6 +14,7 @@ class MyConfig: 'PartialWordThresholdEpoch': 4, \ 'PartialWordEpoch': 5, \ 'NewWordEpoch': 6, \ + 'MarkPinyinEpoch': 7, \ } def getEpochs(self): @@ -167,3 +168,6 @@ class MyConfig: def getNewWordFileName(self): return "newword.txt" + + def getRecognizedWordFileName(self): + return "recognized.txt" diff --git a/markpinyin.py b/markpinyin.py index ff52181..cf03e23 100644 --- a/markpinyin.py +++ b/markpinyin.py @@ -2,6 +2,7 @@ import os import sqlite3 from argparse import ArgumentParser +from operator import itemgetter import utils from myconfig import MyConfig from dirwalk import walkIndex @@ -14,3 +15,48 @@ words_dir = config.getWordRecognizerDir() os.chdir(words_dir) #chdir done + +atomic_words_list = [] +merged_words_list = [] + + +def load_atomic_words(filename): + wordsfile = open(filename) + for oneline in wordsfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (word, pinyin, freq) = oneline.split(None, 2) + freq = int(freq) + + atomic_words_list.append((word, pinyin, freq)) + + wordsfile.close() + #ascending sort + atomic_words_list.sort(key=itemgetter(0, 1)) + + +def load_merged_words(filename): + wordsfile = open(filename) + for oneline in wordsfile.readlines(): + oneline = oneline.rstrip(os.linesep) + + if len(oneline) == 0: + continue + + (word, prefix, postfix, freq) = oneline.split(None, 3) + freq = int(freq) + + merged_words_list.append((word, prefix, postfix, freq)) + + wordsfile.close() + + #ascending sort + merged_words_list.sort(key=itemgetter(0, 1, 2)) + + +#loading old words +load_atomic_words(config.getWordsWithPinyinFileName()) +#print(atomic_words_list) |