summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-01-25 15:27:55 +0800
committerPeng Wu <alexepico@gmail.com>2013-01-25 15:31:26 +0800
commita508523904d2f42783b83be5b23be63f3fe3b521 (patch)
treeffa391225514ccdb6940578bf663a2f106d6193c
parent1638a0ef2818b4fab0454d037d3e75fe635154a2 (diff)
downloadtrainer-a508523904d2f42783b83be5b23be63f3fe3b521.tar.gz
trainer-a508523904d2f42783b83be5b23be63f3fe3b521.tar.xz
trainer-a508523904d2f42783b83be5b23be63f3fe3b521.zip
write markpinyin.py in progress
-rw-r--r--lib/myconfig.py4
-rw-r--r--markpinyin.py46
2 files changed, 50 insertions, 0 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py
index 3459070..9a98852 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -14,6 +14,7 @@ class MyConfig:
'PartialWordThresholdEpoch': 4, \
'PartialWordEpoch': 5, \
'NewWordEpoch': 6, \
+ 'MarkPinyinEpoch': 7, \
}
def getEpochs(self):
@@ -167,3 +168,6 @@ class MyConfig:
def getNewWordFileName(self):
return "newword.txt"
+
+ def getRecognizedWordFileName(self):
+ return "recognized.txt"
diff --git a/markpinyin.py b/markpinyin.py
index ff52181..cf03e23 100644
--- a/markpinyin.py
+++ b/markpinyin.py
@@ -2,6 +2,7 @@
import os
import sqlite3
from argparse import ArgumentParser
+from operator import itemgetter
import utils
from myconfig import MyConfig
from dirwalk import walkIndex
@@ -14,3 +15,48 @@ words_dir = config.getWordRecognizerDir()
os.chdir(words_dir)
#chdir done
+
+atomic_words_list = []
+merged_words_list = []
+
+
+def load_atomic_words(filename):
+ wordsfile = open(filename)
+ for oneline in wordsfile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+
+ if len(oneline) == 0:
+ continue
+
+ (word, pinyin, freq) = oneline.split(None, 2)
+ freq = int(freq)
+
+ atomic_words_list.append((word, pinyin, freq))
+
+ wordsfile.close()
+ #ascending sort
+ atomic_words_list.sort(key=itemgetter(0, 1))
+
+
+def load_merged_words(filename):
+ wordsfile = open(filename)
+ for oneline in wordsfile.readlines():
+ oneline = oneline.rstrip(os.linesep)
+
+ if len(oneline) == 0:
+ continue
+
+ (word, prefix, postfix, freq) = oneline.split(None, 3)
+ freq = int(freq)
+
+ merged_words_list.append((word, prefix, postfix, freq))
+
+ wordsfile.close()
+
+ #ascending sort
+ merged_words_list.sort(key=itemgetter(0, 1, 2))
+
+
+#loading old words
+load_atomic_words(config.getWordsWithPinyinFileName())
+#print(atomic_words_list)