summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-07-18 15:31:48 +0800
committerPeng Wu <alexepico@gmail.com>2011-07-18 15:31:48 +0800
commitfa2643cf811583b489209ca51e5ecade1cc1a080 (patch)
treec3e5382155295212f8fa64c107667ed68b575475
parent31975aa62056940ad4c05c5ad8c0cef504151793 (diff)
downloadtrainer-fa2643cf811583b489209ca51e5ecade1cc1a080.tar.gz
trainer-fa2643cf811583b489209ca51e5ecade1cc1a080.tar.xz
trainer-fa2643cf811583b489209ca51e5ecade1cc1a080.zip
wrote segment.py
-rw-r--r--lib/myconfig.py9
-rwxr-xr-x[-rw-r--r--]segment.py42
2 files changed, 43 insertions, 8 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py
index ec264fc..4ed631f 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -28,12 +28,12 @@ class MyConfig:
def getFinalModelDir(self):
return self.m_trainer_dir + os.sep + 'finals'
- m_tools_dir = '/media/data/Program/trainer/tools'
+ m_tools_dir = '/media/data/Program/trainer/tools/libpinyin'
def getToolsDir(self):
return self.m_tools_dir
- m_evals_dir = '/media/data/Program/trainer/evals'
+ m_evals_dir = '/media/data/Program/trainer/evals/libpinyin'
def getEvalsDir(self):
return self.m_evals_dir
@@ -51,6 +51,11 @@ class MyConfig:
def getSegmentPostfix(self):
return self.m_segment_postfix
+ m_segment_report_postfix = '.segment.report'
+
+ def getSegmentReportPostfix(self):
+ return self.m_segment_report_postfix
+
#For both index page, item page and binary model file
m_status_postfix = '.status'
diff --git a/segment.py b/segment.py
index 942a70e..6c6c173 100644..100755
--- a/segment.py
+++ b/segment.py
@@ -1,24 +1,46 @@
#!/usr/bin/python3
import os
import os.path
+from argparse import ArgumentParser
from subprocess import Popen, PIPE
+from myconfig import MyConfig
+config = MyConfig()
+
+#change cwd to the libpinyin utils/segment directory
+libpinyin_dir = config.getToolsDir()
+libpinyin_sub_dir = os.path.join(libpinyin_dir, 'utils', 'segment')
+os.chdir(libpinyin_sub_dir)
+#chdir done
+
def handleError(error):
sys.exit(error)
-def segmentOneText(infile, outfile):
- pass
+def segmentOneText(infile, outfile, reportfile):
+ cmdline = './ngseg <"' + infile + '" 2>"' + reportfile + '"'
+ subprocess = Popen(cmdline, shell=True, stdout=PIPE, \
+ close_fds=True)
+
+ with open(outfile, 'wb') as f:
+ f.writelines(subprocess.stdout.readlines())
+ f.close()
+
+ os.waitpid(subprocess.pid, 0)
def handleOneIndex(indexpath):
indexfile = open(indexpath, 'r')
for oneline in indexfile.readlines():
(title, textpath) = oneline.split('#')
+ #remove tailing '\n'
+ textpath = textpath.rstrip(os.linesep)
infile = config.getTextDir() + textpath
outfile = config.getTextDir() + textpath + config.getSegmentPostfix()
- print("Processing " + title)
- segmentOneText(infile, outfile)
- print("Processed "+ title)
+ reportfile = config.getTextDir() + textpath + \
+ config.getSegmentReportPostfix()
+ print("Processing " + title + '#' + textpath)
+ segmentOneText(infile, outfile, reportfile)
+ print("Processed "+ title + '#' + textpath)
indexfile.close()
def walkThroughIndex(path):
@@ -32,4 +54,12 @@ def walkThroughIndex(path):
if __name__ == '__main__':
- pass
+ parser = ArgumentParser(description='Segment all raw corpus documents.')
+ parser.add_argument('indexdir', action='store', \
+ help='index directory')
+
+ args = parser.parse_args()
+ walkThroughIndex(args.indexdir)
+ print('done')
+
+