summaryrefslogtreecommitdiffstats
path: root/segment.py
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-07-18 15:31:48 +0800
committerPeng Wu <alexepico@gmail.com>2011-07-18 15:31:48 +0800
commitfa2643cf811583b489209ca51e5ecade1cc1a080 (patch)
treec3e5382155295212f8fa64c107667ed68b575475 /segment.py
parent31975aa62056940ad4c05c5ad8c0cef504151793 (diff)
downloadtrainer-fa2643cf811583b489209ca51e5ecade1cc1a080.tar.gz
trainer-fa2643cf811583b489209ca51e5ecade1cc1a080.tar.xz
trainer-fa2643cf811583b489209ca51e5ecade1cc1a080.zip
wrote segment.py
Diffstat (limited to 'segment.py')
-rwxr-xr-x[-rw-r--r--]segment.py42
1 files changed, 36 insertions, 6 deletions
diff --git a/segment.py b/segment.py
index 942a70e..6c6c173 100644..100755
--- a/segment.py
+++ b/segment.py
@@ -1,24 +1,46 @@
#!/usr/bin/python3
import os
import os.path
+from argparse import ArgumentParser
from subprocess import Popen, PIPE
+from myconfig import MyConfig
+config = MyConfig()
+
+#change cwd to the libpinyin utils/segment directory
+libpinyin_dir = config.getToolsDir()
+libpinyin_sub_dir = os.path.join(libpinyin_dir, 'utils', 'segment')
+os.chdir(libpinyin_sub_dir)
+#chdir done
+
def handleError(error):
sys.exit(error)
-def segmentOneText(infile, outfile):
- pass
+def segmentOneText(infile, outfile, reportfile):
+ cmdline = './ngseg <"' + infile + '" 2>"' + reportfile + '"'
+ subprocess = Popen(cmdline, shell=True, stdout=PIPE, \
+ close_fds=True)
+
+ with open(outfile, 'wb') as f:
+ f.writelines(subprocess.stdout.readlines())
+ f.close()
+
+ os.waitpid(subprocess.pid, 0)
def handleOneIndex(indexpath):
indexfile = open(indexpath, 'r')
for oneline in indexfile.readlines():
(title, textpath) = oneline.split('#')
+ #remove tailing '\n'
+ textpath = textpath.rstrip(os.linesep)
infile = config.getTextDir() + textpath
outfile = config.getTextDir() + textpath + config.getSegmentPostfix()
- print("Processing " + title)
- segmentOneText(infile, outfile)
- print("Processed "+ title)
+ reportfile = config.getTextDir() + textpath + \
+ config.getSegmentReportPostfix()
+ print("Processing " + title + '#' + textpath)
+ segmentOneText(infile, outfile, reportfile)
+ print("Processed "+ title + '#' + textpath)
indexfile.close()
def walkThroughIndex(path):
@@ -32,4 +54,12 @@ def walkThroughIndex(path):
if __name__ == '__main__':
- pass
+ parser = ArgumentParser(description='Segment all raw corpus documents.')
+ parser.add_argument('indexdir', action='store', \
+ help='index directory')
+
+ args = parser.parse_args()
+ walkThroughIndex(args.indexdir)
+ print('done')
+
+