diff options
author | Peng Wu <alexepico@gmail.com> | 2013-04-18 14:51:13 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2013-04-18 15:28:46 +0800 |
commit | 632e0ed5fc56094bf9482a2b4485e27eee390439 (patch) | |
tree | 13e9f30d7a6809b4556e41cb7b9e222748d75900 | |
parent | 3179af0ea400d9c6653fda7d2c1bc8f6cef1f75b (diff) | |
download | trainer-632e0ed5fc56094bf9482a2b4485e27eee390439.tar.gz trainer-632e0ed5fc56094bf9482a2b4485e27eee390439.tar.xz trainer-632e0ed5fc56094bf9482a2b4485e27eee390439.zip |
write mergeseq.py
-rw-r--r-- | lib/myconfig.py | 10 | ||||
-rw-r--r-- | mergeseq.py | 107 | ||||
-rwxr-xr-x | segment.py | 6 |
3 files changed, 121 insertions, 2 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py index fb77c0c..5bb3f83 100644 --- a/lib/myconfig.py +++ b/lib/myconfig.py @@ -14,6 +14,7 @@ class MyConfig: 'PartialWordEpoch': 1, \ 'NewWordEpoch': 1, \ 'MarkPinyinEpoch': 1, \ + 'MergeSequenceEpoch': 1, \ } def getEpochs(self): @@ -107,6 +108,15 @@ class MyConfig: def getTextPostfix(self): return '.text' + def getBackupPostfix(self): + return '.backup' + + def getMergedPostfix(self): + return '.merged' + + def getMergedReportPostfix(self): + return '.merged.report' + def getFinalModelFileName(self): return 'interpolation2.text' diff --git a/mergeseq.py b/mergeseq.py new file mode 100644 index 0000000..2064b15 --- /dev/null +++ b/mergeseq.py @@ -0,0 +1,107 @@ +#!/usr/bin/python3 +import os +import os.path +import sys +from argparse import ArgumentParser +from subprocess import Popen, PIPE +import utils +from myconfig import MyConfig + + +config = MyConfig() + +#change cwd to the libpinyin data directory +libpinyin_dir = config.getToolsDir() +libpinyin_sub_dir = os.path.join(libpinyin_dir, 'data') +os.chdir(libpinyin_sub_dir) +#chdir done + + +def handleError(error): + sys.exit(error) + + +def mergeOneText(infile, outfile, reportfile): + infilestatuspath = infile + config.getStatusPostfix() + infilestatus = utils.load_status(infilestatuspath) + if not utils.check_epoch(infilestatus, 'Segment'): + raise utils.EpochError('Please segment first.\n') + if utils.check_epoch(infilestatus, 'MergeSequence'): + return + + infile = infile + config.getSegmentPostfix() + + #begin processing + cmdline = ['../utils/segment/mergeseq', \ + '-o', outfile, infile] + + subprocess = Popen(cmdline, shell=False, stderr=PIPE, \ + close_fds=True) + + lines = subprocess.stderr.readlines() + if lines: + print('found error report') + with open(reportfile, 'wb') as f: + f.writelines(lines) + + os.waitpid(subprocess.pid, 0) + #end processing + + utils.sign_epoch(infilestatus, 'MergeSequence') + utils.store_status(infilestatuspath, infilestatus) + + +def handleOneIndex(indexpath): + indexstatuspath = indexpath + config.getStatusPostfix() + indexstatus = utils.load_status(indexstatuspath) + if not utils.check_epoch(indexstatus, 'Segment'): + raise utils.EpochError('Please segment first.\n') + if utils.check_epoch(indexstatus, 'MergeSequence'): + return + + #begin processing + indexfile = open(indexpath, 'r') + for oneline in indexfile.readlines(): + #remove tailing '\n' + oneline = oneline.rstrip(os.linesep) + (title, textpath) = oneline.split('#') + + infile = config.getTextDir() + textpath + outfile = config.getTextDir() + textpath + config.getMergedPostfix() + reportfile = config.getTextDir() + textpath + \ + config.getMergedReportPostfix() + + print("Processing " + title + '#' + textpath) + mergeOneText(infile, outfile, reportfile) + print("Processed " + title + '#' + textpath) + + indexfile.close() + #end processing + + utils.sign_epoch(indexstatus, 'MergeSequence') + utils.store_status(indexstatuspath, indexstatus) + + +def walkThroughIndex(path): + for root, dirs, files in os.walk(path, topdown=True, onerror=handleError): + for onefile in files: + filepath = os.path.join(root, onefile) + if onefile.endswith(config.getIndexPostfix()): + handleOneIndex(filepath) + elif onefile.endswith(config.getStatusPostfix()): + pass + else: + print('Unexpected file:' + filepath) + +if __name__ == '__main__': + parser = ArgumentParser(description='Merge all corpus segmented documents.') + parser.add_argument('--indexdir', action='store', \ + help='index directory', \ + default=config.getTextIndexDir()) + + args = parser.parse_args() + print(args) + walkThroughIndex(args.indexdir) + print('done') + + @@ -10,7 +10,7 @@ from myconfig import MyConfig config = MyConfig() -#change cwd to the libpinyin utils/segment directory +#change cwd to the libpinyin data directory libpinyin_dir = config.getToolsDir() libpinyin_sub_dir = os.path.join(libpinyin_dir, 'data') os.chdir(libpinyin_sub_dir) @@ -43,7 +43,6 @@ def segmentOneText(infile, outfile, reportfile, fast): print('found error report') with open(reportfile, 'wb') as f: f.writelines(lines) - f.close() os.waitpid(subprocess.pid, 0) #end processing @@ -64,13 +63,16 @@ def handleOneIndex(indexpath, fast): #remove tailing '\n' oneline = oneline.rstrip(os.linesep) (title, textpath) = oneline.split('#') + infile = config.getTextDir() + textpath outfile = config.getTextDir() + textpath + config.getSegmentPostfix() reportfile = config.getTextDir() + textpath + \ config.getSegmentReportPostfix() + print("Processing " + title + '#' + textpath) segmentOneText(infile, outfile, reportfile, fast) print("Processed " + title + '#' + textpath) + indexfile.close() #end processing |