summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2013-04-18 14:51:13 +0800
committerPeng Wu <alexepico@gmail.com>2013-04-18 15:28:46 +0800
commit632e0ed5fc56094bf9482a2b4485e27eee390439 (patch)
tree13e9f30d7a6809b4556e41cb7b9e222748d75900
parent3179af0ea400d9c6653fda7d2c1bc8f6cef1f75b (diff)
downloadtrainer-632e0ed5fc56094bf9482a2b4485e27eee390439.tar.gz
trainer-632e0ed5fc56094bf9482a2b4485e27eee390439.tar.xz
trainer-632e0ed5fc56094bf9482a2b4485e27eee390439.zip
write mergeseq.py
-rw-r--r--lib/myconfig.py10
-rw-r--r--mergeseq.py107
-rwxr-xr-xsegment.py6
3 files changed, 121 insertions, 2 deletions
diff --git a/lib/myconfig.py b/lib/myconfig.py
index fb77c0c..5bb3f83 100644
--- a/lib/myconfig.py
+++ b/lib/myconfig.py
@@ -14,6 +14,7 @@ class MyConfig:
'PartialWordEpoch': 1, \
'NewWordEpoch': 1, \
'MarkPinyinEpoch': 1, \
+ 'MergeSequenceEpoch': 1, \
}
def getEpochs(self):
@@ -107,6 +108,15 @@ class MyConfig:
def getTextPostfix(self):
return '.text'
+ def getBackupPostfix(self):
+ return '.backup'
+
+ def getMergedPostfix(self):
+ return '.merged'
+
+ def getMergedReportPostfix(self):
+ return '.merged.report'
+
def getFinalModelFileName(self):
return 'interpolation2.text'
diff --git a/mergeseq.py b/mergeseq.py
new file mode 100644
index 0000000..2064b15
--- /dev/null
+++ b/mergeseq.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python3
+import os
+import os.path
+import sys
+from argparse import ArgumentParser
+from subprocess import Popen, PIPE
+import utils
+from myconfig import MyConfig
+
+
+config = MyConfig()
+
+#change cwd to the libpinyin data directory
+libpinyin_dir = config.getToolsDir()
+libpinyin_sub_dir = os.path.join(libpinyin_dir, 'data')
+os.chdir(libpinyin_sub_dir)
+#chdir done
+
+
+def handleError(error):
+ sys.exit(error)
+
+
+def mergeOneText(infile, outfile, reportfile):
+ infilestatuspath = infile + config.getStatusPostfix()
+ infilestatus = utils.load_status(infilestatuspath)
+ if not utils.check_epoch(infilestatus, 'Segment'):
+ raise utils.EpochError('Please segment first.\n')
+ if utils.check_epoch(infilestatus, 'MergeSequence'):
+ return
+
+ infile = infile + config.getSegmentPostfix()
+
+ #begin processing
+ cmdline = ['../utils/segment/mergeseq', \
+ '-o', outfile, infile]
+
+ subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
+ close_fds=True)
+
+ lines = subprocess.stderr.readlines()
+ if lines:
+ print('found error report')
+ with open(reportfile, 'wb') as f:
+ f.writelines(lines)
+
+ os.waitpid(subprocess.pid, 0)
+ #end processing
+
+ utils.sign_epoch(infilestatus, 'MergeSequence')
+ utils.store_status(infilestatuspath, infilestatus)
+
+
+def handleOneIndex(indexpath):
+ indexstatuspath = indexpath + config.getStatusPostfix()
+ indexstatus = utils.load_status(indexstatuspath)
+ if not utils.check_epoch(indexstatus, 'Segment'):
+ raise utils.EpochError('Please segment first.\n')
+ if utils.check_epoch(indexstatus, 'MergeSequence'):
+ return
+
+ #begin processing
+ indexfile = open(indexpath, 'r')
+ for oneline in indexfile.readlines():
+ #remove tailing '\n'
+ oneline = oneline.rstrip(os.linesep)
+ (title, textpath) = oneline.split('#')
+
+ infile = config.getTextDir() + textpath
+ outfile = config.getTextDir() + textpath + config.getMergedPostfix()
+ reportfile = config.getTextDir() + textpath + \
+ config.getMergedReportPostfix()
+
+ print("Processing " + title + '#' + textpath)
+ mergeOneText(infile, outfile, reportfile)
+ print("Processed " + title + '#' + textpath)
+
+ indexfile.close()
+ #end processing
+
+ utils.sign_epoch(indexstatus, 'MergeSequence')
+ utils.store_status(indexstatuspath, indexstatus)
+
+
+def walkThroughIndex(path):
+ for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
+ for onefile in files:
+ filepath = os.path.join(root, onefile)
+ if onefile.endswith(config.getIndexPostfix()):
+ handleOneIndex(filepath)
+ elif onefile.endswith(config.getStatusPostfix()):
+ pass
+ else:
+ print('Unexpected file:' + filepath)
+
+if __name__ == '__main__':
+ parser = ArgumentParser(description='Merge all corpus segmented documents.')
+ parser.add_argument('--indexdir', action='store', \
+ help='index directory', \
+ default=config.getTextIndexDir())
+
+ args = parser.parse_args()
+ print(args)
+ walkThroughIndex(args.indexdir)
+ print('done')
+
+
diff --git a/segment.py b/segment.py
index fe0ba68..f753ef0 100755
--- a/segment.py
+++ b/segment.py
@@ -10,7 +10,7 @@ from myconfig import MyConfig
config = MyConfig()
-#change cwd to the libpinyin utils/segment directory
+#change cwd to the libpinyin data directory
libpinyin_dir = config.getToolsDir()
libpinyin_sub_dir = os.path.join(libpinyin_dir, 'data')
os.chdir(libpinyin_sub_dir)
@@ -43,7 +43,6 @@ def segmentOneText(infile, outfile, reportfile, fast):
print('found error report')
with open(reportfile, 'wb') as f:
f.writelines(lines)
- f.close()
os.waitpid(subprocess.pid, 0)
#end processing
@@ -64,13 +63,16 @@ def handleOneIndex(indexpath, fast):
#remove tailing '\n'
oneline = oneline.rstrip(os.linesep)
(title, textpath) = oneline.split('#')
+
infile = config.getTextDir() + textpath
outfile = config.getTextDir() + textpath + config.getSegmentPostfix()
reportfile = config.getTextDir() + textpath + \
config.getSegmentReportPostfix()
+
print("Processing " + title + '#' + textpath)
segmentOneText(infile, outfile, reportfile, fast)
print("Processed " + title + '#' + textpath)
+
indexfile.close()
#end processing