diff options
author | Aurélien Bompard <aurelien@bompard.org> | 2012-09-25 19:40:20 +0200 |
---|---|---|
committer | Aurélien Bompard <aurelien@bompard.org> | 2012-09-25 19:40:20 +0200 |
commit | ca1967c915458c7e6b54a43767a8b50dea277fb9 (patch) | |
tree | fe04ad19824cafeb191e311467abae8a2bee08e4 | |
parent | d723e64bdb3a39a16e9416ba608ad76bc8390e95 (diff) | |
download | kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.tar.gz kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.tar.xz kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.zip |
Refactor the scrubbing function
-rw-r--r-- | kittystore/scrub.py | 737 | ||||
-rw-r--r-- | kittystore/storm/schema/__init__.py | 10 |
2 files changed, 298 insertions, 449 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py index ac93baa..e7f4dde 100644 --- a/kittystore/scrub.py +++ b/kittystore/scrub.py @@ -25,6 +25,7 @@ import binascii import tempfile from cStringIO import StringIO from types import IntType, StringType +from mimetypes import guess_all_extensions from email.Utils import parsedate from email.Parser import HeaderParser @@ -39,106 +40,32 @@ from email.Charset import Charset #from Mailman.i18n import _ #from Mailman.Logging.Syslog import syslog #from Mailman.Utils import sha_new -from mailman.core.i18n import _ + from mailman.utilities.string import websafe, oneline +# TODO: don't do translations here, the system locale has no meaning to the +# web user +from mailman.core.i18n import _ + +# Path characters for common platforms +pre = re.compile(r'[/\\:]') +# All other characters to strip out of Content-Disposition: filenames +# (essentially anything that isn't an alphanum, dot, dash, or underscore). +sre = re.compile(r'[^-\w.]') +# Regexp to strip out leading dots +dre = re.compile(r'^\.*') -## Path characters for common platforms -#pre = re.compile(r'[/\\:]') -## All other characters to strip out of Content-Disposition: filenames -## (essentially anything that isn't an alphanum, dot, dash, or underscore). -#sre = re.compile(r'[^-\w.]') -## Regexp to strip out leading dots -#dre = re.compile(r'^\.*') -# BR = '<br>\n' -#SPACE = ' ' -# -#try: -# True, False -#except NameError: -# True = 1 -# False = 0 -# -# -#try: -# from mimetypes import guess_all_extensions -#except ImportError: -# import mimetypes -# def guess_all_extensions(ctype, strict=True): -# # BAW: sigh, guess_all_extensions() is new in Python 2.3 -# all = [] -# def check(map): -# for e, t in map.items(): -# if t == ctype: -# all.append(e) -# check(mimetypes.types_map) -# # Python 2.1 doesn't have common_types. Sigh, sigh. -# if not strict and hasattr(mimetypes, 'common_types'): -# check(mimetypes.common_types) -# return all -# -# -# -#def guess_extension(ctype, ext): -# # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot, -# # and .wiz are all mapped to application/msword. This sucks for finding -# # the best reverse mapping. If the extension is one of the giving -# # mappings, we'll trust that, otherwise we'll just guess. :/ -# all = guess_all_extensions(ctype, strict=False) -# if ext in all: -# return ext -# return all and all[0] -# -# -#def safe_strftime(fmt, t): -# try: -# return time.strftime(fmt, t) -# except (TypeError, ValueError, OverflowError): -# return None -# -# -#def calculate_attachments_dir(mlist, msg, msgdata): -# # Calculate the directory that attachments for this message will go -# # under. To avoid inode limitations, the scheme will be: -# # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files> -# # Start by calculating the date-based and msgid-hash components. -# fmt = '%Y%m%d' -# datestr = msg.get('Date') -# if datestr: -# now = parsedate(datestr) -# else: -# now = time.gmtime(msgdata.get('received_time', time.time())) -# datedir = safe_strftime(fmt, now) -# if not datedir: -# datestr = msgdata.get('X-List-Received-Date') -# if datestr: -# datedir = safe_strftime(fmt, datestr) -# if not datedir: -# # What next? Unixfrom, I guess. -# parts = msg.get_unixfrom().split() -# try: -# month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, -# 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12, -# }.get(parts[3], 0) -# day = int(parts[4]) -# year = int(parts[6]) -# except (IndexError, ValueError): -# # Best we can do I think -# month = day = year = 0 -# datedir = '%04d%02d%02d' % (year, month, day) -# assert datedir -# # As for the msgid hash, we'll base this part on the Message-ID: so that -# # all attachments for the same message end up in the same directory (we'll -# # uniquify the filenames in that directory as needed). We use the first 2 -# # and last 2 bytes of the SHA1 hash of the message id as the basis of the -# # directory name. Clashes here don't really matter too much, and that -# # still gives us a 32-bit space to work with. -# msgid = msg['message-id'] -# if msgid is None: -# msgid = msg['Message-ID'] = Utils.unique_message_id(mlist) -# # We assume that the message id actually /is/ unique! -# digest = sha_new(msgid).hexdigest() -# return os.path.join('attachments', datedir, digest[:4] + digest[-4:]) + + +def guess_extension(ctype, ext): + # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot, + # and .wiz are all mapped to application/msword. This sucks for finding + # the best reverse mapping. If the extension is one of the giving + # mappings, we'll trust that, otherwise we'll just guess. :/ + all = guess_all_extensions(ctype, strict=False) + if ext in all: + return ext + return all and all[0] def replace_payload_by_text(msg, text, charset): @@ -149,59 +76,64 @@ def replace_payload_by_text(msg, text, charset): #if isinstance(charset, unicode): # # email 3.0.1 (python 2.4) doesn't like unicode # charset = charset.encode('us-ascii') - msg.set_payload(text, charset) + #msg.set_payload(text, charset) + msg.set_payload('TODO: display attachment here and remove message subpart') -def save_attachment(mlist, msg, filter_html=True): - # Store name, content-type and size - return "TODO: handle attachments and return a link here" -def scrub_message(mlist, msg): - sanitize = 1 # TODO: implement other options - outer = True - charset = None - #lcset = Utils.GetCharSet(mlist.preferred_language) - #lcset_out = Charset(lcset).output_charset or lcset - lcset = "utf-8" - # Now walk over all subparts of this message and scrub out various types - format = delsp = None - for part in msg.walk(): - ctype = part.get_content_type() - # If the part is text/plain, we leave it alone - if ctype == 'text/plain': - # We need to choose a charset for the scrubbed message, so we'll - # arbitrarily pick the charset of the first text/plain part in the - # message. - # MAS: Also get the RFC 3676 stuff from this part. This seems to - # work OK for scrub_nondigest. It will also work as far as - # scrubbing messages for the archive is concerned, but pipermail - # doesn't pay any attention to the RFC 3676 parameters. The plain - # format digest is going to be a disaster in any case as some of - # messages will be format="flowed" and some not. ToDigest creates - # its own Content-Type: header for the plain digest which won't - # have RFC 3676 parameters. If the message Content-Type: headers - # are retained for display in the digest, the parameters will be - # there for information, but not for the MUA. This is the best we - # can do without having get_payload() process the parameters. - if charset is None: - charset = part.get_content_charset(lcset) - format = part.get_param('format') - delsp = part.get_param('delsp') - # TK: if part is attached then check charset and scrub if none - if part.get('content-disposition') and \ - not part.get_content_charset(): - omask = os.umask(002) - try: - url = save_attachment(mlist, part) - finally: - os.umask(omask) - filename = part.get_filename(_('not available')) - filename = oneline(filename, lcset) - replace_payload_by_text(part, _("""\ -An embedded and charset-unspecified text was scrubbed... -Name: %(filename)s -URL: %(url)s -"""), lcset) - elif ctype == 'text/html' and isinstance(sanitize, IntType): + +class Scrubber(object): + """ + Scrubs a single message, extracts attachments, and store them in the + database. + """ + + def __init__(self, mlist, msg, store): + self.mlist = mlist + self.msg = msg + self.store = store + + + def scrub(self): + sanitize = 1 # TODO: implement other options + outer = True + charset = None + #lcset = Utils.GetCharSet(self.mlist.preferred_language) + #lcset_out = Charset(lcset).output_charset or lcset + lcset = "utf-8" + # Now walk over all subparts of this message and scrub out various types + format = delsp = None + for part_num, part in enumerate(self.msg.walk()): + ctype = part.get_content_type() + # If the part is text/plain, we leave it alone + if ctype == 'text/plain': + # We need to choose a charset for the scrubbed message, so we'll + # arbitrarily pick the charset of the first text/plain part in the + # message. + # MAS: Also get the RFC 3676 stuff from this part. This seems to + # work OK for scrub_nondigest. It will also work as far as + # scrubbing messages for the archive is concerned, but pipermail + # doesn't pay any attention to the RFC 3676 parameters. The plain + # format digest is going to be a disaster in any case as some of + # messages will be format="flowed" and some not. ToDigest creates + # its own Content-Type: header for the plain digest which won't + # have RFC 3676 parameters. If the message Content-Type: headers + # are retained for display in the digest, the parameters will be + # there for information, but not for the MUA. This is the best we + # can do without having get_payload() process the parameters. + if charset is None: + charset = part.get_content_charset(lcset) + format = part.get_param('format') + delsp = part.get_param('delsp') + # TK: if part is attached then check charset and scrub if none + if part.get('content-disposition') and \ + not part.get_content_charset(): + self.save_attachment(part, part_num) + replace_payload_by_text(part, _("""\ + An embedded and charset-unspecified text was scrubbed... + Name: %(filename)s + URL: %(url)s + """), lcset) + elif ctype == 'text/html' and isinstance(sanitize, IntType): # if sanitize == 0: # if outer: # raise DiscardMessage @@ -217,314 +149,221 @@ URL: %(url)s # # Pull it out as an attachment but leave it unescaped. This # # is dangerous, but perhaps useful for heavily moderated # # lists. -# omask = os.umask(002) -# try: -# url = save_attachment(mlist, part, filter_html=False) -# finally: -# os.umask(omask) +# self.save_attachment(part, part_num, filter_html=False) # replace_payload_by_text(part, _("""\ #An HTML attachment was scrubbed... #URL: %(url)s #"""), lcset) # else: - if sanitize == 1: - # HTML-escape it and store it as an attachment, but make it - # look a /little/ bit prettier. :( - payload = websafe(part.get_payload(decode=True)) - # For whitespace in the margin, change spaces into - # non-breaking spaces, and tabs into 8 of those. Then use a - # mono-space font. Still looks hideous to me, but then I'd - # just as soon discard them. - def doreplace(s): - return s.expandtabs(8).replace(' ', ' ') - lines = [doreplace(s) for s in payload.split('\n')] - payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n' - part.set_payload(payload) - # We're replacing the payload with the decoded payload so this - # will just get in the way. - del part['content-transfer-encoding'] - omask = os.umask(002) - try: - url = save_attachment(mlist, part, filter_html=False) - finally: - os.umask(omask) + if sanitize == 1: + # HTML-escape it and store it as an attachment, but make it + # look a /little/ bit prettier. :( + payload = websafe(part.get_payload(decode=True)) + # For whitespace in the margin, change spaces into + # non-breaking spaces, and tabs into 8 of those. Then use a + # mono-space font. Still looks hideous to me, but then I'd + # just as soon discard them. + def doreplace(s): + return s.expandtabs(8).replace(' ', ' ') + lines = [doreplace(s) for s in payload.split('\n')] + payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n' + part.set_payload(payload) + # We're replacing the payload with the decoded payload so this + # will just get in the way. + del part['content-transfer-encoding'] + self.save_attachment(part, part_num, filter_html=False) + replace_payload_by_text(part, _("""\ + An HTML attachment was scrubbed... + URL: %(url)s + """), lcset) + elif ctype == 'message/rfc822': + # This part contains a submessage, so it too needs scrubbing + submsg = part.get_payload(0) + self.save_attachment(part, part_num) + subject = submsg.get('subject', _('no subject')) + subject = oneline(subject, lcset) + date = submsg.get('date', _('no date')) + who = submsg.get('from', _('unknown sender')) + size = len(str(submsg)) replace_payload_by_text(part, _("""\ -An HTML attachment was scrubbed... -URL: %(url)s -"""), lcset) - elif ctype == 'message/rfc822': - # This part contains a submessage, so it too needs scrubbing - submsg = part.get_payload(0) - omask = os.umask(002) - try: - url = save_attachment(mlist, part) - finally: - os.umask(omask) - subject = submsg.get('subject', _('no subject')) - subject = oneline(subject, lcset) - date = submsg.get('date', _('no date')) - who = submsg.get('from', _('unknown sender')) - size = len(str(submsg)) - replace_payload_by_text(part, _("""\ -An embedded message was scrubbed... -From: %(who)s -Subject: %(subject)s -Date: %(date)s -Size: %(size)s -URL: %(url)s -"""), lcset) - # If the message isn't a multipart, then we'll strip it out as an - # attachment that would have to be separately downloaded. Pipermail - # will transform the url into a hyperlink. - elif part.get_payload() and not part.is_multipart(): - payload = part.get_payload(decode=True) - ctype = part.get_content_type() - # XXX Under email 2.5, it is possible that payload will be None. - # This can happen when you have a Content-Type: multipart/* with - # only one part and that part has two blank lines between the - # first boundary and the end boundary. In email 3.0 you end up - # with a string in the payload. I think in this case it's safe to - # ignore the part. - if payload is None: - continue - size = len(payload) - omask = os.umask(002) - try: - url = save_attachment(mlist, part) - finally: - os.umask(omask) - desc = part.get('content-description', _('not available')) - desc = oneline(desc, lcset) - filename = part.get_filename(_('not available')) - filename = oneline(filename, lcset) - replace_payload_by_text(part, _("""\ -A non-text attachment was scrubbed... -Name: %(filename)s -Type: %(ctype)s -Size: %(size)d bytes -Desc: %(desc)s -URL: %(url)s -"""), lcset) - outer = False - # We still have to sanitize multipart messages to flat text because - # Pipermail can't handle messages with list payloads. This is a kludge; - # def (n) clever hack ;). - if msg.is_multipart(): - # By default we take the charset of the first text/plain part in the - # message, but if there was none, we'll use the list's preferred - # language's charset. - if not charset or charset == 'us-ascii': - charset = lcset_out - else: - # normalize to the output charset if input/output are different - charset = Charset(charset).output_charset or charset - # We now want to concatenate all the parts which have been scrubbed to - # text/plain, into a single text/plain payload. We need to make sure - # all the characters in the concatenated string are in the same - # encoding, so we'll use the 'replace' key in the coercion call. - # BAW: Martin's original patch suggested we might want to try - # generalizing to utf-8, and that's probably a good idea (eventually). - text = [] - for part in msg.walk(): - # TK: bug-id 1099138 and multipart - # MAS test payload - if part may fail if there are no headers. - if not part.get_payload() or part.is_multipart(): - continue - # All parts should be scrubbed to text/plain by now, except - # if sanitize == 2, there could be text/html parts so keep them - # but skip any other parts. - partctype = part.get_content_type() - if partctype <> 'text/plain' and (partctype <> 'text/html' or - sanitize <> 2): - text.append(_('Skipped content of type %(partctype)s\n')) - continue - try: - t = part.get_payload(decode=True) or '' - # MAS: TypeError exception can occur if payload is None. This - # was observed with a message that contained an attached - # message/delivery-status part. Because of the special parsing - # of this type, this resulted in a text/plain sub-part with a - # null body. See bug 1430236. - except (binascii.Error, TypeError): - t = part.get_payload() or '' - # TK: get_content_charset() returns 'iso-2022-jp' for internally - # crafted (scrubbed) 'euc-jp' text part. So, first try - # get_charset(), then get_content_charset() for the parts - # which are already embeded in the incoming message. - partcharset = part.get_charset() - if partcharset: - partcharset = str(partcharset) + An embedded message was scrubbed... + From: %(who)s + Subject: %(subject)s + Date: %(date)s + Size: %(size)s + URL: %(url)s + """), lcset) + # If the message isn't a multipart, then we'll strip it out as an + # attachment that would have to be separately downloaded. Pipermail + # will transform the url into a hyperlink. + elif part.get_payload() and not part.is_multipart(): + payload = part.get_payload(decode=True) + ctype = part.get_content_type() + # XXX Under email 2.5, it is possible that payload will be None. + # This can happen when you have a Content-Type: multipart/* with + # only one part and that part has two blank lines between the + # first boundary and the end boundary. In email 3.0 you end up + # with a string in the payload. I think in this case it's safe to + # ignore the part. + if payload is None: + continue + size = len(payload) + self.save_attachment(part, part_num) + desc = part.get('content-description', _('not available')) + desc = oneline(desc, lcset) + filename = part.get_filename(_('not available')) + filename = oneline(filename, lcset) + replace_payload_by_text(part, _("""\ + A non-text attachment was scrubbed... + Name: %(filename)s + Type: %(ctype)s + Size: %(size)d bytes + Desc: %(desc)s + URL: %(url)s + """), lcset) + outer = False + # We still have to sanitize multipart messages to flat text because + # Pipermail can't handle messages with list payloads. This is a kludge; + # def (n) clever hack ;). + if self.msg.is_multipart(): + # By default we take the charset of the first text/plain part in the + # message, but if there was none, we'll use the list's preferred + # language's charset. + if not charset or charset == 'us-ascii': + charset = lcset_out else: - partcharset = part.get_content_charset() - if partcharset and partcharset <> charset: - try: - t = unicode(t, partcharset, 'replace') - except (UnicodeError, LookupError, ValueError, - AssertionError): - # We can get here if partcharset is bogus in come way. - # Replace funny characters. We use errors='replace' - t = unicode(t, 'ascii', 'replace') + # normalize to the output charset if input/output are different + charset = Charset(charset).output_charset or charset + # We now want to concatenate all the parts which have been scrubbed to + # text/plain, into a single text/plain payload. We need to make sure + # all the characters in the concatenated string are in the same + # encoding, so we'll use the 'replace' key in the coercion call. + # BAW: Martin's original patch suggested we might want to try + # generalizing to utf-8, and that's probably a good idea (eventually). + text = [] + for part in self.msg.walk(): + # TK: bug-id 1099138 and multipart + # MAS test payload - if part may fail if there are no headers. + if not part.get_payload() or part.is_multipart(): + continue + # All parts should be scrubbed to text/plain by now, except + # if sanitize == 2, there could be text/html parts so keep them + # but skip any other parts. + partctype = part.get_content_type() + if partctype <> 'text/plain' and (partctype <> 'text/html' or + sanitize <> 2): + text.append(_('Skipped content of type %(partctype)s\n')) + continue try: - # Should use HTML-Escape, or try generalizing to UTF-8 - t = t.encode(charset, 'replace') - except (UnicodeError, LookupError, ValueError, - AssertionError): - # if the message charset is bogus, use the list's. - t = t.encode(lcset, 'replace') - # Separation is useful - if isinstance(t, StringType): - if not t.endswith('\n'): - t += '\n' - text.append(t) - # Now join the text and set the payload - sep = _('-------------- next part --------------\n') - # The i18n separator is in the list's charset. Coerce it to the - # message charset. - try: - sep = sep.encode(charset, 'replace') - except (UnicodeError, LookupError, ValueError, - AssertionError): - pass - replace_payload_by_text(msg, sep.join(text), charset) - if format: - msg.set_param('Format', format) - if delsp: - msg.set_param('DelSp', delsp) - return msg + t = part.get_payload(decode=True) or '' + # MAS: TypeError exception can occur if payload is None. This + # was observed with a message that contained an attached + # message/delivery-status part. Because of the special parsing + # of this type, this resulted in a text/plain sub-part with a + # null body. See bug 1430236. + except (binascii.Error, TypeError): + t = part.get_payload() or '' + # TK: get_content_charset() returns 'iso-2022-jp' for internally + # crafted (scrubbed) 'euc-jp' text part. So, first try + # get_charset(), then get_content_charset() for the parts + # which are already embeded in the incoming message. + partcharset = part.get_charset() + if partcharset: + partcharset = str(partcharset) + else: + partcharset = part.get_content_charset() + if partcharset and partcharset <> charset: + try: + t = unicode(t, partcharset, 'replace') + except (UnicodeError, LookupError, ValueError, + AssertionError): + # We can get here if partcharset is bogus in come way. + # Replace funny characters. We use errors='replace' + t = unicode(t, 'ascii', 'replace') + try: + # Should use HTML-Escape, or try generalizing to UTF-8 + t = t.encode(charset, 'replace') + except (UnicodeError, LookupError, ValueError, + AssertionError): + # if the message charset is bogus, use the list's. + t = t.encode(lcset, 'replace') + # Separation is useful + if isinstance(t, StringType): + if not t.endswith('\n'): + t += '\n' + text.append(t) + # Now join the text and set the payload + sep = _('-------------- next part --------------\n') + # The i18n separator is in the list's charset. Coerce it to the + # message charset. + try: + sep = sep.encode(charset, 'replace') + except (UnicodeError, LookupError, ValueError, + AssertionError): + pass + replace_payload_by_text(self.msg, sep.join(text), charset) + if format: + self.msg.set_param('Format', format) + if delsp: + self.msg.set_param('DelSp', delsp) + return self.msg -# -#def makedirs(dir): -# # Create all the directories to store this attachment in -# try: -# os.makedirs(dir, 02775) -# # Unfortunately, FreeBSD seems to be broken in that it doesn't honor -# # the mode arg of mkdir(). -# def twiddle(arg, dirname, names): -# os.chmod(dirname, 02775) -# os.path.walk(dir, twiddle, None) -# except OSError, e: -# if e.errno <> errno.EEXIST: raise -# -# -# -#def save_attachment(mlist, msg, dir, filter_html=True): -# fsdir = os.path.join(mlist.archive_dir(), dir) -# makedirs(fsdir) -# # Figure out the attachment type and get the decoded data -# decodedpayload = msg.get_payload(decode=True) -# # BAW: mimetypes ought to handle non-standard, but commonly found types, -# # e.g. image/jpg (should be image/jpeg). For now we just store such -# # things as application/octet-streams since that seems the safest. -# ctype = msg.get_content_type() -# # i18n file name is encoded -# lcset = Utils.GetCharSet(mlist.preferred_language) -# filename = Utils.oneline(msg.get_filename(''), lcset) -# filename, fnext = os.path.splitext(filename) -# # For safety, we should confirm this is valid ext for content-type -# # but we can use fnext if we introduce fnext filtering -# if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION: -# # HTML message doesn't have filename :-( -# ext = fnext or guess_extension(ctype, fnext) -# else: -# ext = guess_extension(ctype, fnext) -# if not ext: -# # We don't know what it is, so assume it's just a shapeless -# # application/octet-stream, unless the Content-Type: is -# # message/rfc822, in which case we know we'll coerce the type to -# # text/plain below. -# if ctype == 'message/rfc822': -# ext = '.txt' -# else: -# ext = '.bin' -# # Allow only alphanumerics, dash, underscore, and dot -# ext = sre.sub('', ext) -# path = None -# # We need a lock to calculate the next attachment number -# lockfile = os.path.join(fsdir, 'attachments.lock') -# lock = LockFile.LockFile(lockfile) -# lock.lock() -# try: -# # Now base the filename on what's in the attachment, uniquifying it if -# # necessary. -# if not filename or mm_cfg.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME: -# filebase = 'attachment' -# else: -# # Sanitize the filename given in the message headers -# parts = pre.split(filename) -# filename = parts[-1] -# # Strip off leading dots -# filename = dre.sub('', filename) -# # Allow only alphanumerics, dash, underscore, and dot -# filename = sre.sub('', filename) -# # If the filename's extension doesn't match the type we guessed, -# # which one should we go with? For now, let's go with the one we -# # guessed so attachments can't lie about their type. Also, if the -# # filename /has/ no extension, then tack on the one we guessed. -# # The extension was removed from the name above. -# filebase = filename -# # Now we're looking for a unique name for this file on the file -# # system. If msgdir/filebase.ext isn't unique, we'll add a counter -# # after filebase, e.g. msgdir/filebase-cnt.ext -# counter = 0 -# extra = '' -# while True: -# path = os.path.join(fsdir, filebase + extra + ext) -# # Generally it is not a good idea to test for file existance -# # before just trying to create it, but the alternatives aren't -# # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't -# # NFS-safe). Besides, we have an exclusive lock now, so we're -# # guaranteed that no other process will be racing with us. -# if os.path.exists(path): -# counter += 1 -# extra = '-%04d' % counter -# else: -# break -# finally: -# lock.unlock() -# # `path' now contains the unique filename for the attachment. There's -# # just one more step we need to do. If the part is text/html and -# # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be -# # here), then send the attachment through the filter program for -# # sanitization -# if filter_html and ctype == 'text/html': -# base, ext = os.path.splitext(path) -# tmppath = base + '-tmp' + ext -# fp = open(tmppath, 'w') -# try: -# fp.write(decodedpayload) -# fp.close() -# cmd = mm_cfg.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath} -# progfp = os.popen(cmd, 'r') -# decodedpayload = progfp.read() -# status = progfp.close() -# if status: -# syslog('error', -# 'HTML sanitizer exited with non-zero status: %s', -# status) -# finally: -# os.unlink(tmppath) -# # BAW: Since we've now sanitized the document, it should be plain -# # text. Blarg, we really want the sanitizer to tell us what the type -# # if the return data is. :( -# ext = '.txt' -# path = base + '.txt' -# # Is it a message/rfc822 attachment? -# elif ctype == 'message/rfc822': -# submsg = msg.get_payload() -# # BAW: I'm sure we can eventually do better than this. :( -# decodedpayload = Utils.websafe(str(submsg)) -# fp = open(path, 'w') -# fp.write(decodedpayload) -# fp.close() -# # Now calculate the url -# baseurl = mlist.GetBaseArchiveURL() -# # Private archives will likely have a trailing slash. Normalize. -# if baseurl[-1] <> '/': -# baseurl += '/' -# # A trailing space in url string may save users who are using -# # RFC-1738 compliant MUA (Not Mozilla). -# # Trailing space will definitely be a problem with format=flowed. -# # Bracket the URL instead. -# url = '<' + baseurl + '%s/%s%s%s>' % (dir, filebase, extra, ext) -# return url + def save_attachment(self, part, counter, filter_html=True): + # Store name, content-type and size + # Figure out the attachment type and get the decoded data + decodedpayload = part.get_payload(decode=True) + # BAW: mimetypes ought to handle non-standard, but commonly found types, + # e.g. image/jpg (should be image/jpeg). For now we just store such + # things as application/octet-streams since that seems the safest. + ctype = part.get_content_type() + # i18n file name is encoded + #lcset = Utils.GetCharSet(self.mlist.preferred_language) + lcset = "utf-8" + filename = oneline(part.get_filename(''), lcset) + filename, fnext = os.path.splitext(filename) + # For safety, we should confirm this is valid ext for content-type + # but we can use fnext if we introduce fnext filtering + # TODO: re-implement this + #if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION: + # # HTML message doesn't have filename :-( + # ext = fnext or guess_extension(ctype, fnext) + #else: + # ext = guess_extension(ctype, fnext) + ext = fnext or guess_extension(ctype, fnext) + if not ext: + # We don't know what it is, so assume it's just a shapeless + # application/octet-stream, unless the Content-Type: is + # message/rfc822, in which case we know we'll coerce the type to + # text/plain below. + if ctype == 'message/rfc822': + ext = '.txt' + else: + ext = '.bin' + # Allow only alphanumerics, dash, underscore, and dot + ext = sre.sub('', ext) + # Now base the filename on what's in the attachment, uniquifying it if + # necessary. + if not filename: + filebase = 'attachment' + else: + # Sanitize the filename given in the message headers + parts = pre.split(filename) + filename = parts[-1] + # Strip off leading dots + filename = dre.sub('', filename) + # Allow only alphanumerics, dash, underscore, and dot + filename = sre.sub('', filename) + # If the filename's extension doesn't match the type we guessed, + # which one should we go with? For now, let's go with the one we + # guessed so attachments can't lie about their type. Also, if the + # filename /has/ no extension, then tack on the one we guessed. + # The extension was removed from the name above. + filebase = filename + # TODO: bring back the HTML sanitizer feature + if ctype == 'message/rfc822': + submsg = part.get_payload() + # BAW: I'm sure we can eventually do better than this. :( + decodedpayload = websafe(str(submsg)) + msg_id = self.msg['Message-Id'].strip("<>") + self.store.add_attachment(self.mlist, msg_id, counter, decodedpayload) diff --git a/kittystore/storm/schema/__init__.py b/kittystore/storm/schema/__init__.py index c301e88..daba29f 100644 --- a/kittystore/storm/schema/__init__.py +++ b/kittystore/storm/schema/__init__.py @@ -60,6 +60,16 @@ CREATES = { "full" BYTEA NOT NULL, archived_date TIMESTAMP WITHOUT TIME ZONE DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (list_name, message_id) + );""", """ + CREATE TABLE "attachment" ( + list_name VARCHAR(255) NOT NULL, + message_id VARCHAR(255) NOT NULL, + counter INTEGER NOT NULL, + content_type VARCHAR(255) NOT NULL, + name VARCHAR(255), + size INTEGER NOT NULL, + content BYTEA NOT NULL, + PRIMARY KEY (list_name, message_id, counter) );""", 'CREATE INDEX "ix_email_list_name" ON "email" USING btree (list_name);', 'CREATE UNIQUE INDEX "ix_email_message_id" ON "email" USING btree (message_id);', |