summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-09-25 19:40:20 +0200
committerAurélien Bompard <aurelien@bompard.org>2012-09-25 19:40:20 +0200
commitca1967c915458c7e6b54a43767a8b50dea277fb9 (patch)
treefe04ad19824cafeb191e311467abae8a2bee08e4
parentd723e64bdb3a39a16e9416ba608ad76bc8390e95 (diff)
downloadkittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.tar.gz
kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.tar.xz
kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.zip
Refactor the scrubbing function
-rw-r--r--kittystore/scrub.py737
-rw-r--r--kittystore/storm/schema/__init__.py10
2 files changed, 298 insertions, 449 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py
index ac93baa..e7f4dde 100644
--- a/kittystore/scrub.py
+++ b/kittystore/scrub.py
@@ -25,6 +25,7 @@ import binascii
import tempfile
from cStringIO import StringIO
from types import IntType, StringType
+from mimetypes import guess_all_extensions
from email.Utils import parsedate
from email.Parser import HeaderParser
@@ -39,106 +40,32 @@ from email.Charset import Charset
#from Mailman.i18n import _
#from Mailman.Logging.Syslog import syslog
#from Mailman.Utils import sha_new
-from mailman.core.i18n import _
+
from mailman.utilities.string import websafe, oneline
+# TODO: don't do translations here, the system locale has no meaning to the
+# web user
+from mailman.core.i18n import _
+
+# Path characters for common platforms
+pre = re.compile(r'[/\\:]')
+# All other characters to strip out of Content-Disposition: filenames
+# (essentially anything that isn't an alphanum, dot, dash, or underscore).
+sre = re.compile(r'[^-\w.]')
+# Regexp to strip out leading dots
+dre = re.compile(r'^\.*')
-## Path characters for common platforms
-#pre = re.compile(r'[/\\:]')
-## All other characters to strip out of Content-Disposition: filenames
-## (essentially anything that isn't an alphanum, dot, dash, or underscore).
-#sre = re.compile(r'[^-\w.]')
-## Regexp to strip out leading dots
-#dre = re.compile(r'^\.*')
-#
BR = '<br>\n'
-#SPACE = ' '
-#
-#try:
-# True, False
-#except NameError:
-# True = 1
-# False = 0
-#
-#
-#try:
-# from mimetypes import guess_all_extensions
-#except ImportError:
-# import mimetypes
-# def guess_all_extensions(ctype, strict=True):
-# # BAW: sigh, guess_all_extensions() is new in Python 2.3
-# all = []
-# def check(map):
-# for e, t in map.items():
-# if t == ctype:
-# all.append(e)
-# check(mimetypes.types_map)
-# # Python 2.1 doesn't have common_types. Sigh, sigh.
-# if not strict and hasattr(mimetypes, 'common_types'):
-# check(mimetypes.common_types)
-# return all
-#
-#
-#
-#def guess_extension(ctype, ext):
-# # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
-# # and .wiz are all mapped to application/msword. This sucks for finding
-# # the best reverse mapping. If the extension is one of the giving
-# # mappings, we'll trust that, otherwise we'll just guess. :/
-# all = guess_all_extensions(ctype, strict=False)
-# if ext in all:
-# return ext
-# return all and all[0]
-#
-#
-#def safe_strftime(fmt, t):
-# try:
-# return time.strftime(fmt, t)
-# except (TypeError, ValueError, OverflowError):
-# return None
-#
-#
-#def calculate_attachments_dir(mlist, msg, msgdata):
-# # Calculate the directory that attachments for this message will go
-# # under. To avoid inode limitations, the scheme will be:
-# # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
-# # Start by calculating the date-based and msgid-hash components.
-# fmt = '%Y%m%d'
-# datestr = msg.get('Date')
-# if datestr:
-# now = parsedate(datestr)
-# else:
-# now = time.gmtime(msgdata.get('received_time', time.time()))
-# datedir = safe_strftime(fmt, now)
-# if not datedir:
-# datestr = msgdata.get('X-List-Received-Date')
-# if datestr:
-# datedir = safe_strftime(fmt, datestr)
-# if not datedir:
-# # What next? Unixfrom, I guess.
-# parts = msg.get_unixfrom().split()
-# try:
-# month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
-# 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
-# }.get(parts[3], 0)
-# day = int(parts[4])
-# year = int(parts[6])
-# except (IndexError, ValueError):
-# # Best we can do I think
-# month = day = year = 0
-# datedir = '%04d%02d%02d' % (year, month, day)
-# assert datedir
-# # As for the msgid hash, we'll base this part on the Message-ID: so that
-# # all attachments for the same message end up in the same directory (we'll
-# # uniquify the filenames in that directory as needed). We use the first 2
-# # and last 2 bytes of the SHA1 hash of the message id as the basis of the
-# # directory name. Clashes here don't really matter too much, and that
-# # still gives us a 32-bit space to work with.
-# msgid = msg['message-id']
-# if msgid is None:
-# msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
-# # We assume that the message id actually /is/ unique!
-# digest = sha_new(msgid).hexdigest()
-# return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
+
+
+def guess_extension(ctype, ext):
+ # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
+ # and .wiz are all mapped to application/msword. This sucks for finding
+ # the best reverse mapping. If the extension is one of the giving
+ # mappings, we'll trust that, otherwise we'll just guess. :/
+ all = guess_all_extensions(ctype, strict=False)
+ if ext in all:
+ return ext
+ return all and all[0]
def replace_payload_by_text(msg, text, charset):
@@ -149,59 +76,64 @@ def replace_payload_by_text(msg, text, charset):
#if isinstance(charset, unicode):
# # email 3.0.1 (python 2.4) doesn't like unicode
# charset = charset.encode('us-ascii')
- msg.set_payload(text, charset)
+ #msg.set_payload(text, charset)
+ msg.set_payload('TODO: display attachment here and remove message subpart')
-def save_attachment(mlist, msg, filter_html=True):
- # Store name, content-type and size
- return "TODO: handle attachments and return a link here"
-def scrub_message(mlist, msg):
- sanitize = 1 # TODO: implement other options
- outer = True
- charset = None
- #lcset = Utils.GetCharSet(mlist.preferred_language)
- #lcset_out = Charset(lcset).output_charset or lcset
- lcset = "utf-8"
- # Now walk over all subparts of this message and scrub out various types
- format = delsp = None
- for part in msg.walk():
- ctype = part.get_content_type()
- # If the part is text/plain, we leave it alone
- if ctype == 'text/plain':
- # We need to choose a charset for the scrubbed message, so we'll
- # arbitrarily pick the charset of the first text/plain part in the
- # message.
- # MAS: Also get the RFC 3676 stuff from this part. This seems to
- # work OK for scrub_nondigest. It will also work as far as
- # scrubbing messages for the archive is concerned, but pipermail
- # doesn't pay any attention to the RFC 3676 parameters. The plain
- # format digest is going to be a disaster in any case as some of
- # messages will be format="flowed" and some not. ToDigest creates
- # its own Content-Type: header for the plain digest which won't
- # have RFC 3676 parameters. If the message Content-Type: headers
- # are retained for display in the digest, the parameters will be
- # there for information, but not for the MUA. This is the best we
- # can do without having get_payload() process the parameters.
- if charset is None:
- charset = part.get_content_charset(lcset)
- format = part.get_param('format')
- delsp = part.get_param('delsp')
- # TK: if part is attached then check charset and scrub if none
- if part.get('content-disposition') and \
- not part.get_content_charset():
- omask = os.umask(002)
- try:
- url = save_attachment(mlist, part)
- finally:
- os.umask(omask)
- filename = part.get_filename(_('not available'))
- filename = oneline(filename, lcset)
- replace_payload_by_text(part, _("""\
-An embedded and charset-unspecified text was scrubbed...
-Name: %(filename)s
-URL: %(url)s
-"""), lcset)
- elif ctype == 'text/html' and isinstance(sanitize, IntType):
+
+class Scrubber(object):
+ """
+ Scrubs a single message, extracts attachments, and store them in the
+ database.
+ """
+
+ def __init__(self, mlist, msg, store):
+ self.mlist = mlist
+ self.msg = msg
+ self.store = store
+
+
+ def scrub(self):
+ sanitize = 1 # TODO: implement other options
+ outer = True
+ charset = None
+ #lcset = Utils.GetCharSet(self.mlist.preferred_language)
+ #lcset_out = Charset(lcset).output_charset or lcset
+ lcset = "utf-8"
+ # Now walk over all subparts of this message and scrub out various types
+ format = delsp = None
+ for part_num, part in enumerate(self.msg.walk()):
+ ctype = part.get_content_type()
+ # If the part is text/plain, we leave it alone
+ if ctype == 'text/plain':
+ # We need to choose a charset for the scrubbed message, so we'll
+ # arbitrarily pick the charset of the first text/plain part in the
+ # message.
+ # MAS: Also get the RFC 3676 stuff from this part. This seems to
+ # work OK for scrub_nondigest. It will also work as far as
+ # scrubbing messages for the archive is concerned, but pipermail
+ # doesn't pay any attention to the RFC 3676 parameters. The plain
+ # format digest is going to be a disaster in any case as some of
+ # messages will be format="flowed" and some not. ToDigest creates
+ # its own Content-Type: header for the plain digest which won't
+ # have RFC 3676 parameters. If the message Content-Type: headers
+ # are retained for display in the digest, the parameters will be
+ # there for information, but not for the MUA. This is the best we
+ # can do without having get_payload() process the parameters.
+ if charset is None:
+ charset = part.get_content_charset(lcset)
+ format = part.get_param('format')
+ delsp = part.get_param('delsp')
+ # TK: if part is attached then check charset and scrub if none
+ if part.get('content-disposition') and \
+ not part.get_content_charset():
+ self.save_attachment(part, part_num)
+ replace_payload_by_text(part, _("""\
+ An embedded and charset-unspecified text was scrubbed...
+ Name: %(filename)s
+ URL: %(url)s
+ """), lcset)
+ elif ctype == 'text/html' and isinstance(sanitize, IntType):
# if sanitize == 0:
# if outer:
# raise DiscardMessage
@@ -217,314 +149,221 @@ URL: %(url)s
# # Pull it out as an attachment but leave it unescaped. This
# # is dangerous, but perhaps useful for heavily moderated
# # lists.
-# omask = os.umask(002)
-# try:
-# url = save_attachment(mlist, part, filter_html=False)
-# finally:
-# os.umask(omask)
+# self.save_attachment(part, part_num, filter_html=False)
# replace_payload_by_text(part, _("""\
#An HTML attachment was scrubbed...
#URL: %(url)s
#"""), lcset)
# else:
- if sanitize == 1:
- # HTML-escape it and store it as an attachment, but make it
- # look a /little/ bit prettier. :(
- payload = websafe(part.get_payload(decode=True))
- # For whitespace in the margin, change spaces into
- # non-breaking spaces, and tabs into 8 of those. Then use a
- # mono-space font. Still looks hideous to me, but then I'd
- # just as soon discard them.
- def doreplace(s):
- return s.expandtabs(8).replace(' ', '&nbsp;')
- lines = [doreplace(s) for s in payload.split('\n')]
- payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
- part.set_payload(payload)
- # We're replacing the payload with the decoded payload so this
- # will just get in the way.
- del part['content-transfer-encoding']
- omask = os.umask(002)
- try:
- url = save_attachment(mlist, part, filter_html=False)
- finally:
- os.umask(omask)
+ if sanitize == 1:
+ # HTML-escape it and store it as an attachment, but make it
+ # look a /little/ bit prettier. :(
+ payload = websafe(part.get_payload(decode=True))
+ # For whitespace in the margin, change spaces into
+ # non-breaking spaces, and tabs into 8 of those. Then use a
+ # mono-space font. Still looks hideous to me, but then I'd
+ # just as soon discard them.
+ def doreplace(s):
+ return s.expandtabs(8).replace(' ', '&nbsp;')
+ lines = [doreplace(s) for s in payload.split('\n')]
+ payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
+ part.set_payload(payload)
+ # We're replacing the payload with the decoded payload so this
+ # will just get in the way.
+ del part['content-transfer-encoding']
+ self.save_attachment(part, part_num, filter_html=False)
+ replace_payload_by_text(part, _("""\
+ An HTML attachment was scrubbed...
+ URL: %(url)s
+ """), lcset)
+ elif ctype == 'message/rfc822':
+ # This part contains a submessage, so it too needs scrubbing
+ submsg = part.get_payload(0)
+ self.save_attachment(part, part_num)
+ subject = submsg.get('subject', _('no subject'))
+ subject = oneline(subject, lcset)
+ date = submsg.get('date', _('no date'))
+ who = submsg.get('from', _('unknown sender'))
+ size = len(str(submsg))
replace_payload_by_text(part, _("""\
-An HTML attachment was scrubbed...
-URL: %(url)s
-"""), lcset)
- elif ctype == 'message/rfc822':
- # This part contains a submessage, so it too needs scrubbing
- submsg = part.get_payload(0)
- omask = os.umask(002)
- try:
- url = save_attachment(mlist, part)
- finally:
- os.umask(omask)
- subject = submsg.get('subject', _('no subject'))
- subject = oneline(subject, lcset)
- date = submsg.get('date', _('no date'))
- who = submsg.get('from', _('unknown sender'))
- size = len(str(submsg))
- replace_payload_by_text(part, _("""\
-An embedded message was scrubbed...
-From: %(who)s
-Subject: %(subject)s
-Date: %(date)s
-Size: %(size)s
-URL: %(url)s
-"""), lcset)
- # If the message isn't a multipart, then we'll strip it out as an
- # attachment that would have to be separately downloaded. Pipermail
- # will transform the url into a hyperlink.
- elif part.get_payload() and not part.is_multipart():
- payload = part.get_payload(decode=True)
- ctype = part.get_content_type()
- # XXX Under email 2.5, it is possible that payload will be None.
- # This can happen when you have a Content-Type: multipart/* with
- # only one part and that part has two blank lines between the
- # first boundary and the end boundary. In email 3.0 you end up
- # with a string in the payload. I think in this case it's safe to
- # ignore the part.
- if payload is None:
- continue
- size = len(payload)
- omask = os.umask(002)
- try:
- url = save_attachment(mlist, part)
- finally:
- os.umask(omask)
- desc = part.get('content-description', _('not available'))
- desc = oneline(desc, lcset)
- filename = part.get_filename(_('not available'))
- filename = oneline(filename, lcset)
- replace_payload_by_text(part, _("""\
-A non-text attachment was scrubbed...
-Name: %(filename)s
-Type: %(ctype)s
-Size: %(size)d bytes
-Desc: %(desc)s
-URL: %(url)s
-"""), lcset)
- outer = False
- # We still have to sanitize multipart messages to flat text because
- # Pipermail can't handle messages with list payloads. This is a kludge;
- # def (n) clever hack ;).
- if msg.is_multipart():
- # By default we take the charset of the first text/plain part in the
- # message, but if there was none, we'll use the list's preferred
- # language's charset.
- if not charset or charset == 'us-ascii':
- charset = lcset_out
- else:
- # normalize to the output charset if input/output are different
- charset = Charset(charset).output_charset or charset
- # We now want to concatenate all the parts which have been scrubbed to
- # text/plain, into a single text/plain payload. We need to make sure
- # all the characters in the concatenated string are in the same
- # encoding, so we'll use the 'replace' key in the coercion call.
- # BAW: Martin's original patch suggested we might want to try
- # generalizing to utf-8, and that's probably a good idea (eventually).
- text = []
- for part in msg.walk():
- # TK: bug-id 1099138 and multipart
- # MAS test payload - if part may fail if there are no headers.
- if not part.get_payload() or part.is_multipart():
- continue
- # All parts should be scrubbed to text/plain by now, except
- # if sanitize == 2, there could be text/html parts so keep them
- # but skip any other parts.
- partctype = part.get_content_type()
- if partctype <> 'text/plain' and (partctype <> 'text/html' or
- sanitize <> 2):
- text.append(_('Skipped content of type %(partctype)s\n'))
- continue
- try:
- t = part.get_payload(decode=True) or ''
- # MAS: TypeError exception can occur if payload is None. This
- # was observed with a message that contained an attached
- # message/delivery-status part. Because of the special parsing
- # of this type, this resulted in a text/plain sub-part with a
- # null body. See bug 1430236.
- except (binascii.Error, TypeError):
- t = part.get_payload() or ''
- # TK: get_content_charset() returns 'iso-2022-jp' for internally
- # crafted (scrubbed) 'euc-jp' text part. So, first try
- # get_charset(), then get_content_charset() for the parts
- # which are already embeded in the incoming message.
- partcharset = part.get_charset()
- if partcharset:
- partcharset = str(partcharset)
+ An embedded message was scrubbed...
+ From: %(who)s
+ Subject: %(subject)s
+ Date: %(date)s
+ Size: %(size)s
+ URL: %(url)s
+ """), lcset)
+ # If the message isn't a multipart, then we'll strip it out as an
+ # attachment that would have to be separately downloaded. Pipermail
+ # will transform the url into a hyperlink.
+ elif part.get_payload() and not part.is_multipart():
+ payload = part.get_payload(decode=True)
+ ctype = part.get_content_type()
+ # XXX Under email 2.5, it is possible that payload will be None.
+ # This can happen when you have a Content-Type: multipart/* with
+ # only one part and that part has two blank lines between the
+ # first boundary and the end boundary. In email 3.0 you end up
+ # with a string in the payload. I think in this case it's safe to
+ # ignore the part.
+ if payload is None:
+ continue
+ size = len(payload)
+ self.save_attachment(part, part_num)
+ desc = part.get('content-description', _('not available'))
+ desc = oneline(desc, lcset)
+ filename = part.get_filename(_('not available'))
+ filename = oneline(filename, lcset)
+ replace_payload_by_text(part, _("""\
+ A non-text attachment was scrubbed...
+ Name: %(filename)s
+ Type: %(ctype)s
+ Size: %(size)d bytes
+ Desc: %(desc)s
+ URL: %(url)s
+ """), lcset)
+ outer = False
+ # We still have to sanitize multipart messages to flat text because
+ # Pipermail can't handle messages with list payloads. This is a kludge;
+ # def (n) clever hack ;).
+ if self.msg.is_multipart():
+ # By default we take the charset of the first text/plain part in the
+ # message, but if there was none, we'll use the list's preferred
+ # language's charset.
+ if not charset or charset == 'us-ascii':
+ charset = lcset_out
else:
- partcharset = part.get_content_charset()
- if partcharset and partcharset <> charset:
- try:
- t = unicode(t, partcharset, 'replace')
- except (UnicodeError, LookupError, ValueError,
- AssertionError):
- # We can get here if partcharset is bogus in come way.
- # Replace funny characters. We use errors='replace'
- t = unicode(t, 'ascii', 'replace')
+ # normalize to the output charset if input/output are different
+ charset = Charset(charset).output_charset or charset
+ # We now want to concatenate all the parts which have been scrubbed to
+ # text/plain, into a single text/plain payload. We need to make sure
+ # all the characters in the concatenated string are in the same
+ # encoding, so we'll use the 'replace' key in the coercion call.
+ # BAW: Martin's original patch suggested we might want to try
+ # generalizing to utf-8, and that's probably a good idea (eventually).
+ text = []
+ for part in self.msg.walk():
+ # TK: bug-id 1099138 and multipart
+ # MAS test payload - if part may fail if there are no headers.
+ if not part.get_payload() or part.is_multipart():
+ continue
+ # All parts should be scrubbed to text/plain by now, except
+ # if sanitize == 2, there could be text/html parts so keep them
+ # but skip any other parts.
+ partctype = part.get_content_type()
+ if partctype <> 'text/plain' and (partctype <> 'text/html' or
+ sanitize <> 2):
+ text.append(_('Skipped content of type %(partctype)s\n'))
+ continue
try:
- # Should use HTML-Escape, or try generalizing to UTF-8
- t = t.encode(charset, 'replace')
- except (UnicodeError, LookupError, ValueError,
- AssertionError):
- # if the message charset is bogus, use the list's.
- t = t.encode(lcset, 'replace')
- # Separation is useful
- if isinstance(t, StringType):
- if not t.endswith('\n'):
- t += '\n'
- text.append(t)
- # Now join the text and set the payload
- sep = _('-------------- next part --------------\n')
- # The i18n separator is in the list's charset. Coerce it to the
- # message charset.
- try:
- sep = sep.encode(charset, 'replace')
- except (UnicodeError, LookupError, ValueError,
- AssertionError):
- pass
- replace_payload_by_text(msg, sep.join(text), charset)
- if format:
- msg.set_param('Format', format)
- if delsp:
- msg.set_param('DelSp', delsp)
- return msg
+ t = part.get_payload(decode=True) or ''
+ # MAS: TypeError exception can occur if payload is None. This
+ # was observed with a message that contained an attached
+ # message/delivery-status part. Because of the special parsing
+ # of this type, this resulted in a text/plain sub-part with a
+ # null body. See bug 1430236.
+ except (binascii.Error, TypeError):
+ t = part.get_payload() or ''
+ # TK: get_content_charset() returns 'iso-2022-jp' for internally
+ # crafted (scrubbed) 'euc-jp' text part. So, first try
+ # get_charset(), then get_content_charset() for the parts
+ # which are already embeded in the incoming message.
+ partcharset = part.get_charset()
+ if partcharset:
+ partcharset = str(partcharset)
+ else:
+ partcharset = part.get_content_charset()
+ if partcharset and partcharset <> charset:
+ try:
+ t = unicode(t, partcharset, 'replace')
+ except (UnicodeError, LookupError, ValueError,
+ AssertionError):
+ # We can get here if partcharset is bogus in come way.
+ # Replace funny characters. We use errors='replace'
+ t = unicode(t, 'ascii', 'replace')
+ try:
+ # Should use HTML-Escape, or try generalizing to UTF-8
+ t = t.encode(charset, 'replace')
+ except (UnicodeError, LookupError, ValueError,
+ AssertionError):
+ # if the message charset is bogus, use the list's.
+ t = t.encode(lcset, 'replace')
+ # Separation is useful
+ if isinstance(t, StringType):
+ if not t.endswith('\n'):
+ t += '\n'
+ text.append(t)
+ # Now join the text and set the payload
+ sep = _('-------------- next part --------------\n')
+ # The i18n separator is in the list's charset. Coerce it to the
+ # message charset.
+ try:
+ sep = sep.encode(charset, 'replace')
+ except (UnicodeError, LookupError, ValueError,
+ AssertionError):
+ pass
+ replace_payload_by_text(self.msg, sep.join(text), charset)
+ if format:
+ self.msg.set_param('Format', format)
+ if delsp:
+ self.msg.set_param('DelSp', delsp)
+ return self.msg
-#
-#def makedirs(dir):
-# # Create all the directories to store this attachment in
-# try:
-# os.makedirs(dir, 02775)
-# # Unfortunately, FreeBSD seems to be broken in that it doesn't honor
-# # the mode arg of mkdir().
-# def twiddle(arg, dirname, names):
-# os.chmod(dirname, 02775)
-# os.path.walk(dir, twiddle, None)
-# except OSError, e:
-# if e.errno <> errno.EEXIST: raise
-#
-#
-#
-#def save_attachment(mlist, msg, dir, filter_html=True):
-# fsdir = os.path.join(mlist.archive_dir(), dir)
-# makedirs(fsdir)
-# # Figure out the attachment type and get the decoded data
-# decodedpayload = msg.get_payload(decode=True)
-# # BAW: mimetypes ought to handle non-standard, but commonly found types,
-# # e.g. image/jpg (should be image/jpeg). For now we just store such
-# # things as application/octet-streams since that seems the safest.
-# ctype = msg.get_content_type()
-# # i18n file name is encoded
-# lcset = Utils.GetCharSet(mlist.preferred_language)
-# filename = Utils.oneline(msg.get_filename(''), lcset)
-# filename, fnext = os.path.splitext(filename)
-# # For safety, we should confirm this is valid ext for content-type
-# # but we can use fnext if we introduce fnext filtering
-# if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
-# # HTML message doesn't have filename :-(
-# ext = fnext or guess_extension(ctype, fnext)
-# else:
-# ext = guess_extension(ctype, fnext)
-# if not ext:
-# # We don't know what it is, so assume it's just a shapeless
-# # application/octet-stream, unless the Content-Type: is
-# # message/rfc822, in which case we know we'll coerce the type to
-# # text/plain below.
-# if ctype == 'message/rfc822':
-# ext = '.txt'
-# else:
-# ext = '.bin'
-# # Allow only alphanumerics, dash, underscore, and dot
-# ext = sre.sub('', ext)
-# path = None
-# # We need a lock to calculate the next attachment number
-# lockfile = os.path.join(fsdir, 'attachments.lock')
-# lock = LockFile.LockFile(lockfile)
-# lock.lock()
-# try:
-# # Now base the filename on what's in the attachment, uniquifying it if
-# # necessary.
-# if not filename or mm_cfg.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME:
-# filebase = 'attachment'
-# else:
-# # Sanitize the filename given in the message headers
-# parts = pre.split(filename)
-# filename = parts[-1]
-# # Strip off leading dots
-# filename = dre.sub('', filename)
-# # Allow only alphanumerics, dash, underscore, and dot
-# filename = sre.sub('', filename)
-# # If the filename's extension doesn't match the type we guessed,
-# # which one should we go with? For now, let's go with the one we
-# # guessed so attachments can't lie about their type. Also, if the
-# # filename /has/ no extension, then tack on the one we guessed.
-# # The extension was removed from the name above.
-# filebase = filename
-# # Now we're looking for a unique name for this file on the file
-# # system. If msgdir/filebase.ext isn't unique, we'll add a counter
-# # after filebase, e.g. msgdir/filebase-cnt.ext
-# counter = 0
-# extra = ''
-# while True:
-# path = os.path.join(fsdir, filebase + extra + ext)
-# # Generally it is not a good idea to test for file existance
-# # before just trying to create it, but the alternatives aren't
-# # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
-# # NFS-safe). Besides, we have an exclusive lock now, so we're
-# # guaranteed that no other process will be racing with us.
-# if os.path.exists(path):
-# counter += 1
-# extra = '-%04d' % counter
-# else:
-# break
-# finally:
-# lock.unlock()
-# # `path' now contains the unique filename for the attachment. There's
-# # just one more step we need to do. If the part is text/html and
-# # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
-# # here), then send the attachment through the filter program for
-# # sanitization
-# if filter_html and ctype == 'text/html':
-# base, ext = os.path.splitext(path)
-# tmppath = base + '-tmp' + ext
-# fp = open(tmppath, 'w')
-# try:
-# fp.write(decodedpayload)
-# fp.close()
-# cmd = mm_cfg.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
-# progfp = os.popen(cmd, 'r')
-# decodedpayload = progfp.read()
-# status = progfp.close()
-# if status:
-# syslog('error',
-# 'HTML sanitizer exited with non-zero status: %s',
-# status)
-# finally:
-# os.unlink(tmppath)
-# # BAW: Since we've now sanitized the document, it should be plain
-# # text. Blarg, we really want the sanitizer to tell us what the type
-# # if the return data is. :(
-# ext = '.txt'
-# path = base + '.txt'
-# # Is it a message/rfc822 attachment?
-# elif ctype == 'message/rfc822':
-# submsg = msg.get_payload()
-# # BAW: I'm sure we can eventually do better than this. :(
-# decodedpayload = Utils.websafe(str(submsg))
-# fp = open(path, 'w')
-# fp.write(decodedpayload)
-# fp.close()
-# # Now calculate the url
-# baseurl = mlist.GetBaseArchiveURL()
-# # Private archives will likely have a trailing slash. Normalize.
-# if baseurl[-1] <> '/':
-# baseurl += '/'
-# # A trailing space in url string may save users who are using
-# # RFC-1738 compliant MUA (Not Mozilla).
-# # Trailing space will definitely be a problem with format=flowed.
-# # Bracket the URL instead.
-# url = '<' + baseurl + '%s/%s%s%s>' % (dir, filebase, extra, ext)
-# return url
+ def save_attachment(self, part, counter, filter_html=True):
+ # Store name, content-type and size
+ # Figure out the attachment type and get the decoded data
+ decodedpayload = part.get_payload(decode=True)
+ # BAW: mimetypes ought to handle non-standard, but commonly found types,
+ # e.g. image/jpg (should be image/jpeg). For now we just store such
+ # things as application/octet-streams since that seems the safest.
+ ctype = part.get_content_type()
+ # i18n file name is encoded
+ #lcset = Utils.GetCharSet(self.mlist.preferred_language)
+ lcset = "utf-8"
+ filename = oneline(part.get_filename(''), lcset)
+ filename, fnext = os.path.splitext(filename)
+ # For safety, we should confirm this is valid ext for content-type
+ # but we can use fnext if we introduce fnext filtering
+ # TODO: re-implement this
+ #if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
+ # # HTML message doesn't have filename :-(
+ # ext = fnext or guess_extension(ctype, fnext)
+ #else:
+ # ext = guess_extension(ctype, fnext)
+ ext = fnext or guess_extension(ctype, fnext)
+ if not ext:
+ # We don't know what it is, so assume it's just a shapeless
+ # application/octet-stream, unless the Content-Type: is
+ # message/rfc822, in which case we know we'll coerce the type to
+ # text/plain below.
+ if ctype == 'message/rfc822':
+ ext = '.txt'
+ else:
+ ext = '.bin'
+ # Allow only alphanumerics, dash, underscore, and dot
+ ext = sre.sub('', ext)
+ # Now base the filename on what's in the attachment, uniquifying it if
+ # necessary.
+ if not filename:
+ filebase = 'attachment'
+ else:
+ # Sanitize the filename given in the message headers
+ parts = pre.split(filename)
+ filename = parts[-1]
+ # Strip off leading dots
+ filename = dre.sub('', filename)
+ # Allow only alphanumerics, dash, underscore, and dot
+ filename = sre.sub('', filename)
+ # If the filename's extension doesn't match the type we guessed,
+ # which one should we go with? For now, let's go with the one we
+ # guessed so attachments can't lie about their type. Also, if the
+ # filename /has/ no extension, then tack on the one we guessed.
+ # The extension was removed from the name above.
+ filebase = filename
+ # TODO: bring back the HTML sanitizer feature
+ if ctype == 'message/rfc822':
+ submsg = part.get_payload()
+ # BAW: I'm sure we can eventually do better than this. :(
+ decodedpayload = websafe(str(submsg))
+ msg_id = self.msg['Message-Id'].strip("<>")
+ self.store.add_attachment(self.mlist, msg_id, counter, decodedpayload)
diff --git a/kittystore/storm/schema/__init__.py b/kittystore/storm/schema/__init__.py
index c301e88..daba29f 100644
--- a/kittystore/storm/schema/__init__.py
+++ b/kittystore/storm/schema/__init__.py
@@ -60,6 +60,16 @@ CREATES = {
"full" BYTEA NOT NULL,
archived_date TIMESTAMP WITHOUT TIME ZONE DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (list_name, message_id)
+ );""", """
+ CREATE TABLE "attachment" (
+ list_name VARCHAR(255) NOT NULL,
+ message_id VARCHAR(255) NOT NULL,
+ counter INTEGER NOT NULL,
+ content_type VARCHAR(255) NOT NULL,
+ name VARCHAR(255),
+ size INTEGER NOT NULL,
+ content BYTEA NOT NULL,
+ PRIMARY KEY (list_name, message_id, counter)
);""",
'CREATE INDEX "ix_email_list_name" ON "email" USING btree (list_name);',
'CREATE UNIQUE INDEX "ix_email_message_id" ON "email" USING btree (message_id);',