# Copyright (C) 2011-2012 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
"""Cleanse a message for archiving."""
import os
import re
import binascii
from types import IntType
from mimetypes import guess_all_extensions
from email.utils import unquote
from mailman.utilities.string import oneline
# Path characters for common platforms
pre = re.compile(r'[/\\:]')
# All other characters to strip out of Content-Disposition: filenames
# (essentially anything that isn't an alphanum, dot, dash, or underscore).
sre = re.compile(r'[^-\w.]')
# Regexp to strip out leading dots
dre = re.compile(r'^\.*')
BR = '
\n'
NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n')
def guess_extension(ctype, ext):
# mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
# and .wiz are all mapped to application/msword. This sucks for finding
# the best reverse mapping. If the extension is one of the giving
# mappings, we'll trust that, otherwise we'll just guess. :/
all_exts = guess_all_extensions(ctype, strict=False)
if ext in all_exts:
return ext
return all_exts and all_exts[0]
def get_charset(message, default="ascii", guess=False):
"""
Get the message charset.
From: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
"""
if message.get_content_charset():
return message.get_content_charset()
if message.get_charset():
return message.get_charset()
if not guess:
return default
# Try to guess the encoding (best effort mode)
text = message.get_payload(decode=True)
charset = default
for encoding in ["ascii", "utf-8", "iso-8859-15"]:
try:
text.decode(encoding)
except UnicodeDecodeError:
continue
else:
#print encoding, payload
charset = encoding
break
return charset
class Scrubber(object):
"""
Scrubs a single message, extracts attachments, and return the text and the
attachments.
See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
"""
def __init__(self, mlist, msg):
self.mlist = mlist
self.msg = msg
def scrub(self):
attachments = []
sanitize = 1 # TODO: implement other options
#outer = True
# Now walk over all subparts of this message and scrub out various types
for part_num, part in enumerate(self.msg.walk()):
ctype = part.get_content_type()
# If the part is text/plain, we leave it alone
if ctype == 'text/plain':
disposition = part.get('content-disposition')
if disposition and disposition.strip().startswith("attachment"):
# part is attached
attachments.append(self.parse_attachment(part, part_num))
part.set_payload('')
elif ctype == 'text/html' and isinstance(sanitize, IntType):
# if sanitize == 0:
# if outer:
# raise DiscardMessage
# replace_payload_by_text(part,
# _('HTML attachment scrubbed and removed'),
# # Adding charset arg and removing content-type
# # sets content-type to text/plain
# lcset)
# elif sanitize == 2:
# # By leaving it alone, Pipermail will automatically escape it
# pass
# elif sanitize == 3:
# # Pull it out as an attachment but leave it unescaped. This
# # is dangerous, but perhaps useful for heavily moderated
# # lists.
# attachments.append(self.parse_attachment(part, part_num, filter_html=False))
# replace_payload_by_text(part, _("""\
#An HTML attachment was scrubbed...
#URL: %(url)s
#"""), lcset)
# else:
if sanitize == 1:
# Don't HTML-escape it, this is the frontend's job
## HTML-escape it and store it as an attachment, but make it
## look a /little/ bit prettier. :(
#payload = websafe(part.get_payload(decode=True))
## For whitespace in the margin, change spaces into
## non-breaking spaces, and tabs into 8 of those. Then use a
## mono-space font. Still looks hideous to me, but then I'd
## just as soon discard them.
#def doreplace(s):
# return s.expandtabs(8).replace(' ', ' ')
#lines = [doreplace(s) for s in payload.split('\n')]
#payload = '\n' + BR.join(lines) + '\n\n'
#part.set_payload(payload)
## We're replacing the payload with the decoded payload so this
## will just get in the way.
#del part['content-transfer-encoding']
attachments.append(self.parse_attachment(part, part_num, filter_html=False))
part.set_payload('')
elif ctype == 'message/rfc822':
# This part contains a submessage, so it too needs scrubbing
attachments.append(self.parse_attachment(part, part_num))
part.set_payload('')
# If the message isn't a multipart, then we'll strip it out as an
# attachment that would have to be separately downloaded.
elif part.get_payload() and not part.is_multipart():
payload = part.get_payload(decode=True)
ctype = part.get_content_type()
# XXX Under email 2.5, it is possible that payload will be None.
# This can happen when you have a Content-Type: multipart/* with
# only one part and that part has two blank lines between the
# first boundary and the end boundary. In email 3.0 you end up
# with a string in the payload. I think in this case it's safe to
# ignore the part.
if payload is None:
continue
attachments.append(self.parse_attachment(part, part_num))
#outer = False
# We still have to sanitize multipart messages to flat text because
# Pipermail can't handle messages with list payloads. This is a kludge;
# def (n) clever hack ;).
if self.msg.is_multipart():
# We now want to concatenate all the parts which have been scrubbed to
# text/plain, into a single text/plain payload. We need to make sure
# all the characters in the concatenated string are in the same
# encoding, so we'll use the 'replace' key in the coercion call.
# BAW: Martin's original patch suggested we might want to try
# generalizing to utf-8, and that's probably a good idea (eventually).
text = []
for part in self.msg.walk():
# TK: bug-id 1099138 and multipart
# MAS test payload - if part may fail if there are no headers.
if not part.get_payload() or part.is_multipart():
continue
# All parts should be scrubbed to text/plain by now, except
# if sanitize == 2, there could be text/html parts so keep them
# but skip any other parts.
partctype = part.get_content_type()
if partctype <> 'text/plain' and (partctype <> 'text/html' or
sanitize <> 2):
#text.append(_('Skipped content of type %(partctype)s\n'))
continue
try:
t = part.get_payload(decode=True) or ''
# MAS: TypeError exception can occur if payload is None. This
# was observed with a message that contained an attached
# message/delivery-status part. Because of the special parsing
# of this type, this resulted in a text/plain sub-part with a
# null body. See bug 1430236.
except (binascii.Error, TypeError):
t = part.get_payload() or ''
partcharset = get_charset(part, guess=True)
try:
t = unicode(t, partcharset, 'replace')
except (UnicodeError, LookupError, ValueError,
AssertionError):
# We can get here if partcharset is bogus in come way.
# Replace funny characters. We use errors='replace'
t = unicode(t, 'ascii', 'replace')
# Separation is useful
if isinstance(t, basestring):
if not t.endswith('\n'):
t += '\n'
text.append(t)
text = "\n".join(text)
else:
text = self.msg.get_payload(decode=True)
text = text.decode(get_charset(self.msg, guess=True), "replace")
next_part_match = NEXT_PART.search(text)
if next_part_match:
text = text[0:next_part_match.start(0)]
return (text, attachments)
def parse_attachment(self, part, counter, filter_html=True):
# Store name, content-type and size
# Figure out the attachment type and get the decoded data
decodedpayload = part.get_payload(decode=True)
# BAW: mimetypes ought to handle non-standard, but commonly found types,
# e.g. image/jpg (should be image/jpeg). For now we just store such
# things as application/octet-streams since that seems the safest.
ctype = part.get_content_type()
charset = get_charset(part, default=None, guess=False)
# i18n file name is encoded
try:
filename = oneline(part.get_filename(''), in_unicode=True)
except TypeError:
# Workaround for https://bugs.launchpad.net/mailman/+bug/1060951
# (accented filenames)
filename = "attachment.bin"
filename, fnext = os.path.splitext(filename)
# For safety, we should confirm this is valid ext for content-type
# but we can use fnext if we introduce fnext filtering
# TODO: re-implement this
#if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
# # HTML message doesn't have filename :-(
# ext = fnext or guess_extension(ctype, fnext)
#else:
# ext = guess_extension(ctype, fnext)
ext = fnext or guess_extension(ctype, fnext)
if not ext:
# We don't know what it is, so assume it's just a shapeless
# application/octet-stream, unless the Content-Type: is
# message/rfc822, in which case we know we'll coerce the type to
# text/plain below.
if ctype == 'message/rfc822':
ext = '.txt'
else:
ext = '.bin'
# Allow only alphanumerics, dash, underscore, and dot
ext = sre.sub('', ext)
# Now base the filename on what's in the attachment, uniquifying it if
# necessary.
if not filename:
filebase = 'attachment'
else:
# Sanitize the filename given in the message headers
parts = pre.split(filename)
filename = parts[-1]
# Strip off leading dots
filename = dre.sub('', filename)
# Allow only alphanumerics, dash, underscore, and dot
# i18n filenames are not supported yet,
# see https://bugs.launchpad.net/bugs/1060951
filename = sre.sub('', filename)
# If the filename's extension doesn't match the type we guessed,
# which one should we go with? For now, let's go with the one we
# guessed so attachments can't lie about their type. Also, if the
# filename /has/ no extension, then tack on the one we guessed.
# The extension was removed from the name above.
filebase = filename
# TODO: bring back the HTML sanitizer feature
if ctype == 'message/rfc822':
submsg = part.get_payload()
# Don't HTML-escape it, this is the frontend's job
## BAW: I'm sure we can eventually do better than this. :(
#decodedpayload = websafe(str(submsg))
decodedpayload = str(submsg)
return (counter, filebase+ext, ctype, charset, decodedpayload)