Port the scrubbing function from Mailman 2.1

The function works without crashing, I still need to implement save_attachment(), add the required DB tables and wire everything together.
author: Aurélien Bompard <aurelien@bompard.org> 2012-09-19 19:00:08 +0200
committer: Aurélien Bompard <aurelien@bompard.org> 2012-09-24 22:49:51 +0200
commit: ffb9af409c8ebda208803cfdc2c20fe6ec68b76d (patch)
tree: 8e8b4576c74bb89945ed6f2ec0bee3ad516bc72f
parent: 69b1a92df2a3c35ae872100d6607d782db86df2f (diff)
download: kittystore-ffb9af409c8ebda208803cfdc2c20fe6ec68b76d.tar.gz
kittystore-ffb9af409c8ebda208803cfdc2c20fe6ec68b76d.tar.xz
kittystore-ffb9af409c8ebda208803cfdc2c20fe6ec68b76d.zip
2 files changed, 533 insertions, 2 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py
new file mode 100644
index 0000000..ac93baa
--- /dev/null
+++ b/kittystore/scrub.py
@@ -0,0 +1,530 @@
+# Copyright (C) 2001-2011 by the Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+
+"""Cleanse a message for archiving."""
+
+import os
+import re
+import time
+import errno
+import binascii
+import tempfile
+from cStringIO import StringIO
+from types import IntType, StringType
+
+from email.Utils import parsedate
+from email.Parser import HeaderParser
+from email.Generator import Generator
+from email.Charset import Charset
+
+#from Mailman import mm_cfg
+#from Mailman import Utils
+#from Mailman import LockFile
+#from Mailman import Message
+#from Mailman.Errors import DiscardMessage
+#from Mailman.i18n import _
+#from Mailman.Logging.Syslog import syslog
+#from Mailman.Utils import sha_new
+from mailman.core.i18n import _
+from mailman.utilities.string import websafe, oneline
+
+## Path characters for common platforms
+#pre = re.compile(r'[/\\:]')
+## All other characters to strip out of Content-Disposition: filenames
+## (essentially anything that isn't an alphanum, dot, dash, or underscore).
+#sre = re.compile(r'[^-\w.]')
+## Regexp to strip out leading dots
+#dre = re.compile(r'^\.*')
+#
+BR = '<br>\n'
+#SPACE = ' '
+#
+#try:
+#    True, False
+#except NameError:
+#    True = 1
+#    False = 0
+#
+#
+#try:
+#    from mimetypes import guess_all_extensions
+#except ImportError:
+#    import mimetypes
+#    def guess_all_extensions(ctype, strict=True):
+#        # BAW: sigh, guess_all_extensions() is new in Python 2.3
+#        all = []
+#        def check(map):
+#            for e, t in map.items():
+#                if t == ctype:
+#                    all.append(e)
+#        check(mimetypes.types_map)
+#        # Python 2.1 doesn't have common_types.  Sigh, sigh.
+#        if not strict and hasattr(mimetypes, 'common_types'):
+#            check(mimetypes.common_types)
+#        return all
+#
+#
+#
+#def guess_extension(ctype, ext):
+#    # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
+#    # and .wiz are all mapped to application/msword.  This sucks for finding
+#    # the best reverse mapping.  If the extension is one of the giving
+#    # mappings, we'll trust that, otherwise we'll just guess. :/
+#    all = guess_all_extensions(ctype, strict=False)
+#    if ext in all:
+#        return ext
+#    return all and all[0]
+#
+#
+#def safe_strftime(fmt, t):
+#    try:
+#        return time.strftime(fmt, t)
+#    except (TypeError, ValueError, OverflowError):
+#        return None
+#
+#
+#def calculate_attachments_dir(mlist, msg, msgdata):
+#    # Calculate the directory that attachments for this message will go
+#    # under.  To avoid inode limitations, the scheme will be:
+#    # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
+#    # Start by calculating the date-based and msgid-hash components.
+#    fmt = '%Y%m%d'
+#    datestr = msg.get('Date')
+#    if datestr:
+#        now = parsedate(datestr)
+#    else:
+#        now = time.gmtime(msgdata.get('received_time', time.time()))
+#    datedir = safe_strftime(fmt, now)
+#    if not datedir:
+#        datestr = msgdata.get('X-List-Received-Date')
+#        if datestr:
+#            datedir = safe_strftime(fmt, datestr)
+#    if not datedir:
+#        # What next?  Unixfrom, I guess.
+#        parts = msg.get_unixfrom().split()
+#        try:
+#            month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
+#                     'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
+#                     }.get(parts[3], 0)
+#            day = int(parts[4])
+#            year = int(parts[6])
+#        except (IndexError, ValueError):
+#            # Best we can do I think
+#            month = day = year = 0
+#        datedir = '%04d%02d%02d' % (year, month, day)
+#    assert datedir
+#    # As for the msgid hash, we'll base this part on the Message-ID: so that
+#    # all attachments for the same message end up in the same directory (we'll
+#    # uniquify the filenames in that directory as needed).  We use the first 2
+#    # and last 2 bytes of the SHA1 hash of the message id as the basis of the
+#    # directory name.  Clashes here don't really matter too much, and that
+#    # still gives us a 32-bit space to work with.
+#    msgid = msg['message-id']
+#    if msgid is None:
+#        msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
+#    # We assume that the message id actually /is/ unique!
+#    digest = sha_new(msgid).hexdigest()
+#    return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
+
+
+def replace_payload_by_text(msg, text, charset):
+    # TK: This is a common function in replacing the attachment and the main
+    # message by a text (scrubbing).
+    del msg['content-type']
+    del msg['content-transfer-encoding']
+    #if isinstance(charset, unicode):
+    #    # email 3.0.1 (python 2.4) doesn't like unicode
+    #    charset = charset.encode('us-ascii')
+    msg.set_payload(text, charset)
+
+def save_attachment(mlist, msg, filter_html=True):
+    # Store name, content-type and size
+    return "TODO: handle attachments and return a link here"
+
+def scrub_message(mlist, msg):
+    sanitize = 1 # TODO: implement other options
+    outer = True
+    charset = None
+    #lcset = Utils.GetCharSet(mlist.preferred_language)
+    #lcset_out = Charset(lcset).output_charset or lcset
+    lcset = "utf-8"
+    # Now walk over all subparts of this message and scrub out various types
+    format = delsp = None
+    for part in msg.walk():
+        ctype = part.get_content_type()
+        # If the part is text/plain, we leave it alone
+        if ctype == 'text/plain':
+            # We need to choose a charset for the scrubbed message, so we'll
+            # arbitrarily pick the charset of the first text/plain part in the
+            # message.
+            # MAS: Also get the RFC 3676 stuff from this part. This seems to
+            # work OK for scrub_nondigest.  It will also work as far as
+            # scrubbing messages for the archive is concerned, but pipermail
+            # doesn't pay any attention to the RFC 3676 parameters.  The plain
+            # format digest is going to be a disaster in any case as some of
+            # messages will be format="flowed" and some not.  ToDigest creates
+            # its own Content-Type: header for the plain digest which won't
+            # have RFC 3676 parameters. If the message Content-Type: headers
+            # are retained for display in the digest, the parameters will be
+            # there for information, but not for the MUA. This is the best we
+            # can do without having get_payload() process the parameters.
+            if charset is None:
+                charset = part.get_content_charset(lcset)
+                format = part.get_param('format')
+                delsp = part.get_param('delsp')
+            # TK: if part is attached then check charset and scrub if none
+            if part.get('content-disposition') and \
+               not part.get_content_charset():
+                omask = os.umask(002)
+                try:
+                    url = save_attachment(mlist, part)
+                finally:
+                    os.umask(omask)
+                filename = part.get_filename(_('not available'))
+                filename = oneline(filename, lcset)
+                replace_payload_by_text(part, _("""\
+An embedded and charset-unspecified text was scrubbed...
+Name: %(filename)s
+URL: %(url)s
+"""), lcset)
+        elif ctype == 'text/html' and isinstance(sanitize, IntType):
+#            if sanitize == 0:
+#                if outer:
+#                    raise DiscardMessage
+#                replace_payload_by_text(part,
+#                                 _('HTML attachment scrubbed and removed'),
+#                                 # Adding charset arg and removing content-type
+#                                 # sets content-type to text/plain
+#                                 lcset)
+#            elif sanitize == 2:
+#                # By leaving it alone, Pipermail will automatically escape it
+#                pass
+#            elif sanitize == 3:
+#                # Pull it out as an attachment but leave it unescaped.  This
+#                # is dangerous, but perhaps useful for heavily moderated
+#                # lists.
+#                omask = os.umask(002)
+#                try:
+#                    url = save_attachment(mlist, part, filter_html=False)
+#                finally:
+#                    os.umask(omask)
+#                replace_payload_by_text(part, _("""\
+#An HTML attachment was scrubbed...
+#URL: %(url)s
+#"""), lcset)
+#            else:
+            if sanitize == 1:
+                # HTML-escape it and store it as an attachment, but make it
+                # look a /little/ bit prettier. :(
+                payload = websafe(part.get_payload(decode=True))
+                # For whitespace in the margin, change spaces into
+                # non-breaking spaces, and tabs into 8 of those.  Then use a
+                # mono-space font.  Still looks hideous to me, but then I'd
+                # just as soon discard them.
+                def doreplace(s):
+                    return s.expandtabs(8).replace(' ', '&nbsp;')
+                lines = [doreplace(s) for s in payload.split('\n')]
+                payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
+                part.set_payload(payload)
+                # We're replacing the payload with the decoded payload so this
+                # will just get in the way.
+                del part['content-transfer-encoding']
+                omask = os.umask(002)
+                try:
+                    url = save_attachment(mlist, part, filter_html=False)
+                finally:
+                    os.umask(omask)
+                replace_payload_by_text(part, _("""\
+An HTML attachment was scrubbed...
+URL: %(url)s
+"""), lcset)
+        elif ctype == 'message/rfc822':
+            # This part contains a submessage, so it too needs scrubbing
+            submsg = part.get_payload(0)
+            omask = os.umask(002)
+            try:
+                url = save_attachment(mlist, part)
+            finally:
+                os.umask(omask)
+            subject = submsg.get('subject', _('no subject'))
+            subject = oneline(subject, lcset)
+            date = submsg.get('date', _('no date'))
+            who = submsg.get('from', _('unknown sender'))
+            size = len(str(submsg))
+            replace_payload_by_text(part, _("""\
+An embedded message was scrubbed...
+From: %(who)s
+Subject: %(subject)s
+Date: %(date)s
+Size: %(size)s
+URL: %(url)s
+"""), lcset)
+        # If the message isn't a multipart, then we'll strip it out as an
+        # attachment that would have to be separately downloaded.  Pipermail
+        # will transform the url into a hyperlink.
+        elif part.get_payload() and not part.is_multipart():
+            payload = part.get_payload(decode=True)
+            ctype = part.get_content_type()
+            # XXX Under email 2.5, it is possible that payload will be None.
+            # This can happen when you have a Content-Type: multipart/* with
+            # only one part and that part has two blank lines between the
+            # first boundary and the end boundary.  In email 3.0 you end up
+            # with a string in the payload.  I think in this case it's safe to
+            # ignore the part.
+            if payload is None:
+                continue
+            size = len(payload)
+            omask = os.umask(002)
+            try:
+                url = save_attachment(mlist, part)
+            finally:
+                os.umask(omask)
+            desc = part.get('content-description', _('not available'))
+            desc = oneline(desc, lcset)
+            filename = part.get_filename(_('not available'))
+            filename = oneline(filename, lcset)
+            replace_payload_by_text(part, _("""\
+A non-text attachment was scrubbed...
+Name: %(filename)s
+Type: %(ctype)s
+Size: %(size)d bytes
+Desc: %(desc)s
+URL: %(url)s
+"""), lcset)
+        outer = False
+    # We still have to sanitize multipart messages to flat text because
+    # Pipermail can't handle messages with list payloads.  This is a kludge;
+    # def (n) clever hack ;).
+    if msg.is_multipart():
+        # By default we take the charset of the first text/plain part in the
+        # message, but if there was none, we'll use the list's preferred
+        # language's charset.
+        if not charset or charset == 'us-ascii':
+            charset = lcset_out
+        else:
+            # normalize to the output charset if input/output are different
+            charset = Charset(charset).output_charset or charset
+        # We now want to concatenate all the parts which have been scrubbed to
+        # text/plain, into a single text/plain payload.  We need to make sure
+        # all the characters in the concatenated string are in the same
+        # encoding, so we'll use the 'replace' key in the coercion call.
+        # BAW: Martin's original patch suggested we might want to try
+        # generalizing to utf-8, and that's probably a good idea (eventually).
+        text = []
+        for part in msg.walk():
+            # TK: bug-id 1099138 and multipart
+            # MAS test payload - if part may fail if there are no headers.
+            if not part.get_payload() or part.is_multipart():
+                continue
+            # All parts should be scrubbed to text/plain by now, except
+            # if sanitize == 2, there could be text/html parts so keep them
+            # but skip any other parts.
+            partctype = part.get_content_type()
+            if partctype <> 'text/plain' and (partctype <> 'text/html' or
+                                              sanitize <> 2):
+                text.append(_('Skipped content of type %(partctype)s\n'))
+                continue
+            try:
+                t = part.get_payload(decode=True) or ''
+            # MAS: TypeError exception can occur if payload is None. This
+            # was observed with a message that contained an attached
+            # message/delivery-status part. Because of the special parsing
+            # of this type, this resulted in a text/plain sub-part with a
+            # null body. See bug 1430236.
+            except (binascii.Error, TypeError):
+                t = part.get_payload() or ''
+            # TK: get_content_charset() returns 'iso-2022-jp' for internally
+            # crafted (scrubbed) 'euc-jp' text part. So, first try
+            # get_charset(), then get_content_charset() for the parts
+            # which are already embeded in the incoming message.
+            partcharset = part.get_charset()
+            if partcharset:
+                partcharset = str(partcharset)
+            else:
+                partcharset = part.get_content_charset()
+            if partcharset and partcharset <> charset:
+                try:
+                    t = unicode(t, partcharset, 'replace')
+                except (UnicodeError, LookupError, ValueError,
+                        AssertionError):
+                    # We can get here if partcharset is bogus in come way.
+                    # Replace funny characters.  We use errors='replace'
+                    t = unicode(t, 'ascii', 'replace')
+                try:
+                    # Should use HTML-Escape, or try generalizing to UTF-8
+                    t = t.encode(charset, 'replace')
+                except (UnicodeError, LookupError, ValueError,
+                        AssertionError):
+                    # if the message charset is bogus, use the list's.
+                    t = t.encode(lcset, 'replace')
+            # Separation is useful
+            if isinstance(t, StringType):
+                if not t.endswith('\n'):
+                    t += '\n'
+                text.append(t)
+        # Now join the text and set the payload
+        sep = _('-------------- next part --------------\n')
+        # The i18n separator is in the list's charset. Coerce it to the
+        # message charset.
+        try:
+            sep = sep.encode(charset, 'replace')
+        except (UnicodeError, LookupError, ValueError,
+                AssertionError):
+            pass
+        replace_payload_by_text(msg, sep.join(text), charset)
+        if format:
+            msg.set_param('Format', format)
+        if delsp:
+            msg.set_param('DelSp', delsp)
+    return msg
+
+
+#
+#def makedirs(dir):
+#    # Create all the directories to store this attachment in
+#    try:
+#        os.makedirs(dir, 02775)
+#        # Unfortunately, FreeBSD seems to be broken in that it doesn't honor
+#        # the mode arg of mkdir().
+#        def twiddle(arg, dirname, names):
+#            os.chmod(dirname, 02775)
+#        os.path.walk(dir, twiddle, None)
+#    except OSError, e:
+#        if e.errno <> errno.EEXIST: raise
+#
+#
+#
+#def save_attachment(mlist, msg, dir, filter_html=True):
+#    fsdir = os.path.join(mlist.archive_dir(), dir)
+#    makedirs(fsdir)
+#    # Figure out the attachment type and get the decoded data
+#    decodedpayload = msg.get_payload(decode=True)
+#    # BAW: mimetypes ought to handle non-standard, but commonly found types,
+#    # e.g. image/jpg (should be image/jpeg).  For now we just store such
+#    # things as application/octet-streams since that seems the safest.
+#    ctype = msg.get_content_type()
+#    # i18n file name is encoded
+#    lcset = Utils.GetCharSet(mlist.preferred_language)
+#    filename = Utils.oneline(msg.get_filename(''), lcset)
+#    filename, fnext = os.path.splitext(filename)
+#    # For safety, we should confirm this is valid ext for content-type
+#    # but we can use fnext if we introduce fnext filtering
+#    if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
+#        # HTML message doesn't have filename :-(
+#        ext = fnext or guess_extension(ctype, fnext)
+#    else:
+#        ext = guess_extension(ctype, fnext)
+#    if not ext:
+#        # We don't know what it is, so assume it's just a shapeless
+#        # application/octet-stream, unless the Content-Type: is
+#        # message/rfc822, in which case we know we'll coerce the type to
+#        # text/plain below.
+#        if ctype == 'message/rfc822':
+#            ext = '.txt'
+#        else:
+#            ext = '.bin'
+#    # Allow only alphanumerics, dash, underscore, and dot
+#    ext = sre.sub('', ext)
+#    path = None
+#    # We need a lock to calculate the next attachment number
+#    lockfile = os.path.join(fsdir, 'attachments.lock')
+#    lock = LockFile.LockFile(lockfile)
+#    lock.lock()
+#    try:
+#        # Now base the filename on what's in the attachment, uniquifying it if
+#        # necessary.
+#        if not filename or mm_cfg.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME:
+#            filebase = 'attachment'
+#        else:
+#            # Sanitize the filename given in the message headers
+#            parts = pre.split(filename)
+#            filename = parts[-1]
+#            # Strip off leading dots
+#            filename = dre.sub('', filename)
+#            # Allow only alphanumerics, dash, underscore, and dot
+#            filename = sre.sub('', filename)
+#            # If the filename's extension doesn't match the type we guessed,
+#            # which one should we go with?  For now, let's go with the one we
+#            # guessed so attachments can't lie about their type.  Also, if the
+#            # filename /has/ no extension, then tack on the one we guessed.
+#            # The extension was removed from the name above.
+#            filebase = filename
+#        # Now we're looking for a unique name for this file on the file
+#        # system.  If msgdir/filebase.ext isn't unique, we'll add a counter
+#        # after filebase, e.g. msgdir/filebase-cnt.ext
+#        counter = 0
+#        extra = ''
+#        while True:
+#            path = os.path.join(fsdir, filebase + extra + ext)
+#            # Generally it is not a good idea to test for file existance
+#            # before just trying to create it, but the alternatives aren't
+#            # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
+#            # NFS-safe).  Besides, we have an exclusive lock now, so we're
+#            # guaranteed that no other process will be racing with us.
+#            if os.path.exists(path):
+#                counter += 1
+#                extra = '-%04d' % counter
+#            else:
+#                break
+#    finally:
+#        lock.unlock()
+#    # `path' now contains the unique filename for the attachment.  There's
+#    # just one more step we need to do.  If the part is text/html and
+#    # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
+#    # here), then send the attachment through the filter program for
+#    # sanitization
+#    if filter_html and ctype == 'text/html':
+#        base, ext = os.path.splitext(path)
+#        tmppath = base + '-tmp' + ext
+#        fp = open(tmppath, 'w')
+#        try:
+#            fp.write(decodedpayload)
+#            fp.close()
+#            cmd = mm_cfg.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
+#            progfp = os.popen(cmd, 'r')
+#            decodedpayload = progfp.read()
+#            status = progfp.close()
+#            if status:
+#                syslog('error',
+#                       'HTML sanitizer exited with non-zero status: %s',
+#                       status)
+#        finally:
+#            os.unlink(tmppath)
+#        # BAW: Since we've now sanitized the document, it should be plain
+#        # text.  Blarg, we really want the sanitizer to tell us what the type
+#        # if the return data is. :(
+#        ext = '.txt'
+#        path = base + '.txt'
+#    # Is it a message/rfc822 attachment?
+#    elif ctype == 'message/rfc822':
+#        submsg = msg.get_payload()
+#        # BAW: I'm sure we can eventually do better than this. :(
+#        decodedpayload = Utils.websafe(str(submsg))
+#    fp = open(path, 'w')
+#    fp.write(decodedpayload)
+#    fp.close()
+#    # Now calculate the url
+#    baseurl = mlist.GetBaseArchiveURL()
+#    # Private archives will likely have a trailing slash.  Normalize.
+#    if baseurl[-1] <> '/':
+#        baseurl += '/'
+#    # A trailing space in url string may save users who are using
+#    # RFC-1738 compliant MUA (Not Mozilla).
+#    # Trailing space will definitely be a problem with format=flowed.
+#    # Bracket the URL instead.
+#    url = '<' + baseurl + '%s/%s%s%s>' % (dir, filebase, extra, ext)
+#    return url
diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py
index 8bb3872..ebec34f 100644
--- a/kittystore/storm/store.py
+++ b/kittystore/storm/store.py
@@ -17,8 +17,9 @@ from __future__ import absolute_import
 import datetime
 
 from kittystore import MessageNotFound
-from kittystore.utils import get_message_id_hash, parseaddr, parsedate
+from kittystore.utils import parseaddr, parsedate
 from kittystore.utils import header_to_unicode, payload_to_unicode
+from kittystore.scrub import scrub_message
 from kittystore.utils import get_ref_and_thread_id
 
 from zope.interface import implements
@@ -104,7 +105,7 @@ class StormStore(object):
         email.sender_name = from_name
         email.sender_email = unicode(from_email)
         email.subject = header_to_unicode(message.get('Subject'))
-        payload = payload_to_unicode(message)
+        payload = payload_to_unicode(scrub_message(list_name, message))
         email.content = payload
         email.date = parsedate(message.get("Date"))
         if email.date is None:
author	Aurélien Bompard <aurelien@bompard.org>	2012-09-19 19:00:08 +0200
committer	Aurélien Bompard <aurelien@bompard.org>	2012-09-24 22:49:51 +0200
commit	ffb9af409c8ebda208803cfdc2c20fe6ec68b76d (patch)
tree	8e8b4576c74bb89945ed6f2ec0bee3ad516bc72f
parent	69b1a92df2a3c35ae872100d6607d782db86df2f (diff)
download	kittystore-ffb9af409c8ebda208803cfdc2c20fe6ec68b76d.tar.gz kittystore-ffb9af409c8ebda208803cfdc2c20fe6ec68b76d.tar.xz kittystore-ffb9af409c8ebda208803cfdc2c20fe6ec68b76d.zip