Refactor the scrubbing function

author: Aurélien Bompard <aurelien@bompard.org> 2012-09-25 19:40:20 +0200
committer: Aurélien Bompard <aurelien@bompard.org> 2012-09-25 19:40:20 +0200
commit: ca1967c915458c7e6b54a43767a8b50dea277fb9 (patch)
tree: fe04ad19824cafeb191e311467abae8a2bee08e4
parent: d723e64bdb3a39a16e9416ba608ad76bc8390e95 (diff)
download: kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.tar.gz
kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.tar.xz
kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.zip
2 files changed, 298 insertions, 449 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py
index ac93baa..e7f4dde 100644
--- a/kittystore/scrub.py
+++ b/kittystore/scrub.py
@@ -25,6 +25,7 @@ import binascii
 import tempfile
 from cStringIO import StringIO
 from types import IntType, StringType
+from mimetypes import guess_all_extensions
 
 from email.Utils import parsedate
 from email.Parser import HeaderParser
@@ -39,106 +40,32 @@ from email.Charset import Charset
 #from Mailman.i18n import _
 #from Mailman.Logging.Syslog import syslog
 #from Mailman.Utils import sha_new
-from mailman.core.i18n import _
+
 from mailman.utilities.string import websafe, oneline
+# TODO: don't do translations here, the system locale has no meaning to the
+# web user
+from mailman.core.i18n import _
+
+# Path characters for common platforms
+pre = re.compile(r'[/\\:]')
+# All other characters to strip out of Content-Disposition: filenames
+# (essentially anything that isn't an alphanum, dot, dash, or underscore).
+sre = re.compile(r'[^-\w.]')
+# Regexp to strip out leading dots
+dre = re.compile(r'^\.*')
 
-## Path characters for common platforms
-#pre = re.compile(r'[/\\:]')
-## All other characters to strip out of Content-Disposition: filenames
-## (essentially anything that isn't an alphanum, dot, dash, or underscore).
-#sre = re.compile(r'[^-\w.]')
-## Regexp to strip out leading dots
-#dre = re.compile(r'^\.*')
-#
 BR = '<br>\n'
-#SPACE = ' '
-#
-#try:
-#    True, False
-#except NameError:
-#    True = 1
-#    False = 0
-#
-#
-#try:
-#    from mimetypes import guess_all_extensions
-#except ImportError:
-#    import mimetypes
-#    def guess_all_extensions(ctype, strict=True):
-#        # BAW: sigh, guess_all_extensions() is new in Python 2.3
-#        all = []
-#        def check(map):
-#            for e, t in map.items():
-#                if t == ctype:
-#                    all.append(e)
-#        check(mimetypes.types_map)
-#        # Python 2.1 doesn't have common_types.  Sigh, sigh.
-#        if not strict and hasattr(mimetypes, 'common_types'):
-#            check(mimetypes.common_types)
-#        return all
-#
-#
-#
-#def guess_extension(ctype, ext):
-#    # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
-#    # and .wiz are all mapped to application/msword.  This sucks for finding
-#    # the best reverse mapping.  If the extension is one of the giving
-#    # mappings, we'll trust that, otherwise we'll just guess. :/
-#    all = guess_all_extensions(ctype, strict=False)
-#    if ext in all:
-#        return ext
-#    return all and all[0]
-#
-#
-#def safe_strftime(fmt, t):
-#    try:
-#        return time.strftime(fmt, t)
-#    except (TypeError, ValueError, OverflowError):
-#        return None
-#
-#
-#def calculate_attachments_dir(mlist, msg, msgdata):
-#    # Calculate the directory that attachments for this message will go
-#    # under.  To avoid inode limitations, the scheme will be:
-#    # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
-#    # Start by calculating the date-based and msgid-hash components.
-#    fmt = '%Y%m%d'
-#    datestr = msg.get('Date')
-#    if datestr:
-#        now = parsedate(datestr)
-#    else:
-#        now = time.gmtime(msgdata.get('received_time', time.time()))
-#    datedir = safe_strftime(fmt, now)
-#    if not datedir:
-#        datestr = msgdata.get('X-List-Received-Date')
-#        if datestr:
-#            datedir = safe_strftime(fmt, datestr)
-#    if not datedir:
-#        # What next?  Unixfrom, I guess.
-#        parts = msg.get_unixfrom().split()
-#        try:
-#            month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
-#                     'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
-#                     }.get(parts[3], 0)
-#            day = int(parts[4])
-#            year = int(parts[6])
-#        except (IndexError, ValueError):
-#            # Best we can do I think
-#            month = day = year = 0
-#        datedir = '%04d%02d%02d' % (year, month, day)
-#    assert datedir
-#    # As for the msgid hash, we'll base this part on the Message-ID: so that
-#    # all attachments for the same message end up in the same directory (we'll
-#    # uniquify the filenames in that directory as needed).  We use the first 2
-#    # and last 2 bytes of the SHA1 hash of the message id as the basis of the
-#    # directory name.  Clashes here don't really matter too much, and that
-#    # still gives us a 32-bit space to work with.
-#    msgid = msg['message-id']
-#    if msgid is None:
-#        msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
-#    # We assume that the message id actually /is/ unique!
-#    digest = sha_new(msgid).hexdigest()
-#    return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
+
+
+def guess_extension(ctype, ext):
+    # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
+    # and .wiz are all mapped to application/msword.  This sucks for finding
+    # the best reverse mapping.  If the extension is one of the giving
+    # mappings, we'll trust that, otherwise we'll just guess. :/
+    all = guess_all_extensions(ctype, strict=False)
+    if ext in all:
+        return ext
+    return all and all[0]
 
 
 def replace_payload_by_text(msg, text, charset):
@@ -149,59 +76,64 @@ def replace_payload_by_text(msg, text, charset):
     #if isinstance(charset, unicode):
     #    # email 3.0.1 (python 2.4) doesn't like unicode
     #    charset = charset.encode('us-ascii')
-    msg.set_payload(text, charset)
+    #msg.set_payload(text, charset)
+    msg.set_payload('TODO: display attachment here and remove message subpart')
 
-def save_attachment(mlist, msg, filter_html=True):
-    # Store name, content-type and size
-    return "TODO: handle attachments and return a link here"
 
-def scrub_message(mlist, msg):
-    sanitize = 1 # TODO: implement other options
-    outer = True
-    charset = None
-    #lcset = Utils.GetCharSet(mlist.preferred_language)
-    #lcset_out = Charset(lcset).output_charset or lcset
-    lcset = "utf-8"
-    # Now walk over all subparts of this message and scrub out various types
-    format = delsp = None
-    for part in msg.walk():
-        ctype = part.get_content_type()
-        # If the part is text/plain, we leave it alone
-        if ctype == 'text/plain':
-            # We need to choose a charset for the scrubbed message, so we'll
-            # arbitrarily pick the charset of the first text/plain part in the
-            # message.
-            # MAS: Also get the RFC 3676 stuff from this part. This seems to
-            # work OK for scrub_nondigest.  It will also work as far as
-            # scrubbing messages for the archive is concerned, but pipermail
-            # doesn't pay any attention to the RFC 3676 parameters.  The plain
-            # format digest is going to be a disaster in any case as some of
-            # messages will be format="flowed" and some not.  ToDigest creates
-            # its own Content-Type: header for the plain digest which won't
-            # have RFC 3676 parameters. If the message Content-Type: headers
-            # are retained for display in the digest, the parameters will be
-            # there for information, but not for the MUA. This is the best we
-            # can do without having get_payload() process the parameters.
-            if charset is None:
-                charset = part.get_content_charset(lcset)
-                format = part.get_param('format')
-                delsp = part.get_param('delsp')
-            # TK: if part is attached then check charset and scrub if none
-            if part.get('content-disposition') and \
-               not part.get_content_charset():
-                omask = os.umask(002)
-                try:
-                    url = save_attachment(mlist, part)
-                finally:
-                    os.umask(omask)
-                filename = part.get_filename(_('not available'))
-                filename = oneline(filename, lcset)
-                replace_payload_by_text(part, _("""\
-An embedded and charset-unspecified text was scrubbed...
-Name: %(filename)s
-URL: %(url)s
-"""), lcset)
-        elif ctype == 'text/html' and isinstance(sanitize, IntType):
+
+class Scrubber(object):
+    """
+    Scrubs a single message, extracts attachments, and store them in the
+    database.
+    """
+
+    def __init__(self, mlist, msg, store):
+        self.mlist = mlist
+        self.msg = msg
+        self.store = store
+
+
+    def scrub(self):
+        sanitize = 1 # TODO: implement other options
+        outer = True
+        charset = None
+        #lcset = Utils.GetCharSet(self.mlist.preferred_language)
+        #lcset_out = Charset(lcset).output_charset or lcset
+        lcset = "utf-8"
+        # Now walk over all subparts of this message and scrub out various types
+        format = delsp = None
+        for part_num, part in enumerate(self.msg.walk()):
+            ctype = part.get_content_type()
+            # If the part is text/plain, we leave it alone
+            if ctype == 'text/plain':
+                # We need to choose a charset for the scrubbed message, so we'll
+                # arbitrarily pick the charset of the first text/plain part in the
+                # message.
+                # MAS: Also get the RFC 3676 stuff from this part. This seems to
+                # work OK for scrub_nondigest.  It will also work as far as
+                # scrubbing messages for the archive is concerned, but pipermail
+                # doesn't pay any attention to the RFC 3676 parameters.  The plain
+                # format digest is going to be a disaster in any case as some of
+                # messages will be format="flowed" and some not.  ToDigest creates
+                # its own Content-Type: header for the plain digest which won't
+                # have RFC 3676 parameters. If the message Content-Type: headers
+                # are retained for display in the digest, the parameters will be
+                # there for information, but not for the MUA. This is the best we
+                # can do without having get_payload() process the parameters.
+                if charset is None:
+                    charset = part.get_content_charset(lcset)
+                    format = part.get_param('format')
+                    delsp = part.get_param('delsp')
+                # TK: if part is attached then check charset and scrub if none
+                if part.get('content-disposition') and \
+                   not part.get_content_charset():
+                    self.save_attachment(part, part_num)
+                    replace_payload_by_text(part, _("""\
+    An embedded and charset-unspecified text was scrubbed...
+    Name: %(filename)s
+    URL: %(url)s
+    """), lcset)
+            elif ctype == 'text/html' and isinstance(sanitize, IntType):
 #            if sanitize == 0:
 #                if outer:
 #                    raise DiscardMessage
@@ -217,314 +149,221 @@ URL: %(url)s
 #                # Pull it out as an attachment but leave it unescaped.  This
 #                # is dangerous, but perhaps useful for heavily moderated
 #                # lists.
-#                omask = os.umask(002)
-#                try:
-#                    url = save_attachment(mlist, part, filter_html=False)
-#                finally:
-#                    os.umask(omask)
+#                self.save_attachment(part, part_num, filter_html=False)
 #                replace_payload_by_text(part, _("""\
 #An HTML attachment was scrubbed...
 #URL: %(url)s
 #"""), lcset)
 #            else:
-            if sanitize == 1:
-                # HTML-escape it and store it as an attachment, but make it
-                # look a /little/ bit prettier. :(
-                payload = websafe(part.get_payload(decode=True))
-                # For whitespace in the margin, change spaces into
-                # non-breaking spaces, and tabs into 8 of those.  Then use a
-                # mono-space font.  Still looks hideous to me, but then I'd
-                # just as soon discard them.
-                def doreplace(s):
-                    return s.expandtabs(8).replace(' ', '&nbsp;')
-                lines = [doreplace(s) for s in payload.split('\n')]
-                payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
-                part.set_payload(payload)
-                # We're replacing the payload with the decoded payload so this
-                # will just get in the way.
-                del part['content-transfer-encoding']
-                omask = os.umask(002)
-                try:
-                    url = save_attachment(mlist, part, filter_html=False)
-                finally:
-                    os.umask(omask)
+                if sanitize == 1:
+                    # HTML-escape it and store it as an attachment, but make it
+                    # look a /little/ bit prettier. :(
+                    payload = websafe(part.get_payload(decode=True))
+                    # For whitespace in the margin, change spaces into
+                    # non-breaking spaces, and tabs into 8 of those.  Then use a
+                    # mono-space font.  Still looks hideous to me, but then I'd
+                    # just as soon discard them.
+                    def doreplace(s):
+                        return s.expandtabs(8).replace(' ', '&nbsp;')
+                    lines = [doreplace(s) for s in payload.split('\n')]
+                    payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
+                    part.set_payload(payload)
+                    # We're replacing the payload with the decoded payload so this
+                    # will just get in the way.
+                    del part['content-transfer-encoding']
+                    self.save_attachment(part, part_num, filter_html=False)
+                    replace_payload_by_text(part, _("""\
+    An HTML attachment was scrubbed...
+    URL: %(url)s
+    """), lcset)
+            elif ctype == 'message/rfc822':
+                # This part contains a submessage, so it too needs scrubbing
+                submsg = part.get_payload(0)
+                self.save_attachment(part, part_num)
+                subject = submsg.get('subject', _('no subject'))
+                subject = oneline(subject, lcset)
+                date = submsg.get('date', _('no date'))
+                who = submsg.get('from', _('unknown sender'))
+                size = len(str(submsg))
                 replace_payload_by_text(part, _("""\
-An HTML attachment was scrubbed...
-URL: %(url)s
-"""), lcset)
-        elif ctype == 'message/rfc822':
-            # This part contains a submessage, so it too needs scrubbing
-            submsg = part.get_payload(0)
-            omask = os.umask(002)
-            try:
-                url = save_attachment(mlist, part)
-            finally:
-                os.umask(omask)
-            subject = submsg.get('subject', _('no subject'))
-            subject = oneline(subject, lcset)
-            date = submsg.get('date', _('no date'))
-            who = submsg.get('from', _('unknown sender'))
-            size = len(str(submsg))
-            replace_payload_by_text(part, _("""\
-An embedded message was scrubbed...
-From: %(who)s
-Subject: %(subject)s
-Date: %(date)s
-Size: %(size)s
-URL: %(url)s
-"""), lcset)
-        # If the message isn't a multipart, then we'll strip it out as an
-        # attachment that would have to be separately downloaded.  Pipermail
-        # will transform the url into a hyperlink.
-        elif part.get_payload() and not part.is_multipart():
-            payload = part.get_payload(decode=True)
-            ctype = part.get_content_type()
-            # XXX Under email 2.5, it is possible that payload will be None.
-            # This can happen when you have a Content-Type: multipart/* with
-            # only one part and that part has two blank lines between the
-            # first boundary and the end boundary.  In email 3.0 you end up
-            # with a string in the payload.  I think in this case it's safe to
-            # ignore the part.
-            if payload is None:
-                continue
-            size = len(payload)
-            omask = os.umask(002)
-            try:
-                url = save_attachment(mlist, part)
-            finally:
-                os.umask(omask)
-            desc = part.get('content-description', _('not available'))
-            desc = oneline(desc, lcset)
-            filename = part.get_filename(_('not available'))
-            filename = oneline(filename, lcset)
-            replace_payload_by_text(part, _("""\
-A non-text attachment was scrubbed...
-Name: %(filename)s
-Type: %(ctype)s
-Size: %(size)d bytes
-Desc: %(desc)s
-URL: %(url)s
-"""), lcset)
-        outer = False
-    # We still have to sanitize multipart messages to flat text because
-    # Pipermail can't handle messages with list payloads.  This is a kludge;
-    # def (n) clever hack ;).
-    if msg.is_multipart():
-        # By default we take the charset of the first text/plain part in the
-        # message, but if there was none, we'll use the list's preferred
-        # language's charset.
-        if not charset or charset == 'us-ascii':
-            charset = lcset_out
-        else:
-            # normalize to the output charset if input/output are different
-            charset = Charset(charset).output_charset or charset
-        # We now want to concatenate all the parts which have been scrubbed to
-        # text/plain, into a single text/plain payload.  We need to make sure
-        # all the characters in the concatenated string are in the same
-        # encoding, so we'll use the 'replace' key in the coercion call.
-        # BAW: Martin's original patch suggested we might want to try
-        # generalizing to utf-8, and that's probably a good idea (eventually).
-        text = []
-        for part in msg.walk():
-            # TK: bug-id 1099138 and multipart
-            # MAS test payload - if part may fail if there are no headers.
-            if not part.get_payload() or part.is_multipart():
-                continue
-            # All parts should be scrubbed to text/plain by now, except
-            # if sanitize == 2, there could be text/html parts so keep them
-            # but skip any other parts.
-            partctype = part.get_content_type()
-            if partctype <> 'text/plain' and (partctype <> 'text/html' or
-                                              sanitize <> 2):
-                text.append(_('Skipped content of type %(partctype)s\n'))
-                continue
-            try:
-                t = part.get_payload(decode=True) or ''
-            # MAS: TypeError exception can occur if payload is None. This
-            # was observed with a message that contained an attached
-            # message/delivery-status part. Because of the special parsing
-            # of this type, this resulted in a text/plain sub-part with a
-            # null body. See bug 1430236.
-            except (binascii.Error, TypeError):
-                t = part.get_payload() or ''
-            # TK: get_content_charset() returns 'iso-2022-jp' for internally
-            # crafted (scrubbed) 'euc-jp' text part. So, first try
-            # get_charset(), then get_content_charset() for the parts
-            # which are already embeded in the incoming message.
-            partcharset = part.get_charset()
-            if partcharset:
-                partcharset = str(partcharset)
+    An embedded message was scrubbed...
+    From: %(who)s
+    Subject: %(subject)s
+    Date: %(date)s
+    Size: %(size)s
+    URL: %(url)s
+    """), lcset)
+            # If the message isn't a multipart, then we'll strip it out as an
+            # attachment that would have to be separately downloaded.  Pipermail
+            # will transform the url into a hyperlink.
+            elif part.get_payload() and not part.is_multipart():
+                payload = part.get_payload(decode=True)
+                ctype = part.get_content_type()
+                # XXX Under email 2.5, it is possible that payload will be None.
+                # This can happen when you have a Content-Type: multipart/* with
+                # only one part and that part has two blank lines between the
+                # first boundary and the end boundary.  In email 3.0 you end up
+                # with a string in the payload.  I think in this case it's safe to
+                # ignore the part.
+                if payload is None:
+                    continue
+                size = len(payload)
+                self.save_attachment(part, part_num)
+                desc = part.get('content-description', _('not available'))
+                desc = oneline(desc, lcset)
+                filename = part.get_filename(_('not available'))
+                filename = oneline(filename, lcset)
+                replace_payload_by_text(part, _("""\
+    A non-text attachment was scrubbed...
+    Name: %(filename)s
+    Type: %(ctype)s
+    Size: %(size)d bytes
+    Desc: %(desc)s
+    URL: %(url)s
+    """), lcset)
+            outer = False
+        # We still have to sanitize multipart messages to flat text because
+        # Pipermail can't handle messages with list payloads.  This is a kludge;
+        # def (n) clever hack ;).
+        if self.msg.is_multipart():
+            # By default we take the charset of the first text/plain part in the
+            # message, but if there was none, we'll use the list's preferred
+            # language's charset.
+            if not charset or charset == 'us-ascii':
+                charset = lcset_out
             else:
-                partcharset = part.get_content_charset()
-            if partcharset and partcharset <> charset:
-                try:
-                    t = unicode(t, partcharset, 'replace')
-                except (UnicodeError, LookupError, ValueError,
-                        AssertionError):
-                    # We can get here if partcharset is bogus in come way.
-                    # Replace funny characters.  We use errors='replace'
-                    t = unicode(t, 'ascii', 'replace')
+                # normalize to the output charset if input/output are different
+                charset = Charset(charset).output_charset or charset
+            # We now want to concatenate all the parts which have been scrubbed to
+            # text/plain, into a single text/plain payload.  We need to make sure
+            # all the characters in the concatenated string are in the same
+            # encoding, so we'll use the 'replace' key in the coercion call.
+            # BAW: Martin's original patch suggested we might want to try
+            # generalizing to utf-8, and that's probably a good idea (eventually).
+            text = []
+            for part in self.msg.walk():
+                # TK: bug-id 1099138 and multipart
+                # MAS test payload - if part may fail if there are no headers.
+                if not part.get_payload() or part.is_multipart():
+                    continue
+                # All parts should be scrubbed to text/plain by now, except
+                # if sanitize == 2, there could be text/html parts so keep them
+                # but skip any other parts.
+                partctype = part.get_content_type()
+                if partctype <> 'text/plain' and (partctype <> 'text/html' or
+                                                  sanitize <> 2):
+                    text.append(_('Skipped content of type %(partctype)s\n'))
+                    continue
                 try:
-                    # Should use HTML-Escape, or try generalizing to UTF-8
-                    t = t.encode(charset, 'replace')
-                except (UnicodeError, LookupError, ValueError,
-                        AssertionError):
-                    # if the message charset is bogus, use the list's.
-                    t = t.encode(lcset, 'replace')
-            # Separation is useful
-            if isinstance(t, StringType):
-                if not t.endswith('\n'):
-                    t += '\n'
-                text.append(t)
-        # Now join the text and set the payload
-        sep = _('-------------- next part --------------\n')
-        # The i18n separator is in the list's charset. Coerce it to the
-        # message charset.
-        try:
-            sep = sep.encode(charset, 'replace')
-        except (UnicodeError, LookupError, ValueError,
-                AssertionError):
-            pass
-        replace_payload_by_text(msg, sep.join(text), charset)
-        if format:
-            msg.set_param('Format', format)
-        if delsp:
-            msg.set_param('DelSp', delsp)
-    return msg
+                    t = part.get_payload(decode=True) or ''
+                # MAS: TypeError exception can occur if payload is None. This
+                # was observed with a message that contained an attached
+                # message/delivery-status part. Because of the special parsing
+                # of this type, this resulted in a text/plain sub-part with a
+                # null body. See bug 1430236.
+                except (binascii.Error, TypeError):
+                    t = part.get_payload() or ''
+                # TK: get_content_charset() returns 'iso-2022-jp' for internally
+                # crafted (scrubbed) 'euc-jp' text part. So, first try
+                # get_charset(), then get_content_charset() for the parts
+                # which are already embeded in the incoming message.
+                partcharset = part.get_charset()
+                if partcharset:
+                    partcharset = str(partcharset)
+                else:
+                    partcharset = part.get_content_charset()
+                if partcharset and partcharset <> charset:
+                    try:
+                        t = unicode(t, partcharset, 'replace')
+                    except (UnicodeError, LookupError, ValueError,
+                            AssertionError):
+                        # We can get here if partcharset is bogus in come way.
+                        # Replace funny characters.  We use errors='replace'
+                        t = unicode(t, 'ascii', 'replace')
+                    try:
+                        # Should use HTML-Escape, or try generalizing to UTF-8
+                        t = t.encode(charset, 'replace')
+                    except (UnicodeError, LookupError, ValueError,
+                            AssertionError):
+                        # if the message charset is bogus, use the list's.
+                        t = t.encode(lcset, 'replace')
+                # Separation is useful
+                if isinstance(t, StringType):
+                    if not t.endswith('\n'):
+                        t += '\n'
+                    text.append(t)
+            # Now join the text and set the payload
+            sep = _('-------------- next part --------------\n')
+            # The i18n separator is in the list's charset. Coerce it to the
+            # message charset.
+            try:
+                sep = sep.encode(charset, 'replace')
+            except (UnicodeError, LookupError, ValueError,
+                    AssertionError):
+                pass
+            replace_payload_by_text(self.msg, sep.join(text), charset)
+            if format:
+                self.msg.set_param('Format', format)
+            if delsp:
+                self.msg.set_param('DelSp', delsp)
+        return self.msg
 
 
-#
-#def makedirs(dir):
-#    # Create all the directories to store this attachment in
-#    try:
-#        os.makedirs(dir, 02775)
-#        # Unfortunately, FreeBSD seems to be broken in that it doesn't honor
-#        # the mode arg of mkdir().
-#        def twiddle(arg, dirname, names):
-#            os.chmod(dirname, 02775)
-#        os.path.walk(dir, twiddle, None)
-#    except OSError, e:
-#        if e.errno <> errno.EEXIST: raise
-#
-#
-#
-#def save_attachment(mlist, msg, dir, filter_html=True):
-#    fsdir = os.path.join(mlist.archive_dir(), dir)
-#    makedirs(fsdir)
-#    # Figure out the attachment type and get the decoded data
-#    decodedpayload = msg.get_payload(decode=True)
-#    # BAW: mimetypes ought to handle non-standard, but commonly found types,
-#    # e.g. image/jpg (should be image/jpeg).  For now we just store such
-#    # things as application/octet-streams since that seems the safest.
-#    ctype = msg.get_content_type()
-#    # i18n file name is encoded
-#    lcset = Utils.GetCharSet(mlist.preferred_language)
-#    filename = Utils.oneline(msg.get_filename(''), lcset)
-#    filename, fnext = os.path.splitext(filename)
-#    # For safety, we should confirm this is valid ext for content-type
-#    # but we can use fnext if we introduce fnext filtering
-#    if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
-#        # HTML message doesn't have filename :-(
-#        ext = fnext or guess_extension(ctype, fnext)
-#    else:
-#        ext = guess_extension(ctype, fnext)
-#    if not ext:
-#        # We don't know what it is, so assume it's just a shapeless
-#        # application/octet-stream, unless the Content-Type: is
-#        # message/rfc822, in which case we know we'll coerce the type to
-#        # text/plain below.
-#        if ctype == 'message/rfc822':
-#            ext = '.txt'
-#        else:
-#            ext = '.bin'
-#    # Allow only alphanumerics, dash, underscore, and dot
-#    ext = sre.sub('', ext)
-#    path = None
-#    # We need a lock to calculate the next attachment number
-#    lockfile = os.path.join(fsdir, 'attachments.lock')
-#    lock = LockFile.LockFile(lockfile)
-#    lock.lock()
-#    try:
-#        # Now base the filename on what's in the attachment, uniquifying it if
-#        # necessary.
-#        if not filename or mm_cfg.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME:
-#            filebase = 'attachment'
-#        else:
-#            # Sanitize the filename given in the message headers
-#            parts = pre.split(filename)
-#            filename = parts[-1]
-#            # Strip off leading dots
-#            filename = dre.sub('', filename)
-#            # Allow only alphanumerics, dash, underscore, and dot
-#            filename = sre.sub('', filename)
-#            # If the filename's extension doesn't match the type we guessed,
-#            # which one should we go with?  For now, let's go with the one we
-#            # guessed so attachments can't lie about their type.  Also, if the
-#            # filename /has/ no extension, then tack on the one we guessed.
-#            # The extension was removed from the name above.
-#            filebase = filename
-#        # Now we're looking for a unique name for this file on the file
-#        # system.  If msgdir/filebase.ext isn't unique, we'll add a counter
-#        # after filebase, e.g. msgdir/filebase-cnt.ext
-#        counter = 0
-#        extra = ''
-#        while True:
-#            path = os.path.join(fsdir, filebase + extra + ext)
-#            # Generally it is not a good idea to test for file existance
-#            # before just trying to create it, but the alternatives aren't
-#            # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
-#            # NFS-safe).  Besides, we have an exclusive lock now, so we're
-#            # guaranteed that no other process will be racing with us.
-#            if os.path.exists(path):
-#                counter += 1
-#                extra = '-%04d' % counter
-#            else:
-#                break
-#    finally:
-#        lock.unlock()
-#    # `path' now contains the unique filename for the attachment.  There's
-#    # just one more step we need to do.  If the part is text/html and
-#    # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
-#    # here), then send the attachment through the filter program for
-#    # sanitization
-#    if filter_html and ctype == 'text/html':
-#        base, ext = os.path.splitext(path)
-#        tmppath = base + '-tmp' + ext
-#        fp = open(tmppath, 'w')
-#        try:
-#            fp.write(decodedpayload)
-#            fp.close()
-#            cmd = mm_cfg.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
-#            progfp = os.popen(cmd, 'r')
-#            decodedpayload = progfp.read()
-#            status = progfp.close()
-#            if status:
-#                syslog('error',
-#                       'HTML sanitizer exited with non-zero status: %s',
-#                       status)
-#        finally:
-#            os.unlink(tmppath)
-#        # BAW: Since we've now sanitized the document, it should be plain
-#        # text.  Blarg, we really want the sanitizer to tell us what the type
-#        # if the return data is. :(
-#        ext = '.txt'
-#        path = base + '.txt'
-#    # Is it a message/rfc822 attachment?
-#    elif ctype == 'message/rfc822':
-#        submsg = msg.get_payload()
-#        # BAW: I'm sure we can eventually do better than this. :(
-#        decodedpayload = Utils.websafe(str(submsg))
-#    fp = open(path, 'w')
-#    fp.write(decodedpayload)
-#    fp.close()
-#    # Now calculate the url
-#    baseurl = mlist.GetBaseArchiveURL()
-#    # Private archives will likely have a trailing slash.  Normalize.
-#    if baseurl[-1] <> '/':
-#        baseurl += '/'
-#    # A trailing space in url string may save users who are using
-#    # RFC-1738 compliant MUA (Not Mozilla).
-#    # Trailing space will definitely be a problem with format=flowed.
-#    # Bracket the URL instead.
-#    url = '<' + baseurl + '%s/%s%s%s>' % (dir, filebase, extra, ext)
-#    return url
+    def save_attachment(self, part, counter, filter_html=True):
+        # Store name, content-type and size
+        # Figure out the attachment type and get the decoded data
+        decodedpayload = part.get_payload(decode=True)
+        # BAW: mimetypes ought to handle non-standard, but commonly found types,
+        # e.g. image/jpg (should be image/jpeg).  For now we just store such
+        # things as application/octet-streams since that seems the safest.
+        ctype = part.get_content_type()
+        # i18n file name is encoded
+        #lcset = Utils.GetCharSet(self.mlist.preferred_language)
+        lcset = "utf-8"
+        filename = oneline(part.get_filename(''), lcset)
+        filename, fnext = os.path.splitext(filename)
+        # For safety, we should confirm this is valid ext for content-type
+        # but we can use fnext if we introduce fnext filtering
+        # TODO: re-implement this
+        #if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
+        #    # HTML message doesn't have filename :-(
+        #    ext = fnext or guess_extension(ctype, fnext)
+        #else:
+        #    ext = guess_extension(ctype, fnext)
+        ext = fnext or guess_extension(ctype, fnext)
+        if not ext:
+            # We don't know what it is, so assume it's just a shapeless
+            # application/octet-stream, unless the Content-Type: is
+            # message/rfc822, in which case we know we'll coerce the type to
+            # text/plain below.
+            if ctype == 'message/rfc822':
+                ext = '.txt'
+            else:
+                ext = '.bin'
+        # Allow only alphanumerics, dash, underscore, and dot
+        ext = sre.sub('', ext)
+        # Now base the filename on what's in the attachment, uniquifying it if
+        # necessary.
+        if not filename:
+            filebase = 'attachment'
+        else:
+            # Sanitize the filename given in the message headers
+            parts = pre.split(filename)
+            filename = parts[-1]
+            # Strip off leading dots
+            filename = dre.sub('', filename)
+            # Allow only alphanumerics, dash, underscore, and dot
+            filename = sre.sub('', filename)
+            # If the filename's extension doesn't match the type we guessed,
+            # which one should we go with?  For now, let's go with the one we
+            # guessed so attachments can't lie about their type.  Also, if the
+            # filename /has/ no extension, then tack on the one we guessed.
+            # The extension was removed from the name above.
+            filebase = filename
+        # TODO: bring back the HTML sanitizer feature
+        if ctype == 'message/rfc822':
+            submsg = part.get_payload()
+            # BAW: I'm sure we can eventually do better than this. :(
+            decodedpayload = websafe(str(submsg))
+        msg_id = self.msg['Message-Id'].strip("<>")
+        self.store.add_attachment(self.mlist, msg_id, counter, decodedpayload)
diff --git a/kittystore/storm/schema/__init__.py b/kittystore/storm/schema/__init__.py
index c301e88..daba29f 100644
--- a/kittystore/storm/schema/__init__.py
+++ b/kittystore/storm/schema/__init__.py
@@ -60,6 +60,16 @@ CREATES = {
             "full" BYTEA NOT NULL,
             archived_date TIMESTAMP WITHOUT TIME ZONE DEFAULT CURRENT_TIMESTAMP,
             PRIMARY KEY (list_name, message_id)
+        );""", """
+        CREATE TABLE "attachment" (
+            list_name VARCHAR(255) NOT NULL,
+            message_id VARCHAR(255) NOT NULL,
+            counter INTEGER NOT NULL,
+            content_type VARCHAR(255) NOT NULL,
+            name VARCHAR(255),
+            size INTEGER NOT NULL,
+            content BYTEA NOT NULL,
+            PRIMARY KEY (list_name, message_id, counter)
         );""",
         'CREATE INDEX "ix_email_list_name" ON "email" USING btree (list_name);',
         'CREATE UNIQUE INDEX "ix_email_message_id" ON "email" USING btree (message_id);',
author	Aurélien Bompard <aurelien@bompard.org>	2012-09-25 19:40:20 +0200
committer	Aurélien Bompard <aurelien@bompard.org>	2012-09-25 19:40:20 +0200
commit	ca1967c915458c7e6b54a43767a8b50dea277fb9 (patch)
tree	fe04ad19824cafeb191e311467abae8a2bee08e4
parent	d723e64bdb3a39a16e9416ba608ad76bc8390e95 (diff)
download	kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.tar.gz kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.tar.xz kittystore-ca1967c915458c7e6b54a43767a8b50dea277fb9.zip