summaryrefslogtreecommitdiffstats
path: root/kittystore/scrub.py
blob: 2a54954f5f32ba9d096b306be95faa41eaf9568e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# Copyright (C) 2011-2012 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.

"""Cleanse a message for archiving."""

import os
import re
import binascii
from types import IntType
from mimetypes import guess_all_extensions

from email.utils import unquote

from mailman.utilities.string import oneline

# Path characters for common platforms
pre = re.compile(r'[/\\:]')
# All other characters to strip out of Content-Disposition: filenames
# (essentially anything that isn't an alphanum, dot, dash, or underscore).
sre = re.compile(r'[^-\w.]')
# Regexp to strip out leading dots
dre = re.compile(r'^\.*')

BR = '<br>\n'

NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n')


def guess_extension(ctype, ext):
    # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
    # and .wiz are all mapped to application/msword.  This sucks for finding
    # the best reverse mapping.  If the extension is one of the giving
    # mappings, we'll trust that, otherwise we'll just guess. :/
    all_exts = guess_all_extensions(ctype, strict=False)
    if ext in all_exts:
        return ext
    return all_exts and all_exts[0]


def get_charset(message, default="ascii", guess=False):
    """
    Get the message charset.
    From: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
    """
    if message.get_content_charset():
        return message.get_content_charset()
    if message.get_charset():
        return message.get_charset()
    if not guess:
        return default
    # Try to guess the encoding (best effort mode)
    text = message.get_payload(decode=True)
    charset = default
    for encoding in ["ascii", "utf-8", "iso-8859-15"]:
        try:
            text.decode(encoding)
        except UnicodeDecodeError:
            continue
        else:
            #print encoding, payload
            charset = encoding
            break
    return charset


class Scrubber(object):
    """
    Scrubs a single message, extracts attachments, and return the text and the
    attachments.
    See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
    """

    def __init__(self, mlist, msg):
        self.mlist = mlist
        self.msg = msg


    def scrub(self):
        attachments = []
        sanitize = 1 # TODO: implement other options
        #outer = True
        # Now walk over all subparts of this message and scrub out various types
        for part_num, part in enumerate(self.msg.walk()):
            ctype = part.get_content_type()
            # If the part is text/plain, we leave it alone
            if ctype == 'text/plain':
                disposition = part.get('content-disposition')
                if disposition and disposition.strip().startswith("attachment"):
                    # part is attached
                    attachments.append(self.parse_attachment(part, part_num))
                    part.set_payload('')
            elif ctype == 'text/html' and isinstance(sanitize, IntType):
#            if sanitize == 0:
#                if outer:
#                    raise DiscardMessage
#                replace_payload_by_text(part,
#                                 _('HTML attachment scrubbed and removed'),
#                                 # Adding charset arg and removing content-type
#                                 # sets content-type to text/plain
#                                 lcset)
#            elif sanitize == 2:
#                # By leaving it alone, Pipermail will automatically escape it
#                pass
#            elif sanitize == 3:
#                # Pull it out as an attachment but leave it unescaped.  This
#                # is dangerous, but perhaps useful for heavily moderated
#                # lists.
#                attachments.append(self.parse_attachment(part, part_num, filter_html=False))
#                replace_payload_by_text(part, _("""\
#An HTML attachment was scrubbed...
#URL: %(url)s
#"""), lcset)
#            else:
                if sanitize == 1:
                    # Don't HTML-escape it, this is the frontend's job
                    ## HTML-escape it and store it as an attachment, but make it
                    ## look a /little/ bit prettier. :(
                    #payload = websafe(part.get_payload(decode=True))
                    ## For whitespace in the margin, change spaces into
                    ## non-breaking spaces, and tabs into 8 of those.  Then use a
                    ## mono-space font.  Still looks hideous to me, but then I'd
                    ## just as soon discard them.
                    #def doreplace(s):
                    #    return s.expandtabs(8).replace(' ', '&nbsp;')
                    #lines = [doreplace(s) for s in payload.split('\n')]
                    #payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
                    #part.set_payload(payload)
                    ## We're replacing the payload with the decoded payload so this
                    ## will just get in the way.
                    #del part['content-transfer-encoding']
                    attachments.append(self.parse_attachment(part, part_num, filter_html=False))
                    part.set_payload('')
            elif ctype == 'message/rfc822':
                # This part contains a submessage, so it too needs scrubbing
                attachments.append(self.parse_attachment(part, part_num))
                part.set_payload('')
            # If the message isn't a multipart, then we'll strip it out as an
            # attachment that would have to be separately downloaded.
            elif part.get_payload() and not part.is_multipart():
                payload = part.get_payload(decode=True)
                ctype = part.get_content_type()
                # XXX Under email 2.5, it is possible that payload will be None.
                # This can happen when you have a Content-Type: multipart/* with
                # only one part and that part has two blank lines between the
                # first boundary and the end boundary.  In email 3.0 you end up
                # with a string in the payload.  I think in this case it's safe to
                # ignore the part.
                if payload is None:
                    continue
                attachments.append(self.parse_attachment(part, part_num))
            #outer = False
        # We still have to sanitize multipart messages to flat text because
        # Pipermail can't handle messages with list payloads.  This is a kludge;
        # def (n) clever hack ;).
        if self.msg.is_multipart():
            # We now want to concatenate all the parts which have been scrubbed to
            # text/plain, into a single text/plain payload.  We need to make sure
            # all the characters in the concatenated string are in the same
            # encoding, so we'll use the 'replace' key in the coercion call.
            # BAW: Martin's original patch suggested we might want to try
            # generalizing to utf-8, and that's probably a good idea (eventually).
            text = []
            for part in self.msg.walk():
                # TK: bug-id 1099138 and multipart
                # MAS test payload - if part may fail if there are no headers.
                if not part.get_payload() or part.is_multipart():
                    continue
                # All parts should be scrubbed to text/plain by now, except
                # if sanitize == 2, there could be text/html parts so keep them
                # but skip any other parts.
                partctype = part.get_content_type()
                if partctype <> 'text/plain' and (partctype <> 'text/html' or
                                                  sanitize <> 2):
                    #text.append(_('Skipped content of type %(partctype)s\n'))
                    continue
                try:
                    t = part.get_payload(decode=True) or ''
                # MAS: TypeError exception can occur if payload is None. This
                # was observed with a message that contained an attached
                # message/delivery-status part. Because of the special parsing
                # of this type, this resulted in a text/plain sub-part with a
                # null body. See bug 1430236.
                except (binascii.Error, TypeError):
                    t = part.get_payload() or ''
                partcharset = get_charset(part, guess=True)
                try:
                    t = unicode(t, partcharset, 'replace')
                except (UnicodeError, LookupError, ValueError,
                        AssertionError):
                    # We can get here if partcharset is bogus in come way.
                    # Replace funny characters.  We use errors='replace'
                    t = unicode(t, 'ascii', 'replace')
                # Separation is useful
                if isinstance(t, basestring):
                    if not t.endswith('\n'):
                        t += '\n'
                    text.append(t)

            text = "\n".join(text)
        else:
            text = self.msg.get_payload(decode=True)
            text = text.decode(get_charset(self.msg, guess=True), "replace")

            next_part_match = NEXT_PART.search(text)
            if next_part_match:
                text = text[0:next_part_match.start(0)]

        return (text, attachments)


    def parse_attachment(self, part, counter, filter_html=True):
        # Store name, content-type and size
        # Figure out the attachment type and get the decoded data
        decodedpayload = part.get_payload(decode=True)
        # BAW: mimetypes ought to handle non-standard, but commonly found types,
        # e.g. image/jpg (should be image/jpeg).  For now we just store such
        # things as application/octet-streams since that seems the safest.
        ctype = part.get_content_type()
        charset = get_charset(part, default=None, guess=False)
        # i18n file name is encoded
        try:
            filename = oneline(part.get_filename(''), in_unicode=True)
        except TypeError:
            # Workaround for https://bugs.launchpad.net/mailman/+bug/1060951
            # (accented filenames)
            filename = "attachment.bin"
        filename, fnext = os.path.splitext(filename)
        # For safety, we should confirm this is valid ext for content-type
        # but we can use fnext if we introduce fnext filtering
        # TODO: re-implement this
        #if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
        #    # HTML message doesn't have filename :-(
        #    ext = fnext or guess_extension(ctype, fnext)
        #else:
        #    ext = guess_extension(ctype, fnext)
        ext = fnext or guess_extension(ctype, fnext)
        if not ext:
            # We don't know what it is, so assume it's just a shapeless
            # application/octet-stream, unless the Content-Type: is
            # message/rfc822, in which case we know we'll coerce the type to
            # text/plain below.
            if ctype == 'message/rfc822':
                ext = '.txt'
            else:
                ext = '.bin'
        # Allow only alphanumerics, dash, underscore, and dot
        ext = sre.sub('', ext)
        # Now base the filename on what's in the attachment, uniquifying it if
        # necessary.
        if not filename:
            filebase = 'attachment'
        else:
            # Sanitize the filename given in the message headers
            parts = pre.split(filename)
            filename = parts[-1]
            # Strip off leading dots
            filename = dre.sub('', filename)
            # Allow only alphanumerics, dash, underscore, and dot
            # i18n filenames are not supported yet,
            # see https://bugs.launchpad.net/bugs/1060951
            filename = sre.sub('', filename)
            # If the filename's extension doesn't match the type we guessed,
            # which one should we go with?  For now, let's go with the one we
            # guessed so attachments can't lie about their type.  Also, if the
            # filename /has/ no extension, then tack on the one we guessed.
            # The extension was removed from the name above.
            filebase = filename
        # TODO: bring back the HTML sanitizer feature
        if ctype == 'message/rfc822':
            submsg = part.get_payload()
            # Don't HTML-escape it, this is the frontend's job
            ## BAW: I'm sure we can eventually do better than this. :(
            #decodedpayload = websafe(str(submsg))
            decodedpayload = str(submsg)
        return (counter, filebase+ext, ctype, charset, decodedpayload)