diff options
Diffstat (limited to 'kittystore/scrub.py')
-rw-r--r-- | kittystore/scrub.py | 97 |
1 files changed, 6 insertions, 91 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py index a463ac3..0b11963 100644 --- a/kittystore/scrub.py +++ b/kittystore/scrub.py @@ -96,43 +96,15 @@ class Scrubber(object): def scrub(self): sanitize = 1 # TODO: implement other options outer = True - charset = None - #lcset = Utils.GetCharSet(self.mlist.preferred_language) - #lcset_out = Charset(lcset).output_charset or lcset - lcset = "utf-8" # Now walk over all subparts of this message and scrub out various types - format = delsp = None for part_num, part in enumerate(self.msg.walk()): ctype = part.get_content_type() # If the part is text/plain, we leave it alone if ctype == 'text/plain': - # We need to choose a charset for the scrubbed message, so we'll - # arbitrarily pick the charset of the first text/plain part in the - # message. - # MAS: Also get the RFC 3676 stuff from this part. This seems to - # work OK for scrub_nondigest. It will also work as far as - # scrubbing messages for the archive is concerned, but pipermail - # doesn't pay any attention to the RFC 3676 parameters. The plain - # format digest is going to be a disaster in any case as some of - # messages will be format="flowed" and some not. ToDigest creates - # its own Content-Type: header for the plain digest which won't - # have RFC 3676 parameters. If the message Content-Type: headers - # are retained for display in the digest, the parameters will be - # there for information, but not for the MUA. This is the best we - # can do without having get_payload() process the parameters. - if charset is None: - charset = part.get_content_charset(lcset) - format = part.get_param('format') - delsp = part.get_param('delsp') # TK: if part is attached then check charset and scrub if none if part.get('content-disposition') and \ not part.get_content_charset(): self.save_attachment(part, part_num) - replace_payload_by_text(part, _("""\ - An embedded and charset-unspecified text was scrubbed... - Name: %(filename)s - URL: %(url)s - """), lcset) elif ctype == 'text/html' and isinstance(sanitize, IntType): # if sanitize == 0: # if outer: @@ -172,30 +144,14 @@ class Scrubber(object): # will just get in the way. del part['content-transfer-encoding'] self.save_attachment(part, part_num, filter_html=False) - replace_payload_by_text(part, _("""\ - An HTML attachment was scrubbed... - URL: %(url)s - """), lcset) + part.set_payload('') elif ctype == 'message/rfc822': # This part contains a submessage, so it too needs scrubbing submsg = part.get_payload(0) self.save_attachment(part, part_num) - subject = submsg.get('subject', _('no subject')) - subject = oneline(subject, lcset) - date = submsg.get('date', _('no date')) - who = submsg.get('from', _('unknown sender')) - size = len(str(submsg)) - replace_payload_by_text(part, _("""\ - An embedded message was scrubbed... - From: %(who)s - Subject: %(subject)s - Date: %(date)s - Size: %(size)s - URL: %(url)s - """), lcset) + part.set_payload('') # If the message isn't a multipart, then we'll strip it out as an - # attachment that would have to be separately downloaded. Pipermail - # will transform the url into a hyperlink. + # attachment that would have to be separately downloaded. elif part.get_payload() and not part.is_multipart(): payload = part.get_payload(decode=True) ctype = part.get_content_type() @@ -209,31 +165,11 @@ class Scrubber(object): continue size = len(payload) self.save_attachment(part, part_num) - desc = part.get('content-description', _('not available')) - desc = oneline(desc, lcset) - filename = part.get_filename(_('not available')) - filename = oneline(filename, lcset) - replace_payload_by_text(part, _("""\ - A non-text attachment was scrubbed... - Name: %(filename)s - Type: %(ctype)s - Size: %(size)d bytes - Desc: %(desc)s - URL: %(url)s - """), lcset) outer = False # We still have to sanitize multipart messages to flat text because # Pipermail can't handle messages with list payloads. This is a kludge; # def (n) clever hack ;). if self.msg.is_multipart(): - # By default we take the charset of the first text/plain part in the - # message, but if there was none, we'll use the list's preferred - # language's charset. - if not charset or charset == 'us-ascii': - charset = lcset_out - else: - # normalize to the output charset if input/output are different - charset = Charset(charset).output_charset or charset # We now want to concatenate all the parts which have been scrubbed to # text/plain, into a single text/plain payload. We need to make sure # all the characters in the concatenated string are in the same @@ -272,7 +208,7 @@ class Scrubber(object): partcharset = str(partcharset) else: partcharset = part.get_content_charset() - if partcharset and partcharset <> charset: + if partcharset: try: t = unicode(t, partcharset, 'replace') except (UnicodeError, LookupError, ValueError, @@ -280,36 +216,15 @@ class Scrubber(object): # We can get here if partcharset is bogus in come way. # Replace funny characters. We use errors='replace' t = unicode(t, 'ascii', 'replace') - try: - # Should use HTML-Escape, or try generalizing to UTF-8 - t = t.encode(charset, 'replace') - except (UnicodeError, LookupError, ValueError, - AssertionError): - # if the message charset is bogus, use the list's. - t = t.encode(lcset, 'replace') # Separation is useful - if isinstance(t, StringType): + if isinstance(t, basestring): if not t.endswith('\n'): t += '\n' text.append(t) # Now join the text and set the payload sep = _('-------------- next part --------------\n') - # The i18n separator is in the list's charset. Coerce it to the - # message charset. - try: - sep = sep.encode(charset, 'replace') - except (UnicodeError, LookupError, ValueError, - AssertionError): - pass text = sep.join(text) - del self.msg['content-type'] - del self.msg['content-transfer-encoding'] - self.msg.set_payload(text, charset) - if format: - self.msg.set_param('Format', format) - if delsp: - self.msg.set_param('DelSp', delsp) - return text.decode(charset) + return text def save_attachment(self, part, counter, filter_html=True): |