summaryrefslogtreecommitdiffstats
path: root/kittystore/scrub.py
diff options
context:
space:
mode:
Diffstat (limited to 'kittystore/scrub.py')
-rw-r--r--kittystore/scrub.py97
1 files changed, 6 insertions, 91 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py
index a463ac3..0b11963 100644
--- a/kittystore/scrub.py
+++ b/kittystore/scrub.py
@@ -96,43 +96,15 @@ class Scrubber(object):
def scrub(self):
sanitize = 1 # TODO: implement other options
outer = True
- charset = None
- #lcset = Utils.GetCharSet(self.mlist.preferred_language)
- #lcset_out = Charset(lcset).output_charset or lcset
- lcset = "utf-8"
# Now walk over all subparts of this message and scrub out various types
- format = delsp = None
for part_num, part in enumerate(self.msg.walk()):
ctype = part.get_content_type()
# If the part is text/plain, we leave it alone
if ctype == 'text/plain':
- # We need to choose a charset for the scrubbed message, so we'll
- # arbitrarily pick the charset of the first text/plain part in the
- # message.
- # MAS: Also get the RFC 3676 stuff from this part. This seems to
- # work OK for scrub_nondigest. It will also work as far as
- # scrubbing messages for the archive is concerned, but pipermail
- # doesn't pay any attention to the RFC 3676 parameters. The plain
- # format digest is going to be a disaster in any case as some of
- # messages will be format="flowed" and some not. ToDigest creates
- # its own Content-Type: header for the plain digest which won't
- # have RFC 3676 parameters. If the message Content-Type: headers
- # are retained for display in the digest, the parameters will be
- # there for information, but not for the MUA. This is the best we
- # can do without having get_payload() process the parameters.
- if charset is None:
- charset = part.get_content_charset(lcset)
- format = part.get_param('format')
- delsp = part.get_param('delsp')
# TK: if part is attached then check charset and scrub if none
if part.get('content-disposition') and \
not part.get_content_charset():
self.save_attachment(part, part_num)
- replace_payload_by_text(part, _("""\
- An embedded and charset-unspecified text was scrubbed...
- Name: %(filename)s
- URL: %(url)s
- """), lcset)
elif ctype == 'text/html' and isinstance(sanitize, IntType):
# if sanitize == 0:
# if outer:
@@ -172,30 +144,14 @@ class Scrubber(object):
# will just get in the way.
del part['content-transfer-encoding']
self.save_attachment(part, part_num, filter_html=False)
- replace_payload_by_text(part, _("""\
- An HTML attachment was scrubbed...
- URL: %(url)s
- """), lcset)
+ part.set_payload('')
elif ctype == 'message/rfc822':
# This part contains a submessage, so it too needs scrubbing
submsg = part.get_payload(0)
self.save_attachment(part, part_num)
- subject = submsg.get('subject', _('no subject'))
- subject = oneline(subject, lcset)
- date = submsg.get('date', _('no date'))
- who = submsg.get('from', _('unknown sender'))
- size = len(str(submsg))
- replace_payload_by_text(part, _("""\
- An embedded message was scrubbed...
- From: %(who)s
- Subject: %(subject)s
- Date: %(date)s
- Size: %(size)s
- URL: %(url)s
- """), lcset)
+ part.set_payload('')
# If the message isn't a multipart, then we'll strip it out as an
- # attachment that would have to be separately downloaded. Pipermail
- # will transform the url into a hyperlink.
+ # attachment that would have to be separately downloaded.
elif part.get_payload() and not part.is_multipart():
payload = part.get_payload(decode=True)
ctype = part.get_content_type()
@@ -209,31 +165,11 @@ class Scrubber(object):
continue
size = len(payload)
self.save_attachment(part, part_num)
- desc = part.get('content-description', _('not available'))
- desc = oneline(desc, lcset)
- filename = part.get_filename(_('not available'))
- filename = oneline(filename, lcset)
- replace_payload_by_text(part, _("""\
- A non-text attachment was scrubbed...
- Name: %(filename)s
- Type: %(ctype)s
- Size: %(size)d bytes
- Desc: %(desc)s
- URL: %(url)s
- """), lcset)
outer = False
# We still have to sanitize multipart messages to flat text because
# Pipermail can't handle messages with list payloads. This is a kludge;
# def (n) clever hack ;).
if self.msg.is_multipart():
- # By default we take the charset of the first text/plain part in the
- # message, but if there was none, we'll use the list's preferred
- # language's charset.
- if not charset or charset == 'us-ascii':
- charset = lcset_out
- else:
- # normalize to the output charset if input/output are different
- charset = Charset(charset).output_charset or charset
# We now want to concatenate all the parts which have been scrubbed to
# text/plain, into a single text/plain payload. We need to make sure
# all the characters in the concatenated string are in the same
@@ -272,7 +208,7 @@ class Scrubber(object):
partcharset = str(partcharset)
else:
partcharset = part.get_content_charset()
- if partcharset and partcharset <> charset:
+ if partcharset:
try:
t = unicode(t, partcharset, 'replace')
except (UnicodeError, LookupError, ValueError,
@@ -280,36 +216,15 @@ class Scrubber(object):
# We can get here if partcharset is bogus in come way.
# Replace funny characters. We use errors='replace'
t = unicode(t, 'ascii', 'replace')
- try:
- # Should use HTML-Escape, or try generalizing to UTF-8
- t = t.encode(charset, 'replace')
- except (UnicodeError, LookupError, ValueError,
- AssertionError):
- # if the message charset is bogus, use the list's.
- t = t.encode(lcset, 'replace')
# Separation is useful
- if isinstance(t, StringType):
+ if isinstance(t, basestring):
if not t.endswith('\n'):
t += '\n'
text.append(t)
# Now join the text and set the payload
sep = _('-------------- next part --------------\n')
- # The i18n separator is in the list's charset. Coerce it to the
- # message charset.
- try:
- sep = sep.encode(charset, 'replace')
- except (UnicodeError, LookupError, ValueError,
- AssertionError):
- pass
text = sep.join(text)
- del self.msg['content-type']
- del self.msg['content-transfer-encoding']
- self.msg.set_payload(text, charset)
- if format:
- self.msg.set_param('Format', format)
- if delsp:
- self.msg.set_param('DelSp', delsp)
- return text.decode(charset)
+ return text
def save_attachment(self, part, counter, filter_html=True):