Improve scrubbing

- don't escape HTML contents, it's the frontend's job - don't insert placeholder text when scrubbing attachments.
author: Aurélien Bompard <aurelien@bompard.org> 2012-10-01 12:28:47 +0200
committer: Aurélien Bompard <aurelien@bompard.org> 2012-10-01 12:28:47 +0200
commit: e012672451eda6293fd6817036d4dea907f63a4c (patch)
tree: 0fbc36853a6a3051fde7172aa535303437e95fb2
parent: 986ed148c065ae626f6f82f2d93a9b2e346a6e3c (diff)
download: kittystore-e012672451eda6293fd6817036d4dea907f63a4c.tar.gz
kittystore-e012672451eda6293fd6817036d4dea907f63a4c.tar.xz
kittystore-e012672451eda6293fd6817036d4dea907f63a4c.zip
2 files changed, 31 insertions, 42 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py
index 0b11963..04b30e5 100644
--- a/kittystore/scrub.py
+++ b/kittystore/scrub.py
@@ -128,21 +128,22 @@ class Scrubber(object):
 #"""), lcset)
 #            else:
                 if sanitize == 1:
-                    # HTML-escape it and store it as an attachment, but make it
-                    # look a /little/ bit prettier. :(
-                    payload = websafe(part.get_payload(decode=True))
-                    # For whitespace in the margin, change spaces into
-                    # non-breaking spaces, and tabs into 8 of those.  Then use a
-                    # mono-space font.  Still looks hideous to me, but then I'd
-                    # just as soon discard them.
-                    def doreplace(s):
-                        return s.expandtabs(8).replace(' ', '&nbsp;')
-                    lines = [doreplace(s) for s in payload.split('\n')]
-                    payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
-                    part.set_payload(payload)
-                    # We're replacing the payload with the decoded payload so this
-                    # will just get in the way.
-                    del part['content-transfer-encoding']
+                    # Don't HTML-escape it, this is the frontend's job
+                    ## HTML-escape it and store it as an attachment, but make it
+                    ## look a /little/ bit prettier. :(
+                    #payload = websafe(part.get_payload(decode=True))
+                    ## For whitespace in the margin, change spaces into
+                    ## non-breaking spaces, and tabs into 8 of those.  Then use a
+                    ## mono-space font.  Still looks hideous to me, but then I'd
+                    ## just as soon discard them.
+                    #def doreplace(s):
+                    #    return s.expandtabs(8).replace(' ', '&nbsp;')
+                    #lines = [doreplace(s) for s in payload.split('\n')]
+                    #payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
+                    #part.set_payload(payload)
+                    ## We're replacing the payload with the decoded payload so this
+                    ## will just get in the way.
+                    #del part['content-transfer-encoding']
                     self.save_attachment(part, part_num, filter_html=False)
                     part.set_payload('')
             elif ctype == 'message/rfc822':
@@ -188,7 +189,7 @@ class Scrubber(object):
                 partctype = part.get_content_type()
                 if partctype <> 'text/plain' and (partctype <> 'text/html' or
                                                   sanitize <> 2):
-                    text.append(_('Skipped content of type %(partctype)s\n'))
+                    #text.append(_('Skipped content of type %(partctype)s\n'))
                     continue
                 try:
                     t = part.get_payload(decode=True) or ''
@@ -222,8 +223,9 @@ class Scrubber(object):
                         t += '\n'
                     text.append(t)
             # Now join the text and set the payload
-            sep = _('-------------- next part --------------\n')
-            text = sep.join(text)
+            #sep = _('-------------- next part --------------\n')
+            #text = sep.join(text)
+            text = "\n".join(text)
         return text
 
 
@@ -281,8 +283,10 @@ class Scrubber(object):
         # TODO: bring back the HTML sanitizer feature
         if ctype == 'message/rfc822':
             submsg = part.get_payload()
-            # BAW: I'm sure we can eventually do better than this. :(
-            decodedpayload = websafe(str(submsg))
+            # Don't HTML-escape it, this is the frontend's job
+            ## BAW: I'm sure we can eventually do better than this. :(
+            #decodedpayload = websafe(str(submsg))
+            decodedpayload = str(submsg)
         msg_id = self.msg['Message-Id'].strip("<>")
         self.store.add_attachment(
                 self.mlist, msg_id, counter, filebase+ext,
diff --git a/kittystore/test/test_scrub.py b/kittystore/test/test_scrub.py
index d590709..8637e97 100644
--- a/kittystore/test/test_scrub.py
+++ b/kittystore/test/test_scrub.py
@@ -25,13 +25,9 @@ class TestScrubber(unittest.TestCase):
                 'end:vcard\r\n\r\n')
         self.assertEqual(contents,
                 "This is a test message.\r\n\r\n"
-                "-------------- next part --------------\n"
-                "Skipped content of type %(partctype)s\n"
-                "-------------- next part --------------\n"
-                "-- \ndevel mailing list\ndevel@lists.fedoraproject.org\n"
+                "\n-- \ndevel mailing list\ndevel@lists.fedoraproject.org\n"
                 "https://admin.fedoraproject.org/mailman/listinfo/devel\n"
                 )
-        self.fail() # Fix the expected text above
 
     def test_attachment_2(self):
         with open(get_test_file("attachment-2.txt")) as email_file:
@@ -49,14 +45,10 @@ class TestScrubber(unittest.TestCase):
                 'z394AnmMnQCcC+6tWcqE1dPQmIdRbLXgKGVp\r\nEeUAn2OqtaXaXaQV7rx+'
                 'SmOldmSzcFw4\r\n=OEJv\r\n-----END PGP SIGNATURE-----\r\n')
         self.assertEqual(contents,
-                u"This is a test message\r\nNon-ascii chars: Hofm\xfchlgasse"
-                u"\r\n-------------- next part --------------\n"
-                u"Skipped content of type %(partctype)s\n"
-                u"-------------- next part --------------\n"
-                u"-- \ndevel mailing list\ndevel@lists.fedoraproject.org\n"
+                u"This is a test message\r\nNon-ascii chars: Hofm\xfchlgasse\r\n"
+                u"\n-- \ndevel mailing list\ndevel@lists.fedoraproject.org\n"
                 u"https://admin.fedoraproject.org/mailman/listinfo/devel\n"
                 )
-        self.fail() # Fix the expected text above
 
     def test_attachment_3(self):
         with open(get_test_file("attachment-3.txt")) as email_file:
@@ -70,19 +62,14 @@ class TestScrubber(unittest.TestCase):
         self.assertEqual(args_1[0][0:5], ("testlist@example.com",
                 "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com",
                 3, "attachment.html", "text/html"))
-        self.assertEqual(len(args_1[0][5]), 5812)
+        self.assertEqual(len(args_1[0][5]), 3134)
         # Image attachment
         self.assertEqual(args_2[0][0:5], ("testlist@example.com",
                 "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com",
                 4, "GeoffreyRoucourt.jpg", "image/jpeg"))
         self.assertEqual(len(args_2[0][5]), 282180)
         # Scrubbed content
-        self.assertEqual(contents,
-                u"This is a test message\r\n"
-                u"-------------- next part --------------\n"
-                u"Skipped content of type %(partctype)s\n"
-                )
-        self.fail() # Fix the expected text above
+        self.assertEqual(contents, u"This is a test message\r\n")
 
     def test_html_email_1(self):
         with open(get_test_file("html-email-1.txt")) as email_file:
@@ -96,11 +83,9 @@ class TestScrubber(unittest.TestCase):
         self.assertEqual(args[0:5], ("testlist@example.com",
                 "016001cd9b3b$b71efed0$255cfc70$@fr",
                 2, "attachment.html", "text/html"))
-        self.assertEqual(len(args[5]), 5093)
+        self.assertEqual(len(args[5]), 2723)
         # Scrubbed content
         self.assertEqual(contents,
                 u"This is a test message\r\n"
-                u"Non-ASCII chars: r\xe9ponse fran\xe7ais \n"
-                )
-        self.fail() # Fix the expected text above
+                u"Non-ASCII chars: r\xe9ponse fran\xe7ais \n")
author	Aurélien Bompard <aurelien@bompard.org>	2012-10-01 12:28:47 +0200
committer	Aurélien Bompard <aurelien@bompard.org>	2012-10-01 12:28:47 +0200
commit	e012672451eda6293fd6817036d4dea907f63a4c (patch)
tree	0fbc36853a6a3051fde7172aa535303437e95fb2
parent	986ed148c065ae626f6f82f2d93a9b2e346a6e3c (diff)
download	kittystore-e012672451eda6293fd6817036d4dea907f63a4c.tar.gz kittystore-e012672451eda6293fd6817036d4dea907f63a4c.tar.xz kittystore-e012672451eda6293fd6817036d4dea907f63a4c.zip