From ba422e76d4889dce2effcfcb9455ac8080024a54 Mon Sep 17 00:00:00 2001 From: Aslak Knutsen Date: Thu, 14 Mar 2013 23:40:03 +0100 Subject: Clean up pipermail attachments from email.content When loading an archive from pipermail, attachments are described in the email body as "-- next part --" pr attachment. This is noise for the reader and should not be part of the email.content displayed to the user. The complete original message can still be seen in the raw view. --- kittystore/scrub.py | 11 ++++++--- kittystore/test/test_scrub.py | 7 ++++++ kittystore/test/testdata/pipermail_nextpart.txt | 30 +++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 kittystore/test/testdata/pipermail_nextpart.txt (limited to 'kittystore') diff --git a/kittystore/scrub.py b/kittystore/scrub.py index 729f0ba..2a54954 100644 --- a/kittystore/scrub.py +++ b/kittystore/scrub.py @@ -37,6 +37,8 @@ dre = re.compile(r'^\.*') BR = '
\n' +NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n') + def guess_extension(ctype, ext): # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot, @@ -207,13 +209,16 @@ class Scrubber(object): if not t.endswith('\n'): t += '\n' text.append(t) - # Now join the text and set the payload - #sep = _('-------------- next part --------------\n') - #text = sep.join(text) + text = "\n".join(text) else: text = self.msg.get_payload(decode=True) text = text.decode(get_charset(self.msg, guess=True), "replace") + + next_part_match = NEXT_PART.search(text) + if next_part_match: + text = text[0:next_part_match.start(0)] + return (text, attachments) diff --git a/kittystore/test/test_scrub.py b/kittystore/test/test_scrub.py index 2e30cd7..3aef0d2 100644 --- a/kittystore/test/test_scrub.py +++ b/kittystore/test/test_scrub.py @@ -129,3 +129,10 @@ class TestScrubber(unittest.TestCase): u'accented letters : \xe9 \xe8 \xe7 \xe0.\r\nAnd an ' u'attachment with an accented filename\r\n\r\n\r\n\r\n') + def test_remove_next_part_from_content(self): + with open(get_test_file("pipermail_nextpart.txt")) as email_file: + msg = email.message_from_file(email_file, _class=Message) + scrubber = Scrubber("testlist@example.com", msg) + contents, attachments = scrubber.scrub() + + self.failIf("-------------- next part --------------" in contents) diff --git a/kittystore/test/testdata/pipermail_nextpart.txt b/kittystore/test/testdata/pipermail_nextpart.txt new file mode 100644 index 0000000..a63c65d --- /dev/null +++ b/kittystore/test/testdata/pipermail_nextpart.txt @@ -0,0 +1,30 @@ +From vondruch at redhat.com Tue Jul 10 11:29:44 2012 +From: vondruch at redhat.com (=?ISO-8859-2?Q?V=EDt_Ondruch?=) +Date: Tue, 10 Jul 2012 13:29:44 +0200 +Subject: [Fedora-packaging] RPM macros +Message-ID: <4FFC1228.3060409@redhat.com> + +Hi, + +I noticed that in revised haskell guidelines [1], there is mentioned the +ghc-rpm-macros package, which provides macros.ghc file, which in turns +provides some useful macros for packaging of Haskell packages. In Ruby, +we provide similar macro files in ruby-devel and rubygems-devel +subpackages respectively. Perl has their macros directly in the rpm +package itself. + +This seems to be a bit inconsistent to me. So my question is: shouldn't +we standardize some best practices with regards of RPM macros? For +example for Ruby, we placed the macros into -devel subpackages, because +we believe that it is just development dependency. Any opinions? + + +Vit + +-------------- next part -------------- +A non-text attachment was scrubbed... +Name: signature.asc +Type: application/pgp-signature +Size: 190 bytes +Desc: This is a digitally signed message part. +URL: -- cgit