diff options
author | Aurélien Bompard <aurelien@bompard.org> | 2012-11-17 11:52:04 +0100 |
---|---|---|
committer | Aurélien Bompard <aurelien@bompard.org> | 2012-11-17 11:52:04 +0100 |
commit | 45a953af9c77ae16455bc6dc6c6c600835624d49 (patch) | |
tree | 98c753bc3b06417d23410fa22b95e6d321812f21 /kittystore | |
parent | 4d8637021a884a2e7e6debf78cbf71cbca3771a3 (diff) | |
download | kittystore-45a953af9c77ae16455bc6dc6c6c600835624d49.tar.gz kittystore-45a953af9c77ae16455bc6dc6c6c600835624d49.tar.xz kittystore-45a953af9c77ae16455bc6dc6c6c600835624d49.zip |
Don't let the message scrubber add attachments
Or they will be added before the email is in the database, violating
foreign-key constraints.
Diffstat (limited to 'kittystore')
-rw-r--r-- | kittystore/scrub.py | 27 | ||||
-rw-r--r-- | kittystore/storm/__init__.py | 2 | ||||
-rw-r--r-- | kittystore/storm/store.py | 8 | ||||
-rw-r--r-- | kittystore/test/test_scrub.py | 111 | ||||
-rw-r--r-- | kittystore/test/test_storm_store.py | 48 |
5 files changed, 107 insertions, 89 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py index c3c9f52..729f0ba 100644 --- a/kittystore/scrub.py +++ b/kittystore/scrub.py @@ -77,18 +77,18 @@ def get_charset(message, default="ascii", guess=False): class Scrubber(object): """ - Scrubs a single message, extracts attachments, and store them in the - database. + Scrubs a single message, extracts attachments, and return the text and the + attachments. See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/ """ - def __init__(self, mlist, msg, store): + def __init__(self, mlist, msg): self.mlist = mlist self.msg = msg - self.store = store def scrub(self): + attachments = [] sanitize = 1 # TODO: implement other options #outer = True # Now walk over all subparts of this message and scrub out various types @@ -99,7 +99,7 @@ class Scrubber(object): disposition = part.get('content-disposition') if disposition and disposition.strip().startswith("attachment"): # part is attached - self.save_attachment(part, part_num) + attachments.append(self.parse_attachment(part, part_num)) part.set_payload('') elif ctype == 'text/html' and isinstance(sanitize, IntType): # if sanitize == 0: @@ -117,7 +117,7 @@ class Scrubber(object): # # Pull it out as an attachment but leave it unescaped. This # # is dangerous, but perhaps useful for heavily moderated # # lists. -# self.save_attachment(part, part_num, filter_html=False) +# attachments.append(self.parse_attachment(part, part_num, filter_html=False)) # replace_payload_by_text(part, _("""\ #An HTML attachment was scrubbed... #URL: %(url)s @@ -140,11 +140,11 @@ class Scrubber(object): ## We're replacing the payload with the decoded payload so this ## will just get in the way. #del part['content-transfer-encoding'] - self.save_attachment(part, part_num, filter_html=False) + attachments.append(self.parse_attachment(part, part_num, filter_html=False)) part.set_payload('') elif ctype == 'message/rfc822': # This part contains a submessage, so it too needs scrubbing - self.save_attachment(part, part_num) + attachments.append(self.parse_attachment(part, part_num)) part.set_payload('') # If the message isn't a multipart, then we'll strip it out as an # attachment that would have to be separately downloaded. @@ -159,7 +159,7 @@ class Scrubber(object): # ignore the part. if payload is None: continue - self.save_attachment(part, part_num) + attachments.append(self.parse_attachment(part, part_num)) #outer = False # We still have to sanitize multipart messages to flat text because # Pipermail can't handle messages with list payloads. This is a kludge; @@ -214,10 +214,10 @@ class Scrubber(object): else: text = self.msg.get_payload(decode=True) text = text.decode(get_charset(self.msg, guess=True), "replace") - return text + return (text, attachments) - def save_attachment(self, part, counter, filter_html=True): + def parse_attachment(self, part, counter, filter_html=True): # Store name, content-type and size # Figure out the attachment type and get the decoded data decodedpayload = part.get_payload(decode=True) @@ -281,7 +281,4 @@ class Scrubber(object): ## BAW: I'm sure we can eventually do better than this. :( #decodedpayload = websafe(str(submsg)) decodedpayload = str(submsg) - msg_id = unquote(self.msg['Message-Id']) - self.store.add_attachment( - self.mlist, msg_id, counter, filebase+ext, - ctype, charset, decodedpayload) + return (counter, filebase+ext, ctype, charset, decodedpayload) diff --git a/kittystore/storm/__init__.py b/kittystore/storm/__init__.py index da1a56d..b7c7a92 100644 --- a/kittystore/storm/__init__.py +++ b/kittystore/storm/__init__.py @@ -39,8 +39,8 @@ def create_store(url, debug): if debug: storm.tracer.debug(True, stream=sys.stdout) database = create_database(url) - store = Store(database) dbtype = url.partition(":")[0] + store = Store(database) dbschema = Schema(schema.CREATES[dbtype], [], [], schema) dbschema.upgrade(store) return StormStore(store, debug) diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py index 6e6df84..272907b 100644 --- a/kittystore/storm/store.py +++ b/kittystore/storm/store.py @@ -115,8 +115,6 @@ class StormStore(object): email.sender_email = unicode(from_email).strip() email.subject = header_to_unicode(message.get('Subject')) email.full = message.as_string() # Before scrubbing - scrubber = Scrubber(list_name, message, self) - email.content = scrubber.scrub() # warning: modifies the msg in-place msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date @@ -132,6 +130,10 @@ class StormStore(object): email.timezone = ( (utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60 + scrubber = Scrubber(list_name, message) + # warning: scrubbing modifies the msg in-place + email.content, attachments = scrubber.scrub() + #category = 'Question' # TODO: enum + i18n ? #if ('agenda' in message.get('Subject', '').lower() or # 'reminder' in message.get('Subject', '').lower()): @@ -150,6 +152,8 @@ class StormStore(object): self.db.add(email) self.flush() + for attachment in attachments: + self.add_attachment(list_name, msg_id, *attachment) return email.message_id_hash def delete_message(self, message_id): diff --git a/kittystore/test/test_scrub.py b/kittystore/test/test_scrub.py index 35e7d76..2e30cd7 100644 --- a/kittystore/test/test_scrub.py +++ b/kittystore/test/test_scrub.py @@ -18,15 +18,13 @@ class TestScrubber(unittest.TestCase): def test_attachment_1(self): with open(get_test_file("attachment-1.txt")) as email_file: msg = email.message_from_file(email_file, _class=Message) - store = Mock() - scrubber = Scrubber("testlist@example.com", msg, store) - contents = scrubber.scrub() - self.assertEqual(store.add_attachment.call_count, 1) - store.add_attachment.assert_called_with( - 'testlist@example.com', '505E5185.5040208@libero.it', 2, - 'puntogil.vcf', 'text/x-vcard', "utf-8", + scrubber = Scrubber("testlist@example.com", msg) + contents, attachments = scrubber.scrub() + self.assertEqual(len(attachments), 1) + self.assertEqual(attachments[0], ( + 2, 'puntogil.vcf', 'text/x-vcard', "utf-8", 'begin:vcard\r\nfn:gil\r\nn:;gil\r\nversion:2.1\r\n' - 'end:vcard\r\n\r\n') + 'end:vcard\r\n\r\n')) self.assertEqual(contents, "This is a test message.\r\n\r\n" "\n-- \ndevel mailing list\ndevel@lists.fedoraproject.org\n" @@ -36,18 +34,16 @@ class TestScrubber(unittest.TestCase): def test_attachment_2(self): with open(get_test_file("attachment-2.txt")) as email_file: msg = email.message_from_file(email_file, _class=Message) - store = Mock() - scrubber = Scrubber("testlist@example.com", msg, store) - contents = scrubber.scrub() - self.assertEqual(store.add_attachment.call_count, 1) - store.add_attachment.assert_called_with( - 'testlist@example.com', '50619B7A.2030404@thelounge.net', 3, - 'signature.asc', 'application/pgp-signature', None, + scrubber = Scrubber("testlist@example.com", msg) + contents, attachments = scrubber.scrub() + self.assertEqual(len(attachments), 1) + self.assertEqual(attachments[0], ( + 3, 'signature.asc', 'application/pgp-signature', None, '-----BEGIN PGP SIGNATURE-----\r\nVersion: GnuPG v1.4.12 ' '(GNU/Linux)\r\nComment: Using GnuPG with Mozilla - ' 'http://www.enigmail.net/\r\n\r\niEYEARECAAYFAlBhm3oACgkQhmBj' 'z394AnmMnQCcC+6tWcqE1dPQmIdRbLXgKGVp\r\nEeUAn2OqtaXaXaQV7rx+' - 'SmOldmSzcFw4\r\n=OEJv\r\n-----END PGP SIGNATURE-----\r\n') + 'SmOldmSzcFw4\r\n=OEJv\r\n-----END PGP SIGNATURE-----\r\n')) self.assertEqual(contents, u"This is a test message\r\nNon-ascii chars: Hofm\xfchlgasse\r\n" u"\n-- \ndevel mailing list\ndevel@lists.fedoraproject.org\n" @@ -57,37 +53,30 @@ class TestScrubber(unittest.TestCase): def test_attachment_3(self): with open(get_test_file("attachment-3.txt")) as email_file: msg = email.message_from_file(email_file, _class=Message) - store = Mock() - scrubber = Scrubber("testlist@example.com", msg, store) - contents = scrubber.scrub() - self.assertEqual(store.add_attachment.call_count, 2) - args_1, args_2 = store.add_attachment.call_args_list + scrubber = Scrubber("testlist@example.com", msg) + contents, attachments = scrubber.scrub() + self.assertEqual(len(attachments), 2) # HTML part - self.assertEqual(args_1[0][0:6], ("testlist@example.com", - "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com", - 3, "attachment.html", "text/html", "iso-8859-1")) - self.assertEqual(len(args_1[0][6]), 3134) + self.assertEqual(attachments[0][0:4], + (3, "attachment.html", "text/html", "iso-8859-1")) + self.assertEqual(len(attachments[0][4]), 3134) # Image attachment - self.assertEqual(args_2[0][0:6], ("testlist@example.com", - "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com", - 4, "GeoffreyRoucourt.jpg", "image/jpeg", None)) - self.assertEqual(len(args_2[0][6]), 282180) + self.assertEqual(attachments[1][0:4], + (4, "GeoffreyRoucourt.jpg", "image/jpeg", None)) + self.assertEqual(len(attachments[1][4]), 282180) # Scrubbed content self.assertEqual(contents, u"This is a test message\r\n") def test_html_email_1(self): with open(get_test_file("html-email-1.txt")) as email_file: msg = email.message_from_file(email_file, _class=Message) - store = Mock() - scrubber = Scrubber("testlist@example.com", msg, store) - contents = scrubber.scrub() - self.assertEqual(store.add_attachment.call_count, 1) - args = store.add_attachment.call_args[0] + scrubber = Scrubber("testlist@example.com", msg) + contents, attachments = scrubber.scrub() + self.assertEqual(len(attachments), 1) # HTML part - self.assertEqual(args[0:6], ("testlist@example.com", - "016001cd9b3b$b71efed0$255cfc70$@fr", - 2, "attachment.html", "text/html", "iso-8859-1")) - self.assertEqual(len(args[6]), 2723) + self.assertEqual(attachments[0][0:4], + (2, "attachment.html", "text/html", "iso-8859-1")) + self.assertEqual(len(attachments[0][4]), 2723) # Scrubbed content self.assertEqual(contents, u"This is a test message\r\n" @@ -98,9 +87,8 @@ class TestScrubber(unittest.TestCase): for enc in ["utf8", "iso8859"]: with open(get_test_file("payload-%s.txt" % enc)) as email_file: msg = email.message_from_file(email_file, _class=Message) - store = Mock() - scrubber = Scrubber("testlist@example.com", msg, store) - contents = scrubber.scrub() + scrubber = Scrubber("testlist@example.com", msg) + contents, attachments = scrubber.scrub() self.assertTrue(isinstance(contents, unicode)) self.assertEqual(contents, u'This message contains non-ascii ' u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n') @@ -108,22 +96,18 @@ class TestScrubber(unittest.TestCase): def test_attachment_4(self): with open(get_test_file("attachment-4.txt")) as email_file: msg = email.message_from_file(email_file, _class=Message) - store = Mock() - scrubber = Scrubber("testlist@example.com", msg, store) - contents = scrubber.scrub() - self.assertEqual(store.add_attachment.call_count, 2) - args_1, args_2 = store.add_attachment.call_args_list + scrubber = Scrubber("testlist@example.com", msg) + contents, attachments = scrubber.scrub() + self.assertEqual(len(attachments), 2) # HTML part - self.assertEqual(args_1[0][0:6], ("testlist@example.com", - "CAHmoxtXXb3un1C=ZvYNtz-eYghm-GH925gDVHyjhvL2YEsZ-Yw@mail.gmail.com", - 3, "attachment.html", "text/html", "iso-8859-1")) - self.assertEqual(len(args_1[0][6]), 114) + self.assertEqual(attachments[0][0:4], + (3, "attachment.html", "text/html", "iso-8859-1")) + self.assertEqual(len(attachments[0][4]), 114) # text attachment - self.assertEqual(args_2[0][0:6], ("testlist@example.com", - "CAHmoxtXXb3un1C=ZvYNtz-eYghm-GH925gDVHyjhvL2YEsZ-Yw@mail.gmail.com", - #4, u"todo-déjeuner.txt", "text/plain", "utf-8")) - 4, u"todo-djeuner.txt", "text/plain", "utf-8")) - self.assertEqual(len(args_2[0][6]), 112) + self.assertEqual(attachments[1][0:4], + #(4, u"todo-déjeuner.txt", "text/plain", "utf-8")) + (4, u"todo-djeuner.txt", "text/plain", "utf-8")) + self.assertEqual(len(attachments[1][4]), 112) # Scrubbed content self.assertEqual(contents, u'This is a test, HTML message with ' u'accented letters : \xe9 \xe8 \xe7 \xe0.\r\nAnd an ' @@ -132,17 +116,14 @@ class TestScrubber(unittest.TestCase): def test_attachment_5(self): with open(get_test_file("attachment-5.txt")) as email_file: msg = email.message_from_file(email_file, _class=Message) - store = Mock() - scrubber = Scrubber("testlist@example.com", msg, store) - contents = scrubber.scrub() - self.assertEqual(store.add_attachment.call_count, 1) - args = store.add_attachment.call_args_list[0][0] + scrubber = Scrubber("testlist@example.com", msg) + contents, attachments = scrubber.scrub() + self.assertEqual(len(attachments), 1) # text attachment - self.assertEqual(args[0:6], - ("testlist@example.com", "506C3344.7020501@free.fr", - #2, u"todo-déjeuner.txt", "text/plain", "utf-8")) - 2, u"attachment.bin", "text/plain", "utf-8")) - self.assertEqual(len(args[6]), 112) + self.assertEqual(attachments[0][0:4], + #(2, u"todo-déjeuner.txt", "text/plain", "utf-8")) + (2, u"attachment.bin", "text/plain", "utf-8")) + self.assertEqual(len(attachments[0][4]), 112) # Scrubbed content self.assertEqual(contents, u'This is a test, HTML message with ' u'accented letters : \xe9 \xe8 \xe7 \xe0.\r\nAnd an ' diff --git a/kittystore/test/test_storm_store.py b/kittystore/test/test_storm_store.py index b96a3ce..f89feb1 100644 --- a/kittystore/test/test_storm_store.py +++ b/kittystore/test/test_storm_store.py @@ -8,8 +8,12 @@ import email import datetime from storm.exceptions import IntegrityError +from mailman.email.message import Message + from kittystore.storm import get_storm_store -from kittystore.storm.model import Email +from kittystore.storm.model import Email, Attachment + +from kittystore.test import get_test_file class FakeList(object): @@ -17,23 +21,24 @@ class FakeList(object): # (Too few public methods) def __init__(self, name): self.fqdn_listname = name + self.display_name = None -class TestSAStore(unittest.TestCase): +class TestStormStore(unittest.TestCase): def setUp(self): self.store = get_storm_store("sqlite:") - #def tearDown(self): - # self.store.close() + def tearDown(self): + self.store.close() def test_no_message_id(self): - msg = email.message.Message() + msg = Message() self.assertRaises(ValueError, self.store.add_to_list, FakeList("example-list"), msg) def test_no_date(self): - msg = email.message.Message() + msg = Message() msg["From"] = "dummy@example.com" msg["Message-ID"] = "<dummy>" msg.set_payload("Dummy message") @@ -45,6 +50,37 @@ class TestSAStore(unittest.TestCase): stored_msg = self.store.db.find(Email).one() self.assertTrue(stored_msg.date >= now) + def test_date_naive(self): + msg = Message() + msg["From"] = "dummy@example.com" + msg["Message-ID"] = "<dummy>" + msg["Date"] = "Fri, 02 Nov 2012 16:07:54" + msg.set_payload("Dummy message") + try: + self.store.add_to_list(FakeList("example-list"), msg) + except IntegrityError, e: + self.fail(e) + stored_msg = self.store.db.find(Email).one() + expected = datetime.datetime(2012, 11, 2, 16, 7, 54) + self.assertEqual(stored_msg.date, expected) + + def test_attachment_insert_order(self): + """Attachments must not be inserted in the DB before the email""" + # Re-activate foreign key support in sqlite + self.store.db._connection._raw_connection.isolation_level = 'IMMEDIATE' + self.store.db.execute("PRAGMA foreign_keys = ON") + self.store.db._connection._raw_connection.execute("PRAGMA foreign_keys = ON") + #print "*"*10, list(self.store.db.execute("PRAGMA foreign_keys")) + #self.store = get_storm_store("postgres://kittystore:kittystore@localhost/kittystore_test") + with open(get_test_file("attachment-1.txt")) as email_file: + msg = email.message_from_file(email_file, _class=Message) + try: + self.store.add_to_list(FakeList("example-list"), msg) + except IntegrityError, e: + self.fail(e) + self.assertEqual(self.store.db.find(Email).count(), 1) + self.assertEqual(self.store.db.find(Attachment).count(), 1) + #def test_non_ascii_payload(self): # """add_to_list must handle non-ascii messages""" # with open(get_test_file("non-ascii-payload.txt")) as email_file: |