summaryrefslogtreecommitdiffstats
path: root/kittystore
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-11-17 11:52:04 +0100
committerAurélien Bompard <aurelien@bompard.org>2012-11-17 11:52:04 +0100
commit45a953af9c77ae16455bc6dc6c6c600835624d49 (patch)
tree98c753bc3b06417d23410fa22b95e6d321812f21 /kittystore
parent4d8637021a884a2e7e6debf78cbf71cbca3771a3 (diff)
downloadkittystore-45a953af9c77ae16455bc6dc6c6c600835624d49.tar.gz
kittystore-45a953af9c77ae16455bc6dc6c6c600835624d49.tar.xz
kittystore-45a953af9c77ae16455bc6dc6c6c600835624d49.zip
Don't let the message scrubber add attachments
Or they will be added before the email is in the database, violating foreign-key constraints.
Diffstat (limited to 'kittystore')
-rw-r--r--kittystore/scrub.py27
-rw-r--r--kittystore/storm/__init__.py2
-rw-r--r--kittystore/storm/store.py8
-rw-r--r--kittystore/test/test_scrub.py111
-rw-r--r--kittystore/test/test_storm_store.py48
5 files changed, 107 insertions, 89 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py
index c3c9f52..729f0ba 100644
--- a/kittystore/scrub.py
+++ b/kittystore/scrub.py
@@ -77,18 +77,18 @@ def get_charset(message, default="ascii", guess=False):
class Scrubber(object):
"""
- Scrubs a single message, extracts attachments, and store them in the
- database.
+ Scrubs a single message, extracts attachments, and return the text and the
+ attachments.
See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
"""
- def __init__(self, mlist, msg, store):
+ def __init__(self, mlist, msg):
self.mlist = mlist
self.msg = msg
- self.store = store
def scrub(self):
+ attachments = []
sanitize = 1 # TODO: implement other options
#outer = True
# Now walk over all subparts of this message and scrub out various types
@@ -99,7 +99,7 @@ class Scrubber(object):
disposition = part.get('content-disposition')
if disposition and disposition.strip().startswith("attachment"):
# part is attached
- self.save_attachment(part, part_num)
+ attachments.append(self.parse_attachment(part, part_num))
part.set_payload('')
elif ctype == 'text/html' and isinstance(sanitize, IntType):
# if sanitize == 0:
@@ -117,7 +117,7 @@ class Scrubber(object):
# # Pull it out as an attachment but leave it unescaped. This
# # is dangerous, but perhaps useful for heavily moderated
# # lists.
-# self.save_attachment(part, part_num, filter_html=False)
+# attachments.append(self.parse_attachment(part, part_num, filter_html=False))
# replace_payload_by_text(part, _("""\
#An HTML attachment was scrubbed...
#URL: %(url)s
@@ -140,11 +140,11 @@ class Scrubber(object):
## We're replacing the payload with the decoded payload so this
## will just get in the way.
#del part['content-transfer-encoding']
- self.save_attachment(part, part_num, filter_html=False)
+ attachments.append(self.parse_attachment(part, part_num, filter_html=False))
part.set_payload('')
elif ctype == 'message/rfc822':
# This part contains a submessage, so it too needs scrubbing
- self.save_attachment(part, part_num)
+ attachments.append(self.parse_attachment(part, part_num))
part.set_payload('')
# If the message isn't a multipart, then we'll strip it out as an
# attachment that would have to be separately downloaded.
@@ -159,7 +159,7 @@ class Scrubber(object):
# ignore the part.
if payload is None:
continue
- self.save_attachment(part, part_num)
+ attachments.append(self.parse_attachment(part, part_num))
#outer = False
# We still have to sanitize multipart messages to flat text because
# Pipermail can't handle messages with list payloads. This is a kludge;
@@ -214,10 +214,10 @@ class Scrubber(object):
else:
text = self.msg.get_payload(decode=True)
text = text.decode(get_charset(self.msg, guess=True), "replace")
- return text
+ return (text, attachments)
- def save_attachment(self, part, counter, filter_html=True):
+ def parse_attachment(self, part, counter, filter_html=True):
# Store name, content-type and size
# Figure out the attachment type and get the decoded data
decodedpayload = part.get_payload(decode=True)
@@ -281,7 +281,4 @@ class Scrubber(object):
## BAW: I'm sure we can eventually do better than this. :(
#decodedpayload = websafe(str(submsg))
decodedpayload = str(submsg)
- msg_id = unquote(self.msg['Message-Id'])
- self.store.add_attachment(
- self.mlist, msg_id, counter, filebase+ext,
- ctype, charset, decodedpayload)
+ return (counter, filebase+ext, ctype, charset, decodedpayload)
diff --git a/kittystore/storm/__init__.py b/kittystore/storm/__init__.py
index da1a56d..b7c7a92 100644
--- a/kittystore/storm/__init__.py
+++ b/kittystore/storm/__init__.py
@@ -39,8 +39,8 @@ def create_store(url, debug):
if debug:
storm.tracer.debug(True, stream=sys.stdout)
database = create_database(url)
- store = Store(database)
dbtype = url.partition(":")[0]
+ store = Store(database)
dbschema = Schema(schema.CREATES[dbtype], [], [], schema)
dbschema.upgrade(store)
return StormStore(store, debug)
diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py
index 6e6df84..272907b 100644
--- a/kittystore/storm/store.py
+++ b/kittystore/storm/store.py
@@ -115,8 +115,6 @@ class StormStore(object):
email.sender_email = unicode(from_email).strip()
email.subject = header_to_unicode(message.get('Subject'))
email.full = message.as_string() # Before scrubbing
- scrubber = Scrubber(list_name, message, self)
- email.content = scrubber.scrub() # warning: modifies the msg in-place
msg_date = parsedate(message.get("Date"))
if msg_date is None:
# Absent or unparseable date
@@ -132,6 +130,10 @@ class StormStore(object):
email.timezone = ( (utcoffset.days * 24 * 60 * 60)
+ utcoffset.seconds) / 60
+ scrubber = Scrubber(list_name, message)
+ # warning: scrubbing modifies the msg in-place
+ email.content, attachments = scrubber.scrub()
+
#category = 'Question' # TODO: enum + i18n ?
#if ('agenda' in message.get('Subject', '').lower() or
# 'reminder' in message.get('Subject', '').lower()):
@@ -150,6 +152,8 @@ class StormStore(object):
self.db.add(email)
self.flush()
+ for attachment in attachments:
+ self.add_attachment(list_name, msg_id, *attachment)
return email.message_id_hash
def delete_message(self, message_id):
diff --git a/kittystore/test/test_scrub.py b/kittystore/test/test_scrub.py
index 35e7d76..2e30cd7 100644
--- a/kittystore/test/test_scrub.py
+++ b/kittystore/test/test_scrub.py
@@ -18,15 +18,13 @@ class TestScrubber(unittest.TestCase):
def test_attachment_1(self):
with open(get_test_file("attachment-1.txt")) as email_file:
msg = email.message_from_file(email_file, _class=Message)
- store = Mock()
- scrubber = Scrubber("testlist@example.com", msg, store)
- contents = scrubber.scrub()
- self.assertEqual(store.add_attachment.call_count, 1)
- store.add_attachment.assert_called_with(
- 'testlist@example.com', '505E5185.5040208@libero.it', 2,
- 'puntogil.vcf', 'text/x-vcard', "utf-8",
+ scrubber = Scrubber("testlist@example.com", msg)
+ contents, attachments = scrubber.scrub()
+ self.assertEqual(len(attachments), 1)
+ self.assertEqual(attachments[0], (
+ 2, 'puntogil.vcf', 'text/x-vcard', "utf-8",
'begin:vcard\r\nfn:gil\r\nn:;gil\r\nversion:2.1\r\n'
- 'end:vcard\r\n\r\n')
+ 'end:vcard\r\n\r\n'))
self.assertEqual(contents,
"This is a test message.\r\n\r\n"
"\n-- \ndevel mailing list\ndevel@lists.fedoraproject.org\n"
@@ -36,18 +34,16 @@ class TestScrubber(unittest.TestCase):
def test_attachment_2(self):
with open(get_test_file("attachment-2.txt")) as email_file:
msg = email.message_from_file(email_file, _class=Message)
- store = Mock()
- scrubber = Scrubber("testlist@example.com", msg, store)
- contents = scrubber.scrub()
- self.assertEqual(store.add_attachment.call_count, 1)
- store.add_attachment.assert_called_with(
- 'testlist@example.com', '50619B7A.2030404@thelounge.net', 3,
- 'signature.asc', 'application/pgp-signature', None,
+ scrubber = Scrubber("testlist@example.com", msg)
+ contents, attachments = scrubber.scrub()
+ self.assertEqual(len(attachments), 1)
+ self.assertEqual(attachments[0], (
+ 3, 'signature.asc', 'application/pgp-signature', None,
'-----BEGIN PGP SIGNATURE-----\r\nVersion: GnuPG v1.4.12 '
'(GNU/Linux)\r\nComment: Using GnuPG with Mozilla - '
'http://www.enigmail.net/\r\n\r\niEYEARECAAYFAlBhm3oACgkQhmBj'
'z394AnmMnQCcC+6tWcqE1dPQmIdRbLXgKGVp\r\nEeUAn2OqtaXaXaQV7rx+'
- 'SmOldmSzcFw4\r\n=OEJv\r\n-----END PGP SIGNATURE-----\r\n')
+ 'SmOldmSzcFw4\r\n=OEJv\r\n-----END PGP SIGNATURE-----\r\n'))
self.assertEqual(contents,
u"This is a test message\r\nNon-ascii chars: Hofm\xfchlgasse\r\n"
u"\n-- \ndevel mailing list\ndevel@lists.fedoraproject.org\n"
@@ -57,37 +53,30 @@ class TestScrubber(unittest.TestCase):
def test_attachment_3(self):
with open(get_test_file("attachment-3.txt")) as email_file:
msg = email.message_from_file(email_file, _class=Message)
- store = Mock()
- scrubber = Scrubber("testlist@example.com", msg, store)
- contents = scrubber.scrub()
- self.assertEqual(store.add_attachment.call_count, 2)
- args_1, args_2 = store.add_attachment.call_args_list
+ scrubber = Scrubber("testlist@example.com", msg)
+ contents, attachments = scrubber.scrub()
+ self.assertEqual(len(attachments), 2)
# HTML part
- self.assertEqual(args_1[0][0:6], ("testlist@example.com",
- "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com",
- 3, "attachment.html", "text/html", "iso-8859-1"))
- self.assertEqual(len(args_1[0][6]), 3134)
+ self.assertEqual(attachments[0][0:4],
+ (3, "attachment.html", "text/html", "iso-8859-1"))
+ self.assertEqual(len(attachments[0][4]), 3134)
# Image attachment
- self.assertEqual(args_2[0][0:6], ("testlist@example.com",
- "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com",
- 4, "GeoffreyRoucourt.jpg", "image/jpeg", None))
- self.assertEqual(len(args_2[0][6]), 282180)
+ self.assertEqual(attachments[1][0:4],
+ (4, "GeoffreyRoucourt.jpg", "image/jpeg", None))
+ self.assertEqual(len(attachments[1][4]), 282180)
# Scrubbed content
self.assertEqual(contents, u"This is a test message\r\n")
def test_html_email_1(self):
with open(get_test_file("html-email-1.txt")) as email_file:
msg = email.message_from_file(email_file, _class=Message)
- store = Mock()
- scrubber = Scrubber("testlist@example.com", msg, store)
- contents = scrubber.scrub()
- self.assertEqual(store.add_attachment.call_count, 1)
- args = store.add_attachment.call_args[0]
+ scrubber = Scrubber("testlist@example.com", msg)
+ contents, attachments = scrubber.scrub()
+ self.assertEqual(len(attachments), 1)
# HTML part
- self.assertEqual(args[0:6], ("testlist@example.com",
- "016001cd9b3b$b71efed0$255cfc70$@fr",
- 2, "attachment.html", "text/html", "iso-8859-1"))
- self.assertEqual(len(args[6]), 2723)
+ self.assertEqual(attachments[0][0:4],
+ (2, "attachment.html", "text/html", "iso-8859-1"))
+ self.assertEqual(len(attachments[0][4]), 2723)
# Scrubbed content
self.assertEqual(contents,
u"This is a test message\r\n"
@@ -98,9 +87,8 @@ class TestScrubber(unittest.TestCase):
for enc in ["utf8", "iso8859"]:
with open(get_test_file("payload-%s.txt" % enc)) as email_file:
msg = email.message_from_file(email_file, _class=Message)
- store = Mock()
- scrubber = Scrubber("testlist@example.com", msg, store)
- contents = scrubber.scrub()
+ scrubber = Scrubber("testlist@example.com", msg)
+ contents, attachments = scrubber.scrub()
self.assertTrue(isinstance(contents, unicode))
self.assertEqual(contents, u'This message contains non-ascii '
u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n')
@@ -108,22 +96,18 @@ class TestScrubber(unittest.TestCase):
def test_attachment_4(self):
with open(get_test_file("attachment-4.txt")) as email_file:
msg = email.message_from_file(email_file, _class=Message)
- store = Mock()
- scrubber = Scrubber("testlist@example.com", msg, store)
- contents = scrubber.scrub()
- self.assertEqual(store.add_attachment.call_count, 2)
- args_1, args_2 = store.add_attachment.call_args_list
+ scrubber = Scrubber("testlist@example.com", msg)
+ contents, attachments = scrubber.scrub()
+ self.assertEqual(len(attachments), 2)
# HTML part
- self.assertEqual(args_1[0][0:6], ("testlist@example.com",
- "CAHmoxtXXb3un1C=ZvYNtz-eYghm-GH925gDVHyjhvL2YEsZ-Yw@mail.gmail.com",
- 3, "attachment.html", "text/html", "iso-8859-1"))
- self.assertEqual(len(args_1[0][6]), 114)
+ self.assertEqual(attachments[0][0:4],
+ (3, "attachment.html", "text/html", "iso-8859-1"))
+ self.assertEqual(len(attachments[0][4]), 114)
# text attachment
- self.assertEqual(args_2[0][0:6], ("testlist@example.com",
- "CAHmoxtXXb3un1C=ZvYNtz-eYghm-GH925gDVHyjhvL2YEsZ-Yw@mail.gmail.com",
- #4, u"todo-déjeuner.txt", "text/plain", "utf-8"))
- 4, u"todo-djeuner.txt", "text/plain", "utf-8"))
- self.assertEqual(len(args_2[0][6]), 112)
+ self.assertEqual(attachments[1][0:4],
+ #(4, u"todo-déjeuner.txt", "text/plain", "utf-8"))
+ (4, u"todo-djeuner.txt", "text/plain", "utf-8"))
+ self.assertEqual(len(attachments[1][4]), 112)
# Scrubbed content
self.assertEqual(contents, u'This is a test, HTML message with '
u'accented letters : \xe9 \xe8 \xe7 \xe0.\r\nAnd an '
@@ -132,17 +116,14 @@ class TestScrubber(unittest.TestCase):
def test_attachment_5(self):
with open(get_test_file("attachment-5.txt")) as email_file:
msg = email.message_from_file(email_file, _class=Message)
- store = Mock()
- scrubber = Scrubber("testlist@example.com", msg, store)
- contents = scrubber.scrub()
- self.assertEqual(store.add_attachment.call_count, 1)
- args = store.add_attachment.call_args_list[0][0]
+ scrubber = Scrubber("testlist@example.com", msg)
+ contents, attachments = scrubber.scrub()
+ self.assertEqual(len(attachments), 1)
# text attachment
- self.assertEqual(args[0:6],
- ("testlist@example.com", "506C3344.7020501@free.fr",
- #2, u"todo-déjeuner.txt", "text/plain", "utf-8"))
- 2, u"attachment.bin", "text/plain", "utf-8"))
- self.assertEqual(len(args[6]), 112)
+ self.assertEqual(attachments[0][0:4],
+ #(2, u"todo-déjeuner.txt", "text/plain", "utf-8"))
+ (2, u"attachment.bin", "text/plain", "utf-8"))
+ self.assertEqual(len(attachments[0][4]), 112)
# Scrubbed content
self.assertEqual(contents, u'This is a test, HTML message with '
u'accented letters : \xe9 \xe8 \xe7 \xe0.\r\nAnd an '
diff --git a/kittystore/test/test_storm_store.py b/kittystore/test/test_storm_store.py
index b96a3ce..f89feb1 100644
--- a/kittystore/test/test_storm_store.py
+++ b/kittystore/test/test_storm_store.py
@@ -8,8 +8,12 @@ import email
import datetime
from storm.exceptions import IntegrityError
+from mailman.email.message import Message
+
from kittystore.storm import get_storm_store
-from kittystore.storm.model import Email
+from kittystore.storm.model import Email, Attachment
+
+from kittystore.test import get_test_file
class FakeList(object):
@@ -17,23 +21,24 @@ class FakeList(object):
# (Too few public methods)
def __init__(self, name):
self.fqdn_listname = name
+ self.display_name = None
-class TestSAStore(unittest.TestCase):
+class TestStormStore(unittest.TestCase):
def setUp(self):
self.store = get_storm_store("sqlite:")
- #def tearDown(self):
- # self.store.close()
+ def tearDown(self):
+ self.store.close()
def test_no_message_id(self):
- msg = email.message.Message()
+ msg = Message()
self.assertRaises(ValueError, self.store.add_to_list,
FakeList("example-list"), msg)
def test_no_date(self):
- msg = email.message.Message()
+ msg = Message()
msg["From"] = "dummy@example.com"
msg["Message-ID"] = "<dummy>"
msg.set_payload("Dummy message")
@@ -45,6 +50,37 @@ class TestSAStore(unittest.TestCase):
stored_msg = self.store.db.find(Email).one()
self.assertTrue(stored_msg.date >= now)
+ def test_date_naive(self):
+ msg = Message()
+ msg["From"] = "dummy@example.com"
+ msg["Message-ID"] = "<dummy>"
+ msg["Date"] = "Fri, 02 Nov 2012 16:07:54"
+ msg.set_payload("Dummy message")
+ try:
+ self.store.add_to_list(FakeList("example-list"), msg)
+ except IntegrityError, e:
+ self.fail(e)
+ stored_msg = self.store.db.find(Email).one()
+ expected = datetime.datetime(2012, 11, 2, 16, 7, 54)
+ self.assertEqual(stored_msg.date, expected)
+
+ def test_attachment_insert_order(self):
+ """Attachments must not be inserted in the DB before the email"""
+ # Re-activate foreign key support in sqlite
+ self.store.db._connection._raw_connection.isolation_level = 'IMMEDIATE'
+ self.store.db.execute("PRAGMA foreign_keys = ON")
+ self.store.db._connection._raw_connection.execute("PRAGMA foreign_keys = ON")
+ #print "*"*10, list(self.store.db.execute("PRAGMA foreign_keys"))
+ #self.store = get_storm_store("postgres://kittystore:kittystore@localhost/kittystore_test")
+ with open(get_test_file("attachment-1.txt")) as email_file:
+ msg = email.message_from_file(email_file, _class=Message)
+ try:
+ self.store.add_to_list(FakeList("example-list"), msg)
+ except IntegrityError, e:
+ self.fail(e)
+ self.assertEqual(self.store.db.find(Email).count(), 1)
+ self.assertEqual(self.store.db.find(Attachment).count(), 1)
+
#def test_non_ascii_payload(self):
# """add_to_list must handle non-ascii messages"""
# with open(get_test_file("non-ascii-payload.txt")) as email_file: