diff options
author | Aurélien Bompard <aurelien@bompard.org> | 2012-10-02 12:12:41 +0200 |
---|---|---|
committer | Aurélien Bompard <aurelien@bompard.org> | 2012-10-02 12:12:41 +0200 |
commit | 4f396b8956fba81221e9c62d8731134343c561d1 (patch) | |
tree | a7a190d04704596bb43142f7dc16645901d82bfa | |
parent | dc3ade6110c49ed647ee5008fe2fef0e9a6d8522 (diff) | |
download | kittystore-4f396b8956fba81221e9c62d8731134343c561d1.tar.gz kittystore-4f396b8956fba81221e9c62d8731134343c561d1.tar.xz kittystore-4f396b8956fba81221e9c62d8731134343c561d1.zip |
Improve charset handling
And store attachment encoding in the database.
-rw-r--r-- | kittystore/scrub.py | 87 | ||||
-rw-r--r-- | kittystore/storm/model.py | 1 | ||||
-rw-r--r-- | kittystore/storm/schema/__init__.py | 2 | ||||
-rw-r--r-- | kittystore/storm/store.py | 18 | ||||
-rw-r--r-- | kittystore/test/test_scrub.py | 22 | ||||
-rw-r--r-- | kittystore/utils.py | 15 |
6 files changed, 80 insertions, 65 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py index 61ea032..175f56b 100644 --- a/kittystore/scrub.py +++ b/kittystore/scrub.py @@ -68,23 +68,37 @@ def guess_extension(ctype, ext): return all and all[0] -def replace_payload_by_text(msg, text, charset): - # TK: This is a common function in replacing the attachment and the main - # message by a text (scrubbing). - del msg['content-type'] - del msg['content-transfer-encoding'] - #if isinstance(charset, unicode): - # # email 3.0.1 (python 2.4) doesn't like unicode - # charset = charset.encode('us-ascii') - #msg.set_payload(text, charset) - msg.set_payload('TODO: display attachment here and remove message subpart') - +def get_charset(message, default="ascii", guess=False): + """ + Get the message charset. + From: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/ + """ + if message.get_content_charset(): + return message.get_content_charset() + if message.get_charset(): + return message.get_charset() + if not guess: + return default + # Try to guess the encoding (best effort mode) + text = message.get_payload(decode=True) + charset = default + for encoding in ["ascii", "utf-8", "iso-8859-15"]: + try: + text.decode(encoding) + except UnicodeDecodeError: + continue + else: + #print encoding, payload + charset = encoding + break + return charset class Scrubber(object): """ Scrubs a single message, extracts attachments, and store them in the database. + See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/ """ def __init__(self, mlist, msg, store): @@ -101,9 +115,8 @@ class Scrubber(object): ctype = part.get_content_type() # If the part is text/plain, we leave it alone if ctype == 'text/plain': - # TK: if part is attached then check charset and scrub if none - if part.get('content-disposition') and \ - not part.get_content_charset(): + if part.get('content-disposition') == "attachment": + # part is attached self.save_attachment(part, part_num) elif ctype == 'text/html' and isinstance(sanitize, IntType): # if sanitize == 0: @@ -200,23 +213,14 @@ class Scrubber(object): # null body. See bug 1430236. except (binascii.Error, TypeError): t = part.get_payload() or '' - # TK: get_content_charset() returns 'iso-2022-jp' for internally - # crafted (scrubbed) 'euc-jp' text part. So, first try - # get_charset(), then get_content_charset() for the parts - # which are already embeded in the incoming message. - partcharset = part.get_charset() - if partcharset: - partcharset = str(partcharset) - else: - partcharset = part.get_content_charset() - if partcharset: - try: - t = unicode(t, partcharset, 'replace') - except (UnicodeError, LookupError, ValueError, - AssertionError): - # We can get here if partcharset is bogus in come way. - # Replace funny characters. We use errors='replace' - t = unicode(t, 'ascii', 'replace') + partcharset = get_charset(part, guess=True) + try: + t = unicode(t, partcharset, 'replace') + except (UnicodeError, LookupError, ValueError, + AssertionError): + # We can get here if partcharset is bogus in come way. + # Replace funny characters. We use errors='replace' + t = unicode(t, 'ascii', 'replace') # Separation is useful if isinstance(t, basestring): if not t.endswith('\n'): @@ -228,19 +232,7 @@ class Scrubber(object): text = "\n".join(text) else: text = self.msg.get_payload(decode=True) - charset = self.msg.get_content_charset() - if charset is None: - # Try to guess the encoding (best effort mode) - for encoding in ["ascii", "utf-8", "iso-8859-15"]: - try: - text.decode(encoding) - except UnicodeDecodeError: - continue - else: - #print encoding, payload - charset = encoding - break - text = text.decode(charset or "ascii", "replace") + text = text.decode(get_charset(self.msg, guess=True), "replace") return text @@ -252,10 +244,9 @@ class Scrubber(object): # e.g. image/jpg (should be image/jpeg). For now we just store such # things as application/octet-streams since that seems the safest. ctype = part.get_content_type() + charset = get_charset(part, default=None, guess=False) # i18n file name is encoded - #lcset = Utils.GetCharSet(self.mlist.preferred_language) - lcset = "utf-8" - filename = oneline(part.get_filename(''), lcset) + filename = oneline(part.get_filename(''), charset or "ascii") filename, fnext = os.path.splitext(filename) # For safety, we should confirm this is valid ext for content-type # but we can use fnext if we introduce fnext filtering @@ -305,4 +296,4 @@ class Scrubber(object): msg_id = self.msg['Message-Id'].strip("<>") self.store.add_attachment( self.mlist, msg_id, counter, filebase+ext, - ctype, decodedpayload) + ctype, charset, decodedpayload) diff --git a/kittystore/storm/model.py b/kittystore/storm/model.py index 4177fc7..8f74fea 100644 --- a/kittystore/storm/model.py +++ b/kittystore/storm/model.py @@ -83,5 +83,6 @@ class Attachment(object): counter = Int() name = Unicode() content_type = Unicode() + encoding = Unicode() size = Int() content = RawStr() diff --git a/kittystore/storm/schema/__init__.py b/kittystore/storm/schema/__init__.py index daba29f..edb6d7d 100644 --- a/kittystore/storm/schema/__init__.py +++ b/kittystore/storm/schema/__init__.py @@ -28,6 +28,7 @@ CREATES = { message_id VARCHAR(255) NOT NULL, counter INTEGER NOT NULL, content_type VARCHAR(255) NOT NULL, + encoding VARCHAR(50), name VARCHAR(255), size INTEGER NOT NULL, content BLOB NOT NULL, @@ -66,6 +67,7 @@ CREATES = { message_id VARCHAR(255) NOT NULL, counter INTEGER NOT NULL, content_type VARCHAR(255) NOT NULL, + encoding VARCHAR(50), name VARCHAR(255), size INTEGER NOT NULL, content BYTEA NOT NULL, diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py index 2858489..cb0b486 100644 --- a/kittystore/storm/store.py +++ b/kittystore/storm/store.py @@ -377,13 +377,14 @@ class StormStore(object): # Attachments def add_attachment(self, mlist, msg_id, counter, name, content_type, - content): + encoding, content): attachment = Attachment() attachment.list_name = unicode(mlist) attachment.message_id = unicode(msg_id) attachment.counter = counter attachment.name = unicode(name) attachment.content_type = unicode(content_type) + attachment.encoding = unicode(encoding) if encoding is not None else None attachment.content = content attachment.size = len(content) self.db.add(attachment) @@ -403,6 +404,21 @@ class StormStore(object): )).order_by(Attachment.counter) return list(att) + def get_attachment_by_counter(self, list_name, message_id, counter): + """Return the message's attachment at 'counter' position. + + :param list_name: The fully qualified list name to which the + message should be added. + :param message_id: The Message-ID header contents to search for. + :param counter: The position in the MIME-multipart email. + :returns: The corresponding attachment + """ + return self.db.find(Attachment, And( + Attachment.list_name == unicode(list_name), + Attachment.message_id == unicode(message_id), + Attachment.counter == counter + )).one() + # Generic database operations def flush(self): diff --git a/kittystore/test/test_scrub.py b/kittystore/test/test_scrub.py index e694be3..e0e442a 100644 --- a/kittystore/test/test_scrub.py +++ b/kittystore/test/test_scrub.py @@ -20,7 +20,7 @@ class TestScrubber(unittest.TestCase): self.assertEqual(store.add_attachment.call_count, 1) store.add_attachment.assert_called_with( 'testlist@example.com', '505E5185.5040208@libero.it', 2, - 'puntogil.vcf', 'text/x-vcard', + 'puntogil.vcf', 'text/x-vcard', "utf-8", 'begin:vcard\r\nfn:gil\r\nn:;gil\r\nversion:2.1\r\n' 'end:vcard\r\n\r\n') self.assertEqual(contents, @@ -38,7 +38,7 @@ class TestScrubber(unittest.TestCase): self.assertEqual(store.add_attachment.call_count, 1) store.add_attachment.assert_called_with( 'testlist@example.com', '50619B7A.2030404@thelounge.net', 3, - 'signature.asc', 'application/pgp-signature', + 'signature.asc', 'application/pgp-signature', None, '-----BEGIN PGP SIGNATURE-----\r\nVersion: GnuPG v1.4.12 ' '(GNU/Linux)\r\nComment: Using GnuPG with Mozilla - ' 'http://www.enigmail.net/\r\n\r\niEYEARECAAYFAlBhm3oACgkQhmBj' @@ -59,15 +59,15 @@ class TestScrubber(unittest.TestCase): self.assertEqual(store.add_attachment.call_count, 2) args_1, args_2 = store.add_attachment.call_args_list # HTML part - self.assertEqual(args_1[0][0:5], ("testlist@example.com", + self.assertEqual(args_1[0][0:6], ("testlist@example.com", "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com", - 3, "attachment.html", "text/html")) - self.assertEqual(len(args_1[0][5]), 3134) + 3, "attachment.html", "text/html", "iso-8859-1")) + self.assertEqual(len(args_1[0][6]), 3134) # Image attachment - self.assertEqual(args_2[0][0:5], ("testlist@example.com", + self.assertEqual(args_2[0][0:6], ("testlist@example.com", "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com", - 4, "GeoffreyRoucourt.jpg", "image/jpeg")) - self.assertEqual(len(args_2[0][5]), 282180) + 4, "GeoffreyRoucourt.jpg", "image/jpeg", None)) + self.assertEqual(len(args_2[0][6]), 282180) # Scrubbed content self.assertEqual(contents, u"This is a test message\r\n") @@ -80,10 +80,10 @@ class TestScrubber(unittest.TestCase): self.assertEqual(store.add_attachment.call_count, 1) args = store.add_attachment.call_args[0] # HTML part - self.assertEqual(args[0:5], ("testlist@example.com", + self.assertEqual(args[0:6], ("testlist@example.com", "016001cd9b3b$b71efed0$255cfc70$@fr", - 2, "attachment.html", "text/html")) - self.assertEqual(len(args[5]), 2723) + 2, "attachment.html", "text/html", "iso-8859-1")) + self.assertEqual(len(args[6]), 2723) # Scrubbed content self.assertEqual(contents, u"This is a test message\r\n" diff --git a/kittystore/utils.py b/kittystore/utils.py index c860ce5..ddb4d64 100644 --- a/kittystore/utils.py +++ b/kittystore/utils.py @@ -60,18 +60,23 @@ def parseaddr(address): from_name = from_email return from_name, from_email + def header_to_unicode(header): + """ + See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/ + """ h_decoded = [] - for decoded, charset in decode_header(header): + for text, charset in decode_header(header): if charset is None: - h_decoded.append(unicode(decoded)) + h_decoded.append(unicode(text)) else: try: - h_decoded.append(decoded.decode(charset)) + h_decoded.append(text.decode(charset)) except LookupError: # Unknown encoding - h_decoded.append(decoded.decode("ascii", "replace")) - return " ".join(h_decoded) + h_decoded.append(text.decode("ascii", "replace")) + return u" ".join(h_decoded) + def parsedate(datestring): if datestring is None: |