summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-10-02 12:12:41 +0200
committerAurélien Bompard <aurelien@bompard.org>2012-10-02 12:12:41 +0200
commit4f396b8956fba81221e9c62d8731134343c561d1 (patch)
treea7a190d04704596bb43142f7dc16645901d82bfa
parentdc3ade6110c49ed647ee5008fe2fef0e9a6d8522 (diff)
downloadkittystore-4f396b8956fba81221e9c62d8731134343c561d1.tar.gz
kittystore-4f396b8956fba81221e9c62d8731134343c561d1.tar.xz
kittystore-4f396b8956fba81221e9c62d8731134343c561d1.zip
Improve charset handling
And store attachment encoding in the database.
-rw-r--r--kittystore/scrub.py87
-rw-r--r--kittystore/storm/model.py1
-rw-r--r--kittystore/storm/schema/__init__.py2
-rw-r--r--kittystore/storm/store.py18
-rw-r--r--kittystore/test/test_scrub.py22
-rw-r--r--kittystore/utils.py15
6 files changed, 80 insertions, 65 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py
index 61ea032..175f56b 100644
--- a/kittystore/scrub.py
+++ b/kittystore/scrub.py
@@ -68,23 +68,37 @@ def guess_extension(ctype, ext):
return all and all[0]
-def replace_payload_by_text(msg, text, charset):
- # TK: This is a common function in replacing the attachment and the main
- # message by a text (scrubbing).
- del msg['content-type']
- del msg['content-transfer-encoding']
- #if isinstance(charset, unicode):
- # # email 3.0.1 (python 2.4) doesn't like unicode
- # charset = charset.encode('us-ascii')
- #msg.set_payload(text, charset)
- msg.set_payload('TODO: display attachment here and remove message subpart')
-
+def get_charset(message, default="ascii", guess=False):
+ """
+ Get the message charset.
+ From: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
+ """
+ if message.get_content_charset():
+ return message.get_content_charset()
+ if message.get_charset():
+ return message.get_charset()
+ if not guess:
+ return default
+ # Try to guess the encoding (best effort mode)
+ text = message.get_payload(decode=True)
+ charset = default
+ for encoding in ["ascii", "utf-8", "iso-8859-15"]:
+ try:
+ text.decode(encoding)
+ except UnicodeDecodeError:
+ continue
+ else:
+ #print encoding, payload
+ charset = encoding
+ break
+ return charset
class Scrubber(object):
"""
Scrubs a single message, extracts attachments, and store them in the
database.
+ See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
"""
def __init__(self, mlist, msg, store):
@@ -101,9 +115,8 @@ class Scrubber(object):
ctype = part.get_content_type()
# If the part is text/plain, we leave it alone
if ctype == 'text/plain':
- # TK: if part is attached then check charset and scrub if none
- if part.get('content-disposition') and \
- not part.get_content_charset():
+ if part.get('content-disposition') == "attachment":
+ # part is attached
self.save_attachment(part, part_num)
elif ctype == 'text/html' and isinstance(sanitize, IntType):
# if sanitize == 0:
@@ -200,23 +213,14 @@ class Scrubber(object):
# null body. See bug 1430236.
except (binascii.Error, TypeError):
t = part.get_payload() or ''
- # TK: get_content_charset() returns 'iso-2022-jp' for internally
- # crafted (scrubbed) 'euc-jp' text part. So, first try
- # get_charset(), then get_content_charset() for the parts
- # which are already embeded in the incoming message.
- partcharset = part.get_charset()
- if partcharset:
- partcharset = str(partcharset)
- else:
- partcharset = part.get_content_charset()
- if partcharset:
- try:
- t = unicode(t, partcharset, 'replace')
- except (UnicodeError, LookupError, ValueError,
- AssertionError):
- # We can get here if partcharset is bogus in come way.
- # Replace funny characters. We use errors='replace'
- t = unicode(t, 'ascii', 'replace')
+ partcharset = get_charset(part, guess=True)
+ try:
+ t = unicode(t, partcharset, 'replace')
+ except (UnicodeError, LookupError, ValueError,
+ AssertionError):
+ # We can get here if partcharset is bogus in come way.
+ # Replace funny characters. We use errors='replace'
+ t = unicode(t, 'ascii', 'replace')
# Separation is useful
if isinstance(t, basestring):
if not t.endswith('\n'):
@@ -228,19 +232,7 @@ class Scrubber(object):
text = "\n".join(text)
else:
text = self.msg.get_payload(decode=True)
- charset = self.msg.get_content_charset()
- if charset is None:
- # Try to guess the encoding (best effort mode)
- for encoding in ["ascii", "utf-8", "iso-8859-15"]:
- try:
- text.decode(encoding)
- except UnicodeDecodeError:
- continue
- else:
- #print encoding, payload
- charset = encoding
- break
- text = text.decode(charset or "ascii", "replace")
+ text = text.decode(get_charset(self.msg, guess=True), "replace")
return text
@@ -252,10 +244,9 @@ class Scrubber(object):
# e.g. image/jpg (should be image/jpeg). For now we just store such
# things as application/octet-streams since that seems the safest.
ctype = part.get_content_type()
+ charset = get_charset(part, default=None, guess=False)
# i18n file name is encoded
- #lcset = Utils.GetCharSet(self.mlist.preferred_language)
- lcset = "utf-8"
- filename = oneline(part.get_filename(''), lcset)
+ filename = oneline(part.get_filename(''), charset or "ascii")
filename, fnext = os.path.splitext(filename)
# For safety, we should confirm this is valid ext for content-type
# but we can use fnext if we introduce fnext filtering
@@ -305,4 +296,4 @@ class Scrubber(object):
msg_id = self.msg['Message-Id'].strip("<>")
self.store.add_attachment(
self.mlist, msg_id, counter, filebase+ext,
- ctype, decodedpayload)
+ ctype, charset, decodedpayload)
diff --git a/kittystore/storm/model.py b/kittystore/storm/model.py
index 4177fc7..8f74fea 100644
--- a/kittystore/storm/model.py
+++ b/kittystore/storm/model.py
@@ -83,5 +83,6 @@ class Attachment(object):
counter = Int()
name = Unicode()
content_type = Unicode()
+ encoding = Unicode()
size = Int()
content = RawStr()
diff --git a/kittystore/storm/schema/__init__.py b/kittystore/storm/schema/__init__.py
index daba29f..edb6d7d 100644
--- a/kittystore/storm/schema/__init__.py
+++ b/kittystore/storm/schema/__init__.py
@@ -28,6 +28,7 @@ CREATES = {
message_id VARCHAR(255) NOT NULL,
counter INTEGER NOT NULL,
content_type VARCHAR(255) NOT NULL,
+ encoding VARCHAR(50),
name VARCHAR(255),
size INTEGER NOT NULL,
content BLOB NOT NULL,
@@ -66,6 +67,7 @@ CREATES = {
message_id VARCHAR(255) NOT NULL,
counter INTEGER NOT NULL,
content_type VARCHAR(255) NOT NULL,
+ encoding VARCHAR(50),
name VARCHAR(255),
size INTEGER NOT NULL,
content BYTEA NOT NULL,
diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py
index 2858489..cb0b486 100644
--- a/kittystore/storm/store.py
+++ b/kittystore/storm/store.py
@@ -377,13 +377,14 @@ class StormStore(object):
# Attachments
def add_attachment(self, mlist, msg_id, counter, name, content_type,
- content):
+ encoding, content):
attachment = Attachment()
attachment.list_name = unicode(mlist)
attachment.message_id = unicode(msg_id)
attachment.counter = counter
attachment.name = unicode(name)
attachment.content_type = unicode(content_type)
+ attachment.encoding = unicode(encoding) if encoding is not None else None
attachment.content = content
attachment.size = len(content)
self.db.add(attachment)
@@ -403,6 +404,21 @@ class StormStore(object):
)).order_by(Attachment.counter)
return list(att)
+ def get_attachment_by_counter(self, list_name, message_id, counter):
+ """Return the message's attachment at 'counter' position.
+
+ :param list_name: The fully qualified list name to which the
+ message should be added.
+ :param message_id: The Message-ID header contents to search for.
+ :param counter: The position in the MIME-multipart email.
+ :returns: The corresponding attachment
+ """
+ return self.db.find(Attachment, And(
+ Attachment.list_name == unicode(list_name),
+ Attachment.message_id == unicode(message_id),
+ Attachment.counter == counter
+ )).one()
+
# Generic database operations
def flush(self):
diff --git a/kittystore/test/test_scrub.py b/kittystore/test/test_scrub.py
index e694be3..e0e442a 100644
--- a/kittystore/test/test_scrub.py
+++ b/kittystore/test/test_scrub.py
@@ -20,7 +20,7 @@ class TestScrubber(unittest.TestCase):
self.assertEqual(store.add_attachment.call_count, 1)
store.add_attachment.assert_called_with(
'testlist@example.com', '505E5185.5040208@libero.it', 2,
- 'puntogil.vcf', 'text/x-vcard',
+ 'puntogil.vcf', 'text/x-vcard', "utf-8",
'begin:vcard\r\nfn:gil\r\nn:;gil\r\nversion:2.1\r\n'
'end:vcard\r\n\r\n')
self.assertEqual(contents,
@@ -38,7 +38,7 @@ class TestScrubber(unittest.TestCase):
self.assertEqual(store.add_attachment.call_count, 1)
store.add_attachment.assert_called_with(
'testlist@example.com', '50619B7A.2030404@thelounge.net', 3,
- 'signature.asc', 'application/pgp-signature',
+ 'signature.asc', 'application/pgp-signature', None,
'-----BEGIN PGP SIGNATURE-----\r\nVersion: GnuPG v1.4.12 '
'(GNU/Linux)\r\nComment: Using GnuPG with Mozilla - '
'http://www.enigmail.net/\r\n\r\niEYEARECAAYFAlBhm3oACgkQhmBj'
@@ -59,15 +59,15 @@ class TestScrubber(unittest.TestCase):
self.assertEqual(store.add_attachment.call_count, 2)
args_1, args_2 = store.add_attachment.call_args_list
# HTML part
- self.assertEqual(args_1[0][0:5], ("testlist@example.com",
+ self.assertEqual(args_1[0][0:6], ("testlist@example.com",
"CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com",
- 3, "attachment.html", "text/html"))
- self.assertEqual(len(args_1[0][5]), 3134)
+ 3, "attachment.html", "text/html", "iso-8859-1"))
+ self.assertEqual(len(args_1[0][6]), 3134)
# Image attachment
- self.assertEqual(args_2[0][0:5], ("testlist@example.com",
+ self.assertEqual(args_2[0][0:6], ("testlist@example.com",
"CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com",
- 4, "GeoffreyRoucourt.jpg", "image/jpeg"))
- self.assertEqual(len(args_2[0][5]), 282180)
+ 4, "GeoffreyRoucourt.jpg", "image/jpeg", None))
+ self.assertEqual(len(args_2[0][6]), 282180)
# Scrubbed content
self.assertEqual(contents, u"This is a test message\r\n")
@@ -80,10 +80,10 @@ class TestScrubber(unittest.TestCase):
self.assertEqual(store.add_attachment.call_count, 1)
args = store.add_attachment.call_args[0]
# HTML part
- self.assertEqual(args[0:5], ("testlist@example.com",
+ self.assertEqual(args[0:6], ("testlist@example.com",
"016001cd9b3b$b71efed0$255cfc70$@fr",
- 2, "attachment.html", "text/html"))
- self.assertEqual(len(args[5]), 2723)
+ 2, "attachment.html", "text/html", "iso-8859-1"))
+ self.assertEqual(len(args[6]), 2723)
# Scrubbed content
self.assertEqual(contents,
u"This is a test message\r\n"
diff --git a/kittystore/utils.py b/kittystore/utils.py
index c860ce5..ddb4d64 100644
--- a/kittystore/utils.py
+++ b/kittystore/utils.py
@@ -60,18 +60,23 @@ def parseaddr(address):
from_name = from_email
return from_name, from_email
+
def header_to_unicode(header):
+ """
+ See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
+ """
h_decoded = []
- for decoded, charset in decode_header(header):
+ for text, charset in decode_header(header):
if charset is None:
- h_decoded.append(unicode(decoded))
+ h_decoded.append(unicode(text))
else:
try:
- h_decoded.append(decoded.decode(charset))
+ h_decoded.append(text.decode(charset))
except LookupError:
# Unknown encoding
- h_decoded.append(decoded.decode("ascii", "replace"))
- return " ".join(h_decoded)
+ h_decoded.append(text.decode("ascii", "replace"))
+ return u" ".join(h_decoded)
+
def parsedate(datestring):
if datestring is None: