summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-08-20 21:38:43 +0200
committerAurélien Bompard <aurelien@bompard.org>2012-09-07 10:40:54 +0200
commit29bcaf005efd58021a60e109f4f6e70a055560f4 (patch)
tree95db72eba25d9b91b3b83dcfa64c550771cbce15
parentb5c5a81d184bf47018b7360ad835217da5b8e0ee (diff)
Handle non-ascii payloads better
-rw-r--r--kittystore/sa/store.py15
-rw-r--r--kittystore/test/test_sa_store.py53
-rw-r--r--kittystore/test/test_utils.py29
-rw-r--r--kittystore/test/testdata/payload-iso8859.txt8
-rw-r--r--kittystore/test/testdata/payload-utf8.txt (renamed from kittystore/test/testdata/non-ascii-payload.txt)0
-rw-r--r--kittystore/utils.py28
6 files changed, 97 insertions, 36 deletions
diff --git a/kittystore/sa/store.py b/kittystore/sa/store.py
index e4a417f..019bc0e 100644
--- a/kittystore/sa/store.py
+++ b/kittystore/sa/store.py
@@ -19,7 +19,8 @@ import datetime
from kittystore import MessageNotFound
from kittystore.utils import get_message_id_hash, parseaddr, parsedate
-from kittystore.utils import get_ref_and_thread_id, header_to_unicode
+from kittystore.utils import header_to_unicode, payload_to_unicode
+from kittystore.utils import get_ref_and_thread_id
from kittystore.sa.kittysamodel import get_class_object
from zope.interface import implements
@@ -123,15 +124,7 @@ class KittySAStore(object):
from_name, from_email = parseaddr(message['From'])
from_name = header_to_unicode(from_name)
-
- # Turn non-ascii into Unicode, assuming UTF-8
- for part in message.walk():
- if part.get_content_charset() is None:
- try:
- unicode(part.get_payload())
- except UnicodeDecodeError:
- # Try UTF-8
- part.set_charset("utf-8")
+ payload = payload_to_unicode(message)
#category = 'Question' # TODO: enum + i18n ?
#if ('agenda' in message.get('Subject', '').lower() or
@@ -143,7 +136,7 @@ class KittySAStore(object):
sender=from_name,
email=from_email,
subject=header_to_unicode(message.get('Subject')),
- content=message.get_payload(),
+ content=payload.encode("utf-8"),
date=parsedate(message.get("Date")),
message_id=msg_id,
stable_url_id=msg_id_hash,
diff --git a/kittystore/test/test_sa_store.py b/kittystore/test/test_sa_store.py
index 289b658..69680ae 100644
--- a/kittystore/test/test_sa_store.py
+++ b/kittystore/test/test_sa_store.py
@@ -3,7 +3,6 @@
import unittest
import email
import mailbox
-from mock import Mock
from kittystore.sa.store import KittySAStore, list_to_table_name
from kittystore.sa.kittysamodel import get_class_object
@@ -18,27 +17,33 @@ class TestSAStore(unittest.TestCase):
def tearDown(self):
self.store.session.close()
- def test_non_ascii_payload(self):
- """add_to_list must handle non-ascii messages"""
- with open(get_test_file("non-ascii-payload.txt")) as email_file:
- msg = email.message_from_file(email_file)
- self.store.add_to_list("example-list", msg)
- try:
- self.store.session.flush()
- except ProgrammingError, e:
- self.fail(e)
+ #def test_non_ascii_payload(self):
+ # """add_to_list must handle non-ascii messages"""
+ # with open(get_test_file("non-ascii-payload.txt")) as email_file:
+ # msg = email.message_from_file(email_file)
+ # self.store.add_to_list("example-list", msg)
+ # try:
+ # self.store.session.flush()
+ # except ProgrammingError, e:
+ # self.fail(e)
+ # print msg.items()
+ # email_table = get_class_object(list_to_table_name("example-list"), 'email',
+ # self.store.metadata)
+ # emails = self.store.session.query(email_table).all()
+ # for e in emails:
+ # print e.content
- def test_non_ascii_headers(self):
- """add_to_list must handle non-ascii headers"""
- mbox = mailbox.mbox(get_test_file("non-ascii-headers.txt"))
- for msg in mbox:
- self.store.add_to_list("example-list", msg)
- self.store.session.flush()
- email = get_class_object(list_to_table_name("example-list"), 'email',
- self.store.metadata)
- for msg in self.store.session.query(email).all():
- print repr(msg.sender), repr(msg.subject)
- self.failIf("=?" in msg.sender,
- "From header not decoded: %s" % msg.sender)
- self.failIf("=?" in msg.subject,
- "Subject header not decoded: %s" % msg.sender)
+ #def test_non_ascii_headers(self):
+ # """add_to_list must handle non-ascii headers"""
+ # mbox = mailbox.mbox(get_test_file("non-ascii-headers.txt"))
+ # for msg in mbox:
+ # self.store.add_to_list("example-list", msg)
+ # self.store.session.flush()
+ # email_table = get_class_object(list_to_table_name("example-list"), 'email',
+ # self.store.metadata)
+ # for msg in self.store.session.query(email_table).all():
+ # print repr(msg.sender), repr(msg.subject)
+ # self.failIf("=?" in msg.sender,
+ # "From header not decoded: %s" % msg.sender)
+ # self.failIf("=?" in msg.subject,
+ # "Subject header not decoded: %s" % msg.sender)
diff --git a/kittystore/test/test_utils.py b/kittystore/test/test_utils.py
index c4ceec4..8fc0c8b 100644
--- a/kittystore/test/test_utils.py
+++ b/kittystore/test/test_utils.py
@@ -7,6 +7,7 @@ from mock import Mock
import kittystore.utils
from kittystore.test import get_test_file
+
class TestUtils(unittest.TestCase):
def test_ref_parsing(self):
@@ -17,3 +18,31 @@ class TestUtils(unittest.TestCase):
ref_id, thread_id = kittystore.utils.get_ref_and_thread_id(
msg, "example-list", store)
self.assertEqual(ref_id, "200704070053.46646.other.person@example.com")
+
+ def test_non_ascii_payload(self):
+ """utils.payload_to_unicode must handle non-ascii messages"""
+ for enc in ["utf8", "iso8859"]:
+ with open(get_test_file("payload-%s.txt" % enc)) as email_file:
+ msg = email.message_from_file(email_file)
+ payload = kittystore.utils.payload_to_unicode(msg)
+ print enc, repr(payload)
+ self.assertTrue(isinstance(payload, unicode))
+ self.assertEqual(payload, u'This message contains non-ascii '
+ u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n')
+
+ def test_non_ascii_headers(self):
+ """utils.header_to_unicode must handle non-ascii headers"""
+ testdata = [
+ ("=?ISO-8859-2?Q?V=EDt_Ondruch?=", u'V\xedt Ondruch'),
+ ("=?UTF-8?B?VsOtdCBPbmRydWNo?=", u'V\xedt Ondruch'),
+ ("=?iso-8859-1?q?Bj=F6rn_Persson?=", u'Bj\xf6rn Persson'),
+ ("=?UTF-8?B?TWFyY2VsYSBNYcWhbMOhxYhvdsOh?=", u'Marcela Ma\u0161l\xe1\u0148ov\xe1'),
+ ("Dan =?ISO-8859-1?Q?Hor=E1k?=", u'Dan Hor\xe1k'),
+ ("=?ISO-8859-1?Q?Bj=F6rn?= Persson", u'Bj\xf6rnPersson'),
+ ("=?UTF-8?Q?Re=3A_=5BFedora=2Dfr=2Dlist=5D_Compte=2Drendu_de_la_r=C3=A9union_du_?= =?UTF-8?Q?1_novembre_2009?=", u"Re: [Fedora-fr-list] Compte-rendu de la r\xe9union du 1 novembre 2009"),
+ ("=?iso-8859-1?q?Compte-rendu_de_la_r=E9union_du_?= =?iso-8859-1?q?1_novembre_2009?=", u"Compte-rendu de la r\xe9union du 1 novembre 2009"),
+ ]
+ for h_in, h_expected in testdata:
+ h_out = kittystore.utils.header_to_unicode(h_in)
+ self.assertEqual(h_out, h_expected)
+ self.assertTrue(isinstance(h_out, unicode))
diff --git a/kittystore/test/testdata/payload-iso8859.txt b/kittystore/test/testdata/payload-iso8859.txt
new file mode 100644
index 0000000..2794a5a
--- /dev/null
+++ b/kittystore/test/testdata/payload-iso8859.txt
@@ -0,0 +1,8 @@
+From test at example.com Fri Apr 6 22:43:55 2007
+From: test at example.com (Dummy Person)
+Date: Fri, 6 Apr 2007 15:43:55 -0700 (PDT)
+Subject: Dummy subject
+Message-ID: <20070406224355.899B9180064@test.example.com>
+
+This message contains non-ascii characters:
+é è ç à î ï ë ¤
diff --git a/kittystore/test/testdata/non-ascii-payload.txt b/kittystore/test/testdata/payload-utf8.txt
index d8106eb..d8106eb 100644
--- a/kittystore/test/testdata/non-ascii-payload.txt
+++ b/kittystore/test/testdata/payload-utf8.txt
diff --git a/kittystore/utils.py b/kittystore/utils.py
index 4703c40..f14ea24 100644
--- a/kittystore/utils.py
+++ b/kittystore/utils.py
@@ -26,7 +26,8 @@ import dateutil.parser
__all__ = ("get_message_id_hash", "parseaddr", "parsedate",
- "header_to_unicode", "get_ref_and_thread_id",
+ "header_to_unicode", "payload_to_unicode",
+ "get_ref_and_thread_id",
)
@@ -69,6 +70,31 @@ def header_to_unicode(header):
h_decoded.append(decoded.decode(charset))
return "".join(h_decoded)
+def payload_to_unicode(message):
+ # Turn non-ascii into Unicode, assuming UTF-8
+ payload = []
+ for part in message.walk():
+ if part.get_content_charset() is None:
+ for encoding in ["ascii", "utf-8", "iso-8859-15"]:
+ try:
+ payload.append(unicode(part.get_payload().decode(encoding)))
+ except UnicodeDecodeError:
+ continue
+ else:
+ print encoding, payload
+ break
+ # Try UTF-8
+ #part.set_charset("utf-8")
+ #try:
+ # payload.append(part.get_payload().decode("utf-8"))
+ #except UnicodeDecodeError, e:
+ # print e
+ # print message.items()
+ # print part.get_payload()
+ # raise
+ #return message.get_payload()
+ return "".join(payload)
+
def parsedate(datestring):
if datestring is None:
return None