diff options
| author | Aurélien Bompard <aurelien@bompard.org> | 2012-08-20 21:38:43 +0200 |
|---|---|---|
| committer | Aurélien Bompard <aurelien@bompard.org> | 2012-09-07 10:40:54 +0200 |
| commit | 29bcaf005efd58021a60e109f4f6e70a055560f4 (patch) | |
| tree | 95db72eba25d9b91b3b83dcfa64c550771cbce15 | |
| parent | b5c5a81d184bf47018b7360ad835217da5b8e0ee (diff) | |
Handle non-ascii payloads better
| -rw-r--r-- | kittystore/sa/store.py | 15 | ||||
| -rw-r--r-- | kittystore/test/test_sa_store.py | 53 | ||||
| -rw-r--r-- | kittystore/test/test_utils.py | 29 | ||||
| -rw-r--r-- | kittystore/test/testdata/payload-iso8859.txt | 8 | ||||
| -rw-r--r-- | kittystore/test/testdata/payload-utf8.txt (renamed from kittystore/test/testdata/non-ascii-payload.txt) | 0 | ||||
| -rw-r--r-- | kittystore/utils.py | 28 |
6 files changed, 97 insertions, 36 deletions
diff --git a/kittystore/sa/store.py b/kittystore/sa/store.py index e4a417f..019bc0e 100644 --- a/kittystore/sa/store.py +++ b/kittystore/sa/store.py @@ -19,7 +19,8 @@ import datetime from kittystore import MessageNotFound from kittystore.utils import get_message_id_hash, parseaddr, parsedate -from kittystore.utils import get_ref_and_thread_id, header_to_unicode +from kittystore.utils import header_to_unicode, payload_to_unicode +from kittystore.utils import get_ref_and_thread_id from kittystore.sa.kittysamodel import get_class_object from zope.interface import implements @@ -123,15 +124,7 @@ class KittySAStore(object): from_name, from_email = parseaddr(message['From']) from_name = header_to_unicode(from_name) - - # Turn non-ascii into Unicode, assuming UTF-8 - for part in message.walk(): - if part.get_content_charset() is None: - try: - unicode(part.get_payload()) - except UnicodeDecodeError: - # Try UTF-8 - part.set_charset("utf-8") + payload = payload_to_unicode(message) #category = 'Question' # TODO: enum + i18n ? #if ('agenda' in message.get('Subject', '').lower() or @@ -143,7 +136,7 @@ class KittySAStore(object): sender=from_name, email=from_email, subject=header_to_unicode(message.get('Subject')), - content=message.get_payload(), + content=payload.encode("utf-8"), date=parsedate(message.get("Date")), message_id=msg_id, stable_url_id=msg_id_hash, diff --git a/kittystore/test/test_sa_store.py b/kittystore/test/test_sa_store.py index 289b658..69680ae 100644 --- a/kittystore/test/test_sa_store.py +++ b/kittystore/test/test_sa_store.py @@ -3,7 +3,6 @@ import unittest import email import mailbox -from mock import Mock from kittystore.sa.store import KittySAStore, list_to_table_name from kittystore.sa.kittysamodel import get_class_object @@ -18,27 +17,33 @@ class TestSAStore(unittest.TestCase): def tearDown(self): self.store.session.close() - def test_non_ascii_payload(self): - """add_to_list must handle non-ascii messages""" - with open(get_test_file("non-ascii-payload.txt")) as email_file: - msg = email.message_from_file(email_file) - self.store.add_to_list("example-list", msg) - try: - self.store.session.flush() - except ProgrammingError, e: - self.fail(e) + #def test_non_ascii_payload(self): + # """add_to_list must handle non-ascii messages""" + # with open(get_test_file("non-ascii-payload.txt")) as email_file: + # msg = email.message_from_file(email_file) + # self.store.add_to_list("example-list", msg) + # try: + # self.store.session.flush() + # except ProgrammingError, e: + # self.fail(e) + # print msg.items() + # email_table = get_class_object(list_to_table_name("example-list"), 'email', + # self.store.metadata) + # emails = self.store.session.query(email_table).all() + # for e in emails: + # print e.content - def test_non_ascii_headers(self): - """add_to_list must handle non-ascii headers""" - mbox = mailbox.mbox(get_test_file("non-ascii-headers.txt")) - for msg in mbox: - self.store.add_to_list("example-list", msg) - self.store.session.flush() - email = get_class_object(list_to_table_name("example-list"), 'email', - self.store.metadata) - for msg in self.store.session.query(email).all(): - print repr(msg.sender), repr(msg.subject) - self.failIf("=?" in msg.sender, - "From header not decoded: %s" % msg.sender) - self.failIf("=?" in msg.subject, - "Subject header not decoded: %s" % msg.sender) + #def test_non_ascii_headers(self): + # """add_to_list must handle non-ascii headers""" + # mbox = mailbox.mbox(get_test_file("non-ascii-headers.txt")) + # for msg in mbox: + # self.store.add_to_list("example-list", msg) + # self.store.session.flush() + # email_table = get_class_object(list_to_table_name("example-list"), 'email', + # self.store.metadata) + # for msg in self.store.session.query(email_table).all(): + # print repr(msg.sender), repr(msg.subject) + # self.failIf("=?" in msg.sender, + # "From header not decoded: %s" % msg.sender) + # self.failIf("=?" in msg.subject, + # "Subject header not decoded: %s" % msg.sender) diff --git a/kittystore/test/test_utils.py b/kittystore/test/test_utils.py index c4ceec4..8fc0c8b 100644 --- a/kittystore/test/test_utils.py +++ b/kittystore/test/test_utils.py @@ -7,6 +7,7 @@ from mock import Mock import kittystore.utils from kittystore.test import get_test_file + class TestUtils(unittest.TestCase): def test_ref_parsing(self): @@ -17,3 +18,31 @@ class TestUtils(unittest.TestCase): ref_id, thread_id = kittystore.utils.get_ref_and_thread_id( msg, "example-list", store) self.assertEqual(ref_id, "200704070053.46646.other.person@example.com") + + def test_non_ascii_payload(self): + """utils.payload_to_unicode must handle non-ascii messages""" + for enc in ["utf8", "iso8859"]: + with open(get_test_file("payload-%s.txt" % enc)) as email_file: + msg = email.message_from_file(email_file) + payload = kittystore.utils.payload_to_unicode(msg) + print enc, repr(payload) + self.assertTrue(isinstance(payload, unicode)) + self.assertEqual(payload, u'This message contains non-ascii ' + u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n') + + def test_non_ascii_headers(self): + """utils.header_to_unicode must handle non-ascii headers""" + testdata = [ + ("=?ISO-8859-2?Q?V=EDt_Ondruch?=", u'V\xedt Ondruch'), + ("=?UTF-8?B?VsOtdCBPbmRydWNo?=", u'V\xedt Ondruch'), + ("=?iso-8859-1?q?Bj=F6rn_Persson?=", u'Bj\xf6rn Persson'), + ("=?UTF-8?B?TWFyY2VsYSBNYcWhbMOhxYhvdsOh?=", u'Marcela Ma\u0161l\xe1\u0148ov\xe1'), + ("Dan =?ISO-8859-1?Q?Hor=E1k?=", u'Dan Hor\xe1k'), + ("=?ISO-8859-1?Q?Bj=F6rn?= Persson", u'Bj\xf6rnPersson'), + ("=?UTF-8?Q?Re=3A_=5BFedora=2Dfr=2Dlist=5D_Compte=2Drendu_de_la_r=C3=A9union_du_?= =?UTF-8?Q?1_novembre_2009?=", u"Re: [Fedora-fr-list] Compte-rendu de la r\xe9union du 1 novembre 2009"), + ("=?iso-8859-1?q?Compte-rendu_de_la_r=E9union_du_?= =?iso-8859-1?q?1_novembre_2009?=", u"Compte-rendu de la r\xe9union du 1 novembre 2009"), + ] + for h_in, h_expected in testdata: + h_out = kittystore.utils.header_to_unicode(h_in) + self.assertEqual(h_out, h_expected) + self.assertTrue(isinstance(h_out, unicode)) diff --git a/kittystore/test/testdata/payload-iso8859.txt b/kittystore/test/testdata/payload-iso8859.txt new file mode 100644 index 0000000..2794a5a --- /dev/null +++ b/kittystore/test/testdata/payload-iso8859.txt @@ -0,0 +1,8 @@ +From test at example.com Fri Apr 6 22:43:55 2007 +From: test at example.com (Dummy Person) +Date: Fri, 6 Apr 2007 15:43:55 -0700 (PDT) +Subject: Dummy subject +Message-ID: <20070406224355.899B9180064@test.example.com> + +This message contains non-ascii characters: +é è ç à î ï ë ¤ diff --git a/kittystore/test/testdata/non-ascii-payload.txt b/kittystore/test/testdata/payload-utf8.txt index d8106eb..d8106eb 100644 --- a/kittystore/test/testdata/non-ascii-payload.txt +++ b/kittystore/test/testdata/payload-utf8.txt diff --git a/kittystore/utils.py b/kittystore/utils.py index 4703c40..f14ea24 100644 --- a/kittystore/utils.py +++ b/kittystore/utils.py @@ -26,7 +26,8 @@ import dateutil.parser __all__ = ("get_message_id_hash", "parseaddr", "parsedate", - "header_to_unicode", "get_ref_and_thread_id", + "header_to_unicode", "payload_to_unicode", + "get_ref_and_thread_id", ) @@ -69,6 +70,31 @@ def header_to_unicode(header): h_decoded.append(decoded.decode(charset)) return "".join(h_decoded) +def payload_to_unicode(message): + # Turn non-ascii into Unicode, assuming UTF-8 + payload = [] + for part in message.walk(): + if part.get_content_charset() is None: + for encoding in ["ascii", "utf-8", "iso-8859-15"]: + try: + payload.append(unicode(part.get_payload().decode(encoding))) + except UnicodeDecodeError: + continue + else: + print encoding, payload + break + # Try UTF-8 + #part.set_charset("utf-8") + #try: + # payload.append(part.get_payload().decode("utf-8")) + #except UnicodeDecodeError, e: + # print e + # print message.items() + # print part.get_payload() + # raise + #return message.get_payload() + return "".join(payload) + def parsedate(datestring): if datestring is None: return None |
