diff options
author | Aurélien Bompard <aurelien@bompard.org> | 2012-10-02 10:11:58 +0200 |
---|---|---|
committer | Aurélien Bompard <aurelien@bompard.org> | 2012-10-02 10:11:58 +0200 |
commit | dc3ade6110c49ed647ee5008fe2fef0e9a6d8522 (patch) | |
tree | 948f847a2462391bc68f4ae4c7171ebb841c92f5 | |
parent | e012672451eda6293fd6817036d4dea907f63a4c (diff) | |
download | kittystore-dc3ade6110c49ed647ee5008fe2fef0e9a6d8522.tar.gz kittystore-dc3ade6110c49ed647ee5008fe2fef0e9a6d8522.tar.xz kittystore-dc3ade6110c49ed647ee5008fe2fef0e9a6d8522.zip |
Fix header decoding and associated tests
-rw-r--r-- | kittystore/mongo/__init__.py | 2 | ||||
-rw-r--r-- | kittystore/mongo/store.py | 6 | ||||
-rw-r--r-- | kittystore/sa/store.py | 9 | ||||
-rw-r--r-- | kittystore/scrub.py | 15 | ||||
-rw-r--r-- | kittystore/storm/store.py | 2 | ||||
-rw-r--r-- | kittystore/test/test_scrub.py | 12 | ||||
-rw-r--r-- | kittystore/test/test_utils.py | 13 | ||||
-rw-r--r-- | kittystore/utils.py | 7 |
8 files changed, 43 insertions, 23 deletions
diff --git a/kittystore/mongo/__init__.py b/kittystore/mongo/__init__.py index 5231c6a..50dc1df 100644 --- a/kittystore/mongo/__init__.py +++ b/kittystore/mongo/__init__.py @@ -1 +1 @@ -__test__ = False +__test__ = {} diff --git a/kittystore/mongo/store.py b/kittystore/mongo/store.py index 79780ba..dbaefb6 100644 --- a/kittystore/mongo/store.py +++ b/kittystore/mongo/store.py @@ -16,7 +16,11 @@ license. """ -import pymongo +__test__ = {} +try: + import pymongo +except ImportError: + pass import re from datetime import datetime diff --git a/kittystore/sa/store.py b/kittystore/sa/store.py index 03632ae..efad3c6 100644 --- a/kittystore/sa/store.py +++ b/kittystore/sa/store.py @@ -19,7 +19,8 @@ import datetime from kittystore import MessageNotFound from kittystore.utils import get_message_id_hash, parseaddr, parsedate -from kittystore.utils import header_to_unicode, payload_to_unicode +from kittystore.utils import header_to_unicode +from kittystore.scrub import Scrubber from kittystore.utils import get_ref_and_thread_id from kittystore.sa.kittysamodel import get_class_object @@ -124,7 +125,9 @@ class KittySAStore(object): from_name, from_email = parseaddr(message['From']) from_name = header_to_unicode(from_name) - payload = payload_to_unicode(message) + full = message.as_string() + scrubber = Scrubber(list_name, message, self) + payload = scrubber.scrub() # modifies the message in-place #category = 'Question' # TODO: enum + i18n ? #if ('agenda' in message.get('Subject', '').lower() or @@ -142,7 +145,7 @@ class KittySAStore(object): stable_url_id=msg_id_hash, thread_id=thread_id, references=ref, - full=message.as_string(), + full=full, ) self.session.add(mail) return msg_id_hash diff --git a/kittystore/scrub.py b/kittystore/scrub.py index 04b30e5..61ea032 100644 --- a/kittystore/scrub.py +++ b/kittystore/scrub.py @@ -226,6 +226,21 @@ class Scrubber(object): #sep = _('-------------- next part --------------\n') #text = sep.join(text) text = "\n".join(text) + else: + text = self.msg.get_payload(decode=True) + charset = self.msg.get_content_charset() + if charset is None: + # Try to guess the encoding (best effort mode) + for encoding in ["ascii", "utf-8", "iso-8859-15"]: + try: + text.decode(encoding) + except UnicodeDecodeError: + continue + else: + #print encoding, payload + charset = encoding + break + text = text.decode(charset or "ascii", "replace") return text diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py index 7c3f403..2858489 100644 --- a/kittystore/storm/store.py +++ b/kittystore/storm/store.py @@ -109,7 +109,7 @@ class StormStore(object): email.subject = header_to_unicode(message.get('Subject')) email.full = message.as_string() # Before scrubbing scrubber = Scrubber(list_name, message, self) - email.content = scrubber.scrub() + email.content = scrubber.scrub() # warning: modifies the msg in-place email.date = parsedate(message.get("Date")) if email.date is None: # Absent or unparseable date diff --git a/kittystore/test/test_scrub.py b/kittystore/test/test_scrub.py index 8637e97..e694be3 100644 --- a/kittystore/test/test_scrub.py +++ b/kittystore/test/test_scrub.py @@ -89,3 +89,15 @@ class TestScrubber(unittest.TestCase): u"This is a test message\r\n" u"Non-ASCII chars: r\xe9ponse fran\xe7ais \n") + def test_non_ascii_payload(self): + """Scrubber must handle non-ascii messages""" + for enc in ["utf8", "iso8859"]: + with open(get_test_file("payload-%s.txt" % enc)) as email_file: + msg = email.message_from_file(email_file) + store = Mock() + scrubber = Scrubber("testlist@example.com", msg, store) + contents = scrubber.scrub() + self.assertTrue(isinstance(contents, unicode)) + self.assertEqual(contents, u'This message contains non-ascii ' + u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n') + diff --git a/kittystore/test/test_utils.py b/kittystore/test/test_utils.py index 46d7e72..64466b8 100644 --- a/kittystore/test/test_utils.py +++ b/kittystore/test/test_utils.py @@ -30,17 +30,6 @@ class TestUtils(unittest.TestCase): msg, "example-list", store) self.assertEqual(ref_id, None) - def test_non_ascii_payload(self): - """utils.payload_to_unicode must handle non-ascii messages""" - for enc in ["utf8", "iso8859"]: - with open(get_test_file("payload-%s.txt" % enc)) as email_file: - msg = email.message_from_file(email_file) - payload = kittystore.utils.payload_to_unicode(msg) - #print enc, repr(payload) - self.assertTrue(isinstance(payload, unicode)) - self.assertEqual(payload, u'This message contains non-ascii ' - u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n') - def test_non_ascii_headers(self): """utils.header_to_unicode must handle non-ascii headers""" testdata = [ @@ -49,7 +38,7 @@ class TestUtils(unittest.TestCase): ("=?iso-8859-1?q?Bj=F6rn_Persson?=", u'Bj\xf6rn Persson'), ("=?UTF-8?B?TWFyY2VsYSBNYcWhbMOhxYhvdsOh?=", u'Marcela Ma\u0161l\xe1\u0148ov\xe1'), ("Dan =?ISO-8859-1?Q?Hor=E1k?=", u'Dan Hor\xe1k'), - ("=?ISO-8859-1?Q?Bj=F6rn?= Persson", u'Bj\xf6rnPersson'), + ("=?ISO-8859-1?Q?Bj=F6rn?= Persson", u'Bj\xf6rn Persson'), ("=?UTF-8?Q?Re=3A_=5BFedora=2Dfr=2Dlist=5D_Compte=2Drendu_de_la_r=C3=A9union_du_?= =?UTF-8?Q?1_novembre_2009?=", u"Re: [Fedora-fr-list] Compte-rendu de la r\xe9union du 1 novembre 2009"), ("=?iso-8859-1?q?Compte-rendu_de_la_r=E9union_du_?= =?iso-8859-1?q?1_novembre_2009?=", u"Compte-rendu de la r\xe9union du 1 novembre 2009"), ] diff --git a/kittystore/utils.py b/kittystore/utils.py index bdc2270..c860ce5 100644 --- a/kittystore/utils.py +++ b/kittystore/utils.py @@ -42,7 +42,7 @@ def get_message_id_hash(msg_id): details. Example: >>> get_message_id_hash('<87myycy5eh.fsf@uwakimon.sk.tsukuba.ac.jp>') - 'AGDWSNXXKCWEILKKNYTBOHRDQGOX3Y35' + 'JJIGKPKB6CVDX6B2CUG4IHAJRIQIOUTP' """ msg_id = msg_id.strip("<>") @@ -66,15 +66,12 @@ def header_to_unicode(header): if charset is None: h_decoded.append(unicode(decoded)) else: - if h_decoded: - # not so sure why... - h_decoded.append(" ") try: h_decoded.append(decoded.decode(charset)) except LookupError: # Unknown encoding h_decoded.append(decoded.decode("ascii", "replace")) - return "".join(h_decoded) + return " ".join(h_decoded) def parsedate(datestring): if datestring is None: |