summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-10-02 10:11:58 +0200
committerAurélien Bompard <aurelien@bompard.org>2012-10-02 10:11:58 +0200
commitdc3ade6110c49ed647ee5008fe2fef0e9a6d8522 (patch)
tree948f847a2462391bc68f4ae4c7171ebb841c92f5
parente012672451eda6293fd6817036d4dea907f63a4c (diff)
downloadkittystore-dc3ade6110c49ed647ee5008fe2fef0e9a6d8522.tar.gz
kittystore-dc3ade6110c49ed647ee5008fe2fef0e9a6d8522.tar.xz
kittystore-dc3ade6110c49ed647ee5008fe2fef0e9a6d8522.zip
Fix header decoding and associated tests
-rw-r--r--kittystore/mongo/__init__.py2
-rw-r--r--kittystore/mongo/store.py6
-rw-r--r--kittystore/sa/store.py9
-rw-r--r--kittystore/scrub.py15
-rw-r--r--kittystore/storm/store.py2
-rw-r--r--kittystore/test/test_scrub.py12
-rw-r--r--kittystore/test/test_utils.py13
-rw-r--r--kittystore/utils.py7
8 files changed, 43 insertions, 23 deletions
diff --git a/kittystore/mongo/__init__.py b/kittystore/mongo/__init__.py
index 5231c6a..50dc1df 100644
--- a/kittystore/mongo/__init__.py
+++ b/kittystore/mongo/__init__.py
@@ -1 +1 @@
-__test__ = False
+__test__ = {}
diff --git a/kittystore/mongo/store.py b/kittystore/mongo/store.py
index 79780ba..dbaefb6 100644
--- a/kittystore/mongo/store.py
+++ b/kittystore/mongo/store.py
@@ -16,7 +16,11 @@ license.
"""
-import pymongo
+__test__ = {}
+try:
+ import pymongo
+except ImportError:
+ pass
import re
from datetime import datetime
diff --git a/kittystore/sa/store.py b/kittystore/sa/store.py
index 03632ae..efad3c6 100644
--- a/kittystore/sa/store.py
+++ b/kittystore/sa/store.py
@@ -19,7 +19,8 @@ import datetime
from kittystore import MessageNotFound
from kittystore.utils import get_message_id_hash, parseaddr, parsedate
-from kittystore.utils import header_to_unicode, payload_to_unicode
+from kittystore.utils import header_to_unicode
+from kittystore.scrub import Scrubber
from kittystore.utils import get_ref_and_thread_id
from kittystore.sa.kittysamodel import get_class_object
@@ -124,7 +125,9 @@ class KittySAStore(object):
from_name, from_email = parseaddr(message['From'])
from_name = header_to_unicode(from_name)
- payload = payload_to_unicode(message)
+ full = message.as_string()
+ scrubber = Scrubber(list_name, message, self)
+ payload = scrubber.scrub() # modifies the message in-place
#category = 'Question' # TODO: enum + i18n ?
#if ('agenda' in message.get('Subject', '').lower() or
@@ -142,7 +145,7 @@ class KittySAStore(object):
stable_url_id=msg_id_hash,
thread_id=thread_id,
references=ref,
- full=message.as_string(),
+ full=full,
)
self.session.add(mail)
return msg_id_hash
diff --git a/kittystore/scrub.py b/kittystore/scrub.py
index 04b30e5..61ea032 100644
--- a/kittystore/scrub.py
+++ b/kittystore/scrub.py
@@ -226,6 +226,21 @@ class Scrubber(object):
#sep = _('-------------- next part --------------\n')
#text = sep.join(text)
text = "\n".join(text)
+ else:
+ text = self.msg.get_payload(decode=True)
+ charset = self.msg.get_content_charset()
+ if charset is None:
+ # Try to guess the encoding (best effort mode)
+ for encoding in ["ascii", "utf-8", "iso-8859-15"]:
+ try:
+ text.decode(encoding)
+ except UnicodeDecodeError:
+ continue
+ else:
+ #print encoding, payload
+ charset = encoding
+ break
+ text = text.decode(charset or "ascii", "replace")
return text
diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py
index 7c3f403..2858489 100644
--- a/kittystore/storm/store.py
+++ b/kittystore/storm/store.py
@@ -109,7 +109,7 @@ class StormStore(object):
email.subject = header_to_unicode(message.get('Subject'))
email.full = message.as_string() # Before scrubbing
scrubber = Scrubber(list_name, message, self)
- email.content = scrubber.scrub()
+ email.content = scrubber.scrub() # warning: modifies the msg in-place
email.date = parsedate(message.get("Date"))
if email.date is None:
# Absent or unparseable date
diff --git a/kittystore/test/test_scrub.py b/kittystore/test/test_scrub.py
index 8637e97..e694be3 100644
--- a/kittystore/test/test_scrub.py
+++ b/kittystore/test/test_scrub.py
@@ -89,3 +89,15 @@ class TestScrubber(unittest.TestCase):
u"This is a test message\r\n"
u"Non-ASCII chars: r\xe9ponse fran\xe7ais \n")
+ def test_non_ascii_payload(self):
+ """Scrubber must handle non-ascii messages"""
+ for enc in ["utf8", "iso8859"]:
+ with open(get_test_file("payload-%s.txt" % enc)) as email_file:
+ msg = email.message_from_file(email_file)
+ store = Mock()
+ scrubber = Scrubber("testlist@example.com", msg, store)
+ contents = scrubber.scrub()
+ self.assertTrue(isinstance(contents, unicode))
+ self.assertEqual(contents, u'This message contains non-ascii '
+ u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n')
+
diff --git a/kittystore/test/test_utils.py b/kittystore/test/test_utils.py
index 46d7e72..64466b8 100644
--- a/kittystore/test/test_utils.py
+++ b/kittystore/test/test_utils.py
@@ -30,17 +30,6 @@ class TestUtils(unittest.TestCase):
msg, "example-list", store)
self.assertEqual(ref_id, None)
- def test_non_ascii_payload(self):
- """utils.payload_to_unicode must handle non-ascii messages"""
- for enc in ["utf8", "iso8859"]:
- with open(get_test_file("payload-%s.txt" % enc)) as email_file:
- msg = email.message_from_file(email_file)
- payload = kittystore.utils.payload_to_unicode(msg)
- #print enc, repr(payload)
- self.assertTrue(isinstance(payload, unicode))
- self.assertEqual(payload, u'This message contains non-ascii '
- u'characters:\n\xe9 \xe8 \xe7 \xe0 \xee \xef \xeb \u20ac\n')
-
def test_non_ascii_headers(self):
"""utils.header_to_unicode must handle non-ascii headers"""
testdata = [
@@ -49,7 +38,7 @@ class TestUtils(unittest.TestCase):
("=?iso-8859-1?q?Bj=F6rn_Persson?=", u'Bj\xf6rn Persson'),
("=?UTF-8?B?TWFyY2VsYSBNYcWhbMOhxYhvdsOh?=", u'Marcela Ma\u0161l\xe1\u0148ov\xe1'),
("Dan =?ISO-8859-1?Q?Hor=E1k?=", u'Dan Hor\xe1k'),
- ("=?ISO-8859-1?Q?Bj=F6rn?= Persson", u'Bj\xf6rnPersson'),
+ ("=?ISO-8859-1?Q?Bj=F6rn?= Persson", u'Bj\xf6rn Persson'),
("=?UTF-8?Q?Re=3A_=5BFedora=2Dfr=2Dlist=5D_Compte=2Drendu_de_la_r=C3=A9union_du_?= =?UTF-8?Q?1_novembre_2009?=", u"Re: [Fedora-fr-list] Compte-rendu de la r\xe9union du 1 novembre 2009"),
("=?iso-8859-1?q?Compte-rendu_de_la_r=E9union_du_?= =?iso-8859-1?q?1_novembre_2009?=", u"Compte-rendu de la r\xe9union du 1 novembre 2009"),
]
diff --git a/kittystore/utils.py b/kittystore/utils.py
index bdc2270..c860ce5 100644
--- a/kittystore/utils.py
+++ b/kittystore/utils.py
@@ -42,7 +42,7 @@ def get_message_id_hash(msg_id):
details. Example:
>>> get_message_id_hash('<87myycy5eh.fsf@uwakimon.sk.tsukuba.ac.jp>')
- 'AGDWSNXXKCWEILKKNYTBOHRDQGOX3Y35'
+ 'JJIGKPKB6CVDX6B2CUG4IHAJRIQIOUTP'
"""
msg_id = msg_id.strip("<>")
@@ -66,15 +66,12 @@ def header_to_unicode(header):
if charset is None:
h_decoded.append(unicode(decoded))
else:
- if h_decoded:
- # not so sure why...
- h_decoded.append(" ")
try:
h_decoded.append(decoded.decode(charset))
except LookupError:
# Unknown encoding
h_decoded.append(decoded.decode("ascii", "replace"))
- return "".join(h_decoded)
+ return " ".join(h_decoded)
def parsedate(datestring):
if datestring is None: