Improve charset handling

And store attachment encoding in the database.
author: Aurélien Bompard <aurelien@bompard.org> 2012-10-02 12:12:41 +0200
committer: Aurélien Bompard <aurelien@bompard.org> 2012-10-02 12:12:41 +0200
commit: 4f396b8956fba81221e9c62d8731134343c561d1 (patch)
tree: a7a190d04704596bb43142f7dc16645901d82bfa
parent: dc3ade6110c49ed647ee5008fe2fef0e9a6d8522 (diff)
download: kittystore-4f396b8956fba81221e9c62d8731134343c561d1.tar.gz
kittystore-4f396b8956fba81221e9c62d8731134343c561d1.tar.xz
kittystore-4f396b8956fba81221e9c62d8731134343c561d1.zip
6 files changed, 80 insertions, 65 deletions
diff --git a/kittystore/scrub.py b/kittystore/scrub.py
index 61ea032..175f56b 100644
--- a/kittystore/scrub.py
+++ b/kittystore/scrub.py
@@ -68,23 +68,37 @@ def guess_extension(ctype, ext):
     return all and all[0]
 
 
-def replace_payload_by_text(msg, text, charset):
-    # TK: This is a common function in replacing the attachment and the main
-    # message by a text (scrubbing).
-    del msg['content-type']
-    del msg['content-transfer-encoding']
-    #if isinstance(charset, unicode):
-    #    # email 3.0.1 (python 2.4) doesn't like unicode
-    #    charset = charset.encode('us-ascii')
-    #msg.set_payload(text, charset)
-    msg.set_payload('TODO: display attachment here and remove message subpart')
-
+def get_charset(message, default="ascii", guess=False):
+    """
+    Get the message charset.
+    From: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
+    """
+    if message.get_content_charset():
+        return message.get_content_charset()
+    if message.get_charset():
+        return message.get_charset()
+    if not guess:
+        return default
+    # Try to guess the encoding (best effort mode)
+    text = message.get_payload(decode=True)
+    charset = default
+    for encoding in ["ascii", "utf-8", "iso-8859-15"]:
+        try:
+            text.decode(encoding)
+        except UnicodeDecodeError:
+            continue
+        else:
+            #print encoding, payload
+            charset = encoding
+            break
+    return charset
 
 
 class Scrubber(object):
     """
     Scrubs a single message, extracts attachments, and store them in the
     database.
+    See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
     """
 
     def __init__(self, mlist, msg, store):
@@ -101,9 +115,8 @@ class Scrubber(object):
             ctype = part.get_content_type()
             # If the part is text/plain, we leave it alone
             if ctype == 'text/plain':
-                # TK: if part is attached then check charset and scrub if none
-                if part.get('content-disposition') and \
-                   not part.get_content_charset():
+                if part.get('content-disposition') == "attachment":
+                    # part is attached
                     self.save_attachment(part, part_num)
             elif ctype == 'text/html' and isinstance(sanitize, IntType):
 #            if sanitize == 0:
@@ -200,23 +213,14 @@ class Scrubber(object):
                 # null body. See bug 1430236.
                 except (binascii.Error, TypeError):
                     t = part.get_payload() or ''
-                # TK: get_content_charset() returns 'iso-2022-jp' for internally
-                # crafted (scrubbed) 'euc-jp' text part. So, first try
-                # get_charset(), then get_content_charset() for the parts
-                # which are already embeded in the incoming message.
-                partcharset = part.get_charset()
-                if partcharset:
-                    partcharset = str(partcharset)
-                else:
-                    partcharset = part.get_content_charset()
-                if partcharset:
-                    try:
-                        t = unicode(t, partcharset, 'replace')
-                    except (UnicodeError, LookupError, ValueError,
-                            AssertionError):
-                        # We can get here if partcharset is bogus in come way.
-                        # Replace funny characters.  We use errors='replace'
-                        t = unicode(t, 'ascii', 'replace')
+                partcharset = get_charset(part, guess=True)
+                try:
+                    t = unicode(t, partcharset, 'replace')
+                except (UnicodeError, LookupError, ValueError,
+                        AssertionError):
+                    # We can get here if partcharset is bogus in come way.
+                    # Replace funny characters.  We use errors='replace'
+                    t = unicode(t, 'ascii', 'replace')
                 # Separation is useful
                 if isinstance(t, basestring):
                     if not t.endswith('\n'):
@@ -228,19 +232,7 @@ class Scrubber(object):
             text = "\n".join(text)
         else:
             text = self.msg.get_payload(decode=True)
-            charset = self.msg.get_content_charset()
-            if charset is None:
-                # Try to guess the encoding (best effort mode)
-                for encoding in ["ascii", "utf-8", "iso-8859-15"]:
-                    try:
-                        text.decode(encoding)
-                    except UnicodeDecodeError:
-                        continue
-                    else:
-                        #print encoding, payload
-                        charset = encoding
-                        break
-            text = text.decode(charset or "ascii", "replace")
+            text = text.decode(get_charset(self.msg, guess=True), "replace")
         return text
 
 
@@ -252,10 +244,9 @@ class Scrubber(object):
         # e.g. image/jpg (should be image/jpeg).  For now we just store such
         # things as application/octet-streams since that seems the safest.
         ctype = part.get_content_type()
+        charset = get_charset(part, default=None, guess=False)
         # i18n file name is encoded
-        #lcset = Utils.GetCharSet(self.mlist.preferred_language)
-        lcset = "utf-8"
-        filename = oneline(part.get_filename(''), lcset)
+        filename = oneline(part.get_filename(''), charset or "ascii")
         filename, fnext = os.path.splitext(filename)
         # For safety, we should confirm this is valid ext for content-type
         # but we can use fnext if we introduce fnext filtering
@@ -305,4 +296,4 @@ class Scrubber(object):
         msg_id = self.msg['Message-Id'].strip("<>")
         self.store.add_attachment(
                 self.mlist, msg_id, counter, filebase+ext,
-                ctype, decodedpayload)
+                ctype, charset, decodedpayload)
diff --git a/kittystore/storm/model.py b/kittystore/storm/model.py
index 4177fc7..8f74fea 100644
--- a/kittystore/storm/model.py
+++ b/kittystore/storm/model.py
@@ -83,5 +83,6 @@ class Attachment(object):
     counter = Int()
     name = Unicode()
     content_type = Unicode()
+    encoding = Unicode()
     size = Int()
     content = RawStr()
diff --git a/kittystore/storm/schema/__init__.py b/kittystore/storm/schema/__init__.py
index daba29f..edb6d7d 100644
--- a/kittystore/storm/schema/__init__.py
+++ b/kittystore/storm/schema/__init__.py
@@ -28,6 +28,7 @@ CREATES = {
             message_id VARCHAR(255) NOT NULL,
             counter INTEGER NOT NULL,
             content_type VARCHAR(255) NOT NULL,
+            encoding VARCHAR(50),
             name VARCHAR(255),
             size INTEGER NOT NULL,
             content BLOB NOT NULL,
@@ -66,6 +67,7 @@ CREATES = {
             message_id VARCHAR(255) NOT NULL,
             counter INTEGER NOT NULL,
             content_type VARCHAR(255) NOT NULL,
+            encoding VARCHAR(50),
             name VARCHAR(255),
             size INTEGER NOT NULL,
             content BYTEA NOT NULL,
diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py
index 2858489..cb0b486 100644
--- a/kittystore/storm/store.py
+++ b/kittystore/storm/store.py
@@ -377,13 +377,14 @@ class StormStore(object):
     # Attachments
 
     def add_attachment(self, mlist, msg_id, counter, name, content_type,
-                       content):
+                       encoding, content):
         attachment = Attachment()
         attachment.list_name = unicode(mlist)
         attachment.message_id = unicode(msg_id)
         attachment.counter = counter
         attachment.name = unicode(name)
         attachment.content_type = unicode(content_type)
+        attachment.encoding = unicode(encoding) if encoding is not None else None
         attachment.content = content
         attachment.size = len(content)
         self.db.add(attachment)
@@ -403,6 +404,21 @@ class StormStore(object):
                 )).order_by(Attachment.counter)
         return list(att)
 
+    def get_attachment_by_counter(self, list_name, message_id, counter):
+        """Return the message's attachment at 'counter' position.
+
+        :param list_name: The fully qualified list name to which the
+            message should be added.
+        :param message_id: The Message-ID header contents to search for.
+        :param counter: The position in the MIME-multipart email.
+        :returns: The corresponding attachment
+        """
+        return self.db.find(Attachment, And(
+                    Attachment.list_name == unicode(list_name),
+                    Attachment.message_id == unicode(message_id),
+                    Attachment.counter == counter
+                )).one()
+
     # Generic database operations
 
     def flush(self):
diff --git a/kittystore/test/test_scrub.py b/kittystore/test/test_scrub.py
index e694be3..e0e442a 100644
--- a/kittystore/test/test_scrub.py
+++ b/kittystore/test/test_scrub.py
@@ -20,7 +20,7 @@ class TestScrubber(unittest.TestCase):
         self.assertEqual(store.add_attachment.call_count, 1)
         store.add_attachment.assert_called_with(
                 'testlist@example.com', '505E5185.5040208@libero.it', 2,
-                'puntogil.vcf', 'text/x-vcard',
+                'puntogil.vcf', 'text/x-vcard', "utf-8",
                 'begin:vcard\r\nfn:gil\r\nn:;gil\r\nversion:2.1\r\n'
                 'end:vcard\r\n\r\n')
         self.assertEqual(contents,
@@ -38,7 +38,7 @@ class TestScrubber(unittest.TestCase):
         self.assertEqual(store.add_attachment.call_count, 1)
         store.add_attachment.assert_called_with(
                 'testlist@example.com', '50619B7A.2030404@thelounge.net', 3,
-                'signature.asc', 'application/pgp-signature',
+                'signature.asc', 'application/pgp-signature', None,
                 '-----BEGIN PGP SIGNATURE-----\r\nVersion: GnuPG v1.4.12 '
                 '(GNU/Linux)\r\nComment: Using GnuPG with Mozilla - '
                 'http://www.enigmail.net/\r\n\r\niEYEARECAAYFAlBhm3oACgkQhmBj'
@@ -59,15 +59,15 @@ class TestScrubber(unittest.TestCase):
         self.assertEqual(store.add_attachment.call_count, 2)
         args_1, args_2 = store.add_attachment.call_args_list
         # HTML part
-        self.assertEqual(args_1[0][0:5], ("testlist@example.com",
+        self.assertEqual(args_1[0][0:6], ("testlist@example.com",
                 "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com",
-                3, "attachment.html", "text/html"))
-        self.assertEqual(len(args_1[0][5]), 3134)
+                3, "attachment.html", "text/html", "iso-8859-1"))
+        self.assertEqual(len(args_1[0][6]), 3134)
         # Image attachment
-        self.assertEqual(args_2[0][0:5], ("testlist@example.com",
+        self.assertEqual(args_2[0][0:6], ("testlist@example.com",
                 "CACec3Lup8apbhUMcm_Ktn1dPxx4eWr2y1RV7ZSYhy0tzmjSrgQ@mail.gmail.com",
-                4, "GeoffreyRoucourt.jpg", "image/jpeg"))
-        self.assertEqual(len(args_2[0][5]), 282180)
+                4, "GeoffreyRoucourt.jpg", "image/jpeg", None))
+        self.assertEqual(len(args_2[0][6]), 282180)
         # Scrubbed content
         self.assertEqual(contents, u"This is a test message\r\n")
 
@@ -80,10 +80,10 @@ class TestScrubber(unittest.TestCase):
         self.assertEqual(store.add_attachment.call_count, 1)
         args = store.add_attachment.call_args[0]
         # HTML part
-        self.assertEqual(args[0:5], ("testlist@example.com",
+        self.assertEqual(args[0:6], ("testlist@example.com",
                 "016001cd9b3b$b71efed0$255cfc70$@fr",
-                2, "attachment.html", "text/html"))
-        self.assertEqual(len(args[5]), 2723)
+                2, "attachment.html", "text/html", "iso-8859-1"))
+        self.assertEqual(len(args[6]), 2723)
         # Scrubbed content
         self.assertEqual(contents,
                 u"This is a test message\r\n"
diff --git a/kittystore/utils.py b/kittystore/utils.py
index c860ce5..ddb4d64 100644
--- a/kittystore/utils.py
+++ b/kittystore/utils.py
@@ -60,18 +60,23 @@ def parseaddr(address):
         from_name = from_email
     return from_name, from_email
 
+
 def header_to_unicode(header):
+    """
+    See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
+    """
     h_decoded = []
-    for decoded, charset in decode_header(header):
+    for text, charset in decode_header(header):
         if charset is None:
-            h_decoded.append(unicode(decoded))
+            h_decoded.append(unicode(text))
         else:
             try:
-                h_decoded.append(decoded.decode(charset))
+                h_decoded.append(text.decode(charset))
             except LookupError:
                 # Unknown encoding
-                h_decoded.append(decoded.decode("ascii", "replace"))
-    return " ".join(h_decoded)
+                h_decoded.append(text.decode("ascii", "replace"))
+    return u" ".join(h_decoded)
+
 
 def parsedate(datestring):
     if datestring is None:
author	Aurélien Bompard <aurelien@bompard.org>	2012-10-02 12:12:41 +0200
committer	Aurélien Bompard <aurelien@bompard.org>	2012-10-02 12:12:41 +0200
commit	4f396b8956fba81221e9c62d8731134343c561d1 (patch)
tree	a7a190d04704596bb43142f7dc16645901d82bfa
parent	dc3ade6110c49ed647ee5008fe2fef0e9a6d8522 (diff)
download	kittystore-4f396b8956fba81221e9c62d8731134343c561d1.tar.gz kittystore-4f396b8956fba81221e9c62d8731134343c561d1.tar.xz kittystore-4f396b8956fba81221e9c62d8731134343c561d1.zip