summaryrefslogtreecommitdiffstats
path: root/to_sqldb.py
blob: 186c90386ae60d3b0aebc885df59f4a2cac9eed1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/python -tt

# Import the content of a mbox file into SQL

import datetime
import mailbox
import os
import re
import sys
import time
import urllib
from base64 import b32encode
from dateutil.parser import parse
from dateutil import tz
from kitchen.text.converters import to_bytes
from hashlib import sha1
from sqlalchemy.exc import OperationalError

from kittystore import get_store

TOTALCNT = 0
#KITTYSTORE_URL = 'postgres://mm3:mm3@localhost/mm3'
#KITTYSTORE_URL = 'postgres://kittystore:kittystore@localhost/kittystore'
KITTYSTORE_URL = 'sqlite:///' + os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "kittystore.sqlite"))


PREFIX_RE = re.compile("^\[([\w\s_-]+)\] ")

ATTACHMENT_RE = re.compile(r"""
--------------[ ]next[ ]part[ ]--------------\n
A[ ]non-text[ ]attachment[ ]was[ ]scrubbed\.\.\.\n
Name:[ ]([^\n]+)\n
Type:[ ]([^\n]+)\n
Size:[ ]\d+[ ]bytes\n
Desc:[ ].+?\n
Url[ ]:[ ]([^\s]+)\s*\n
""", re.X | re.S)

EMBEDDED_MSG_RE = re.compile(r"""
--------------[ ]next[ ]part[ ]--------------\n
An[ ]embedded[ ]message[ ]was[ ]scrubbed\.\.\.\n
From:[ ].+?\n
Subject:[ ](.+?)\n
Date:[ ][^\n]+\n
Size:[ ]\d+\n
Url:[ ]([^\s]+)\s*\n
""", re.X | re.S)

HTML_ATTACH_RE = re.compile(r"""
--------------[ ]next[ ]part[ ]--------------\n
An[ ]HTML[ ]attachment[ ]was[ ]scrubbed\.\.\.\n
URL:[ ]([^\s]+)\s*\n
""", re.X)

TEXT_NO_CHARSET_RE = re.compile(r"""
--------------[ ]next[ ]part[ ]--------------\n
An[ ]embedded[ ]and[ ]charset-unspecified[ ]text[ ]was[ ]scrubbed\.\.\.\n
Name:[ ]([^\n]+)\n
Url:[ ]([^\s]+)\s*\n
""", re.X | re.S)


class DummyMailingList(object):
    def __init__(self, address):
        self.fqdn_listname = unicode(address)
        self.display_name = None


def convert_date(date_string):
    """ Convert the string of the date to a datetime object. """
    #print date_string
    date_string = date_string.split('(')[0].strip()
    dt = parse(date_string)
    return dt.astimezone(tz.tzutc())


def extract_attachments(store, mlist, message):
    """Parse message to search for attachments"""
    has_attach = False
    message_text = message.as_string()
    if "-------------- next part --------------" in message_text:
        has_attach = True
    # Regular attachments
    attachments = ATTACHMENT_RE.findall(message_text)
    for counter, att in enumerate(attachments):
        download_attachment(store, mlist, message["Message-Id"], counter,
                            att[0], att[1], att[2])
    # Embedded messages
    embedded = EMBEDDED_MSG_RE.findall(message_text)
    for counter, att in enumerate(embedded):
        download_attachment(store, mlist, message["Message-Id"], counter,
                            att[0], 'message/rfc822', att[1])
    # HTML attachments
    html_attachments = HTML_ATTACH_RE.findall(message_text)
    for counter, att in enumerate(html_attachments):
        download_attachment(store, mlist, message["Message-Id"], counter,
                            os.path.basename(att), 'text/html', att)
    # Text without charset
    text_no_charset = TEXT_NO_CHARSET_RE.findall(message_text)
    for counter, att in enumerate(text_no_charset):
        download_attachment(store, mlist, message["Message-Id"], counter,
                            att[0], 'text/plain', att[1])
    ## Other, probably inline text/plain
    #if has_attach and not (attachments or embedded
    #                       or html_attachments or text_no_charset):
    #    print message_text


def download_attachment(store, mlist, message_id, counter, name, content_type, url):
    #print "Downloading attachment from", url
    content = urllib.urlopen(url).read()
    store.add_attachment(mlist, message_id, counter, name, content_type,
                         None, content)

def to_db(mbfile, list_name, store):
    """ Upload all the emails in a mbox file into the database using
    kittystore API.

    :arg mbfile, a mailbox file from which the emails are extracted and
    upload to the database.
    :arg list_name, the fully qualified list name.
    """
    global TOTALCNT
    cnt = 0
    cnt_read = 0
    mlist = DummyMailingList(list_name)
    for message in mailbox.mbox(mbfile):
        cnt_read = cnt_read + 1
        #print cnt_read
        TOTALCNT = TOTALCNT + 1
        # Try to find the mailing-list subject prefix in the first email
        if cnt_read == 1:
            subject_prefix = PREFIX_RE.search(message["subject"])
            if subject_prefix:
                mlist.display_name = unicode(subject_prefix.group(1))
        try:
            msg_id_hash = store.add_to_list(mlist, message)
        except ValueError, e:
            if len(e.args) != 2:
                raise # Regular ValueError exception
            print "%s from %s about %s" % (e.args[0],
                    e.args[1].get("From"), e.args[1].get("Subject"))
            continue
        except OperationalError, e:
            print message["From"], message["Subject"], e
            # Database is locked
            time.sleep(1)
            msg_id_hash = store.add_to_list(mlist, message)
        # Parse message to search for attachments
        extract_attachments(store, mlist, message)

        store.flush()
        cnt = cnt + 1
    store.commit()
    print '  %s email read' % cnt_read
    print '  %s email added to the database' % cnt


if __name__ == '__main__':
    #sys.argv.extend(['devel', 'lists/devel-2012-03-March.txt'])
    if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv:
        print '''USAGE:
python to_sqldb.py list_name mbox_file [mbox_file]'''
    else:
        print 'Adding to database list: %s' % sys.argv[1]

        store = get_store(KITTYSTORE_URL, debug=False)
        for mbfile in sys.argv[2:]:
            print mbfile
            if os.path.exists(mbfile):
                to_db(mbfile, sys.argv[1], store)
                print '  %s emails are stored into the database' % store.get_list_size(sys.argv[1])