summaryrefslogtreecommitdiffstats
path: root/to_sqldb.py
blob: a208ecb6fb313b0386c84f4f22abff59612b96db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/python -tt

# Import the content of a mbox file into mongodb

import bson
import datetime
import mailbox
import os
import re
import sys
import time
from base64 import b32encode
from dateutil.parser import parse
from dateutil import tz
from kitchen.text.converters import to_bytes
from hashlib import sha1

from kittystore.kittysamodel import Email, get_class_object
from kittystore.kittysastore import list_to_table_name, KittySAStore
from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy.exc import OperationalError, InvalidRequestError


TOTALCNT = 0
DB_URL = 'postgres://mm3:mm3@localhost/mm3'
engine = create_engine(DB_URL, echo=False,
    pool_recycle=3600)
store = KittySAStore(DB_URL)
Session = sessionmaker(bind=engine)
session = Session()


def convert_date(date_string):
    """ Convert the string of the date to a datetime object. """
    #print date_string
    date_string = date_string.split('(')[0].strip()
    dt = parse(date_string)
    return dt.astimezone(tz.tzutc())


def to_db(mbfile, list_name):
    """ Upload all the emails in a mbox file into the database using
    kittystore API.

    :arg mbfile, a mailbox file from which the emails are extracted and
    upload to the database.
    :arg list_name, the fully qualified list name.
    """
    global TOTALCNT
    cnt = 0
    cnt_read = 0
    email = get_class_object(list_to_table_name(list_name), 'email',
                    MetaData(engine), create=True)
    for message in mailbox.mbox(mbfile):
        cnt_read = cnt_read + 1
        #print cnt_read
        TOTALCNT = TOTALCNT + 1
        infos = {}
        ## TODO: We need to catch-up Subjects/From which are of a specific
        ## encoding.
        for it in message.keys():
            it2 = it.replace('-', '')
            infos[it2] = message[it]
        keys = infos.keys()
        ## There seem to be a problem to parse some messages
        if not keys:
            print '  Failed: %s keys: "%s"' % (mbfile, keys)
            #print message
            continue
        if 'MessageID' in infos:
            infos['MessageID'] = infos['MessageID'].replace('<', ''
                ).replace('>', '')
        if 'From' in infos:
            regex = '(.*)\((.*)\)'
            match = re.match(regex, infos['From'])
            if match:
                email_add, name = match.groups()
                infos['From'] = name
                email_add = email_add.replace(' at ', '@')
                infos['Email'] = email_add.strip()
        try:
            if not 'MessageID' in infos:
                print '  Failed: No Message-ID for email:'
                print '   Content:', message['Subject'], message['Date'], message['From']
                continue
            if not store.get_email(list_name, infos['MessageID']):
                infos['Date'] = convert_date(infos['Date'])
                infos['Content'] = message.get_payload()
                thread_id = 0
                if not 'References' in infos and not 'InReplyTo' in infos:
                    infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest())
                else:
                    ref = None
                    if 'References' in infos:
                        ref= infos['References'].split()[0].strip()
                    else:
                        ref= infos['InReplyTo']
                        infos['References'] = infos['InReplyTo']
                        del(infos['InReplyTo'])
                    ref = ref.replace('<', '').replace('>', '')
                    res = store.get_email(list_name, ref)
                    if res and res.thread_id:
                        infos['ThreadID'] = res.thread_id
                    else:
                        infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest())
                infos['Category'] = 'Question'
                if 'agenda' in infos['Subject'].lower():
                    infos['Category'] = 'Agenda'
                if 'reminder' in infos['Subject'].lower():
                    infos['Category'] = 'Agenda'
                infos['Full'] = message.as_string()

                ## TODO: I'm not sure the TOTALCNT approach is the right one
                ## we should discuss this with the pipermail guys
                infos['LegacyID'] = TOTALCNT
                if not 'References' in infos:
                    infos['References'] = None
                #print infos.keys()
                mail = email(
                    sender=infos['From'],
                    email=infos['Email'],
                    subject=infos['Subject'],
                    content=infos['Content'],
                    date=infos['Date'],
                    message_id=infos['MessageID'],
                    stable_url_id=infos['MessageID'],
                    thread_id=infos['ThreadID'],
                    references=infos['References'],
                    full=infos['Full'],
                    )
                mail.save(session)
                cnt = cnt + 1
                session.commit()
        except Exception, err:
            print ' Error: "%s"' % err
            print 'File:',mbfile , 'Content:', message['Subject'], message['Date'], message['From']
            pass
        #else:
            #print '  Failed: %s ID: "%s" ' % (mbfile, infos['MessageID'])
            #print '   Content:', message['Subject'], message['Date'], message['From']
    session.commit()
    print '  %s email read' % cnt_read
    print '  %s email added to the database' % cnt

def get_table_size(list_name):
    """ Return the size of the document in mongodb. """
    email = get_class_object(list_to_table_name(list_name), 'email',
                    MetaData(engine))
    print '  %s emails are stored into the database' % session.query(email).count()


if __name__ == '__main__':
    #sys.argv.extend(['devel', 'lists/devel-2012-03-March.txt'])
    if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv:
        print '''USAGE:
python to_sqldb.py list_name mbox_file [mbox_file]'''
    else:
        print 'Adding to database list: %s' % sys.argv[1]
        for mbfile in sys.argv[2:]:
            print mbfile
            if os.path.exists(mbfile):
                to_db(mbfile, sys.argv[1])
                get_table_size(sys.argv[1])
    session.close()
    store.session.close()