to_mongo.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

#!/usr/bin/python -tt

# Import the content of a mbox file into mongodb

import bson
from bson.errors import InvalidStringData
import datetime
import mailbox
import os
import pymongo
import re
import sys
import time
from base64 import b32encode
from dateutil.parser import parse
from kitchen.text.converters import to_bytes
from hashlib import sha1

connection = pymongo.Connection('localhost', 27017)

TOTALCNT = 0

def convert_date(date_string):
    """ Convert the string of the date to a datetime object. """
    date_string = date_string.split('(')[0].strip()
    dt = parse(date_string)
    return dt.astimezone(tz.tzutc())


def to_mongo(mbfile, database):
    """ Upload all the emails in a mbox file into a mongo database. """
    global TOTALCNT
    db = connection[database]
    cnt = 0
    cnt_read = 0
    for message in mailbox.mbox(mbfile):
        cnt_read = cnt_read + 1
        TOTALCNT = TOTALCNT + 1
        infos = {}
        ## TODO: We need to catch-up Subjects/From which are of a specific
        ## encoding.
        for it in message.keys():
            it2 = it.replace('-', '')
            infos[it2] = message[it]
        keys = infos.keys()
        ## There seem to be a problem to parse some messages
        if not keys:
            print '  Failed: %s keys: "%s"' % (mbfile, keys)
            #print message
            continue
        if 'MessageID' in infos:
            infos['MessageID'] = infos['MessageID'].replace('<', ''
                ).replace('>', '')
        if 'From' in infos:
            regex = '(.*)\((.*)\)'
            match = re.match(regex, infos['From'])
            if match:
                email, name = match.groups()
                infos['From'] = name
                email = email.replace(' at ', '@')
                infos['Email'] = email.strip()
        try:
            if db.mails.find({'MessageID': infos['MessageID']}).count() == 0:
                infos['Date'] = convert_date(infos['Date'])
                infos['Content'] = message.get_payload()
                try:
                    bson.BSON.encode({'content' : infos['Content']})
                except InvalidStringData:
                    ## TODO: Do something about this encoding issue
                    raise InvalidStringData('Email has invalid content')
                thread_id = 0
                db.mails.create_index('MessageID')
                db.mails.ensure_index('MessageID')
                db.mails.create_index('ThreadID')
                db.mails.ensure_index('ThreadID')
                if not 'References' in infos and not 'InReplyTo' in infos:
                    infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest())
                else:
                    ref = None
                    if 'References' in infos:
                        ref= infos['References'].split()[0].strip()
                    else:
                        ref= infos['InReplyTo']
                    ref = ref.replace('<', '').replace('>', '')
                    res = db.mails.find_one({'MessageID': ref})
                    if res and 'ThreadID' in res:
                        infos['ThreadID'] = res['ThreadID']
                    else:
                        infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest())
#                infos['Category'] = 'Question'
#                if 'agenda' in infos['Subject'].lower():
#                    infos['Category'] = 'Agenda'
#                if 'reminder' in infos['Subject'].lower():
#                    infos['Category'] = 'Agenda'
#                infos['Full'] = message.as_string()
#                try:
#                    bson.BSON.encode({'content' : infos['Full']})
#                except InvalidStringData:
#                    ## TODO: Do something about this encoding issue
#                    raise InvalidStringData('Email has invalid full version')

                ## TODO: I'm not sure the TOTALCNT approach is the right one
                ## we should discuss this with the pipermail guys
#                infos['LegacyID'] = TOTALCNT
                db.mails.insert(infos)
                cnt = cnt + 1
        except Exception, err:
            print '  Failed: %s error: "%s"' % (mbfile, err)
            print '  Failed:', message['Subject'], message['Date'], message['From']
    print '  %s email read' % cnt_read
    print '  %s email added to the database' % cnt

def get_document_size(database):
    """ Return the size of the document in mongodb. """
    db = connection[database]
    print '  %s emails are stored into the database' % db.mails.count()


if __name__ == '__main__':
    #sys.argv.extend(['devel', 'lists/devel-2012-03-March.txt'])
    if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv:
        print '''USAGE:
python mbox_to_mongo.py db_name mbox_file [mbox_file]'''
    else:
        print 'Adding to database: %s' % sys.argv[1]
        for mbfile in sys.argv[2:]:
            print mbfile
            if os.path.exists(mbfile):
                print mbfile
                to_mongo(mbfile, sys.argv[1])
                get_document_size(sys.argv[1])

"""
## Test command-line:
$ mongo
use fedora-devel
db.mails.find()
db.mails.count()
"""