1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
|
#!/usr/bin/python -tt
# Import the content of a mbox file into SQL
import datetime
import mailbox
import os
import re
import sys
import time
import urllib
from base64 import b32encode
from dateutil.parser import parse
from dateutil import tz
from kitchen.text.converters import to_bytes
from hashlib import sha1
from sqlalchemy.exc import OperationalError
from kittystore import get_store
TOTALCNT = 0
#KITTYSTORE_URL = 'postgres://mm3:mm3@localhost/mm3'
#KITTYSTORE_URL = 'postgres://kittystore:kittystore@localhost/kittystore'
KITTYSTORE_URL = 'sqlite:///' + os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "kittystore.sqlite"))
PREFIX_RE = re.compile("^\[([\w\s_-]+)\] ")
ATTACHMENT_RE = re.compile(r"""
--------------[ ]next[ ]part[ ]--------------\n
A[ ]non-text[ ]attachment[ ]was[ ]scrubbed\.\.\.\n
Name:[ ]([^\n]+)\n
Type:[ ]([^\n]+)\n
Size:[ ]\d+[ ]bytes\n
Desc:[ ].+?\n
Url[ ]:[ ]([^\s]+)\s*\n
""", re.X | re.S)
EMBEDDED_MSG_RE = re.compile(r"""
--------------[ ]next[ ]part[ ]--------------\n
An[ ]embedded[ ]message[ ]was[ ]scrubbed\.\.\.\n
From:[ ].+?\n
Subject:[ ](.+?)\n
Date:[ ][^\n]+\n
Size:[ ]\d+\n
Url:[ ]([^\s]+)\s*\n
""", re.X | re.S)
HTML_ATTACH_RE = re.compile(r"""
--------------[ ]next[ ]part[ ]--------------\n
An[ ]HTML[ ]attachment[ ]was[ ]scrubbed\.\.\.\n
URL:[ ]([^\s]+)\s*\n
""", re.X)
TEXT_NO_CHARSET_RE = re.compile(r"""
--------------[ ]next[ ]part[ ]--------------\n
An[ ]embedded[ ]and[ ]charset-unspecified[ ]text[ ]was[ ]scrubbed\.\.\.\n
Name:[ ]([^\n]+)\n
Url:[ ]([^\s]+)\s*\n
""", re.X | re.S)
class DummyMailingList(object):
def __init__(self, address):
self.fqdn_listname = unicode(address)
self.display_name = None
def convert_date(date_string):
""" Convert the string of the date to a datetime object. """
#print date_string
date_string = date_string.split('(')[0].strip()
dt = parse(date_string)
return dt.astimezone(tz.tzutc())
def extract_attachments(store, mlist, message):
"""Parse message to search for attachments"""
has_attach = False
message_text = message.as_string()
if "-------------- next part --------------" in message_text:
has_attach = True
# Regular attachments
attachments = ATTACHMENT_RE.findall(message_text)
for counter, att in enumerate(attachments):
download_attachment(store, mlist, message["Message-Id"], counter,
att[0], att[1], att[2])
# Embedded messages
embedded = EMBEDDED_MSG_RE.findall(message_text)
for counter, att in enumerate(embedded):
download_attachment(store, mlist, message["Message-Id"], counter,
att[0], 'message/rfc822', att[1])
# HTML attachments
html_attachments = HTML_ATTACH_RE.findall(message_text)
for counter, att in enumerate(html_attachments):
download_attachment(store, mlist, message["Message-Id"], counter,
os.path.basename(att), 'text/html', att)
# Text without charset
text_no_charset = TEXT_NO_CHARSET_RE.findall(message_text)
for counter, att in enumerate(text_no_charset):
download_attachment(store, mlist, message["Message-Id"], counter,
att[0], 'text/plain', att[1])
## Other, probably inline text/plain
#if has_attach and not (attachments or embedded
# or html_attachments or text_no_charset):
# print message_text
def download_attachment(store, mlist, message_id, counter, name, content_type, url):
#print "Downloading attachment from", url
content = urllib.urlopen(url).read()
store.add_attachment(mlist, message_id, counter, name, content_type,
None, content)
def to_db(mbfile, list_name, store):
""" Upload all the emails in a mbox file into the database using
kittystore API.
:arg mbfile, a mailbox file from which the emails are extracted and
upload to the database.
:arg list_name, the fully qualified list name.
"""
global TOTALCNT
cnt = 0
cnt_read = 0
mlist = DummyMailingList(list_name)
for message in mailbox.mbox(mbfile):
cnt_read = cnt_read + 1
#print cnt_read
TOTALCNT = TOTALCNT + 1
# Try to find the mailing-list subject prefix in the first email
if cnt_read == 1:
subject_prefix = PREFIX_RE.search(message["subject"])
if subject_prefix:
mlist.display_name = unicode(subject_prefix.group(1))
try:
msg_id_hash = store.add_to_list(mlist, message)
except ValueError, e:
if len(e.args) != 2:
raise # Regular ValueError exception
print "%s from %s about %s" % (e.args[0],
e.args[1].get("From"), e.args[1].get("Subject"))
continue
except OperationalError, e:
print message["From"], message["Subject"], e
# Database is locked
time.sleep(1)
msg_id_hash = store.add_to_list(mlist, message)
# Parse message to search for attachments
extract_attachments(store, mlist, message)
store.flush()
cnt = cnt + 1
store.commit()
print ' %s email read' % cnt_read
print ' %s email added to the database' % cnt
if __name__ == '__main__':
#sys.argv.extend(['devel', 'lists/devel-2012-03-March.txt'])
if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv:
print '''USAGE:
python to_sqldb.py list_name mbox_file [mbox_file]'''
else:
print 'Adding to database list: %s' % sys.argv[1]
store = get_store(KITTYSTORE_URL, debug=False)
for mbfile in sys.argv[2:]:
print mbfile
if os.path.exists(mbfile):
to_db(mbfile, sys.argv[1], store)
print ' %s emails are stored into the database' % store.get_list_size(sys.argv[1])
|