diff options
author | Pierre-Yves Chibon <pingou@pingoured.fr> | 2012-05-20 16:29:41 +0200 |
---|---|---|
committer | Pierre-Yves Chibon <pingou@pingoured.fr> | 2012-05-20 16:29:41 +0200 |
commit | cbb1998a160b51caacc8d63d433c126e0fbc5b7e (patch) | |
tree | c99688e01c492d60603d01fc40043aaa82f16efe | |
parent | 3ffe46ccc7fe3d62bfd0bad1b26086ddc62fdb1e (diff) | |
download | kittystore-cbb1998a160b51caacc8d63d433c126e0fbc5b7e.tar.gz kittystore-cbb1998a160b51caacc8d63d433c126e0fbc5b7e.tar.xz kittystore-cbb1998a160b51caacc8d63d433c126e0fbc5b7e.zip |
Add the to_mongo.py script
This script comes from the mongomail project but it deserves to be here with to_sqldb.py
-rwxr-xr-x | to_mongo.py | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/to_mongo.py b/to_mongo.py new file mode 100755 index 0000000..a1e1df3 --- /dev/null +++ b/to_mongo.py @@ -0,0 +1,139 @@ +#!/usr/bin/python -tt + +# Import the content of a mbox file into mongodb + +import bson +from bson.errors import InvalidStringData +import datetime +import mailbox +import os +import pymongo +import re +import sys +import time +from base64 import b32encode +from dateutil.parser import parse +from kitchen.text.converters import to_bytes +from hashlib import sha1 + +connection = pymongo.Connection('localhost', 27017) + +TOTALCNT = 0 + +def convert_date(date_string): + """ Convert the string of the date to a datetime object. """ + date_string = date_string.strip() + dt = parse(date_string) + return dt + + +def to_mongo(mbfile, database): + """ Upload all the emails in a mbox file into a mongo database. """ + global TOTALCNT + db = connection[database] + cnt = 0 + cnt_read = 0 + for message in mailbox.mbox(mbfile): + cnt_read = cnt_read + 1 + TOTALCNT = TOTALCNT + 1 + infos = {} + ## TODO: We need to catch-up Subjects/From which are of a specific + ## encoding. + for it in message.keys(): + it2 = it.replace('-', '') + infos[it2] = message[it] + keys = infos.keys() + ## There seem to be a problem to parse some messages + if not keys: + print ' Failed: %s keys: "%s"' % (mbfile, keys) + #print message + continue + if 'MessageID' in infos: + infos['MessageID'] = infos['MessageID'].replace('<', '' + ).replace('>', '') + if 'From' in infos: + regex = '(.*)\((.*)\)' + match = re.match(regex, infos['From']) + if match: + email, name = match.groups() + infos['From'] = name + email = email.replace(' at ', '@') + infos['Email'] = email.strip() + try: + if db.mails.find({'MessageID': infos['MessageID']}).count() == 0: + infos['Date'] = convert_date(infos['Date']) + infos['Content'] = message.get_payload() + try: + bson.BSON.encode({'content' : infos['Content']}) + except InvalidStringData: + ## TODO: Do something about this encoding issue + raise InvalidStringData('Email has invalid content') + thread_id = 0 + db.mails.create_index('MessageID') + db.mails.ensure_index('MessageID') + db.mails.create_index('ThreadID') + db.mails.ensure_index('ThreadID') + if not 'References' in infos and not 'InReplyTo' in infos: + infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest()) + else: + ref = None + if 'References' in infos: + ref= infos['References'].split()[0].strip() + else: + ref= infos['InReplyTo'] + ref = ref.replace('<', '').replace('>', '') + res = db.mails.find_one({'MessageID': ref}) + if res and 'ThreadID' in res: + infos['ThreadID'] = res['ThreadID'] + else: + infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest()) +# infos['Category'] = 'Question' +# if 'agenda' in infos['Subject'].lower(): +# infos['Category'] = 'Agenda' +# if 'reminder' in infos['Subject'].lower(): +# infos['Category'] = 'Agenda' +# infos['Full'] = message.as_string() +# try: +# bson.BSON.encode({'content' : infos['Full']}) +# except InvalidStringData: +# ## TODO: Do something about this encoding issue +# raise InvalidStringData('Email has invalid full version') + + ## TODO: I'm not sure the TOTALCNT approach is the right one + ## we should discuss this with the pipermail guys +# infos['LegacyID'] = TOTALCNT + db.mails.insert(infos) + cnt = cnt + 1 + except Exception, err: + print ' Failed: %s error: "%s"' % (mbfile, err) + print ' Failed:', message['Subject'], message['Date'], message['From'] + print ' %s email read' % cnt_read + print ' %s email added to the database' % cnt + +def get_document_size(database): + """ Return the size of the document in mongodb. """ + db = connection[database] + print ' %s emails are stored into the database' % db.mails.count() + + +if __name__ == '__main__': + #sys.argv.extend(['devel', 'lists/devel-2012-03-March.txt']) + if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv: + print '''USAGE: +python mbox_to_mongo.py db_name mbox_file [mbox_file]''' + else: + print 'Adding to database: %s' % sys.argv[1] + for mbfile in sys.argv[2:]: + print mbfile + if os.path.exists(mbfile): + print mbfile + to_mongo(mbfile, sys.argv[1]) + get_document_size(sys.argv[1]) + +""" +## Test command-line: +$ mongo +use fedora-devel +db.mails.find() +db.mails.count() +""" |