#!/usr/bin/python -tt

# Import the content of a mbox file into mongodb

import bson
from bson.errors import InvalidStringData
import datetime
import mailbox
import os
import pymongo
import re
import sys
import time
from base64 import b32encode
from dateutil.parser import parse
from kitchen.text.converters import to_bytes
from hashlib import sha1

connection = pymongo.Connection('localhost', 27017)

TOTALCNT = 0

def convert_date(date_string):
    """ Convert the string of the date to a datetime object. """
    date_string = date_string.split('(')[0].strip()
    dt = parse(date_string)
    return dt.astimezone(tz.tzutc())


def to_mongo(mbfile, database):
    """ Upload all the emails in a mbox file into a mongo database. """
    global TOTALCNT
    db = connection[database]
    cnt = 0
    cnt_read = 0
    for message in mailbox.mbox(mbfile):
        cnt_read = cnt_read + 1
        TOTALCNT = TOTALCNT + 1
        infos = {}
        ## TODO: We need to catch-up Subjects/From which are of a specific
        ## encoding.
        for it in message.keys():
            it2 = it.replace('-', '')
            infos[it2] = message[it]
        keys = infos.keys()
        ## There seem to be a problem to parse some messages
        if not keys:
            print '  Failed: %s keys: "%s"' % (mbfile, keys)
            #print message
            continue
        if 'MessageID' in infos:
            infos['MessageID'] = infos['MessageID'].replace('<', ''
                ).replace('>', '')
        if 'From' in infos:
            regex = '(.*)\((.*)\)'
            match = re.match(regex, infos['From'])
            if match:
                email, name = match.groups()
                infos['From'] = name
                email = email.replace(' at ', '@')
                infos['Email'] = email.strip()
        try:
            if db.mails.find({'MessageID': infos['MessageID']}).count() == 0:
                infos['Date'] = convert_date(infos['Date'])
                infos['Content'] = message.get_payload()
                try:
                    bson.BSON.encode({'content' : infos['Content']})
                except InvalidStringData:
                    ## TODO: Do something about this encoding issue
                    raise InvalidStringData('Email has invalid content')
                thread_id = 0
                db.mails.create_index('MessageID')
                db.mails.ensure_index('MessageID')
                db.mails.create_index('ThreadID')
                db.mails.ensure_index('ThreadID')
                if not 'References' in infos and not 'InReplyTo' in infos:
                    infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest())
                else:
                    ref = None
                    if 'References' in infos:
                        ref= infos['References'].split()[0].strip()
                    else:
                        ref= infos['InReplyTo']
                    ref = ref.replace('<', '').replace('>', '')
                    res = db.mails.find_one({'MessageID': ref})
                    if res and 'ThreadID' in res:
                        infos['ThreadID'] = res['ThreadID']
                    else:
                        infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest())
#                infos['Category'] = 'Question'
#                if 'agenda' in infos['Subject'].lower():
#                    infos['Category'] = 'Agenda'
#                if 'reminder' in infos['Subject'].lower():
#                    infos['Category'] = 'Agenda'
#                infos['Full'] = message.as_string()
#                try:
#                    bson.BSON.encode({'content' : infos['Full']})
#                except InvalidStringData:
#                    ## TODO: Do something about this encoding issue
#                    raise InvalidStringData('Email has invalid full version')

                ## TODO: I'm not sure the TOTALCNT approach is the right one
                ## we should discuss this with the pipermail guys
#                infos['LegacyID'] = TOTALCNT
                db.mails.insert(infos)
                cnt = cnt + 1
        except Exception, err:
            print '  Failed: %s error: "%s"' % (mbfile, err)
            print '  Failed:', message['Subject'], message['Date'], message['From']
    print '  %s email read' % cnt_read
    print '  %s email added to the database' % cnt

def get_document_size(database):
    """ Return the size of the document in mongodb. """
    db = connection[database]
    print '  %s emails are stored into the database' % db.mails.count()


if __name__ == '__main__':
    #sys.argv.extend(['devel', 'lists/devel-2012-03-March.txt'])
    if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv:
        print '''USAGE:
python mbox_to_mongo.py db_name mbox_file [mbox_file]'''
    else:
        print 'Adding to database: %s' % sys.argv[1]
        for mbfile in sys.argv[2:]:
            print mbfile
            if os.path.exists(mbfile):
                print mbfile
                to_mongo(mbfile, sys.argv[1])
                get_document_size(sys.argv[1])

"""
## Test command-line:
$ mongo
use fedora-devel
db.mails.find()
db.mails.count()
"""