diff options
author | Aurélien Bompard <aurelien@bompard.org> | 2012-10-25 10:30:54 +0200 |
---|---|---|
committer | Aurélien Bompard <aurelien@bompard.org> | 2012-10-25 10:30:54 +0200 |
commit | 72852d0bd2731897833abef47d82547257c95851 (patch) | |
tree | 38a0ec825476dfa9cbeb37cb37f90c88e36d001a | |
parent | 9764744e107556b008af0c782a23368488f4892c (diff) | |
download | kittystore-72852d0bd2731897833abef47d82547257c95851.tar.gz kittystore-72852d0bd2731897833abef47d82547257c95851.tar.xz kittystore-72852d0bd2731897833abef47d82547257c95851.zip |
Remove obsolete mongodb code
-rw-r--r-- | README.rst | 7 | ||||
-rw-r--r-- | kittystore/mongo/__init__.py | 1 | ||||
-rw-r--r-- | kittystore/mongo/store.py | 232 | ||||
-rwxr-xr-x | to_mongo.py | 139 |
4 files changed, 2 insertions, 377 deletions
@@ -17,9 +17,6 @@ Source: https://github.com/pypingou/kittystore Dependencies: ------------- - SQLAlchemy -- bson -- pymongo (for mongodb) - License: @@ -33,8 +30,8 @@ Load the database: ------------------ - Retrieve the archives using the get_mbox.py script -- Configure the to_sqldb.py or to_mongo.py script (adjust user/password/database name/host/port) -- Load the archives by calling the to_sqldb.py or to_mongo.py script +- Configure the to_sqldb.py script (adjust user/password/database name/host/port) +- Load the archives by calling the to_sqldb.py script (this might be memory intensive, so you may want to do 2 or 3 years per run and split the runs) diff --git a/kittystore/mongo/__init__.py b/kittystore/mongo/__init__.py deleted file mode 100644 index 50dc1df..0000000 --- a/kittystore/mongo/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__test__ = {} diff --git a/kittystore/mongo/store.py b/kittystore/mongo/store.py deleted file mode 100644 index dbaefb6..0000000 --- a/kittystore/mongo/store.py +++ /dev/null @@ -1,232 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -KittyMGStore - an object mapper and interface to the mongo database - representation of emails for mailman 3. - -Copyright (C) 2012 Pierre-Yves Chibon -Author: Pierre-Yves Chibon <pingou@pingoured.fr> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at -your option) any later version. -See http://www.gnu.org/copyleft/gpl.html for the full text of the -license. -""" - - -__test__ = {} -try: - import pymongo -except ImportError: - pass -import re -from datetime import datetime - -from zope.interface import implements -from mailman.interfaces.messages import IMessageStore - - -class KittyMGStore(object): - """ Implementation of the store for a MongoDB backend. """ - - implements(IMessageStore) - - def __init__(self, host='localhost', port=27017): - """ Constructor. - Create the session using the engine defined in the url. - - :arg host, hostname or IP of the database server. Defaults to - 'localhost' - :arg port, port of the database server. Defaults to '27017' - :kwarg debug, a boolean to set the debug mode on or off. - """ - self.connection = pymongo.Connection(host, port) - - def get_archives(self, list_name, start, end): - """ Return all the thread started emails between two given dates. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg start, a datetime object representing the starting date of - the interval to query. - :arg end, a datetime object representing the ending date of - the interval to query. - """ - mongodb = self.connection[list_name] - mongodb.mails.create_index('Date') - mongodb.mails.ensure_index('Date') - mongodb.mails.create_index('References') - mongodb.mails.ensure_index('References') - # Beginning of thread == No 'References' header - archives = [] - for email in mongodb.mails.find( - {'References': {'$exists':False}, - 'InReplyTo': {'$exists':False}, - "Date": {"$gt": start, "$lt": end}}, - sort=[('Date', pymongo.DESCENDING)]): - archives.append(email) - return archives - - def get_archives_length(self, list_name): - """ Return a dictionnary of years, months for which there are - potentially archives available for a given list (based on the - oldest post on the list). - - :arg list_name, name of the mailing list in which this email - should be searched. - """ - mongodb = self.connection[list_name] - mongodb.mails.create_index('Date') - mongodb.mails.ensure_index('Date') - archives = {} - entry = mongodb.mails.find_one(sort=[('Date', pymongo.ASCENDING)]) - date = entry['Date'] - now = datetime.now() - year = date.year - month = date.month - while year < now.year: - archives[year] = range(1, 13)[(month -1):] - year = year + 1 - month = 1 - archives[now.year] = range(1, 13)[:now.month] - return archives - - def get_email(self, list_name, message_id): - """ Return an Email object found in the database corresponding - to the Message-ID provided. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg message_id, Message-ID as found in the headers of the email. - Used here to uniquely identify the email present in the database. - """ - mongodb = self.connection[list_name] - mongodb.mails.create_index('MessageID') - mongodb.mails.ensure_index('MessageID') - return mongodb.mails.find_one({'MessageID': message_id}) - - def get_list_size(self, list_name): - """ Return the number of emails stored for a given mailing list. - - :arg list_name, name of the mailing list in which this email - should be searched. - """ - mongodb = self.connection[list_name] - return mongodb.mails.count() - - def get_thread_length(self, list_name, thread_id): - """ Return the number of email present in a thread. This thread - is uniquely identified by its thread_id. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg thread_id, unique identifier of the thread as specified in - the database. - """ - mongodb = self.connection[list_name] - mongodb.mails.create_index('ThreadID') - mongodb.mails.ensure_index('ThreadID') - return mongodb.mails.find({'ThreadID': thread_id}).count() - - def get_thread_participants(self, list_name, thread_id): - """ Return the list of participant in a thread. This thread - is uniquely identified by its thread_id. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg thread_id, unique identifier of the thread as specified in - the database. - """ - mongodb = self.connection[list_name] - mongodb.mails.create_index('ThreadID') - mongodb.mails.ensure_index('ThreadID') - authors = set() - for mail in mongodb.mails.find({'ThreadID': thread_id}): - authors.add(mail['From']) - return authors - - def search_content(self, list_name, keyword): - """ Returns a list of email containing the specified keyword in - their content. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg keyword, keyword to search in the content of the emails. - """ - mongodb = self.connection[list_name] - mongodb.mails.create_index('Date') - mongodb.mails.ensure_index('Date') - mongodb.mails.create_index('Content') - mongodb.mails.ensure_index('Content') - regex = '.*%s.*' % keyword - query_string = {'Content': re.compile(regex, re.IGNORECASE)} - return list(mongodb.mails.find(query_string, sort=[('Date', - pymongo.DESCENDING)])) - - def search_content_subject(self, list_name, keyword): - """ Returns a list of email containing the specified keyword in - their content or their subject. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg keyword, keyword to search in the content or subject of - the emails. - """ - mongodb = self.connection[list_name] - mongodb.mails.create_index('Date') - mongodb.mails.ensure_index('Date') - mongodb.mails.create_index('Content') - mongodb.mails.ensure_index('Content') - mongodb.mails.create_index('Subject') - mongodb.mails.ensure_index('Subject') - regex = '.*%s.*' % keyword - query_string = {'$or' : [ - {'Content': re.compile(regex, re.IGNORECASE)}, - {'Subject': re.compile(regex, re.IGNORECASE)} - ]} - return list(mongodb.mails.find(query_string, sort=[('Date', - pymongo.DESCENDING)])) - - def search_sender(self, list_name, keyword): - """ Returns a list of email containing the specified keyword in - the name or email address of the sender of the email. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg keyword, keyword to search in the database. - """ - mongodb = self.connection[list_name] - mongodb.mails.create_index('Date') - mongodb.mails.ensure_index('Date') - mongodb.mails.create_index('From') - mongodb.mails.ensure_index('From') - mongodb.mails.create_index('Email') - mongodb.mails.ensure_index('Email') - regex = '.*%s.*' % keyword - query_string = {'$or' : [ - {'From': re.compile(regex, re.IGNORECASE)}, - {'Email': re.compile(regex, re.IGNORECASE)} - ]} - return list(mongodb.mails.find(query_string, sort=[('Date', - pymongo.DESCENDING)])) - - - def search_subject(self, list_name, keyword): - """ Returns a list of email containing the specified keyword in - their subject. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg keyword, keyword to search in the subject of the emails. - """ - mongodb = self.connection[list_name] - mongodb.mails.create_index('Date') - mongodb.mails.ensure_index('Date') - mongodb.mails.create_index('Subject') - mongodb.mails.ensure_index('Subject') - regex = '.*%s.*' % keyword - query_string = {'Subject': re.compile(regex, re.IGNORECASE)} - return list(mongodb.mails.find(query_string, sort=[('Date', - pymongo.DESCENDING)])) diff --git a/to_mongo.py b/to_mongo.py deleted file mode 100755 index 379dfaf..0000000 --- a/to_mongo.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/python -tt - -# Import the content of a mbox file into mongodb - -import bson -from bson.errors import InvalidStringData -import datetime -import mailbox -import os -import pymongo -import re -import sys -import time -from base64 import b32encode -from dateutil.parser import parse -from kitchen.text.converters import to_bytes -from hashlib import sha1 - -connection = pymongo.Connection('localhost', 27017) - -TOTALCNT = 0 - -def convert_date(date_string): - """ Convert the string of the date to a datetime object. """ - date_string = date_string.split('(')[0].strip() - dt = parse(date_string) - return dt.astimezone(tz.tzutc()) - - -def to_mongo(mbfile, database): - """ Upload all the emails in a mbox file into a mongo database. """ - global TOTALCNT - db = connection[database] - cnt = 0 - cnt_read = 0 - for message in mailbox.mbox(mbfile): - cnt_read = cnt_read + 1 - TOTALCNT = TOTALCNT + 1 - infos = {} - ## TODO: We need to catch-up Subjects/From which are of a specific - ## encoding. - for it in message.keys(): - it2 = it.replace('-', '') - infos[it2] = message[it] - keys = infos.keys() - ## There seem to be a problem to parse some messages - if not keys: - print ' Failed: %s keys: "%s"' % (mbfile, keys) - #print message - continue - if 'MessageID' in infos: - infos['MessageID'] = infos['MessageID'].replace('<', '' - ).replace('>', '') - if 'From' in infos: - regex = '(.*)\((.*)\)' - match = re.match(regex, infos['From']) - if match: - email, name = match.groups() - infos['From'] = name - email = email.replace(' at ', '@') - infos['Email'] = email.strip() - try: - if db.mails.find({'MessageID': infos['MessageID']}).count() == 0: - infos['Date'] = convert_date(infos['Date']) - infos['Content'] = message.get_payload() - try: - bson.BSON.encode({'content' : infos['Content']}) - except InvalidStringData: - ## TODO: Do something about this encoding issue - raise InvalidStringData('Email has invalid content') - thread_id = 0 - db.mails.create_index('MessageID') - db.mails.ensure_index('MessageID') - db.mails.create_index('ThreadID') - db.mails.ensure_index('ThreadID') - if not 'References' in infos and not 'InReplyTo' in infos: - infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest()) - else: - ref = None - if 'References' in infos: - ref= infos['References'].split()[0].strip() - else: - ref= infos['InReplyTo'] - ref = ref.replace('<', '').replace('>', '') - res = db.mails.find_one({'MessageID': ref}) - if res and 'ThreadID' in res: - infos['ThreadID'] = res['ThreadID'] - else: - infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest()) -# infos['Category'] = 'Question' -# if 'agenda' in infos['Subject'].lower(): -# infos['Category'] = 'Agenda' -# if 'reminder' in infos['Subject'].lower(): -# infos['Category'] = 'Agenda' -# infos['Full'] = message.as_string() -# try: -# bson.BSON.encode({'content' : infos['Full']}) -# except InvalidStringData: -# ## TODO: Do something about this encoding issue -# raise InvalidStringData('Email has invalid full version') - - ## TODO: I'm not sure the TOTALCNT approach is the right one - ## we should discuss this with the pipermail guys -# infos['LegacyID'] = TOTALCNT - db.mails.insert(infos) - cnt = cnt + 1 - except Exception, err: - print ' Failed: %s error: "%s"' % (mbfile, err) - print ' Failed:', message['Subject'], message['Date'], message['From'] - print ' %s email read' % cnt_read - print ' %s email added to the database' % cnt - -def get_document_size(database): - """ Return the size of the document in mongodb. """ - db = connection[database] - print ' %s emails are stored into the database' % db.mails.count() - - -if __name__ == '__main__': - #sys.argv.extend(['devel', 'lists/devel-2012-03-March.txt']) - if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv: - print '''USAGE: -python mbox_to_mongo.py db_name mbox_file [mbox_file]''' - else: - print 'Adding to database: %s' % sys.argv[1] - for mbfile in sys.argv[2:]: - print mbfile - if os.path.exists(mbfile): - print mbfile - to_mongo(mbfile, sys.argv[1]) - get_document_size(sys.argv[1]) - -""" -## Test command-line: -$ mongo -use fedora-devel -db.mails.find() -db.mails.count() -""" |