summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-10-25 10:30:54 +0200
committerAurélien Bompard <aurelien@bompard.org>2012-10-25 10:30:54 +0200
commit72852d0bd2731897833abef47d82547257c95851 (patch)
tree38a0ec825476dfa9cbeb37cb37f90c88e36d001a
parent9764744e107556b008af0c782a23368488f4892c (diff)
downloadkittystore-72852d0bd2731897833abef47d82547257c95851.tar.gz
kittystore-72852d0bd2731897833abef47d82547257c95851.tar.xz
kittystore-72852d0bd2731897833abef47d82547257c95851.zip
Remove obsolete mongodb code
-rw-r--r--README.rst7
-rw-r--r--kittystore/mongo/__init__.py1
-rw-r--r--kittystore/mongo/store.py232
-rwxr-xr-xto_mongo.py139
4 files changed, 2 insertions, 377 deletions
diff --git a/README.rst b/README.rst
index 9b3a6b4..912ef7c 100644
--- a/README.rst
+++ b/README.rst
@@ -17,9 +17,6 @@ Source: https://github.com/pypingou/kittystore
Dependencies:
-------------
- SQLAlchemy
-- bson
-- pymongo (for mongodb)
-
License:
@@ -33,8 +30,8 @@ Load the database:
------------------
- Retrieve the archives using the get_mbox.py script
-- Configure the to_sqldb.py or to_mongo.py script (adjust user/password/database name/host/port)
-- Load the archives by calling the to_sqldb.py or to_mongo.py script
+- Configure the to_sqldb.py script (adjust user/password/database name/host/port)
+- Load the archives by calling the to_sqldb.py script
(this might be memory intensive, so you may want to do 2 or 3 years per run and split
the runs)
diff --git a/kittystore/mongo/__init__.py b/kittystore/mongo/__init__.py
deleted file mode 100644
index 50dc1df..0000000
--- a/kittystore/mongo/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__test__ = {}
diff --git a/kittystore/mongo/store.py b/kittystore/mongo/store.py
deleted file mode 100644
index dbaefb6..0000000
--- a/kittystore/mongo/store.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-KittyMGStore - an object mapper and interface to the mongo database
- representation of emails for mailman 3.
-
-Copyright (C) 2012 Pierre-Yves Chibon
-Author: Pierre-Yves Chibon <pingou@pingoured.fr>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or (at
-your option) any later version.
-See http://www.gnu.org/copyleft/gpl.html for the full text of the
-license.
-"""
-
-
-__test__ = {}
-try:
- import pymongo
-except ImportError:
- pass
-import re
-from datetime import datetime
-
-from zope.interface import implements
-from mailman.interfaces.messages import IMessageStore
-
-
-class KittyMGStore(object):
- """ Implementation of the store for a MongoDB backend. """
-
- implements(IMessageStore)
-
- def __init__(self, host='localhost', port=27017):
- """ Constructor.
- Create the session using the engine defined in the url.
-
- :arg host, hostname or IP of the database server. Defaults to
- 'localhost'
- :arg port, port of the database server. Defaults to '27017'
- :kwarg debug, a boolean to set the debug mode on or off.
- """
- self.connection = pymongo.Connection(host, port)
-
- def get_archives(self, list_name, start, end):
- """ Return all the thread started emails between two given dates.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg start, a datetime object representing the starting date of
- the interval to query.
- :arg end, a datetime object representing the ending date of
- the interval to query.
- """
- mongodb = self.connection[list_name]
- mongodb.mails.create_index('Date')
- mongodb.mails.ensure_index('Date')
- mongodb.mails.create_index('References')
- mongodb.mails.ensure_index('References')
- # Beginning of thread == No 'References' header
- archives = []
- for email in mongodb.mails.find(
- {'References': {'$exists':False},
- 'InReplyTo': {'$exists':False},
- "Date": {"$gt": start, "$lt": end}},
- sort=[('Date', pymongo.DESCENDING)]):
- archives.append(email)
- return archives
-
- def get_archives_length(self, list_name):
- """ Return a dictionnary of years, months for which there are
- potentially archives available for a given list (based on the
- oldest post on the list).
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- """
- mongodb = self.connection[list_name]
- mongodb.mails.create_index('Date')
- mongodb.mails.ensure_index('Date')
- archives = {}
- entry = mongodb.mails.find_one(sort=[('Date', pymongo.ASCENDING)])
- date = entry['Date']
- now = datetime.now()
- year = date.year
- month = date.month
- while year < now.year:
- archives[year] = range(1, 13)[(month -1):]
- year = year + 1
- month = 1
- archives[now.year] = range(1, 13)[:now.month]
- return archives
-
- def get_email(self, list_name, message_id):
- """ Return an Email object found in the database corresponding
- to the Message-ID provided.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg message_id, Message-ID as found in the headers of the email.
- Used here to uniquely identify the email present in the database.
- """
- mongodb = self.connection[list_name]
- mongodb.mails.create_index('MessageID')
- mongodb.mails.ensure_index('MessageID')
- return mongodb.mails.find_one({'MessageID': message_id})
-
- def get_list_size(self, list_name):
- """ Return the number of emails stored for a given mailing list.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- """
- mongodb = self.connection[list_name]
- return mongodb.mails.count()
-
- def get_thread_length(self, list_name, thread_id):
- """ Return the number of email present in a thread. This thread
- is uniquely identified by its thread_id.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg thread_id, unique identifier of the thread as specified in
- the database.
- """
- mongodb = self.connection[list_name]
- mongodb.mails.create_index('ThreadID')
- mongodb.mails.ensure_index('ThreadID')
- return mongodb.mails.find({'ThreadID': thread_id}).count()
-
- def get_thread_participants(self, list_name, thread_id):
- """ Return the list of participant in a thread. This thread
- is uniquely identified by its thread_id.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg thread_id, unique identifier of the thread as specified in
- the database.
- """
- mongodb = self.connection[list_name]
- mongodb.mails.create_index('ThreadID')
- mongodb.mails.ensure_index('ThreadID')
- authors = set()
- for mail in mongodb.mails.find({'ThreadID': thread_id}):
- authors.add(mail['From'])
- return authors
-
- def search_content(self, list_name, keyword):
- """ Returns a list of email containing the specified keyword in
- their content.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg keyword, keyword to search in the content of the emails.
- """
- mongodb = self.connection[list_name]
- mongodb.mails.create_index('Date')
- mongodb.mails.ensure_index('Date')
- mongodb.mails.create_index('Content')
- mongodb.mails.ensure_index('Content')
- regex = '.*%s.*' % keyword
- query_string = {'Content': re.compile(regex, re.IGNORECASE)}
- return list(mongodb.mails.find(query_string, sort=[('Date',
- pymongo.DESCENDING)]))
-
- def search_content_subject(self, list_name, keyword):
- """ Returns a list of email containing the specified keyword in
- their content or their subject.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg keyword, keyword to search in the content or subject of
- the emails.
- """
- mongodb = self.connection[list_name]
- mongodb.mails.create_index('Date')
- mongodb.mails.ensure_index('Date')
- mongodb.mails.create_index('Content')
- mongodb.mails.ensure_index('Content')
- mongodb.mails.create_index('Subject')
- mongodb.mails.ensure_index('Subject')
- regex = '.*%s.*' % keyword
- query_string = {'$or' : [
- {'Content': re.compile(regex, re.IGNORECASE)},
- {'Subject': re.compile(regex, re.IGNORECASE)}
- ]}
- return list(mongodb.mails.find(query_string, sort=[('Date',
- pymongo.DESCENDING)]))
-
- def search_sender(self, list_name, keyword):
- """ Returns a list of email containing the specified keyword in
- the name or email address of the sender of the email.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg keyword, keyword to search in the database.
- """
- mongodb = self.connection[list_name]
- mongodb.mails.create_index('Date')
- mongodb.mails.ensure_index('Date')
- mongodb.mails.create_index('From')
- mongodb.mails.ensure_index('From')
- mongodb.mails.create_index('Email')
- mongodb.mails.ensure_index('Email')
- regex = '.*%s.*' % keyword
- query_string = {'$or' : [
- {'From': re.compile(regex, re.IGNORECASE)},
- {'Email': re.compile(regex, re.IGNORECASE)}
- ]}
- return list(mongodb.mails.find(query_string, sort=[('Date',
- pymongo.DESCENDING)]))
-
-
- def search_subject(self, list_name, keyword):
- """ Returns a list of email containing the specified keyword in
- their subject.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg keyword, keyword to search in the subject of the emails.
- """
- mongodb = self.connection[list_name]
- mongodb.mails.create_index('Date')
- mongodb.mails.ensure_index('Date')
- mongodb.mails.create_index('Subject')
- mongodb.mails.ensure_index('Subject')
- regex = '.*%s.*' % keyword
- query_string = {'Subject': re.compile(regex, re.IGNORECASE)}
- return list(mongodb.mails.find(query_string, sort=[('Date',
- pymongo.DESCENDING)]))
diff --git a/to_mongo.py b/to_mongo.py
deleted file mode 100755
index 379dfaf..0000000
--- a/to_mongo.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/python -tt
-
-# Import the content of a mbox file into mongodb
-
-import bson
-from bson.errors import InvalidStringData
-import datetime
-import mailbox
-import os
-import pymongo
-import re
-import sys
-import time
-from base64 import b32encode
-from dateutil.parser import parse
-from kitchen.text.converters import to_bytes
-from hashlib import sha1
-
-connection = pymongo.Connection('localhost', 27017)
-
-TOTALCNT = 0
-
-def convert_date(date_string):
- """ Convert the string of the date to a datetime object. """
- date_string = date_string.split('(')[0].strip()
- dt = parse(date_string)
- return dt.astimezone(tz.tzutc())
-
-
-def to_mongo(mbfile, database):
- """ Upload all the emails in a mbox file into a mongo database. """
- global TOTALCNT
- db = connection[database]
- cnt = 0
- cnt_read = 0
- for message in mailbox.mbox(mbfile):
- cnt_read = cnt_read + 1
- TOTALCNT = TOTALCNT + 1
- infos = {}
- ## TODO: We need to catch-up Subjects/From which are of a specific
- ## encoding.
- for it in message.keys():
- it2 = it.replace('-', '')
- infos[it2] = message[it]
- keys = infos.keys()
- ## There seem to be a problem to parse some messages
- if not keys:
- print ' Failed: %s keys: "%s"' % (mbfile, keys)
- #print message
- continue
- if 'MessageID' in infos:
- infos['MessageID'] = infos['MessageID'].replace('<', ''
- ).replace('>', '')
- if 'From' in infos:
- regex = '(.*)\((.*)\)'
- match = re.match(regex, infos['From'])
- if match:
- email, name = match.groups()
- infos['From'] = name
- email = email.replace(' at ', '@')
- infos['Email'] = email.strip()
- try:
- if db.mails.find({'MessageID': infos['MessageID']}).count() == 0:
- infos['Date'] = convert_date(infos['Date'])
- infos['Content'] = message.get_payload()
- try:
- bson.BSON.encode({'content' : infos['Content']})
- except InvalidStringData:
- ## TODO: Do something about this encoding issue
- raise InvalidStringData('Email has invalid content')
- thread_id = 0
- db.mails.create_index('MessageID')
- db.mails.ensure_index('MessageID')
- db.mails.create_index('ThreadID')
- db.mails.ensure_index('ThreadID')
- if not 'References' in infos and not 'InReplyTo' in infos:
- infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest())
- else:
- ref = None
- if 'References' in infos:
- ref= infos['References'].split()[0].strip()
- else:
- ref= infos['InReplyTo']
- ref = ref.replace('<', '').replace('>', '')
- res = db.mails.find_one({'MessageID': ref})
- if res and 'ThreadID' in res:
- infos['ThreadID'] = res['ThreadID']
- else:
- infos['ThreadID'] = b32encode(sha1(infos['MessageID']).digest())
-# infos['Category'] = 'Question'
-# if 'agenda' in infos['Subject'].lower():
-# infos['Category'] = 'Agenda'
-# if 'reminder' in infos['Subject'].lower():
-# infos['Category'] = 'Agenda'
-# infos['Full'] = message.as_string()
-# try:
-# bson.BSON.encode({'content' : infos['Full']})
-# except InvalidStringData:
-# ## TODO: Do something about this encoding issue
-# raise InvalidStringData('Email has invalid full version')
-
- ## TODO: I'm not sure the TOTALCNT approach is the right one
- ## we should discuss this with the pipermail guys
-# infos['LegacyID'] = TOTALCNT
- db.mails.insert(infos)
- cnt = cnt + 1
- except Exception, err:
- print ' Failed: %s error: "%s"' % (mbfile, err)
- print ' Failed:', message['Subject'], message['Date'], message['From']
- print ' %s email read' % cnt_read
- print ' %s email added to the database' % cnt
-
-def get_document_size(database):
- """ Return the size of the document in mongodb. """
- db = connection[database]
- print ' %s emails are stored into the database' % db.mails.count()
-
-
-if __name__ == '__main__':
- #sys.argv.extend(['devel', 'lists/devel-2012-03-March.txt'])
- if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv:
- print '''USAGE:
-python mbox_to_mongo.py db_name mbox_file [mbox_file]'''
- else:
- print 'Adding to database: %s' % sys.argv[1]
- for mbfile in sys.argv[2:]:
- print mbfile
- if os.path.exists(mbfile):
- print mbfile
- to_mongo(mbfile, sys.argv[1])
- get_document_size(sys.argv[1])
-
-"""
-## Test command-line:
-$ mongo
-use fedora-devel
-db.mails.find()
-db.mails.count()
-"""