summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-08-17 15:57:41 +0200
committerAurélien Bompard <aurelien@bompard.org>2012-09-07 10:40:54 +0200
commit4051981f32379b673c5ef2c50962d05570b91204 (patch)
tree50beecf8edc41d6814fe3ba5f6345a87c4143ce5
parentf2c53282e819fe935911466dd873034ebcde9d02 (diff)
downloadkittystore-4051981f32379b673c5ef2c50962d05570b91204.tar.gz
kittystore-4051981f32379b673c5ef2c50962d05570b91204.tar.xz
kittystore-4051981f32379b673c5ef2c50962d05570b91204.zip
Fix some bugs in the import code
and add unit tests for these cases.
-rw-r--r--kittystore/sa/__init__.py366
-rw-r--r--kittystore/sa/store.py391
-rw-r--r--kittystore/test/__init__.py8
-rw-r--r--kittystore/test/test_sa_store.py28
-rw-r--r--kittystore/test/test_utils.py19
-rw-r--r--kittystore/test/testdata/non-ascii-payload.txt8
-rw-r--r--kittystore/test/testdata/strange-in-reply-to-header.txt10
-rw-r--r--kittystore/utils.py16
-rw-r--r--setup.py1
-rw-r--r--to_sqldb.py7
10 files changed, 482 insertions, 372 deletions
diff --git a/kittystore/sa/__init__.py b/kittystore/sa/__init__.py
index bf79467..3d228d0 100644
--- a/kittystore/sa/__init__.py
+++ b/kittystore/sa/__init__.py
@@ -15,368 +15,4 @@ See http://www.gnu.org/copyleft/gpl.html for the full text of the
license.
"""
-import datetime
-
-from kittystore.utils import get_message_id_hash, parseaddr, parsedate
-from kittystore.utils import get_ref_and_thread_id
-from kittystore.sa.kittysamodel import get_class_object
-
-from zope.interface import implements
-from mailman.interfaces.messages import IMessageStore
-
-from sqlalchemy import create_engine, distinct, MetaData, and_, desc, or_
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import sessionmaker
-from sqlalchemy.orm.exc import NoResultFound
-
-
-def list_to_table_name(list_name):
- """ For a given fully qualified list name, return the table name.
- What the method does is to transform the special characters from the
- list name to underscore ('_') and append the 'KS_' prefix in front.
- (KS stands for KittyStore).
-
- Characters replaced: -.@
-
- :arg list_name, the fully qualified list name to be transformed to
- the table name.
- """
- for char in ['-', '.', '@']:
- list_name = list_name.replace(char, '_')
- return 'HK_%s' % list_name
-
-
-class KittySAStore(object):
- """
- SQL-Alchemy powered interface to query emails from the database.
- """
-
- implements(IMessageStore)
-
- def __init__(self, url, debug=False):
- """ Constructor.
- Create the session using the engine defined in the url.
-
- :arg url, URL used to connect to the database. The URL contains
- information with regards to the database engine, the host to connect
- to, the user and password and the database name.
- ie: <engine>://<user>:<password>@<host>/<dbname>
- ie: mysql://mm3_user:mm3_password@localhost/mm3
- :kwarg debug, a boolean to set the debug mode on or off.
- """
- connect_args = {}
- if url.startswith('sqlite://'):
- connect_args["check_same_thread"] = False
- self.engine = create_engine(url, echo=debug, connect_args=connect_args)
- self.metadata = MetaData(self.engine)
- session = sessionmaker(bind=self.engine)
- self.session = session()
-
- def add(self, message):
- """Add the message to the store.
-
- :param message: An email.message.Message instance containing at
- least a unique Message-ID header. The message will be given
- an X-Message-ID-Hash header, overriding any existing such
- header.
- :returns: The calculated X-Message-ID-Hash header.
- :raises ValueError: if the message is missing a Message-ID
- header.
- The storage service is also allowed to raise this exception
- if it find, but disallows collisions.
- """
-
- def add_to_list(self, list_name, message):
- """Add the message to a specific list of the store.
-
- :param list_name: The fully qualified list name to which the
- message should be added.
- :param message: An email.message.Message instance containing at
- least a unique Message-ID header. The message will be given
- an X-Message-ID-Hash header, overriding any existing such
- header.
- :returns: The calculated X-Message-ID-Hash header.
- :raises ValueError: if the message is missing a Message-ID
- header.
- The storage service is also allowed to raise this exception
- if it find, but disallows collisions.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- MetaData(self.engine), create=True)
- if not message.has_key("Message-Id"):
- raise ValueError("No 'Message-Id' header in email", message)
- msg_id = message['Message-Id'].strip("<>")
- msg_id_hash = get_message_id_hash(msg_id)
- if self.get_message_by_id_from_list(list_name, msg_id) is not None:
- print ("Duplicate email from %s: %s" %
- (message['From'], message.get('Subject', '""')))
- return msg_id_hash
-
- # Find thread id
- ref, thread_id = get_ref_and_thread_id(message, list_name, self)
- if thread_id is None:
- # make up the thread_id if not found
- thread_id = msg_id_hash
-
- from_name, from_email = parseaddr(message['From'])
-
- #category = 'Question' # TODO: enum + i18n ?
- #if ('agenda' in message.get('Subject', '').lower() or
- # 'reminder' in message.get('Subject', '').lower()):
- # # i18n!
- # category = 'Agenda'
-
- mail = email(
- sender=from_name,
- email=from_email,
- subject=message.get('Subject'),
- content=message.get_payload(),
- date=parsedate(message.get("Date")),
- message_id=msg_id,
- stable_url_id=msg_id_hash,
- thread_id=thread_id,
- references=ref,
- full=message.as_string(),
- )
- mail.save(self.session)
- return msg_id_hash
-
- def delete_message(self, message_id):
- """Remove the given message from the store.
-
- :param message: The Message-ID of the mesage to delete from the
- store.
- :raises LookupError: if there is no such message.
- """
-
- def delete_message_from_list(self, list_name, message_id):
- """Remove the given message for a specific list from the store.
-
- :param list_name: The fully qualified list name to which the
- message should be added.
- :param message: The Message-ID of the mesage to delete from the
- store.
- :raises LookupError: if there is no such message.
- """
-
- def get_list_size(self, list_name):
- """ Return the number of emails stored for a given mailing list.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- return self.session.query(email).count()
-
-
- def get_message_by_hash(self, message_id_hash):
- """Return the message with the matching X-Message-ID-Hash.
-
- :param message_id_hash: The X-Message-ID-Hash header contents to
- search for.
- :returns: The message, or None if no matching message was found.
- """
-
- def get_message_by_hash_from_list(self, list_name, message_id_hash):
- """Return the message with the matching X-Message-ID-Hash.
-
- :param message_id_hash: The X-Message-ID-Hash header contents to
- search for.
- :returns: The message, or None if no matching message was found.
- """
-
- def get_message_by_id(self, message_id):
- """Return the message with a matching Message-ID.
-
- :param message_id: The Message-ID header contents to search for.
- :returns: The message, or None if no matching message was found.
- """
-
- def get_message_by_id_from_list(self, list_name, message_id):
- """Return the message with a matching Message-ID.
-
- :param list_name: The fully qualified list name to which the
- message should be added.
- :param message_id: The Message-ID header contents to search for.
- :returns: The message, or None if no matching message was found.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- mail = None
- try:
- mail = self.session.query(email).filter_by(
- message_id=message_id).one()
- except NoResultFound:
- pass
- return mail
-
- def search_list_for_content(self, list_name, keyword):
- """ Returns a list of email containing the specified keyword in
- their content.
-
- :param list_name: name of the mailing list in which this email
- should be searched.
- :param keyword: keyword to search in the content of the emails.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- mails = self.session.query(email).filter(
- email.content.ilike('%{0}%'.format(keyword))
- ).order_by(email.date).all()
- mails.reverse() # TODO: change the SQL order above
- return mails
-
- def search_list_for_content_subject(self, list_name, keyword):
- """ Returns a list of email containing the specified keyword in
- their content or their subject.
-
- :param list_name: name of the mailing list in which this email
- should be searched.
- :param keyword: keyword to search in the content or subject of
- the emails.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- mails = self.session.query(email).filter(or_(
- email.content.ilike('%{0}%'.format(keyword)),
- email.subject.ilike('%{0}%'.format(keyword))
- )).order_by(email.date).all()
- mails.reverse() # TODO: change the SQL order above
- return mails
-
- def search_list_for_sender(self, list_name, keyword):
- """ Returns a list of email containing the specified keyword in
- the name or email address of the sender of the email.
-
- :param list_name: name of the mailing list in which this email
- should be searched.
- :param keyword: keyword to search in the database.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- mails = self.session.query(email).filter(or_(
- email.sender.ilike('%{0}%'.format(keyword)),
- email.email.ilike('%{0}%'.format(keyword))
- )).order_by(email.date).all()
- mails.reverse() # TODO: change the SQL order above
- return mails
-
-
- def search_list_for_subject(self, list_name, keyword):
- """ Returns a list of email containing the specified keyword in
- their subject.
-
- :param list_name: name of the mailing list in which this email
- should be searched.
- :param keyword: keyword to search in the subject of the emails.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- mails = self.session.query(email).filter(
- email.subject.ilike('%{0}%'.format(keyword))
- ).order_by(email.date).all()
- mails.reverse() # TODO: change the SQL order above
- return mails
-
- @property
- def messages(self):
- """An iterator over all messages in this message store."""
- raise NotImplementedError
-
-
-
-
-
-
- def get_archives(self, list_name, start, end):
- """ Return all the thread started emails between two given dates.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg start, a datetime object representing the starting date of
- the interval to query.
- :arg end, a datetime object representing the ending date of
- the interval to query.
- """
- # Beginning of thread == No 'References' header
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- mails = self.session.query(email).filter(
- and_(
- email.date >= start,
- email.date <= end,
- email.references == None)
- ).order_by(email.date).all()
- mails.reverse()
- return mails
-
- def get_archives_length(self, list_name):
- """ Return a dictionnary of years, months for which there are
- potentially archives available for a given list (based on the
- oldest post on the list).
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- """
- archives = {}
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- entry = self.session.query(email).order_by(
- email.date).limit(1).all()[0]
- now = datetime.datetime.now()
- year = entry.date.year
- month = entry.date.month
- while year < now.year:
- archives[year] = range(1, 13)[(month -1):]
- year = year + 1
- month = 1
- archives[now.year] = range(1, 13)[:now.month]
- return archives
-
- def get_thread(self, list_name, thread_id):
- """ Return all the emails present in a thread. This thread
- is uniquely identified by its thread_id.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg thread_id, thread_id as used in the web-pages.
- Used here to uniquely identify the thread in the database.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- mail = None
- try:
- mail = self.session.query(email).filter_by(
- thread_id=thread_id).order_by(email.date).all()
- except NoResultFound:
- pass
- return mail
-
- def get_thread_length(self, list_name, thread_id):
- """ Return the number of email present in a thread. This thread
- is uniquely identified by its thread_id.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg thread_id, unique identifier of the thread as specified in
- the database.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- return self.session.query(email).filter_by(
- thread_id=thread_id).count()
-
- def get_thread_participants(self, list_name, thread_id):
- """ Return the list of participant in a thread. This thread
- is uniquely identified by its thread_id.
-
- :arg list_name, name of the mailing list in which this email
- should be searched.
- :arg thread_id, unique identifier of the thread as specified in
- the database.
- """
- email = get_class_object(list_to_table_name(list_name), 'email',
- self.metadata)
- return self.session.query(distinct(email.sender)).filter(
- email.thread_id == thread_id).all()
+from kittystore.sa.store import KittySAStore
diff --git a/kittystore/sa/store.py b/kittystore/sa/store.py
new file mode 100644
index 0000000..c0755df
--- /dev/null
+++ b/kittystore/sa/store.py
@@ -0,0 +1,391 @@
+# -*- coding: utf-8 -*-
+
+"""
+KittySAStore - an object mapper and interface to a SQL database
+ representation of emails for mailman 3.
+
+Copyright (C) 2012 Pierre-Yves Chibon
+Author: Pierre-Yves Chibon <pingou@pingoured.fr>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+See http://www.gnu.org/copyleft/gpl.html for the full text of the
+license.
+"""
+
+import datetime
+
+from kittystore.utils import get_message_id_hash, parseaddr, parsedate
+from kittystore.utils import get_ref_and_thread_id
+from kittystore.sa.kittysamodel import get_class_object
+
+from zope.interface import implements
+from mailman.interfaces.messages import IMessageStore
+
+from sqlalchemy import create_engine, distinct, MetaData, and_, desc, or_
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.orm.exc import NoResultFound
+
+
+def list_to_table_name(list_name):
+ """ For a given fully qualified list name, return the table name.
+ What the method does is to transform the special characters from the
+ list name to underscore ('_') and append the 'KS_' prefix in front.
+ (KS stands for KittyStore).
+
+ Characters replaced: -.@
+
+ :arg list_name, the fully qualified list name to be transformed to
+ the table name.
+ """
+ for char in ['-', '.', '@']:
+ list_name = list_name.replace(char, '_')
+ return 'HK_%s' % list_name
+
+
+class KittySAStore(object):
+ """
+ SQL-Alchemy powered interface to query emails from the database.
+ """
+
+ implements(IMessageStore)
+
+ def __init__(self, url, debug=False):
+ """ Constructor.
+ Create the session using the engine defined in the url.
+
+ :arg url, URL used to connect to the database. The URL contains
+ information with regards to the database engine, the host to connect
+ to, the user and password and the database name.
+ ie: <engine>://<user>:<password>@<host>/<dbname>
+ ie: mysql://mm3_user:mm3_password@localhost/mm3
+ :kwarg debug, a boolean to set the debug mode on or off.
+ """
+ connect_args = {}
+ if url.startswith('sqlite://'):
+ connect_args["check_same_thread"] = False
+ self.engine = create_engine(url, echo=debug, connect_args=connect_args)
+ self.metadata = MetaData(self.engine)
+ session = sessionmaker(bind=self.engine)
+ self.session = session()
+
+ def add(self, message):
+ """Add the message to the store.
+
+ :param message: An email.message.Message instance containing at
+ least a unique Message-ID header. The message will be given
+ an X-Message-ID-Hash header, overriding any existing such
+ header.
+ :returns: The calculated X-Message-ID-Hash header.
+ :raises ValueError: if the message is missing a Message-ID
+ header.
+ The storage service is also allowed to raise this exception
+ if it find, but disallows collisions.
+ """
+
+ def add_to_list(self, list_name, message):
+ """Add the message to a specific list of the store.
+
+ :param list_name: The fully qualified list name to which the
+ message should be added.
+ :param message: An email.message.Message instance containing at
+ least a unique Message-ID header. The message will be given
+ an X-Message-ID-Hash header, overriding any existing such
+ header.
+ :returns: The calculated X-Message-ID-Hash header.
+ :raises ValueError: if the message is missing a Message-ID
+ header.
+ The storage service is also allowed to raise this exception
+ if it find, but disallows collisions.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ MetaData(self.engine), create=True)
+ if not message.has_key("Message-Id"):
+ raise ValueError("No 'Message-Id' header in email", message)
+ msg_id = message['Message-Id'].strip("<>")
+ msg_id_hash = get_message_id_hash(msg_id)
+ if self.get_message_by_id_from_list(list_name, msg_id) is not None:
+ print ("Duplicate email from %s: %s" %
+ (message['From'], message.get('Subject', '""')))
+ return msg_id_hash
+
+ # Find thread id
+ ref, thread_id = get_ref_and_thread_id(message, list_name, self)
+ if thread_id is None:
+ # make up the thread_id if not found
+ thread_id = msg_id_hash
+
+ from_name, from_email = parseaddr(message['From'])
+
+ # Turn non-ascii into Unicode, assuming UTF-8
+ for part in message.walk():
+ if part.get_content_charset() is None:
+ try:
+ unicode(part.get_payload())
+ except UnicodeDecodeError:
+ # Try UTF-8
+ part.set_charset("utf-8")
+
+ #category = 'Question' # TODO: enum + i18n ?
+ #if ('agenda' in message.get('Subject', '').lower() or
+ # 'reminder' in message.get('Subject', '').lower()):
+ # # i18n!
+ # category = 'Agenda'
+
+ mail = email(
+ sender=from_name,
+ email=from_email,
+ subject=message.get('Subject'),
+ content=message.get_payload(),
+ date=parsedate(message.get("Date")),
+ message_id=msg_id,
+ stable_url_id=msg_id_hash,
+ thread_id=thread_id,
+ references=ref,
+ full=message.as_string(),
+ )
+ mail.save(self.session)
+ return msg_id_hash
+
+ def delete_message(self, message_id):
+ """Remove the given message from the store.
+
+ :param message: The Message-ID of the mesage to delete from the
+ store.
+ :raises LookupError: if there is no such message.
+ """
+
+ def delete_message_from_list(self, list_name, message_id):
+ """Remove the given message for a specific list from the store.
+
+ :param list_name: The fully qualified list name to which the
+ message should be added.
+ :param message: The Message-ID of the mesage to delete from the
+ store.
+ :raises LookupError: if there is no such message.
+ """
+
+ def get_list_size(self, list_name):
+ """ Return the number of emails stored for a given mailing list.
+
+ :arg list_name, name of the mailing list in which this email
+ should be searched.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ return self.session.query(email).count()
+
+
+ def get_message_by_hash(self, message_id_hash):
+ """Return the message with the matching X-Message-ID-Hash.
+
+ :param message_id_hash: The X-Message-ID-Hash header contents to
+ search for.
+ :returns: The message, or None if no matching message was found.
+ """
+
+ def get_message_by_hash_from_list(self, list_name, message_id_hash):
+ """Return the message with the matching X-Message-ID-Hash.
+
+ :param message_id_hash: The X-Message-ID-Hash header contents to
+ search for.
+ :returns: The message, or None if no matching message was found.
+ """
+
+ def get_message_by_id(self, message_id):
+ """Return the message with a matching Message-ID.
+
+ :param message_id: The Message-ID header contents to search for.
+ :returns: The message, or None if no matching message was found.
+ """
+
+ def get_message_by_id_from_list(self, list_name, message_id):
+ """Return the message with a matching Message-ID.
+
+ :param list_name: The fully qualified list name to which the
+ message should be added.
+ :param message_id: The Message-ID header contents to search for.
+ :returns: The message, or None if no matching message was found.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ mail = None
+ try:
+ mail = self.session.query(email).filter_by(
+ message_id=message_id).one()
+ except NoResultFound:
+ pass
+ return mail
+
+ def search_list_for_content(self, list_name, keyword):
+ """ Returns a list of email containing the specified keyword in
+ their content.
+
+ :param list_name: name of the mailing list in which this email
+ should be searched.
+ :param keyword: keyword to search in the content of the emails.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ mails = self.session.query(email).filter(
+ email.content.ilike('%{0}%'.format(keyword))
+ ).order_by(email.date).all()
+ mails.reverse() # TODO: change the SQL order above
+ return mails
+
+ def search_list_for_content_subject(self, list_name, keyword):
+ """ Returns a list of email containing the specified keyword in
+ their content or their subject.
+
+ :param list_name: name of the mailing list in which this email
+ should be searched.
+ :param keyword: keyword to search in the content or subject of
+ the emails.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ mails = self.session.query(email).filter(or_(
+ email.content.ilike('%{0}%'.format(keyword)),
+ email.subject.ilike('%{0}%'.format(keyword))
+ )).order_by(email.date).all()
+ mails.reverse() # TODO: change the SQL order above
+ return mails
+
+ def search_list_for_sender(self, list_name, keyword):
+ """ Returns a list of email containing the specified keyword in
+ the name or email address of the sender of the email.
+
+ :param list_name: name of the mailing list in which this email
+ should be searched.
+ :param keyword: keyword to search in the database.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ mails = self.session.query(email).filter(or_(
+ email.sender.ilike('%{0}%'.format(keyword)),
+ email.email.ilike('%{0}%'.format(keyword))
+ )).order_by(email.date).all()
+ mails.reverse() # TODO: change the SQL order above
+ return mails
+
+
+ def search_list_for_subject(self, list_name, keyword):
+ """ Returns a list of email containing the specified keyword in
+ their subject.
+
+ :param list_name: name of the mailing list in which this email
+ should be searched.
+ :param keyword: keyword to search in the subject of the emails.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ mails = self.session.query(email).filter(
+ email.subject.ilike('%{0}%'.format(keyword))
+ ).order_by(email.date).all()
+ mails.reverse() # TODO: change the SQL order above
+ return mails
+
+ @property
+ def messages(self):
+ """An iterator over all messages in this message store."""
+ raise NotImplementedError
+
+
+
+
+
+
+ def get_archives(self, list_name, start, end):
+ """ Return all the thread started emails between two given dates.
+
+ :arg list_name, name of the mailing list in which this email
+ should be searched.
+ :arg start, a datetime object representing the starting date of
+ the interval to query.
+ :arg end, a datetime object representing the ending date of
+ the interval to query.
+ """
+ # Beginning of thread == No 'References' header
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ mails = self.session.query(email).filter(
+ and_(
+ email.date >= start,
+ email.date <= end,
+ email.references == None)
+ ).order_by(email.date).all()
+ mails.reverse()
+ return mails
+
+ def get_archives_length(self, list_name):
+ """ Return a dictionnary of years, months for which there are
+ potentially archives available for a given list (based on the
+ oldest post on the list).
+
+ :arg list_name, name of the mailing list in which this email
+ should be searched.
+ """
+ archives = {}
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ entry = self.session.query(email).order_by(
+ email.date).limit(1).all()[0]
+ now = datetime.datetime.now()
+ year = entry.date.year
+ month = entry.date.month
+ while year < now.year:
+ archives[year] = range(1, 13)[(month -1):]
+ year = year + 1
+ month = 1
+ archives[now.year] = range(1, 13)[:now.month]
+ return archives
+
+ def get_thread(self, list_name, thread_id):
+ """ Return all the emails present in a thread. This thread
+ is uniquely identified by its thread_id.
+
+ :arg list_name, name of the mailing list in which this email
+ should be searched.
+ :arg thread_id, thread_id as used in the web-pages.
+ Used here to uniquely identify the thread in the database.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ mail = None
+ try:
+ mail = self.session.query(email).filter_by(
+ thread_id=thread_id).order_by(email.date).all()
+ except NoResultFound:
+ pass
+ return mail
+
+ def get_thread_length(self, list_name, thread_id):
+ """ Return the number of email present in a thread. This thread
+ is uniquely identified by its thread_id.
+
+ :arg list_name, name of the mailing list in which this email
+ should be searched.
+ :arg thread_id, unique identifier of the thread as specified in
+ the database.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ return self.session.query(email).filter_by(
+ thread_id=thread_id).count()
+
+ def get_thread_participants(self, list_name, thread_id):
+ """ Return the list of participant in a thread. This thread
+ is uniquely identified by its thread_id.
+
+ :arg list_name, name of the mailing list in which this email
+ should be searched.
+ :arg thread_id, unique identifier of the thread as specified in
+ the database.
+ """
+ email = get_class_object(list_to_table_name(list_name), 'email',
+ self.metadata)
+ return self.session.query(distinct(email.sender)).filter(
+ email.thread_id == thread_id).all()
diff --git a/kittystore/test/__init__.py b/kittystore/test/__init__.py
new file mode 100644
index 0000000..9a10e1f
--- /dev/null
+++ b/kittystore/test/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+
+import os
+
+
+def get_test_file(*fileparts):
+ return os.path.join(os.path.dirname(__file__), "testdata", *fileparts)
+get_test_file.__test__ = False
diff --git a/kittystore/test/test_sa_store.py b/kittystore/test/test_sa_store.py
new file mode 100644
index 0000000..01a1a4a
--- /dev/null
+++ b/kittystore/test/test_sa_store.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import email
+from mock import Mock
+
+from kittystore.sa import KittySAStore
+from sqlalchemy.exc import ProgrammingError
+from kittystore.test import get_test_file
+
+class TestSAStore(unittest.TestCase):
+
+ def setUp(self):
+ self.store = KittySAStore("sqlite:///:memory:")
+
+ def tearDown(self):
+ self.store.session.close()
+
+ def test_non_ascii_payload(self):
+ """add_to_list must handle non-ascii messages"""
+ with open(get_test_file("non-ascii-payload.txt")) as email_file:
+ msg = email.message_from_file(email_file)
+ self.store.add_to_list("example-list", msg)
+ try:
+ self.store.session.flush()
+ except ProgrammingError, e:
+ self.fail(e)
+
diff --git a/kittystore/test/test_utils.py b/kittystore/test/test_utils.py
new file mode 100644
index 0000000..c4ceec4
--- /dev/null
+++ b/kittystore/test/test_utils.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import email
+from mock import Mock
+
+import kittystore.utils
+from kittystore.test import get_test_file
+
+class TestUtils(unittest.TestCase):
+
+ def test_ref_parsing(self):
+ with open(get_test_file("strange-in-reply-to-header.txt")) as email_file:
+ msg = email.message_from_file(email_file)
+ store = Mock()
+ store.get_message_by_id_from_list.return_value = None
+ ref_id, thread_id = kittystore.utils.get_ref_and_thread_id(
+ msg, "example-list", store)
+ self.assertEqual(ref_id, "200704070053.46646.other.person@example.com")
diff --git a/kittystore/test/testdata/non-ascii-payload.txt b/kittystore/test/testdata/non-ascii-payload.txt
new file mode 100644
index 0000000..d8106eb
--- /dev/null
+++ b/kittystore/test/testdata/non-ascii-payload.txt
@@ -0,0 +1,8 @@
+From test at example.com Fri Apr 6 22:43:55 2007
+From: test at example.com (Dummy Person)
+Date: Fri, 6 Apr 2007 15:43:55 -0700 (PDT)
+Subject: Dummy subject
+Message-ID: <20070406224355.899B9180064@test.example.com>
+
+This message contains non-ascii characters:
+é è ç à î ï ë €
diff --git a/kittystore/test/testdata/strange-in-reply-to-header.txt b/kittystore/test/testdata/strange-in-reply-to-header.txt
new file mode 100644
index 0000000..ead532b
--- /dev/null
+++ b/kittystore/test/testdata/strange-in-reply-to-header.txt
@@ -0,0 +1,10 @@
+From test at example.com Fri Apr 6 22:43:55 2007
+From: test at example.com (Dummy Person)
+Date: Fri, 6 Apr 2007 15:43:55 -0700 (PDT)
+Subject: Dummy subject
+In-Reply-To: Other person's message of Saturday, 7 April 2007 00:53:46 +0300 <200704070053.46646.other.person@example.com>
+Message-ID: <20070406224355.899B9180064@test.example.com>
+
+> Other person's message
+
+Dummy person's reply
diff --git a/kittystore/utils.py b/kittystore/utils.py
index 05247b8..e1d7fd0 100644
--- a/kittystore/utils.py
+++ b/kittystore/utils.py
@@ -16,6 +16,7 @@ license.
import email.utils
import time
+import re
from datetime import datetime, tzinfo
from base64 import b32encode
from hashlib import sha1
@@ -28,6 +29,9 @@ __all__ = ("get_message_id_hash", "parseaddr", "parsedate",
)
+IN_BRACKETS_RE = re.compile("[^<]*<([^>]+)>.*")
+
+
def get_message_id_hash(msg_id):
"""
Returns the X-Message-ID-Hash header for the provided Message-ID header.
@@ -68,13 +72,13 @@ def get_ref_and_thread_id(message, list_name, store):
and not message.has_key("In-Reply-To")):
return None, None
# It's a reply, use the thread_id from the parent email
- ref = message.get("References")
- if ref is not None:
+ ref_id = message.get("References")
+ if ref_id is not None:
# There can be multiple references, use the first one
- ref = ref.split()[0].strip()
+ ref_id = ref_id.split()[0].strip()
else:
- ref = message.get("In-Reply-To")
- ref = ref.strip("<>")
+ ref_id = message.get("In-Reply-To")
+ ref_id = IN_BRACKETS_RE.match(ref_id).group(1)
# It's a reply, use the thread_id from the parent email
ref_msg = store.get_message_by_id_from_list(list_name, ref_id)
if ref_msg is None:
@@ -82,5 +86,5 @@ def get_ref_and_thread_id(message, list_name, store):
else:
# re-use parent's thread-id
thread_id = ref_msg.thread_id
- return ref, thread_id
+ return ref_id, thread_id
diff --git a/setup.py b/setup.py
index 076fb33..1a051e0 100644
--- a/setup.py
+++ b/setup.py
@@ -19,5 +19,6 @@ setup(
'zope.interface',
'SQLAlchemy==0.7.8',
'python-dateutil < 2.0' # 2.0+ is for Python 3
+ 'mock',
],
)
diff --git a/to_sqldb.py b/to_sqldb.py
index 1d6576a..4afa8af 100644
--- a/to_sqldb.py
+++ b/to_sqldb.py
@@ -44,7 +44,12 @@ def to_db(mbfile, list_name, store):
cnt_read = cnt_read + 1
#print cnt_read
TOTALCNT = TOTALCNT + 1
- msg_id_hash = store.add_to_list(list_name, message)
+ try:
+ msg_id_hash = store.add_to_list(list_name, message)
+ except ValueError, e:
+ print "%s from %s about %s" % (e.args[0],
+ e.args[1].get("From"), e.args[1].get("Subject"))
+ continue
store.session.flush()
cnt = cnt + 1
store.session.commit()