diff options
author | Aurélien Bompard <aurelien@bompard.org> | 2012-08-17 15:57:41 +0200 |
---|---|---|
committer | Aurélien Bompard <aurelien@bompard.org> | 2012-09-07 10:40:54 +0200 |
commit | 4051981f32379b673c5ef2c50962d05570b91204 (patch) | |
tree | 50beecf8edc41d6814fe3ba5f6345a87c4143ce5 | |
parent | f2c53282e819fe935911466dd873034ebcde9d02 (diff) | |
download | kittystore-4051981f32379b673c5ef2c50962d05570b91204.tar.gz kittystore-4051981f32379b673c5ef2c50962d05570b91204.tar.xz kittystore-4051981f32379b673c5ef2c50962d05570b91204.zip |
Fix some bugs in the import code
and add unit tests for these cases.
-rw-r--r-- | kittystore/sa/__init__.py | 366 | ||||
-rw-r--r-- | kittystore/sa/store.py | 391 | ||||
-rw-r--r-- | kittystore/test/__init__.py | 8 | ||||
-rw-r--r-- | kittystore/test/test_sa_store.py | 28 | ||||
-rw-r--r-- | kittystore/test/test_utils.py | 19 | ||||
-rw-r--r-- | kittystore/test/testdata/non-ascii-payload.txt | 8 | ||||
-rw-r--r-- | kittystore/test/testdata/strange-in-reply-to-header.txt | 10 | ||||
-rw-r--r-- | kittystore/utils.py | 16 | ||||
-rw-r--r-- | setup.py | 1 | ||||
-rw-r--r-- | to_sqldb.py | 7 |
10 files changed, 482 insertions, 372 deletions
diff --git a/kittystore/sa/__init__.py b/kittystore/sa/__init__.py index bf79467..3d228d0 100644 --- a/kittystore/sa/__init__.py +++ b/kittystore/sa/__init__.py @@ -15,368 +15,4 @@ See http://www.gnu.org/copyleft/gpl.html for the full text of the license. """ -import datetime - -from kittystore.utils import get_message_id_hash, parseaddr, parsedate -from kittystore.utils import get_ref_and_thread_id -from kittystore.sa.kittysamodel import get_class_object - -from zope.interface import implements -from mailman.interfaces.messages import IMessageStore - -from sqlalchemy import create_engine, distinct, MetaData, and_, desc, or_ -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker -from sqlalchemy.orm.exc import NoResultFound - - -def list_to_table_name(list_name): - """ For a given fully qualified list name, return the table name. - What the method does is to transform the special characters from the - list name to underscore ('_') and append the 'KS_' prefix in front. - (KS stands for KittyStore). - - Characters replaced: -.@ - - :arg list_name, the fully qualified list name to be transformed to - the table name. - """ - for char in ['-', '.', '@']: - list_name = list_name.replace(char, '_') - return 'HK_%s' % list_name - - -class KittySAStore(object): - """ - SQL-Alchemy powered interface to query emails from the database. - """ - - implements(IMessageStore) - - def __init__(self, url, debug=False): - """ Constructor. - Create the session using the engine defined in the url. - - :arg url, URL used to connect to the database. The URL contains - information with regards to the database engine, the host to connect - to, the user and password and the database name. - ie: <engine>://<user>:<password>@<host>/<dbname> - ie: mysql://mm3_user:mm3_password@localhost/mm3 - :kwarg debug, a boolean to set the debug mode on or off. - """ - connect_args = {} - if url.startswith('sqlite://'): - connect_args["check_same_thread"] = False - self.engine = create_engine(url, echo=debug, connect_args=connect_args) - self.metadata = MetaData(self.engine) - session = sessionmaker(bind=self.engine) - self.session = session() - - def add(self, message): - """Add the message to the store. - - :param message: An email.message.Message instance containing at - least a unique Message-ID header. The message will be given - an X-Message-ID-Hash header, overriding any existing such - header. - :returns: The calculated X-Message-ID-Hash header. - :raises ValueError: if the message is missing a Message-ID - header. - The storage service is also allowed to raise this exception - if it find, but disallows collisions. - """ - - def add_to_list(self, list_name, message): - """Add the message to a specific list of the store. - - :param list_name: The fully qualified list name to which the - message should be added. - :param message: An email.message.Message instance containing at - least a unique Message-ID header. The message will be given - an X-Message-ID-Hash header, overriding any existing such - header. - :returns: The calculated X-Message-ID-Hash header. - :raises ValueError: if the message is missing a Message-ID - header. - The storage service is also allowed to raise this exception - if it find, but disallows collisions. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - MetaData(self.engine), create=True) - if not message.has_key("Message-Id"): - raise ValueError("No 'Message-Id' header in email", message) - msg_id = message['Message-Id'].strip("<>") - msg_id_hash = get_message_id_hash(msg_id) - if self.get_message_by_id_from_list(list_name, msg_id) is not None: - print ("Duplicate email from %s: %s" % - (message['From'], message.get('Subject', '""'))) - return msg_id_hash - - # Find thread id - ref, thread_id = get_ref_and_thread_id(message, list_name, self) - if thread_id is None: - # make up the thread_id if not found - thread_id = msg_id_hash - - from_name, from_email = parseaddr(message['From']) - - #category = 'Question' # TODO: enum + i18n ? - #if ('agenda' in message.get('Subject', '').lower() or - # 'reminder' in message.get('Subject', '').lower()): - # # i18n! - # category = 'Agenda' - - mail = email( - sender=from_name, - email=from_email, - subject=message.get('Subject'), - content=message.get_payload(), - date=parsedate(message.get("Date")), - message_id=msg_id, - stable_url_id=msg_id_hash, - thread_id=thread_id, - references=ref, - full=message.as_string(), - ) - mail.save(self.session) - return msg_id_hash - - def delete_message(self, message_id): - """Remove the given message from the store. - - :param message: The Message-ID of the mesage to delete from the - store. - :raises LookupError: if there is no such message. - """ - - def delete_message_from_list(self, list_name, message_id): - """Remove the given message for a specific list from the store. - - :param list_name: The fully qualified list name to which the - message should be added. - :param message: The Message-ID of the mesage to delete from the - store. - :raises LookupError: if there is no such message. - """ - - def get_list_size(self, list_name): - """ Return the number of emails stored for a given mailing list. - - :arg list_name, name of the mailing list in which this email - should be searched. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - return self.session.query(email).count() - - - def get_message_by_hash(self, message_id_hash): - """Return the message with the matching X-Message-ID-Hash. - - :param message_id_hash: The X-Message-ID-Hash header contents to - search for. - :returns: The message, or None if no matching message was found. - """ - - def get_message_by_hash_from_list(self, list_name, message_id_hash): - """Return the message with the matching X-Message-ID-Hash. - - :param message_id_hash: The X-Message-ID-Hash header contents to - search for. - :returns: The message, or None if no matching message was found. - """ - - def get_message_by_id(self, message_id): - """Return the message with a matching Message-ID. - - :param message_id: The Message-ID header contents to search for. - :returns: The message, or None if no matching message was found. - """ - - def get_message_by_id_from_list(self, list_name, message_id): - """Return the message with a matching Message-ID. - - :param list_name: The fully qualified list name to which the - message should be added. - :param message_id: The Message-ID header contents to search for. - :returns: The message, or None if no matching message was found. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - mail = None - try: - mail = self.session.query(email).filter_by( - message_id=message_id).one() - except NoResultFound: - pass - return mail - - def search_list_for_content(self, list_name, keyword): - """ Returns a list of email containing the specified keyword in - their content. - - :param list_name: name of the mailing list in which this email - should be searched. - :param keyword: keyword to search in the content of the emails. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - mails = self.session.query(email).filter( - email.content.ilike('%{0}%'.format(keyword)) - ).order_by(email.date).all() - mails.reverse() # TODO: change the SQL order above - return mails - - def search_list_for_content_subject(self, list_name, keyword): - """ Returns a list of email containing the specified keyword in - their content or their subject. - - :param list_name: name of the mailing list in which this email - should be searched. - :param keyword: keyword to search in the content or subject of - the emails. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - mails = self.session.query(email).filter(or_( - email.content.ilike('%{0}%'.format(keyword)), - email.subject.ilike('%{0}%'.format(keyword)) - )).order_by(email.date).all() - mails.reverse() # TODO: change the SQL order above - return mails - - def search_list_for_sender(self, list_name, keyword): - """ Returns a list of email containing the specified keyword in - the name or email address of the sender of the email. - - :param list_name: name of the mailing list in which this email - should be searched. - :param keyword: keyword to search in the database. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - mails = self.session.query(email).filter(or_( - email.sender.ilike('%{0}%'.format(keyword)), - email.email.ilike('%{0}%'.format(keyword)) - )).order_by(email.date).all() - mails.reverse() # TODO: change the SQL order above - return mails - - - def search_list_for_subject(self, list_name, keyword): - """ Returns a list of email containing the specified keyword in - their subject. - - :param list_name: name of the mailing list in which this email - should be searched. - :param keyword: keyword to search in the subject of the emails. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - mails = self.session.query(email).filter( - email.subject.ilike('%{0}%'.format(keyword)) - ).order_by(email.date).all() - mails.reverse() # TODO: change the SQL order above - return mails - - @property - def messages(self): - """An iterator over all messages in this message store.""" - raise NotImplementedError - - - - - - - def get_archives(self, list_name, start, end): - """ Return all the thread started emails between two given dates. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg start, a datetime object representing the starting date of - the interval to query. - :arg end, a datetime object representing the ending date of - the interval to query. - """ - # Beginning of thread == No 'References' header - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - mails = self.session.query(email).filter( - and_( - email.date >= start, - email.date <= end, - email.references == None) - ).order_by(email.date).all() - mails.reverse() - return mails - - def get_archives_length(self, list_name): - """ Return a dictionnary of years, months for which there are - potentially archives available for a given list (based on the - oldest post on the list). - - :arg list_name, name of the mailing list in which this email - should be searched. - """ - archives = {} - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - entry = self.session.query(email).order_by( - email.date).limit(1).all()[0] - now = datetime.datetime.now() - year = entry.date.year - month = entry.date.month - while year < now.year: - archives[year] = range(1, 13)[(month -1):] - year = year + 1 - month = 1 - archives[now.year] = range(1, 13)[:now.month] - return archives - - def get_thread(self, list_name, thread_id): - """ Return all the emails present in a thread. This thread - is uniquely identified by its thread_id. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg thread_id, thread_id as used in the web-pages. - Used here to uniquely identify the thread in the database. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - mail = None - try: - mail = self.session.query(email).filter_by( - thread_id=thread_id).order_by(email.date).all() - except NoResultFound: - pass - return mail - - def get_thread_length(self, list_name, thread_id): - """ Return the number of email present in a thread. This thread - is uniquely identified by its thread_id. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg thread_id, unique identifier of the thread as specified in - the database. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - return self.session.query(email).filter_by( - thread_id=thread_id).count() - - def get_thread_participants(self, list_name, thread_id): - """ Return the list of participant in a thread. This thread - is uniquely identified by its thread_id. - - :arg list_name, name of the mailing list in which this email - should be searched. - :arg thread_id, unique identifier of the thread as specified in - the database. - """ - email = get_class_object(list_to_table_name(list_name), 'email', - self.metadata) - return self.session.query(distinct(email.sender)).filter( - email.thread_id == thread_id).all() +from kittystore.sa.store import KittySAStore diff --git a/kittystore/sa/store.py b/kittystore/sa/store.py new file mode 100644 index 0000000..c0755df --- /dev/null +++ b/kittystore/sa/store.py @@ -0,0 +1,391 @@ +# -*- coding: utf-8 -*- + +""" +KittySAStore - an object mapper and interface to a SQL database + representation of emails for mailman 3. + +Copyright (C) 2012 Pierre-Yves Chibon +Author: Pierre-Yves Chibon <pingou@pingoured.fr> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. +See http://www.gnu.org/copyleft/gpl.html for the full text of the +license. +""" + +import datetime + +from kittystore.utils import get_message_id_hash, parseaddr, parsedate +from kittystore.utils import get_ref_and_thread_id +from kittystore.sa.kittysamodel import get_class_object + +from zope.interface import implements +from mailman.interfaces.messages import IMessageStore + +from sqlalchemy import create_engine, distinct, MetaData, and_, desc, or_ +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +from sqlalchemy.orm.exc import NoResultFound + + +def list_to_table_name(list_name): + """ For a given fully qualified list name, return the table name. + What the method does is to transform the special characters from the + list name to underscore ('_') and append the 'KS_' prefix in front. + (KS stands for KittyStore). + + Characters replaced: -.@ + + :arg list_name, the fully qualified list name to be transformed to + the table name. + """ + for char in ['-', '.', '@']: + list_name = list_name.replace(char, '_') + return 'HK_%s' % list_name + + +class KittySAStore(object): + """ + SQL-Alchemy powered interface to query emails from the database. + """ + + implements(IMessageStore) + + def __init__(self, url, debug=False): + """ Constructor. + Create the session using the engine defined in the url. + + :arg url, URL used to connect to the database. The URL contains + information with regards to the database engine, the host to connect + to, the user and password and the database name. + ie: <engine>://<user>:<password>@<host>/<dbname> + ie: mysql://mm3_user:mm3_password@localhost/mm3 + :kwarg debug, a boolean to set the debug mode on or off. + """ + connect_args = {} + if url.startswith('sqlite://'): + connect_args["check_same_thread"] = False + self.engine = create_engine(url, echo=debug, connect_args=connect_args) + self.metadata = MetaData(self.engine) + session = sessionmaker(bind=self.engine) + self.session = session() + + def add(self, message): + """Add the message to the store. + + :param message: An email.message.Message instance containing at + least a unique Message-ID header. The message will be given + an X-Message-ID-Hash header, overriding any existing such + header. + :returns: The calculated X-Message-ID-Hash header. + :raises ValueError: if the message is missing a Message-ID + header. + The storage service is also allowed to raise this exception + if it find, but disallows collisions. + """ + + def add_to_list(self, list_name, message): + """Add the message to a specific list of the store. + + :param list_name: The fully qualified list name to which the + message should be added. + :param message: An email.message.Message instance containing at + least a unique Message-ID header. The message will be given + an X-Message-ID-Hash header, overriding any existing such + header. + :returns: The calculated X-Message-ID-Hash header. + :raises ValueError: if the message is missing a Message-ID + header. + The storage service is also allowed to raise this exception + if it find, but disallows collisions. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + MetaData(self.engine), create=True) + if not message.has_key("Message-Id"): + raise ValueError("No 'Message-Id' header in email", message) + msg_id = message['Message-Id'].strip("<>") + msg_id_hash = get_message_id_hash(msg_id) + if self.get_message_by_id_from_list(list_name, msg_id) is not None: + print ("Duplicate email from %s: %s" % + (message['From'], message.get('Subject', '""'))) + return msg_id_hash + + # Find thread id + ref, thread_id = get_ref_and_thread_id(message, list_name, self) + if thread_id is None: + # make up the thread_id if not found + thread_id = msg_id_hash + + from_name, from_email = parseaddr(message['From']) + + # Turn non-ascii into Unicode, assuming UTF-8 + for part in message.walk(): + if part.get_content_charset() is None: + try: + unicode(part.get_payload()) + except UnicodeDecodeError: + # Try UTF-8 + part.set_charset("utf-8") + + #category = 'Question' # TODO: enum + i18n ? + #if ('agenda' in message.get('Subject', '').lower() or + # 'reminder' in message.get('Subject', '').lower()): + # # i18n! + # category = 'Agenda' + + mail = email( + sender=from_name, + email=from_email, + subject=message.get('Subject'), + content=message.get_payload(), + date=parsedate(message.get("Date")), + message_id=msg_id, + stable_url_id=msg_id_hash, + thread_id=thread_id, + references=ref, + full=message.as_string(), + ) + mail.save(self.session) + return msg_id_hash + + def delete_message(self, message_id): + """Remove the given message from the store. + + :param message: The Message-ID of the mesage to delete from the + store. + :raises LookupError: if there is no such message. + """ + + def delete_message_from_list(self, list_name, message_id): + """Remove the given message for a specific list from the store. + + :param list_name: The fully qualified list name to which the + message should be added. + :param message: The Message-ID of the mesage to delete from the + store. + :raises LookupError: if there is no such message. + """ + + def get_list_size(self, list_name): + """ Return the number of emails stored for a given mailing list. + + :arg list_name, name of the mailing list in which this email + should be searched. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + return self.session.query(email).count() + + + def get_message_by_hash(self, message_id_hash): + """Return the message with the matching X-Message-ID-Hash. + + :param message_id_hash: The X-Message-ID-Hash header contents to + search for. + :returns: The message, or None if no matching message was found. + """ + + def get_message_by_hash_from_list(self, list_name, message_id_hash): + """Return the message with the matching X-Message-ID-Hash. + + :param message_id_hash: The X-Message-ID-Hash header contents to + search for. + :returns: The message, or None if no matching message was found. + """ + + def get_message_by_id(self, message_id): + """Return the message with a matching Message-ID. + + :param message_id: The Message-ID header contents to search for. + :returns: The message, or None if no matching message was found. + """ + + def get_message_by_id_from_list(self, list_name, message_id): + """Return the message with a matching Message-ID. + + :param list_name: The fully qualified list name to which the + message should be added. + :param message_id: The Message-ID header contents to search for. + :returns: The message, or None if no matching message was found. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + mail = None + try: + mail = self.session.query(email).filter_by( + message_id=message_id).one() + except NoResultFound: + pass + return mail + + def search_list_for_content(self, list_name, keyword): + """ Returns a list of email containing the specified keyword in + their content. + + :param list_name: name of the mailing list in which this email + should be searched. + :param keyword: keyword to search in the content of the emails. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + mails = self.session.query(email).filter( + email.content.ilike('%{0}%'.format(keyword)) + ).order_by(email.date).all() + mails.reverse() # TODO: change the SQL order above + return mails + + def search_list_for_content_subject(self, list_name, keyword): + """ Returns a list of email containing the specified keyword in + their content or their subject. + + :param list_name: name of the mailing list in which this email + should be searched. + :param keyword: keyword to search in the content or subject of + the emails. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + mails = self.session.query(email).filter(or_( + email.content.ilike('%{0}%'.format(keyword)), + email.subject.ilike('%{0}%'.format(keyword)) + )).order_by(email.date).all() + mails.reverse() # TODO: change the SQL order above + return mails + + def search_list_for_sender(self, list_name, keyword): + """ Returns a list of email containing the specified keyword in + the name or email address of the sender of the email. + + :param list_name: name of the mailing list in which this email + should be searched. + :param keyword: keyword to search in the database. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + mails = self.session.query(email).filter(or_( + email.sender.ilike('%{0}%'.format(keyword)), + email.email.ilike('%{0}%'.format(keyword)) + )).order_by(email.date).all() + mails.reverse() # TODO: change the SQL order above + return mails + + + def search_list_for_subject(self, list_name, keyword): + """ Returns a list of email containing the specified keyword in + their subject. + + :param list_name: name of the mailing list in which this email + should be searched. + :param keyword: keyword to search in the subject of the emails. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + mails = self.session.query(email).filter( + email.subject.ilike('%{0}%'.format(keyword)) + ).order_by(email.date).all() + mails.reverse() # TODO: change the SQL order above + return mails + + @property + def messages(self): + """An iterator over all messages in this message store.""" + raise NotImplementedError + + + + + + + def get_archives(self, list_name, start, end): + """ Return all the thread started emails between two given dates. + + :arg list_name, name of the mailing list in which this email + should be searched. + :arg start, a datetime object representing the starting date of + the interval to query. + :arg end, a datetime object representing the ending date of + the interval to query. + """ + # Beginning of thread == No 'References' header + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + mails = self.session.query(email).filter( + and_( + email.date >= start, + email.date <= end, + email.references == None) + ).order_by(email.date).all() + mails.reverse() + return mails + + def get_archives_length(self, list_name): + """ Return a dictionnary of years, months for which there are + potentially archives available for a given list (based on the + oldest post on the list). + + :arg list_name, name of the mailing list in which this email + should be searched. + """ + archives = {} + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + entry = self.session.query(email).order_by( + email.date).limit(1).all()[0] + now = datetime.datetime.now() + year = entry.date.year + month = entry.date.month + while year < now.year: + archives[year] = range(1, 13)[(month -1):] + year = year + 1 + month = 1 + archives[now.year] = range(1, 13)[:now.month] + return archives + + def get_thread(self, list_name, thread_id): + """ Return all the emails present in a thread. This thread + is uniquely identified by its thread_id. + + :arg list_name, name of the mailing list in which this email + should be searched. + :arg thread_id, thread_id as used in the web-pages. + Used here to uniquely identify the thread in the database. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + mail = None + try: + mail = self.session.query(email).filter_by( + thread_id=thread_id).order_by(email.date).all() + except NoResultFound: + pass + return mail + + def get_thread_length(self, list_name, thread_id): + """ Return the number of email present in a thread. This thread + is uniquely identified by its thread_id. + + :arg list_name, name of the mailing list in which this email + should be searched. + :arg thread_id, unique identifier of the thread as specified in + the database. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + return self.session.query(email).filter_by( + thread_id=thread_id).count() + + def get_thread_participants(self, list_name, thread_id): + """ Return the list of participant in a thread. This thread + is uniquely identified by its thread_id. + + :arg list_name, name of the mailing list in which this email + should be searched. + :arg thread_id, unique identifier of the thread as specified in + the database. + """ + email = get_class_object(list_to_table_name(list_name), 'email', + self.metadata) + return self.session.query(distinct(email.sender)).filter( + email.thread_id == thread_id).all() diff --git a/kittystore/test/__init__.py b/kittystore/test/__init__.py new file mode 100644 index 0000000..9a10e1f --- /dev/null +++ b/kittystore/test/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- + +import os + + +def get_test_file(*fileparts): + return os.path.join(os.path.dirname(__file__), "testdata", *fileparts) +get_test_file.__test__ = False diff --git a/kittystore/test/test_sa_store.py b/kittystore/test/test_sa_store.py new file mode 100644 index 0000000..01a1a4a --- /dev/null +++ b/kittystore/test/test_sa_store.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- + +import unittest +import email +from mock import Mock + +from kittystore.sa import KittySAStore +from sqlalchemy.exc import ProgrammingError +from kittystore.test import get_test_file + +class TestSAStore(unittest.TestCase): + + def setUp(self): + self.store = KittySAStore("sqlite:///:memory:") + + def tearDown(self): + self.store.session.close() + + def test_non_ascii_payload(self): + """add_to_list must handle non-ascii messages""" + with open(get_test_file("non-ascii-payload.txt")) as email_file: + msg = email.message_from_file(email_file) + self.store.add_to_list("example-list", msg) + try: + self.store.session.flush() + except ProgrammingError, e: + self.fail(e) + diff --git a/kittystore/test/test_utils.py b/kittystore/test/test_utils.py new file mode 100644 index 0000000..c4ceec4 --- /dev/null +++ b/kittystore/test/test_utils.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +import unittest +import email +from mock import Mock + +import kittystore.utils +from kittystore.test import get_test_file + +class TestUtils(unittest.TestCase): + + def test_ref_parsing(self): + with open(get_test_file("strange-in-reply-to-header.txt")) as email_file: + msg = email.message_from_file(email_file) + store = Mock() + store.get_message_by_id_from_list.return_value = None + ref_id, thread_id = kittystore.utils.get_ref_and_thread_id( + msg, "example-list", store) + self.assertEqual(ref_id, "200704070053.46646.other.person@example.com") diff --git a/kittystore/test/testdata/non-ascii-payload.txt b/kittystore/test/testdata/non-ascii-payload.txt new file mode 100644 index 0000000..d8106eb --- /dev/null +++ b/kittystore/test/testdata/non-ascii-payload.txt @@ -0,0 +1,8 @@ +From test at example.com Fri Apr 6 22:43:55 2007 +From: test at example.com (Dummy Person) +Date: Fri, 6 Apr 2007 15:43:55 -0700 (PDT) +Subject: Dummy subject +Message-ID: <20070406224355.899B9180064@test.example.com> + +This message contains non-ascii characters: +é è ç à î ï ë € diff --git a/kittystore/test/testdata/strange-in-reply-to-header.txt b/kittystore/test/testdata/strange-in-reply-to-header.txt new file mode 100644 index 0000000..ead532b --- /dev/null +++ b/kittystore/test/testdata/strange-in-reply-to-header.txt @@ -0,0 +1,10 @@ +From test at example.com Fri Apr 6 22:43:55 2007 +From: test at example.com (Dummy Person) +Date: Fri, 6 Apr 2007 15:43:55 -0700 (PDT) +Subject: Dummy subject +In-Reply-To: Other person's message of Saturday, 7 April 2007 00:53:46 +0300 <200704070053.46646.other.person@example.com> +Message-ID: <20070406224355.899B9180064@test.example.com> + +> Other person's message + +Dummy person's reply diff --git a/kittystore/utils.py b/kittystore/utils.py index 05247b8..e1d7fd0 100644 --- a/kittystore/utils.py +++ b/kittystore/utils.py @@ -16,6 +16,7 @@ license. import email.utils import time +import re from datetime import datetime, tzinfo from base64 import b32encode from hashlib import sha1 @@ -28,6 +29,9 @@ __all__ = ("get_message_id_hash", "parseaddr", "parsedate", ) +IN_BRACKETS_RE = re.compile("[^<]*<([^>]+)>.*") + + def get_message_id_hash(msg_id): """ Returns the X-Message-ID-Hash header for the provided Message-ID header. @@ -68,13 +72,13 @@ def get_ref_and_thread_id(message, list_name, store): and not message.has_key("In-Reply-To")): return None, None # It's a reply, use the thread_id from the parent email - ref = message.get("References") - if ref is not None: + ref_id = message.get("References") + if ref_id is not None: # There can be multiple references, use the first one - ref = ref.split()[0].strip() + ref_id = ref_id.split()[0].strip() else: - ref = message.get("In-Reply-To") - ref = ref.strip("<>") + ref_id = message.get("In-Reply-To") + ref_id = IN_BRACKETS_RE.match(ref_id).group(1) # It's a reply, use the thread_id from the parent email ref_msg = store.get_message_by_id_from_list(list_name, ref_id) if ref_msg is None: @@ -82,5 +86,5 @@ def get_ref_and_thread_id(message, list_name, store): else: # re-use parent's thread-id thread_id = ref_msg.thread_id - return ref, thread_id + return ref_id, thread_id @@ -19,5 +19,6 @@ setup( 'zope.interface', 'SQLAlchemy==0.7.8', 'python-dateutil < 2.0' # 2.0+ is for Python 3 + 'mock', ], ) diff --git a/to_sqldb.py b/to_sqldb.py index 1d6576a..4afa8af 100644 --- a/to_sqldb.py +++ b/to_sqldb.py @@ -44,7 +44,12 @@ def to_db(mbfile, list_name, store): cnt_read = cnt_read + 1 #print cnt_read TOTALCNT = TOTALCNT + 1 - msg_id_hash = store.add_to_list(list_name, message) + try: + msg_id_hash = store.add_to_list(list_name, message) + except ValueError, e: + print "%s from %s about %s" % (e.args[0], + e.args[1].get("From"), e.args[1].get("Subject")) + continue store.session.flush() cnt = cnt + 1 store.session.commit() |