From 6a1eaf953aedfb2de06ef43119741d612a6f779b Mon Sep 17 00:00:00 2001 From: Aurélien Bompard Date: Fri, 9 Nov 2012 13:51:18 +0100 Subject: Add a thread object in the model New object in the model: the Thread. It starts by having an active_date, corresponding to the date of the last email in the thread. Later on, it will also hold the category, tags, maybe ratings, etc. A schema migration is provided, upgrade should be automatic. --- kittystore/storm/model.py | 71 +++++++++++++++++++++- kittystore/storm/schema/__init__.py | 30 ++++++++-- kittystore/storm/schema/patch_2.py | 62 ++++++++++++++++++++ kittystore/storm/store.py | 114 ++++++++++++++++-------------------- 4 files changed, 208 insertions(+), 69 deletions(-) create mode 100644 kittystore/storm/schema/patch_2.py diff --git a/kittystore/storm/model.py b/kittystore/storm/model.py index e87a1fd..d87be6d 100644 --- a/kittystore/storm/model.py +++ b/kittystore/storm/model.py @@ -15,7 +15,8 @@ license. import datetime from zope.interface import implements -from storm.locals import Unicode, RawStr, Int, ReferenceSet +from storm.locals import Unicode, RawStr, Int, ReferenceSet, Reference +from storm.expr import Desc from mailman.interfaces.messages import IMessage from kittystore.utils import get_message_id_hash @@ -94,6 +95,71 @@ class Attachment(object): encoding = Unicode() size = Int() content = RawStr() + # reference to the email + email = Reference((list_name, message_id), + (Email.list_name, Email.message_id)) + + +class Thread(object): + """ + A thread of archived email, from a mailing-list. It is identified by both + the list name and the thread id. + """ + + __storm_table__ = "thread" + __storm_primary__ = "list_name", "thread_id" + + list_name = Unicode() + thread_id = Unicode() + date_active = DateTime() + emails = ReferenceSet( + (list_name, thread_id), + (Email.list_name, Email.thread_id), + order_by=Email.date + ) + _starting_email = None + + def __init__(self, list_name, thread_id, date_active=None): + self.list_name = unicode(list_name) + self.thread_id = unicode(thread_id) + self.date_active = date_active + + @property + def starting_email(self): + """Return (and cache) the email starting this thread""" + if self._starting_email is None: + self._starting_email = self.emails.find(Email.in_reply_to == None).one() + if self._starting_email is None: + # probably a partial import, we don't have the real first email + self._starting_email = self.emails.order_by(Email.date).first() + return self._starting_email + + @property + def last_email(self): + return self.emails.order_by(Desc(Email.date)).first() + + @property + def participants(self): + """Set of email senders in this thread""" + p = [] + for sender in self.emails.find().config(distinct=True + ).values(Email.sender_name): + p.append(sender) + return p + + @property + def email_ids(self): + return list(self.emails.find().values(Email.message_id)) + + def __len__(self): + return self.emails.count() + + def __storm_pre_flush__(self): + """Auto-set the active date from the last email in thread""" + if self.date_active is not None: + return + self.date_active = list(self.emails.order_by(Desc(Email.date) + ).config(limit=1).values(Email.date))[0] # References @@ -103,4 +169,7 @@ Email.attachments = ReferenceSet( Email.message_id), (Attachment.list_name, Attachment.message_id), + order_by=Attachment.counter ) +Email.thread = Reference((Email.list_name, Email.thread_id), + (Thread.list_name, Thread.thread_id)) diff --git a/kittystore/storm/schema/__init__.py b/kittystore/storm/schema/__init__.py index e32085e..50fda8d 100644 --- a/kittystore/storm/schema/__init__.py +++ b/kittystore/storm/schema/__init__.py @@ -9,6 +9,12 @@ CREATES = { display_name TEXT, PRIMARY KEY (name) );""", """ + CREATE TABLE "thread" ( + list_name VARCHAR(255) NOT NULL, + thread_id VARCHAR(255) NOT NULL, + date_active DATETIME NOT NULL, + PRIMARY KEY (list_name, thread_id) + );""", """ CREATE TABLE "email" ( list_name VARCHAR(255) NOT NULL, message_id VARCHAR(255) NOT NULL, @@ -22,7 +28,9 @@ CREATES = { thread_id VARCHAR(255) NOT NULL, "full" BLOB NOT NULL, archived_date DATETIME DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (list_name, message_id) + PRIMARY KEY (list_name, message_id), + FOREIGN KEY (list_name, thread_id) + REFERENCES thread(list_name, thread_id) ON DELETE CASCADE );""", """ CREATE TABLE "attachment" ( list_name VARCHAR(255) NOT NULL, @@ -33,13 +41,16 @@ CREATES = { name VARCHAR(255), size INTEGER NOT NULL, content BLOB NOT NULL, - PRIMARY KEY (list_name, message_id, counter) + PRIMARY KEY (list_name, message_id, counter), + FOREIGN KEY (list_name, message_id) + REFERENCES email(list_name, message_id) ON DELETE CASCADE );""", 'CREATE INDEX "ix_email_list_name" ON "email" (list_name);', 'CREATE INDEX "ix_email_date" ON "email" (date);', 'CREATE UNIQUE INDEX "ix_email_list_name_message_id_hash" ON "email" (list_name, message_id_hash);', 'CREATE INDEX "ix_email_subject" ON "email" (subject);', 'CREATE INDEX "ix_email_thread_id" ON "email" (thread_id);', + 'CREATE INDEX "ix_thread_date_active" ON "thread" (date_active);', ], "postgres": [ """ @@ -48,6 +59,12 @@ CREATES = { display_name TEXT, PRIMARY KEY (name) );""", """ + CREATE TABLE "thread" ( + list_name VARCHAR(255) NOT NULL, + thread_id VARCHAR(255) NOT NULL, + date_active TIMESTAMP WITH TIME ZONE NOT NULL, + PRIMARY KEY (list_name, thread_id) + );""", """ CREATE TABLE "email" ( list_name VARCHAR(255) NOT NULL, message_id VARCHAR(255) NOT NULL, @@ -61,7 +78,9 @@ CREATES = { thread_id VARCHAR(255) NOT NULL, "full" BYTEA NOT NULL, archived_date TIMESTAMP WITHOUT TIME ZONE DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (list_name, message_id) + PRIMARY KEY (list_name, message_id), + FOREIGN KEY (list_name, thread_id) + REFERENCES thread(list_name, thread_id) ON DELETE CASCADE );""", """ CREATE TABLE "attachment" ( list_name VARCHAR(255) NOT NULL, @@ -72,13 +91,16 @@ CREATES = { name VARCHAR(255), size INTEGER NOT NULL, content BYTEA NOT NULL, - PRIMARY KEY (list_name, message_id, counter) + PRIMARY KEY (list_name, message_id, counter), + FOREIGN KEY (list_name, message_id) + REFERENCES email(list_name, message_id) ON DELETE CASCADE );""", 'CREATE INDEX "ix_email_list_name" ON "email" USING btree (list_name);', 'CREATE INDEX "ix_email_date" ON "email" USING btree (date);', 'CREATE UNIQUE INDEX "ix_email_list_name_message_id_hash" ON "email" USING btree (list_name, message_id_hash);', 'CREATE INDEX "ix_email_subject" ON "email" USING btree (subject);', 'CREATE INDEX "ix_email_thread_id" ON "email" USING btree (thread_id);', + 'CREATE INDEX "ix_thread_date_active" ON "thread" USING btree (date_active);', ], } diff --git a/kittystore/storm/schema/patch_2.py b/kittystore/storm/schema/patch_2.py new file mode 100644 index 0000000..8a96df1 --- /dev/null +++ b/kittystore/storm/schema/patch_2.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +from storm.expr import And +from storm.locals import Desc + +from . import get_db_type +from kittystore.storm.model import Thread, Email + + +SQL = { + "sqlite": [ """ + CREATE TABLE "thread" ( + list_name VARCHAR(255) NOT NULL, + thread_id VARCHAR(255) NOT NULL, + date_active DATETIME NOT NULL, + PRIMARY KEY (list_name, thread_id) + );""", + 'CREATE INDEX "ix_thread_date_active" ON "thread" (date_active);', + ], + "postgres": [ """ + CREATE TABLE "thread" ( + list_name VARCHAR(255) NOT NULL, + thread_id VARCHAR(255) NOT NULL, + date_active TIMESTAMP WITH TIME ZONE NOT NULL, + PRIMARY KEY (list_name, thread_id) + );""", + 'CREATE INDEX "ix_thread_date_active" ON "thread" USING btree (date_active);', + ], + } + + +def apply(store): + """Add the thread table""" + dbtype = get_db_type(store) + for statement in SQL[dbtype]: + store.execute(statement) + for email in store.find(Email, Email.in_reply_to == None): + thread = Thread(email.list_name, email.thread_id) + store.add(thread) + store.flush() + for email in store.find(Email): + # in case of partial imports, some threads are missing their original + # email (the one without an in-reply-to header) + thread_count = store.find(Thread, And( + Thread.list_name == email.list_name, + Thread.thread_id == email.thread_id, + )).count() + if thread_count == 0: + # this email has no associated thread, create it + thread = Thread(email.list_name, email.thread_id) + store.add(thread) + store.flush() + if dbtype == "postgres": + store.execute('ALTER TABLE email ' + 'ADD FOREIGN KEY (list_name, thread_id) ' + 'REFERENCES thread(list_name, thread_id) ON DELETE CASCADE;') + store.execute('ALTER TABLE attachment ' + 'ADD FOREIGN KEY (list_name, message_id) ' + 'REFERENCES email(list_name, message_id) ON DELETE CASCADE;') + store.commit() diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py index b51e2c9..4e91346 100644 --- a/kittystore/storm/store.py +++ b/kittystore/storm/store.py @@ -28,7 +28,7 @@ from mailman.interfaces.messages import IMessageStore from storm.locals import Desc from storm.expr import And, Or -from .model import List, Email, Attachment +from .model import List, Email, Attachment, Thread class StormStore(object): @@ -99,8 +99,10 @@ class StormStore(object): return email.message_id_hash # Find thread id + new_thread = False ref, thread_id = get_ref_and_thread_id(message, list_name, self) if thread_id is None: + new_thread = True # make up the thread_id if not found thread_id = email.message_id_hash email.thread_id = thread_id @@ -108,8 +110,8 @@ class StormStore(object): from_name, from_email = parseaddr(message['From']) from_name = header_to_unicode(from_name) - email.sender_name = from_name - email.sender_email = unicode(from_email) + email.sender_name = from_name.strip() + email.sender_email = unicode(from_email).strip() email.subject = header_to_unicode(message.get('Subject')) email.full = message.as_string() # Before scrubbing scrubber = Scrubber(list_name, message, self) @@ -125,6 +127,16 @@ class StormStore(object): # # i18n! # category = 'Agenda' + if new_thread: + thread = Thread(list_name, thread_id, email.date) + else: + thread = self.db.find(Thread, And( + Thread.list_name == list_name, + Thread.thread_id == thread_id, + )).one() + thread.date_active = email.date + self.db.add(thread) + self.db.add(email) self.flush() return email.message_id_hash @@ -152,6 +164,13 @@ class StormStore(object): if msg is None: raise MessageNotFound(list_name, message_id) self.db.delete(msg) + # Remove the thread if necessary + thread = self.db.find(Thread, And( + Thread.list_name == msg.list_name, + Thread.thread_id == msg.thread_id, + )).one() + if len(thread.emails) == 0: + self.db.delete(thread) self.flush() def get_list_size(self, list_name): @@ -297,9 +316,8 @@ class StormStore(object): """ return list(self.db.find(List.name).order_by(List.name)) - def get_messages(self, list_name, start, end, threads=False): - """ Return all emails between two given dates, optionnaly selecting - only the thread-starting ones. + def get_messages(self, list_name, start, end): + """ Return all emails between two given dates. :param list_name: The name of the mailing list in which these emails should be searched. @@ -309,20 +327,29 @@ class StormStore(object): the interval to query. :returns: The list of messages. """ - conditions = [ - Email.list_name == unicode(list_name), - Email.date >= start, - Email.date < end, - ] - if threads: - # Beginning of thread == No 'References' header - conditions.append(Email.in_reply_to == None) - emails = self.db.find(Email, And(*conditions) - ).order_by(Desc(Email.date)) + emails = self.db.find(Email, And( + Email.list_name == unicode(list_name), + Email.date >= start, + Email.date < end, + )).order_by(Desc(Email.date)) return list(emails) + def get_thread(self, list_name, thread_id): + """ Return the specified thread. + + :param list_name: The name of the mailing list in which this email + should be searched. + :param thread_id: The thread_id as used in the web-pages. Used here to + uniquely identify the thread in the database. + :returns: The thread object. + """ + return self.db.find(Thread, And( + Thread.list_name == unicode(list_name), + Thread.thread_id == unicode(thread_id) + )).one() + def get_threads(self, list_name, start, end): - """ Return all the thread-starting emails between two given dates. + """ Return all the threads active between two given dates. :param list_name: The name of the mailing list in which this email should be searched. @@ -332,7 +359,12 @@ class StormStore(object): the interval to query. :returns: The list of thread-starting messages. """ - return self.get_messages(list_name, start, end, threads=True) + threads = self.db.find(Thread, And( + Thread.list_name == unicode(list_name), + Thread.date_active >= start, + Thread.date_active < end, + )).order_by(Desc(Thread.date_active)) + return list(threads) def get_start_date(self, list_name): """ Get the date of the first archived email in a list. @@ -349,52 +381,6 @@ class StormStore(object): else: return None - def get_messages_in_thread(self, list_name, thread_id): - """ Return all the emails present in a thread. This thread - is uniquely identified by its thread_id. - - :param list_name: The name of the mailing list in which this email - should be searched. - :param thread_id: The thread_id as used in the web-pages. Used here to - uniquely identify the thread in the database. - :returns: The list of messages in the thread. - """ - emails = self.db.find(Email, And( - Email.list_name == unicode(list_name), - Email.thread_id == unicode(thread_id), - )).order_by(Email.date) - return list(emails) - - def get_thread_length(self, list_name, thread_id): - """ Return the number of email present in a thread. This thread - is uniquely identified by its thread_id. - - :param list_name: The name of the mailing list to query. - :param thread_id: The unique identifier of the thread as specified in - the database. - :returns: The number of messages in the thread. - :rtype: int - """ - return self.db.find(Email, And( - Email.list_name == unicode(list_name), - Email.thread_id == unicode(thread_id), - )).count() - - def get_thread_participants(self, list_name, thread_id): - """ Return the list of participant in a thread. This thread - is uniquely identified by its thread_id. - - :param list_name: The name of the mailing list to query. - :param thread_id: The unique identifier of the thread as specified in - the database. - :return: The list of message sender names in the thread. - """ - participants = self.db.find(Email.sender_name, And( - Email.list_name == unicode(list_name), - Email.thread_id == unicode(thread_id), - )).config(distinct=True) - return list(participants) - def get_thread_neighbors(self, list_name, thread_id): """ Return the previous and the next threads of the specified thread, in date order. The returned objects are the emails starting the -- cgit