Add a Whoosh full-text search index

author: Aurélien Bompard <aurelien@bompard.org> 2013-06-06 20:48:00 +0200
committer: Aurélien Bompard <aurelien@bompard.org> 2013-06-06 20:48:00 +0200
commit: ed64d1d1db150ae99618b69fd008dbc82a92d2cd (patch)
tree: c9dcc75aa7303d073ef027ce3d2c87e2bcf47fdc /kittystore/storm
parent: d0c38e1a55e60699416470cb38c4e6675a9383c6 (diff)
download: kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.gz
kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.xz
kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.zip
4 files changed, 175 insertions, 5 deletions
diff --git a/kittystore/storm/__init__.py b/kittystore/storm/__init__.py
index b7c7a92..8e1c744 100644
--- a/kittystore/storm/__init__.py
+++ b/kittystore/storm/__init__.py
@@ -12,6 +12,7 @@ from storm.schema.schema import Schema
 from .model import List, Email
 from . import schema
 from .store import StormStore
+from .search import SearchEngine
 
 
 class ThreadSafeStorePool(object):
@@ -35,7 +36,7 @@ class ThreadSafeStorePool(object):
             return self._local.store
 
 
-def create_store(url, debug):
+def create_store(url, search, debug):
     if debug:
         storm.tracer.debug(True, stream=sys.stdout)
     database = create_database(url)
@@ -43,11 +44,15 @@ def create_store(url, debug):
     store = Store(database)
     dbschema = Schema(schema.CREATES[dbtype], [], [], schema)
     dbschema.upgrade(store)
-    return StormStore(store, debug)
+    if search is not None:
+        search_index = SearchEngine(search)
+    else:
+        search_index = None
+    return StormStore(store, search_index, debug)
 
 
-def get_storm_store(url, debug=False):
+def get_storm_store(url, search=None, debug=False):
     # Thread safety is managed by the middleware
     #store_pool = ThreadSafeStorePool(url, debug)
     #return store_pool.get()
-    return create_store(url, debug)
+    return create_store(url, search, debug)
diff --git a/kittystore/storm/model.py b/kittystore/storm/model.py
index 084a4e0..86dbe89 100644
--- a/kittystore/storm/model.py
+++ b/kittystore/storm/model.py
@@ -92,6 +92,7 @@ class Email(Storm):
     full_email = Reference((list_name, message_id),
                      ("EmailFull.list_name", "EmailFull.message_id"))
     full = Proxy(full_email, "EmailFull.full")
+    mlist = Reference(list_name, "List.name")
 
     def __init__(self, list_name, message_id):
         self.list_name = unicode(list_name)
diff --git a/kittystore/storm/search.py b/kittystore/storm/search.py
new file mode 100644
index 0000000..8c37747
--- /dev/null
+++ b/kittystore/storm/search.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright (C) 2012 Aurélien Bompard <abompard@fedoraproject.org>
+Author: Aurélien Bompard <abompard@fedoraproject.org>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+See http://www.gnu.org/copyleft/gpl.html  for the full text of the
+license.
+"""
+
+from __future__ import absolute_import
+
+import os
+
+from whoosh.index import create_in, exists_in, open_dir
+from whoosh.fields import Schema, ID, TEXT, DATETIME, KEYWORD
+from whoosh.analysis import StemmingAnalyzer
+from whoosh.qparser import MultifieldParser
+
+from .model import Email
+
+
+def email_to_search_doc(email):
+    if not isinstance(email, Email):
+        raise ValueError("not an instance of the Email class")
+    search_doc = {
+            "list_name": email.list_name,
+            "message_id": email.message_id,
+            "sender": u"%s %s" % (email.sender_name, email.sender_email),
+            "subject": email.subject,
+            "content": email.content,
+            "date": email.date, # UTC
+    }
+    attachments = [a.name for a in email.attachments]
+    if attachments:
+        search_doc["attachments"] = " ".join(attachments)
+    return search_doc
+
+
+class SearchEngine(object):
+
+    def __init__(self, location):
+        self.location = location
+        self._index = None
+
+    def _get_schema(self):
+        stem_ana = StemmingAnalyzer()
+        return Schema(
+                list_name=ID(stored=True),
+                message_id=ID(stored=True),
+                sender=TEXT(field_boost=1.5),
+                subject=TEXT(field_boost=2.0, analyzer=stem_ana),
+                content=TEXT(analyzer=stem_ana),
+                date=DATETIME(),
+                attachments=TEXT,
+                tags=KEYWORD(commas=True, scorable=True),
+            )
+
+    @property
+    def index(self):
+        if self._index is None:
+            if not os.path.isdir(self.location):
+                os.makedirs(self.location)
+            if exists_in(self.location):
+                self._index = open_dir(self.location)
+            else:
+                self._index = create_in(self.location, self._get_schema())
+        return self._index
+
+    def add(self, doc):
+        writer = self.index.writer()
+        if isinstance(doc, Email):
+            doc = email_to_search_doc(doc)
+        try:
+            writer.add_document(**doc)
+        except Exception:
+            writer.cancel()
+            raise
+        else:
+            writer.commit()
+
+    def search(self, query, page=None, limit=10):
+        """
+        TODO: Should the searcher be shared?
+        http://pythonhosted.org/Whoosh/threads.html#concurrency
+        """
+        query = MultifieldParser(
+                ["sender", "subject", "content", "attachments"],
+                self.index.schema).parse(query)
+        return_value = {"total": 0, "results": []}
+        with self.index.searcher() as searcher:
+            if page:
+                results = searcher.search_page(query, page, pagelen=limit)
+                return_value["total"] = results.total
+            else:
+                results = searcher.search(query, limit=limit)
+                # http://pythonhosted.org/Whoosh/searching.html#results-object
+                if results.has_exact_length():
+                    return_value["total"] = len(results)
+                else:
+                    return_value["total"] = results.estimated_length()
+            return_value["results"] = [ dict(r) for r in results ]
+        return return_value
+
+    def optimize(self):
+        return self.index.optimize()
+
+    def add_batch(self, documents):
+        """
+        See http://pythonhosted.org/Whoosh/batch.html
+        """
+        writer = self.index.writer(limitmb=256, procs=4, multisegment=True)
+        # remove the LRU cache limit from the stemanalyzer
+        for component in writer.schema["content"].analyzer:
+            try:
+                component.cachesize = -1
+                component.clear()
+            except AttributeError:
+                continue
+        try:
+            for doc in documents:
+                if isinstance(doc, Email):
+                    doc = email_to_search_doc(doc)
+                writer.add_document(**doc)
+        except Exception:
+            writer.cancel()
+            raise
+        else:
+            writer.commit()
+
+    def initialize_with(self, store):
+        """Create and populate the index with the contents of a Store"""
+        if exists_in(self.location):
+            return # index already exists
+        messages = store.db.find(Email).order_by(Email.archived_date)
+        self.add_batch(messages)
diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py
index 9c39c70..07770e8 100644
--- a/kittystore/storm/store.py
+++ b/kittystore/storm/store.py
@@ -40,7 +40,7 @@ class StormStore(object):
 
     implements(IMessageStore)
 
-    def __init__(self, db, debug=False):
+    def __init__(self, db, search_index=None, debug=False):
         """ Constructor.
         Create the session using the engine defined in the url.
 
@@ -49,6 +49,8 @@ class StormStore(object):
         """
         self.db = db
         self.debug = debug
+        self.search_index = search_index
+
 
     # IMessageStore methods
 
@@ -158,6 +160,9 @@ class StormStore(object):
         for attachment in attachments:
             self.add_attachment(list_name, msg_id, *attachment)
         self.flush()
+        # search indexing
+        if self.search_index is not None:
+            self.search_index.add(email)
         return email.message_id_hash
 
     def delete_message(self, message_id):
@@ -247,6 +252,25 @@ class StormStore(object):
                 )).one()
         return msg
 
+    def search(self, query, list_name=None, page=None, limit=10):
+        """ Returns a list of email containing the specified keyword in
+        their content.
+
+        :param query: the query string to execute.
+        :param list_name: name of the mailing list in which this email
+            should be searched. If None or not specified, all lists are
+            searched.
+        :param page: the page number to return. If None, don't paginate.
+        :param limit: the number of results per page.
+        """
+        if list_name is not None:
+            query += " list_name:%s" % list_name
+        results = self.search_index.search(query, page, limit)
+        results["results"] = [ self.get_message_by_id_from_list(
+                                    r["list_name"], r["message_id"])
+                               for r in results["results"] ]
+        return results
+
     def search_list_for_content(self, list_name, keyword):
         """ Returns a list of email containing the specified keyword in
         their content.
author	Aurélien Bompard <aurelien@bompard.org>	2013-06-06 20:48:00 +0200
committer	Aurélien Bompard <aurelien@bompard.org>	2013-06-06 20:48:00 +0200
commit	ed64d1d1db150ae99618b69fd008dbc82a92d2cd (patch)
tree	c9dcc75aa7303d073ef027ce3d2c87e2bcf47fdc /kittystore/storm
parent	d0c38e1a55e60699416470cb38c4e6675a9383c6 (diff)
download	kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.gz kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.xz kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.zip