summaryrefslogtreecommitdiffstats
path: root/kittystore/storm
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2013-06-06 20:48:00 +0200
committerAurélien Bompard <aurelien@bompard.org>2013-06-06 20:48:00 +0200
commited64d1d1db150ae99618b69fd008dbc82a92d2cd (patch)
treec9dcc75aa7303d073ef027ce3d2c87e2bcf47fdc /kittystore/storm
parentd0c38e1a55e60699416470cb38c4e6675a9383c6 (diff)
downloadkittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.gz
kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.xz
kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.zip
Add a Whoosh full-text search index
Diffstat (limited to 'kittystore/storm')
-rw-r--r--kittystore/storm/__init__.py13
-rw-r--r--kittystore/storm/model.py1
-rw-r--r--kittystore/storm/search.py140
-rw-r--r--kittystore/storm/store.py26
4 files changed, 175 insertions, 5 deletions
diff --git a/kittystore/storm/__init__.py b/kittystore/storm/__init__.py
index b7c7a92..8e1c744 100644
--- a/kittystore/storm/__init__.py
+++ b/kittystore/storm/__init__.py
@@ -12,6 +12,7 @@ from storm.schema.schema import Schema
from .model import List, Email
from . import schema
from .store import StormStore
+from .search import SearchEngine
class ThreadSafeStorePool(object):
@@ -35,7 +36,7 @@ class ThreadSafeStorePool(object):
return self._local.store
-def create_store(url, debug):
+def create_store(url, search, debug):
if debug:
storm.tracer.debug(True, stream=sys.stdout)
database = create_database(url)
@@ -43,11 +44,15 @@ def create_store(url, debug):
store = Store(database)
dbschema = Schema(schema.CREATES[dbtype], [], [], schema)
dbschema.upgrade(store)
- return StormStore(store, debug)
+ if search is not None:
+ search_index = SearchEngine(search)
+ else:
+ search_index = None
+ return StormStore(store, search_index, debug)
-def get_storm_store(url, debug=False):
+def get_storm_store(url, search=None, debug=False):
# Thread safety is managed by the middleware
#store_pool = ThreadSafeStorePool(url, debug)
#return store_pool.get()
- return create_store(url, debug)
+ return create_store(url, search, debug)
diff --git a/kittystore/storm/model.py b/kittystore/storm/model.py
index 084a4e0..86dbe89 100644
--- a/kittystore/storm/model.py
+++ b/kittystore/storm/model.py
@@ -92,6 +92,7 @@ class Email(Storm):
full_email = Reference((list_name, message_id),
("EmailFull.list_name", "EmailFull.message_id"))
full = Proxy(full_email, "EmailFull.full")
+ mlist = Reference(list_name, "List.name")
def __init__(self, list_name, message_id):
self.list_name = unicode(list_name)
diff --git a/kittystore/storm/search.py b/kittystore/storm/search.py
new file mode 100644
index 0000000..8c37747
--- /dev/null
+++ b/kittystore/storm/search.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright (C) 2012 Aurélien Bompard <abompard@fedoraproject.org>
+Author: Aurélien Bompard <abompard@fedoraproject.org>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+See http://www.gnu.org/copyleft/gpl.html for the full text of the
+license.
+"""
+
+from __future__ import absolute_import
+
+import os
+
+from whoosh.index import create_in, exists_in, open_dir
+from whoosh.fields import Schema, ID, TEXT, DATETIME, KEYWORD
+from whoosh.analysis import StemmingAnalyzer
+from whoosh.qparser import MultifieldParser
+
+from .model import Email
+
+
+def email_to_search_doc(email):
+ if not isinstance(email, Email):
+ raise ValueError("not an instance of the Email class")
+ search_doc = {
+ "list_name": email.list_name,
+ "message_id": email.message_id,
+ "sender": u"%s %s" % (email.sender_name, email.sender_email),
+ "subject": email.subject,
+ "content": email.content,
+ "date": email.date, # UTC
+ }
+ attachments = [a.name for a in email.attachments]
+ if attachments:
+ search_doc["attachments"] = " ".join(attachments)
+ return search_doc
+
+
+class SearchEngine(object):
+
+ def __init__(self, location):
+ self.location = location
+ self._index = None
+
+ def _get_schema(self):
+ stem_ana = StemmingAnalyzer()
+ return Schema(
+ list_name=ID(stored=True),
+ message_id=ID(stored=True),
+ sender=TEXT(field_boost=1.5),
+ subject=TEXT(field_boost=2.0, analyzer=stem_ana),
+ content=TEXT(analyzer=stem_ana),
+ date=DATETIME(),
+ attachments=TEXT,
+ tags=KEYWORD(commas=True, scorable=True),
+ )
+
+ @property
+ def index(self):
+ if self._index is None:
+ if not os.path.isdir(self.location):
+ os.makedirs(self.location)
+ if exists_in(self.location):
+ self._index = open_dir(self.location)
+ else:
+ self._index = create_in(self.location, self._get_schema())
+ return self._index
+
+ def add(self, doc):
+ writer = self.index.writer()
+ if isinstance(doc, Email):
+ doc = email_to_search_doc(doc)
+ try:
+ writer.add_document(**doc)
+ except Exception:
+ writer.cancel()
+ raise
+ else:
+ writer.commit()
+
+ def search(self, query, page=None, limit=10):
+ """
+ TODO: Should the searcher be shared?
+ http://pythonhosted.org/Whoosh/threads.html#concurrency
+ """
+ query = MultifieldParser(
+ ["sender", "subject", "content", "attachments"],
+ self.index.schema).parse(query)
+ return_value = {"total": 0, "results": []}
+ with self.index.searcher() as searcher:
+ if page:
+ results = searcher.search_page(query, page, pagelen=limit)
+ return_value["total"] = results.total
+ else:
+ results = searcher.search(query, limit=limit)
+ # http://pythonhosted.org/Whoosh/searching.html#results-object
+ if results.has_exact_length():
+ return_value["total"] = len(results)
+ else:
+ return_value["total"] = results.estimated_length()
+ return_value["results"] = [ dict(r) for r in results ]
+ return return_value
+
+ def optimize(self):
+ return self.index.optimize()
+
+ def add_batch(self, documents):
+ """
+ See http://pythonhosted.org/Whoosh/batch.html
+ """
+ writer = self.index.writer(limitmb=256, procs=4, multisegment=True)
+ # remove the LRU cache limit from the stemanalyzer
+ for component in writer.schema["content"].analyzer:
+ try:
+ component.cachesize = -1
+ component.clear()
+ except AttributeError:
+ continue
+ try:
+ for doc in documents:
+ if isinstance(doc, Email):
+ doc = email_to_search_doc(doc)
+ writer.add_document(**doc)
+ except Exception:
+ writer.cancel()
+ raise
+ else:
+ writer.commit()
+
+ def initialize_with(self, store):
+ """Create and populate the index with the contents of a Store"""
+ if exists_in(self.location):
+ return # index already exists
+ messages = store.db.find(Email).order_by(Email.archived_date)
+ self.add_batch(messages)
diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py
index 9c39c70..07770e8 100644
--- a/kittystore/storm/store.py
+++ b/kittystore/storm/store.py
@@ -40,7 +40,7 @@ class StormStore(object):
implements(IMessageStore)
- def __init__(self, db, debug=False):
+ def __init__(self, db, search_index=None, debug=False):
""" Constructor.
Create the session using the engine defined in the url.
@@ -49,6 +49,8 @@ class StormStore(object):
"""
self.db = db
self.debug = debug
+ self.search_index = search_index
+
# IMessageStore methods
@@ -158,6 +160,9 @@ class StormStore(object):
for attachment in attachments:
self.add_attachment(list_name, msg_id, *attachment)
self.flush()
+ # search indexing
+ if self.search_index is not None:
+ self.search_index.add(email)
return email.message_id_hash
def delete_message(self, message_id):
@@ -247,6 +252,25 @@ class StormStore(object):
)).one()
return msg
+ def search(self, query, list_name=None, page=None, limit=10):
+ """ Returns a list of email containing the specified keyword in
+ their content.
+
+ :param query: the query string to execute.
+ :param list_name: name of the mailing list in which this email
+ should be searched. If None or not specified, all lists are
+ searched.
+ :param page: the page number to return. If None, don't paginate.
+ :param limit: the number of results per page.
+ """
+ if list_name is not None:
+ query += " list_name:%s" % list_name
+ results = self.search_index.search(query, page, limit)
+ results["results"] = [ self.get_message_by_id_from_list(
+ r["list_name"], r["message_id"])
+ for r in results["results"] ]
+ return results
+
def search_list_for_content(self, list_name, keyword):
""" Returns a list of email containing the specified keyword in
their content.