summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2013-06-06 20:48:00 +0200
committerAurélien Bompard <aurelien@bompard.org>2013-06-06 20:48:00 +0200
commited64d1d1db150ae99618b69fd008dbc82a92d2cd (patch)
treec9dcc75aa7303d073ef027ce3d2c87e2bcf47fdc
parentd0c38e1a55e60699416470cb38c4e6675a9383c6 (diff)
downloadkittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.gz
kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.xz
kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.zip
Add a Whoosh full-text search index
-rw-r--r--kittystore/__init__.py7
-rw-r--r--kittystore/scripts.py21
-rw-r--r--kittystore/storm/__init__.py13
-rw-r--r--kittystore/storm/model.py1
-rw-r--r--kittystore/storm/search.py140
-rw-r--r--kittystore/storm/store.py26
-rw-r--r--requirements.txt1
7 files changed, 195 insertions, 14 deletions
diff --git a/kittystore/__init__.py b/kittystore/__init__.py
index db00b1b..6626a70 100644
--- a/kittystore/__init__.py
+++ b/kittystore/__init__.py
@@ -18,7 +18,7 @@ license.
__all__ = ("get_store", "MessageNotFound", )
-def get_store(url, debug=False):
+def get_store(url, search=None, debug=False):
"""Factory for a KittyStore subclass"""
if url.startswith("mongo://"):
raise NotImplementedError
@@ -27,7 +27,10 @@ def get_store(url, debug=False):
# return KittySAStore(url, debug)
else:
from kittystore.storm import get_storm_store
- return get_storm_store(url, debug)
+ store = get_storm_store(url, search, debug)
+ if search is not None:
+ store.search_index.initialize_with(store)
+ return store
class MessageNotFound(Exception):
diff --git a/kittystore/scripts.py b/kittystore/scripts.py
index 67d7b4c..fa40d20 100644
--- a/kittystore/scripts.py
+++ b/kittystore/scripts.py
@@ -40,6 +40,8 @@ def updatedb():
parser = OptionParser(usage="%prog -s store_url")
parser.add_option("-s", "--store", metavar="URL",
help="the URL to the store database")
+ parser.add_option("-i", "--search-index", metavar="PATH",
+ help="the path to the search index")
parser.add_option("--settings",
help="the Python path to a settings module")
parser.add_option("--pythonpath",
@@ -47,25 +49,30 @@ def updatedb():
parser.add_option("-d", "--debug", action="store_true",
help="show SQL queries")
opts, args = parser.parse_args()
- if opts.store is not None:
- store_url = opts.store
- elif opts.settings is not None:
+ django_settings = None
+ if opts.settings is not None:
if opts.pythonpath is not None:
sys.path.append(opts.pythonpath)
try:
- mod = importlib.import_module(opts.settings)
+ django_settings = importlib.import_module(opts.settings)
except ImportError as e:
parser.error("could not import settings '%s' (Is it on "
"sys.path?): %s" % (opts.settings, e))
- store_url = mod.KITTYSTORE_URL
+ if opts.store is not None:
+ store_url = opts.store
+ elif getattr(django_settings, "KITTYSTORE_URL", None) is not None:
+ store_url = django_settings.KITTYSTORE_URL
else:
parser.error("you must either specify a store URL (eg: "
"sqlite:///kittystore.sqlite) or a Django configuration "
"module (Python path to the settings module)")
+ if opts.search_index is None:
+ opts.search_index = getattr(django_settings, "KITTYSTORE_SEARCH_INDEX", None)
if args:
parser.error("no arguments allowed.")
- print 'Upgrading the database schema if necessary...'
- store = get_store(store_url, debug=opts.debug)
+ print 'Upgrading the database schema and populating ' \
+ 'the search index if necessary...'
+ store = get_store(store_url, search=opts.search_index, debug=opts.debug)
version = list(store.db.execute(
"SELECT patch.version FROM patch "
"ORDER BY version DESC LIMIT 1"
diff --git a/kittystore/storm/__init__.py b/kittystore/storm/__init__.py
index b7c7a92..8e1c744 100644
--- a/kittystore/storm/__init__.py
+++ b/kittystore/storm/__init__.py
@@ -12,6 +12,7 @@ from storm.schema.schema import Schema
from .model import List, Email
from . import schema
from .store import StormStore
+from .search import SearchEngine
class ThreadSafeStorePool(object):
@@ -35,7 +36,7 @@ class ThreadSafeStorePool(object):
return self._local.store
-def create_store(url, debug):
+def create_store(url, search, debug):
if debug:
storm.tracer.debug(True, stream=sys.stdout)
database = create_database(url)
@@ -43,11 +44,15 @@ def create_store(url, debug):
store = Store(database)
dbschema = Schema(schema.CREATES[dbtype], [], [], schema)
dbschema.upgrade(store)
- return StormStore(store, debug)
+ if search is not None:
+ search_index = SearchEngine(search)
+ else:
+ search_index = None
+ return StormStore(store, search_index, debug)
-def get_storm_store(url, debug=False):
+def get_storm_store(url, search=None, debug=False):
# Thread safety is managed by the middleware
#store_pool = ThreadSafeStorePool(url, debug)
#return store_pool.get()
- return create_store(url, debug)
+ return create_store(url, search, debug)
diff --git a/kittystore/storm/model.py b/kittystore/storm/model.py
index 084a4e0..86dbe89 100644
--- a/kittystore/storm/model.py
+++ b/kittystore/storm/model.py
@@ -92,6 +92,7 @@ class Email(Storm):
full_email = Reference((list_name, message_id),
("EmailFull.list_name", "EmailFull.message_id"))
full = Proxy(full_email, "EmailFull.full")
+ mlist = Reference(list_name, "List.name")
def __init__(self, list_name, message_id):
self.list_name = unicode(list_name)
diff --git a/kittystore/storm/search.py b/kittystore/storm/search.py
new file mode 100644
index 0000000..8c37747
--- /dev/null
+++ b/kittystore/storm/search.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright (C) 2012 Aurélien Bompard <abompard@fedoraproject.org>
+Author: Aurélien Bompard <abompard@fedoraproject.org>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+See http://www.gnu.org/copyleft/gpl.html for the full text of the
+license.
+"""
+
+from __future__ import absolute_import
+
+import os
+
+from whoosh.index import create_in, exists_in, open_dir
+from whoosh.fields import Schema, ID, TEXT, DATETIME, KEYWORD
+from whoosh.analysis import StemmingAnalyzer
+from whoosh.qparser import MultifieldParser
+
+from .model import Email
+
+
+def email_to_search_doc(email):
+ if not isinstance(email, Email):
+ raise ValueError("not an instance of the Email class")
+ search_doc = {
+ "list_name": email.list_name,
+ "message_id": email.message_id,
+ "sender": u"%s %s" % (email.sender_name, email.sender_email),
+ "subject": email.subject,
+ "content": email.content,
+ "date": email.date, # UTC
+ }
+ attachments = [a.name for a in email.attachments]
+ if attachments:
+ search_doc["attachments"] = " ".join(attachments)
+ return search_doc
+
+
+class SearchEngine(object):
+
+ def __init__(self, location):
+ self.location = location
+ self._index = None
+
+ def _get_schema(self):
+ stem_ana = StemmingAnalyzer()
+ return Schema(
+ list_name=ID(stored=True),
+ message_id=ID(stored=True),
+ sender=TEXT(field_boost=1.5),
+ subject=TEXT(field_boost=2.0, analyzer=stem_ana),
+ content=TEXT(analyzer=stem_ana),
+ date=DATETIME(),
+ attachments=TEXT,
+ tags=KEYWORD(commas=True, scorable=True),
+ )
+
+ @property
+ def index(self):
+ if self._index is None:
+ if not os.path.isdir(self.location):
+ os.makedirs(self.location)
+ if exists_in(self.location):
+ self._index = open_dir(self.location)
+ else:
+ self._index = create_in(self.location, self._get_schema())
+ return self._index
+
+ def add(self, doc):
+ writer = self.index.writer()
+ if isinstance(doc, Email):
+ doc = email_to_search_doc(doc)
+ try:
+ writer.add_document(**doc)
+ except Exception:
+ writer.cancel()
+ raise
+ else:
+ writer.commit()
+
+ def search(self, query, page=None, limit=10):
+ """
+ TODO: Should the searcher be shared?
+ http://pythonhosted.org/Whoosh/threads.html#concurrency
+ """
+ query = MultifieldParser(
+ ["sender", "subject", "content", "attachments"],
+ self.index.schema).parse(query)
+ return_value = {"total": 0, "results": []}
+ with self.index.searcher() as searcher:
+ if page:
+ results = searcher.search_page(query, page, pagelen=limit)
+ return_value["total"] = results.total
+ else:
+ results = searcher.search(query, limit=limit)
+ # http://pythonhosted.org/Whoosh/searching.html#results-object
+ if results.has_exact_length():
+ return_value["total"] = len(results)
+ else:
+ return_value["total"] = results.estimated_length()
+ return_value["results"] = [ dict(r) for r in results ]
+ return return_value
+
+ def optimize(self):
+ return self.index.optimize()
+
+ def add_batch(self, documents):
+ """
+ See http://pythonhosted.org/Whoosh/batch.html
+ """
+ writer = self.index.writer(limitmb=256, procs=4, multisegment=True)
+ # remove the LRU cache limit from the stemanalyzer
+ for component in writer.schema["content"].analyzer:
+ try:
+ component.cachesize = -1
+ component.clear()
+ except AttributeError:
+ continue
+ try:
+ for doc in documents:
+ if isinstance(doc, Email):
+ doc = email_to_search_doc(doc)
+ writer.add_document(**doc)
+ except Exception:
+ writer.cancel()
+ raise
+ else:
+ writer.commit()
+
+ def initialize_with(self, store):
+ """Create and populate the index with the contents of a Store"""
+ if exists_in(self.location):
+ return # index already exists
+ messages = store.db.find(Email).order_by(Email.archived_date)
+ self.add_batch(messages)
diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py
index 9c39c70..07770e8 100644
--- a/kittystore/storm/store.py
+++ b/kittystore/storm/store.py
@@ -40,7 +40,7 @@ class StormStore(object):
implements(IMessageStore)
- def __init__(self, db, debug=False):
+ def __init__(self, db, search_index=None, debug=False):
""" Constructor.
Create the session using the engine defined in the url.
@@ -49,6 +49,8 @@ class StormStore(object):
"""
self.db = db
self.debug = debug
+ self.search_index = search_index
+
# IMessageStore methods
@@ -158,6 +160,9 @@ class StormStore(object):
for attachment in attachments:
self.add_attachment(list_name, msg_id, *attachment)
self.flush()
+ # search indexing
+ if self.search_index is not None:
+ self.search_index.add(email)
return email.message_id_hash
def delete_message(self, message_id):
@@ -247,6 +252,25 @@ class StormStore(object):
)).one()
return msg
+ def search(self, query, list_name=None, page=None, limit=10):
+ """ Returns a list of email containing the specified keyword in
+ their content.
+
+ :param query: the query string to execute.
+ :param list_name: name of the mailing list in which this email
+ should be searched. If None or not specified, all lists are
+ searched.
+ :param page: the page number to return. If None, don't paginate.
+ :param limit: the number of results per page.
+ """
+ if list_name is not None:
+ query += " list_name:%s" % list_name
+ results = self.search_index.search(query, page, limit)
+ results["results"] = [ self.get_message_by_id_from_list(
+ r["list_name"], r["message_id"])
+ for r in results["results"] ]
+ return results
+
def search_list_for_content(self, list_name, keyword):
""" Returns a list of email containing the specified keyword in
their content.
diff --git a/requirements.txt b/requirements.txt
index f0f0be8..7ea03fa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ storm
python-dateutil < 2.0 # 2.0+ is for Python 3
mock
networkx
+Whoosh