diff options
| author | Aurélien Bompard <aurelien@bompard.org> | 2013-06-06 20:48:00 +0200 |
|---|---|---|
| committer | Aurélien Bompard <aurelien@bompard.org> | 2013-06-06 20:48:00 +0200 |
| commit | ed64d1d1db150ae99618b69fd008dbc82a92d2cd (patch) | |
| tree | c9dcc75aa7303d073ef027ce3d2c87e2bcf47fdc | |
| parent | d0c38e1a55e60699416470cb38c4e6675a9383c6 (diff) | |
| download | kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.gz kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.xz kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.zip | |
Add a Whoosh full-text search index
| -rw-r--r-- | kittystore/__init__.py | 7 | ||||
| -rw-r--r-- | kittystore/scripts.py | 21 | ||||
| -rw-r--r-- | kittystore/storm/__init__.py | 13 | ||||
| -rw-r--r-- | kittystore/storm/model.py | 1 | ||||
| -rw-r--r-- | kittystore/storm/search.py | 140 | ||||
| -rw-r--r-- | kittystore/storm/store.py | 26 | ||||
| -rw-r--r-- | requirements.txt | 1 |
7 files changed, 195 insertions, 14 deletions
diff --git a/kittystore/__init__.py b/kittystore/__init__.py index db00b1b..6626a70 100644 --- a/kittystore/__init__.py +++ b/kittystore/__init__.py @@ -18,7 +18,7 @@ license. __all__ = ("get_store", "MessageNotFound", ) -def get_store(url, debug=False): +def get_store(url, search=None, debug=False): """Factory for a KittyStore subclass""" if url.startswith("mongo://"): raise NotImplementedError @@ -27,7 +27,10 @@ def get_store(url, debug=False): # return KittySAStore(url, debug) else: from kittystore.storm import get_storm_store - return get_storm_store(url, debug) + store = get_storm_store(url, search, debug) + if search is not None: + store.search_index.initialize_with(store) + return store class MessageNotFound(Exception): diff --git a/kittystore/scripts.py b/kittystore/scripts.py index 67d7b4c..fa40d20 100644 --- a/kittystore/scripts.py +++ b/kittystore/scripts.py @@ -40,6 +40,8 @@ def updatedb(): parser = OptionParser(usage="%prog -s store_url") parser.add_option("-s", "--store", metavar="URL", help="the URL to the store database") + parser.add_option("-i", "--search-index", metavar="PATH", + help="the path to the search index") parser.add_option("--settings", help="the Python path to a settings module") parser.add_option("--pythonpath", @@ -47,25 +49,30 @@ def updatedb(): parser.add_option("-d", "--debug", action="store_true", help="show SQL queries") opts, args = parser.parse_args() - if opts.store is not None: - store_url = opts.store - elif opts.settings is not None: + django_settings = None + if opts.settings is not None: if opts.pythonpath is not None: sys.path.append(opts.pythonpath) try: - mod = importlib.import_module(opts.settings) + django_settings = importlib.import_module(opts.settings) except ImportError as e: parser.error("could not import settings '%s' (Is it on " "sys.path?): %s" % (opts.settings, e)) - store_url = mod.KITTYSTORE_URL + if opts.store is not None: + store_url = opts.store + elif getattr(django_settings, "KITTYSTORE_URL", None) is not None: + store_url = django_settings.KITTYSTORE_URL else: parser.error("you must either specify a store URL (eg: " "sqlite:///kittystore.sqlite) or a Django configuration " "module (Python path to the settings module)") + if opts.search_index is None: + opts.search_index = getattr(django_settings, "KITTYSTORE_SEARCH_INDEX", None) if args: parser.error("no arguments allowed.") - print 'Upgrading the database schema if necessary...' - store = get_store(store_url, debug=opts.debug) + print 'Upgrading the database schema and populating ' \ + 'the search index if necessary...' + store = get_store(store_url, search=opts.search_index, debug=opts.debug) version = list(store.db.execute( "SELECT patch.version FROM patch " "ORDER BY version DESC LIMIT 1" diff --git a/kittystore/storm/__init__.py b/kittystore/storm/__init__.py index b7c7a92..8e1c744 100644 --- a/kittystore/storm/__init__.py +++ b/kittystore/storm/__init__.py @@ -12,6 +12,7 @@ from storm.schema.schema import Schema from .model import List, Email from . import schema from .store import StormStore +from .search import SearchEngine class ThreadSafeStorePool(object): @@ -35,7 +36,7 @@ class ThreadSafeStorePool(object): return self._local.store -def create_store(url, debug): +def create_store(url, search, debug): if debug: storm.tracer.debug(True, stream=sys.stdout) database = create_database(url) @@ -43,11 +44,15 @@ def create_store(url, debug): store = Store(database) dbschema = Schema(schema.CREATES[dbtype], [], [], schema) dbschema.upgrade(store) - return StormStore(store, debug) + if search is not None: + search_index = SearchEngine(search) + else: + search_index = None + return StormStore(store, search_index, debug) -def get_storm_store(url, debug=False): +def get_storm_store(url, search=None, debug=False): # Thread safety is managed by the middleware #store_pool = ThreadSafeStorePool(url, debug) #return store_pool.get() - return create_store(url, debug) + return create_store(url, search, debug) diff --git a/kittystore/storm/model.py b/kittystore/storm/model.py index 084a4e0..86dbe89 100644 --- a/kittystore/storm/model.py +++ b/kittystore/storm/model.py @@ -92,6 +92,7 @@ class Email(Storm): full_email = Reference((list_name, message_id), ("EmailFull.list_name", "EmailFull.message_id")) full = Proxy(full_email, "EmailFull.full") + mlist = Reference(list_name, "List.name") def __init__(self, list_name, message_id): self.list_name = unicode(list_name) diff --git a/kittystore/storm/search.py b/kittystore/storm/search.py new file mode 100644 index 0000000..8c37747 --- /dev/null +++ b/kittystore/storm/search.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +""" +Copyright (C) 2012 Aurélien Bompard <abompard@fedoraproject.org> +Author: Aurélien Bompard <abompard@fedoraproject.org> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. +See http://www.gnu.org/copyleft/gpl.html for the full text of the +license. +""" + +from __future__ import absolute_import + +import os + +from whoosh.index import create_in, exists_in, open_dir +from whoosh.fields import Schema, ID, TEXT, DATETIME, KEYWORD +from whoosh.analysis import StemmingAnalyzer +from whoosh.qparser import MultifieldParser + +from .model import Email + + +def email_to_search_doc(email): + if not isinstance(email, Email): + raise ValueError("not an instance of the Email class") + search_doc = { + "list_name": email.list_name, + "message_id": email.message_id, + "sender": u"%s %s" % (email.sender_name, email.sender_email), + "subject": email.subject, + "content": email.content, + "date": email.date, # UTC + } + attachments = [a.name for a in email.attachments] + if attachments: + search_doc["attachments"] = " ".join(attachments) + return search_doc + + +class SearchEngine(object): + + def __init__(self, location): + self.location = location + self._index = None + + def _get_schema(self): + stem_ana = StemmingAnalyzer() + return Schema( + list_name=ID(stored=True), + message_id=ID(stored=True), + sender=TEXT(field_boost=1.5), + subject=TEXT(field_boost=2.0, analyzer=stem_ana), + content=TEXT(analyzer=stem_ana), + date=DATETIME(), + attachments=TEXT, + tags=KEYWORD(commas=True, scorable=True), + ) + + @property + def index(self): + if self._index is None: + if not os.path.isdir(self.location): + os.makedirs(self.location) + if exists_in(self.location): + self._index = open_dir(self.location) + else: + self._index = create_in(self.location, self._get_schema()) + return self._index + + def add(self, doc): + writer = self.index.writer() + if isinstance(doc, Email): + doc = email_to_search_doc(doc) + try: + writer.add_document(**doc) + except Exception: + writer.cancel() + raise + else: + writer.commit() + + def search(self, query, page=None, limit=10): + """ + TODO: Should the searcher be shared? + http://pythonhosted.org/Whoosh/threads.html#concurrency + """ + query = MultifieldParser( + ["sender", "subject", "content", "attachments"], + self.index.schema).parse(query) + return_value = {"total": 0, "results": []} + with self.index.searcher() as searcher: + if page: + results = searcher.search_page(query, page, pagelen=limit) + return_value["total"] = results.total + else: + results = searcher.search(query, limit=limit) + # http://pythonhosted.org/Whoosh/searching.html#results-object + if results.has_exact_length(): + return_value["total"] = len(results) + else: + return_value["total"] = results.estimated_length() + return_value["results"] = [ dict(r) for r in results ] + return return_value + + def optimize(self): + return self.index.optimize() + + def add_batch(self, documents): + """ + See http://pythonhosted.org/Whoosh/batch.html + """ + writer = self.index.writer(limitmb=256, procs=4, multisegment=True) + # remove the LRU cache limit from the stemanalyzer + for component in writer.schema["content"].analyzer: + try: + component.cachesize = -1 + component.clear() + except AttributeError: + continue + try: + for doc in documents: + if isinstance(doc, Email): + doc = email_to_search_doc(doc) + writer.add_document(**doc) + except Exception: + writer.cancel() + raise + else: + writer.commit() + + def initialize_with(self, store): + """Create and populate the index with the contents of a Store""" + if exists_in(self.location): + return # index already exists + messages = store.db.find(Email).order_by(Email.archived_date) + self.add_batch(messages) diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py index 9c39c70..07770e8 100644 --- a/kittystore/storm/store.py +++ b/kittystore/storm/store.py @@ -40,7 +40,7 @@ class StormStore(object): implements(IMessageStore) - def __init__(self, db, debug=False): + def __init__(self, db, search_index=None, debug=False): """ Constructor. Create the session using the engine defined in the url. @@ -49,6 +49,8 @@ class StormStore(object): """ self.db = db self.debug = debug + self.search_index = search_index + # IMessageStore methods @@ -158,6 +160,9 @@ class StormStore(object): for attachment in attachments: self.add_attachment(list_name, msg_id, *attachment) self.flush() + # search indexing + if self.search_index is not None: + self.search_index.add(email) return email.message_id_hash def delete_message(self, message_id): @@ -247,6 +252,25 @@ class StormStore(object): )).one() return msg + def search(self, query, list_name=None, page=None, limit=10): + """ Returns a list of email containing the specified keyword in + their content. + + :param query: the query string to execute. + :param list_name: name of the mailing list in which this email + should be searched. If None or not specified, all lists are + searched. + :param page: the page number to return. If None, don't paginate. + :param limit: the number of results per page. + """ + if list_name is not None: + query += " list_name:%s" % list_name + results = self.search_index.search(query, page, limit) + results["results"] = [ self.get_message_by_id_from_list( + r["list_name"], r["message_id"]) + for r in results["results"] ] + return results + def search_list_for_content(self, list_name, keyword): """ Returns a list of email containing the specified keyword in their content. diff --git a/requirements.txt b/requirements.txt index f0f0be8..7ea03fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ storm python-dateutil < 2.0 # 2.0+ is for Python 3 mock networkx +Whoosh |
