diff options
| author | Aurélien Bompard <aurelien@bompard.org> | 2013-06-06 20:48:00 +0200 |
|---|---|---|
| committer | Aurélien Bompard <aurelien@bompard.org> | 2013-06-06 20:48:00 +0200 |
| commit | ed64d1d1db150ae99618b69fd008dbc82a92d2cd (patch) | |
| tree | c9dcc75aa7303d073ef027ce3d2c87e2bcf47fdc /kittystore/storm | |
| parent | d0c38e1a55e60699416470cb38c4e6675a9383c6 (diff) | |
| download | kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.gz kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.tar.xz kittystore-ed64d1d1db150ae99618b69fd008dbc82a92d2cd.zip | |
Add a Whoosh full-text search index
Diffstat (limited to 'kittystore/storm')
| -rw-r--r-- | kittystore/storm/__init__.py | 13 | ||||
| -rw-r--r-- | kittystore/storm/model.py | 1 | ||||
| -rw-r--r-- | kittystore/storm/search.py | 140 | ||||
| -rw-r--r-- | kittystore/storm/store.py | 26 |
4 files changed, 175 insertions, 5 deletions
diff --git a/kittystore/storm/__init__.py b/kittystore/storm/__init__.py index b7c7a92..8e1c744 100644 --- a/kittystore/storm/__init__.py +++ b/kittystore/storm/__init__.py @@ -12,6 +12,7 @@ from storm.schema.schema import Schema from .model import List, Email from . import schema from .store import StormStore +from .search import SearchEngine class ThreadSafeStorePool(object): @@ -35,7 +36,7 @@ class ThreadSafeStorePool(object): return self._local.store -def create_store(url, debug): +def create_store(url, search, debug): if debug: storm.tracer.debug(True, stream=sys.stdout) database = create_database(url) @@ -43,11 +44,15 @@ def create_store(url, debug): store = Store(database) dbschema = Schema(schema.CREATES[dbtype], [], [], schema) dbschema.upgrade(store) - return StormStore(store, debug) + if search is not None: + search_index = SearchEngine(search) + else: + search_index = None + return StormStore(store, search_index, debug) -def get_storm_store(url, debug=False): +def get_storm_store(url, search=None, debug=False): # Thread safety is managed by the middleware #store_pool = ThreadSafeStorePool(url, debug) #return store_pool.get() - return create_store(url, debug) + return create_store(url, search, debug) diff --git a/kittystore/storm/model.py b/kittystore/storm/model.py index 084a4e0..86dbe89 100644 --- a/kittystore/storm/model.py +++ b/kittystore/storm/model.py @@ -92,6 +92,7 @@ class Email(Storm): full_email = Reference((list_name, message_id), ("EmailFull.list_name", "EmailFull.message_id")) full = Proxy(full_email, "EmailFull.full") + mlist = Reference(list_name, "List.name") def __init__(self, list_name, message_id): self.list_name = unicode(list_name) diff --git a/kittystore/storm/search.py b/kittystore/storm/search.py new file mode 100644 index 0000000..8c37747 --- /dev/null +++ b/kittystore/storm/search.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +""" +Copyright (C) 2012 Aurélien Bompard <abompard@fedoraproject.org> +Author: Aurélien Bompard <abompard@fedoraproject.org> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. +See http://www.gnu.org/copyleft/gpl.html for the full text of the +license. +""" + +from __future__ import absolute_import + +import os + +from whoosh.index import create_in, exists_in, open_dir +from whoosh.fields import Schema, ID, TEXT, DATETIME, KEYWORD +from whoosh.analysis import StemmingAnalyzer +from whoosh.qparser import MultifieldParser + +from .model import Email + + +def email_to_search_doc(email): + if not isinstance(email, Email): + raise ValueError("not an instance of the Email class") + search_doc = { + "list_name": email.list_name, + "message_id": email.message_id, + "sender": u"%s %s" % (email.sender_name, email.sender_email), + "subject": email.subject, + "content": email.content, + "date": email.date, # UTC + } + attachments = [a.name for a in email.attachments] + if attachments: + search_doc["attachments"] = " ".join(attachments) + return search_doc + + +class SearchEngine(object): + + def __init__(self, location): + self.location = location + self._index = None + + def _get_schema(self): + stem_ana = StemmingAnalyzer() + return Schema( + list_name=ID(stored=True), + message_id=ID(stored=True), + sender=TEXT(field_boost=1.5), + subject=TEXT(field_boost=2.0, analyzer=stem_ana), + content=TEXT(analyzer=stem_ana), + date=DATETIME(), + attachments=TEXT, + tags=KEYWORD(commas=True, scorable=True), + ) + + @property + def index(self): + if self._index is None: + if not os.path.isdir(self.location): + os.makedirs(self.location) + if exists_in(self.location): + self._index = open_dir(self.location) + else: + self._index = create_in(self.location, self._get_schema()) + return self._index + + def add(self, doc): + writer = self.index.writer() + if isinstance(doc, Email): + doc = email_to_search_doc(doc) + try: + writer.add_document(**doc) + except Exception: + writer.cancel() + raise + else: + writer.commit() + + def search(self, query, page=None, limit=10): + """ + TODO: Should the searcher be shared? + http://pythonhosted.org/Whoosh/threads.html#concurrency + """ + query = MultifieldParser( + ["sender", "subject", "content", "attachments"], + self.index.schema).parse(query) + return_value = {"total": 0, "results": []} + with self.index.searcher() as searcher: + if page: + results = searcher.search_page(query, page, pagelen=limit) + return_value["total"] = results.total + else: + results = searcher.search(query, limit=limit) + # http://pythonhosted.org/Whoosh/searching.html#results-object + if results.has_exact_length(): + return_value["total"] = len(results) + else: + return_value["total"] = results.estimated_length() + return_value["results"] = [ dict(r) for r in results ] + return return_value + + def optimize(self): + return self.index.optimize() + + def add_batch(self, documents): + """ + See http://pythonhosted.org/Whoosh/batch.html + """ + writer = self.index.writer(limitmb=256, procs=4, multisegment=True) + # remove the LRU cache limit from the stemanalyzer + for component in writer.schema["content"].analyzer: + try: + component.cachesize = -1 + component.clear() + except AttributeError: + continue + try: + for doc in documents: + if isinstance(doc, Email): + doc = email_to_search_doc(doc) + writer.add_document(**doc) + except Exception: + writer.cancel() + raise + else: + writer.commit() + + def initialize_with(self, store): + """Create and populate the index with the contents of a Store""" + if exists_in(self.location): + return # index already exists + messages = store.db.find(Email).order_by(Email.archived_date) + self.add_batch(messages) diff --git a/kittystore/storm/store.py b/kittystore/storm/store.py index 9c39c70..07770e8 100644 --- a/kittystore/storm/store.py +++ b/kittystore/storm/store.py @@ -40,7 +40,7 @@ class StormStore(object): implements(IMessageStore) - def __init__(self, db, debug=False): + def __init__(self, db, search_index=None, debug=False): """ Constructor. Create the session using the engine defined in the url. @@ -49,6 +49,8 @@ class StormStore(object): """ self.db = db self.debug = debug + self.search_index = search_index + # IMessageStore methods @@ -158,6 +160,9 @@ class StormStore(object): for attachment in attachments: self.add_attachment(list_name, msg_id, *attachment) self.flush() + # search indexing + if self.search_index is not None: + self.search_index.add(email) return email.message_id_hash def delete_message(self, message_id): @@ -247,6 +252,25 @@ class StormStore(object): )).one() return msg + def search(self, query, list_name=None, page=None, limit=10): + """ Returns a list of email containing the specified keyword in + their content. + + :param query: the query string to execute. + :param list_name: name of the mailing list in which this email + should be searched. If None or not specified, all lists are + searched. + :param page: the page number to return. If None, don't paginate. + :param limit: the number of results per page. + """ + if list_name is not None: + query += " list_name:%s" % list_name + results = self.search_index.search(query, page, limit) + results["results"] = [ self.get_message_by_id_from_list( + r["list_name"], r["message_id"]) + for r in results["results"] ] + return results + def search_list_for_content(self, list_name, keyword): """ Returns a list of email containing the specified keyword in their content. |
