From 38136fd2fdda860915dc8573f14229bd4b42ab94 Mon Sep 17 00:00:00 2001 From: Aurélien Bompard Date: Thu, 28 Nov 2013 10:59:58 +0100 Subject: Improvements in the caching system --- kittystore/caching/email.py | 30 ++++++++++++++++++++++++------ kittystore/caching/thread.py | 1 + kittystore/import.py | 5 ++++- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/kittystore/caching/email.py b/kittystore/caching/email.py index 527d1e3..697afea 100644 --- a/kittystore/caching/email.py +++ b/kittystore/caching/email.py @@ -25,8 +25,15 @@ class MailmanUser(CachedValue): address = message.sender_email if address not in self._user_id_cache: mm_client = self._get_mailman_client(store.settings) - mm_user = mm_client.get_user(address) - self._user_id_cache[address] = unicode(mm_user.user_id) + try: + mm_user = mm_client.get_user(address) + except HTTPError, e: + if e.code == 404: + self._user_id_cache[address] = None + else: + raise + else: + self._user_id_cache[address] = unicode(mm_user.user_id) return self._user_id_cache[address] def on_new_message(self, store, mlist, message): @@ -50,12 +57,23 @@ class MailmanUser(CachedValue): return # Can't update at this time def refresh(self, store): + # There can be millions of emails, break into smaller chuncks to avoid + # hogging up the memory + print "Getting missing email user ids from Mailman" # XXX: Storm-specific from kittystore.storm.model import Email + buffer_size = 50000 + prev_count = store.db.find(Email, Email.user_id == None).count() try: - for num, message in enumerate(store.db.find(Email, Email.user_id == None)): - message.user_id = self._get_user_id(store, message) - if num % 1000 == 0: - store.commit() # otherwise we'll blow up the memory + while True: + for message in store.db.find(Email, + Email.user_id == None)[:buffer_size]: + message.user_id = self._get_user_id(store, message) + store.commit() + count = store.db.find(Email, Email.user_id == None).count() + if count == 0 or count == prev_count: + break # done, or no improvement (former members) + prev_count = count + print "%d emails left to refresh" % count except (HTTPError, mailmanclient.MailmanConnectionError): return # Can't refresh at this time diff --git a/kittystore/caching/thread.py b/kittystore/caching/thread.py index 62edb5d..6fdb712 100644 --- a/kittystore/caching/thread.py +++ b/kittystore/caching/thread.py @@ -18,6 +18,7 @@ class ThreadStats(CachedValue): len(message.thread.participants) def refresh(self, store): + print "Refreshing thread statistics" # XXX: Storm-specific from kittystore.storm.model import Thread for num, thread in enumerate(store.db.find(Thread)): diff --git a/kittystore/import.py b/kittystore/import.py index a184ebe..3605446 100644 --- a/kittystore/import.py +++ b/kittystore/import.py @@ -283,6 +283,8 @@ def parse_args(): help="show a whole lot more of output") parser.add_option("--no-download", action="store_true", help="don't download attachments") + parser.add_option("--no-refresh", action="store_true", + help="don't refresh the cache after importing") parser.add_option("-D", "--duplicates", action="store_true", help="do not skip duplicate emails (same Message-ID header), " "import them with a different Message-ID") @@ -324,5 +326,6 @@ def main(): if opts.verbose: print ' %s emails are stored into the database' \ % store.get_list_size(opts.list_name) - store.refresh_cache(full=True) + if not opts.no_refresh: + store.refresh_cache(full=True) store.commit() -- cgit