From 2fdd73816c56b578a65466db4e5a86b9b191e1c1 Mon Sep 17 00:00:00 2001
From: Monsyne Dragon <mdragon@rackspace.com>
Date: Fri, 6 Jul 2012 18:28:21 +0000
Subject: Refactor instance_usage_audit. Add audit tasklog.

The instance usage audit cronjob that generates periodic
compute.instance.exists notifications is not particularly scalable.
It is run on one server and takes longer as the number of instances grows.

This change moves the generation of those events to a periodic task in
the compute manager. It also adds an api extension that can be used
by administrators to check for errors generating these events.

Change-Id: I856d3d0c73c34e570112f1345d306308ef20a9ae
---
 nova/compute/manager.py | 49 ++++++++++++++++++++++++++++++
 nova/compute/utils.py   | 80 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 128 insertions(+), 1 deletion(-)

(limited to 'nova/compute')

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 05818b27c..541766265 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -143,6 +143,9 @@ compute_opts = [
                'this functionality will be replaced when HostAggregates '
                'become more funtional for general grouping in Folsom. (see: '
                'http://etherpad.openstack.org/FolsomNovaHostAggregates-v2)'),
+    cfg.BoolOpt('instance_usage_audit',
+               default=False,
+               help="Generate periodic compute.instance.exists notifications"),
 
     ]
 
@@ -2365,6 +2368,52 @@ class ComputeManager(manager.SchedulerDependentManager):
                             "Will retry later.")
                     LOG.error(msg % locals(), instance=instance)
 
+    @manager.periodic_task
+    def _instance_usage_audit(self, context):
+        if FLAGS.instance_usage_audit:
+            if not compute_utils.has_audit_been_run(context, self.host):
+                begin, end = utils.last_completed_audit_period()
+                instances = self.db.instance_get_active_by_window_joined(
+                                                            context,
+                                                            begin,
+                                                            end,
+                                                            host=self.host)
+                num_instances = len(instances)
+                errors = 0
+                successes = 0
+                LOG.info(_("Running instance usage audit for"
+                           " host %(host)s from %(begin_time)s to "
+                           "%(end_time)s. %(number_instances)s"
+                           " instances.") % dict(host=self.host,
+                               begin_time=begin,
+                               end_time=end,
+                               number_instances=num_instances))
+                start_time = time.time()
+                compute_utils.start_instance_usage_audit(context,
+                                              begin, end,
+                                              self.host, num_instances)
+                for instance_ref in instances:
+                    try:
+                        compute_utils.notify_usage_exists(
+                            context, instance_ref,
+                            ignore_missing_network_data=False)
+                        successes += 1
+                    except Exception:
+                        LOG.exception(_('Failed to generate usage '
+                                        'audit for instance '
+                                        'on host %s') % self.host,
+                                      instance=instance)
+                        errors += 1
+                compute_utils.finish_instance_usage_audit(context,
+                                              begin, end,
+                                              self.host, errors,
+                                              "Instance usage audit ran "
+                                              "for host %s, %s instances "
+                                              "in %s seconds." % (
+                                              self.host,
+                                              num_instances,
+                                              time.time() - start_time))
+
     @manager.periodic_task
     def _poll_bandwidth_usage(self, context, start_time=None, stop_time=None):
         if not start_time:
diff --git a/nova/compute/utils.py b/nova/compute/utils.py
index 04d8a842c..65a3b2d90 100644
--- a/nova/compute/utils.py
+++ b/nova/compute/utils.py
@@ -23,7 +23,7 @@ from nova.network import model as network_model
 from nova import notifications
 from nova.openstack.common import log
 from nova.openstack.common.notifier import api as notifier_api
-
+from nova import utils
 
 FLAGS = flags.FLAGS
 LOG = log.getLogger(__name__)
@@ -108,3 +108,81 @@ def get_nw_info_for_instance(instance):
     info_cache = instance['info_cache'] or {}
     cached_nwinfo = info_cache.get('network_info') or []
     return network_model.NetworkInfo.hydrate(cached_nwinfo)
+
+
+def has_audit_been_run(context, host, timestamp=None):
+    begin, end = utils.last_completed_audit_period(before=timestamp)
+    task_log = db.task_log_get(context, "instance_usage_audit",
+                               begin, end, host)
+    if task_log:
+        return True
+    else:
+        return False
+
+
+def start_instance_usage_audit(context, begin, end, host, num_instances):
+    db.task_log_begin_task(context, "instance_usage_audit", begin, end, host,
+                           num_instances, "Instance usage audit started...")
+
+
+def finish_instance_usage_audit(context, begin, end, host, errors, message):
+    db.task_log_end_task(context, "instance_usage_audit", begin, end, host,
+                         errors, message)
+
+
+def get_audit_task_logs(context, begin=None, end=None, before=None):
+    """Returns a full log for all instance usage audit tasks on all computes.
+
+    :param begin: datetime beginning of audit period to get logs for,
+        Defaults to the beginning of the most recently completed
+        audit period prior to the 'before' date.
+    :param end: datetime ending of audit period to get logs for,
+        Defaults to the ending of the most recently completed
+        audit period prior to the 'before' date.
+    :param before: By default we look for the audit period most recently
+        completed before this datetime. Has no effect if both begin and end
+        are specified.
+    """
+    defbegin, defend = utils.last_completed_audit_period(before=before)
+    if begin is None:
+        begin = defbegin
+    if end is None:
+        end = defend
+    task_logs = db.task_log_get_all(context, "instance_usage_audit",
+                                    begin, end)
+    services = db.service_get_all_by_topic(context, "compute")
+    hosts = set(serv['host'] for serv in services)
+    seen_hosts = set()
+    done_hosts = set()
+    running_hosts = set()
+    total_errors = 0
+    total_items = 0
+    for tlog in task_logs:
+        seen_hosts.add(tlog['host'])
+        if tlog['state'] == "DONE":
+            done_hosts.add(tlog['host'])
+        if tlog['state'] == "RUNNING":
+            running_hosts.add(tlog['host'])
+        total_errors += tlog['errors']
+        total_items += tlog['task_items']
+    log = dict((tl['host'], dict(state=tl['state'],
+                              instances=tl['task_items'],
+                              errors=tl['errors'],
+                              message=tl['message']))
+              for tl in task_logs)
+    missing_hosts = hosts - seen_hosts
+    overall_status = "%s hosts done. %s errors." % (
+                'ALL' if len(done_hosts) == len(hosts)
+                else "%s of %s" % (len(done_hosts), len(hosts)),
+                total_errors)
+    return dict(period_beginning=str(begin),
+                period_ending=str(end),
+                num_hosts=len(hosts),
+                num_hosts_done=len(done_hosts),
+                num_hosts_running=len(running_hosts),
+                num_hosts_not_run=len(missing_hosts),
+                hosts_not_run=list(missing_hosts),
+                total_instances=total_items,
+                total_errors=total_errors,
+                overall_status=overall_status,
+                log=log)
-- 
cgit