From e695b8498c486d5b664d8e551e7182a102826cd2 Mon Sep 17 00:00:00 2001 From: Rick Harris Date: Wed, 21 Dec 2011 22:40:23 +0000 Subject: Adds running_deleted_instance_reaper task. This adds a periodic task to cleanup erroneously running instances. The impetus of the patch was a XenServer specific issue bug #911366, where deleted instances would remain running on the host machine. The patch however is hypervisor agnostic and is generally useful as a housekeeping task to make sure these 'zombied' instances are detected. Change-Id: Iddc6a88920a537a3a115f8b9bc0039ec0e24a194 --- nova/compute/manager.py | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ nova/utils.py | 28 +++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index ab582114b..44d35b4a6 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -85,6 +85,17 @@ flags.DEFINE_integer("resize_confirm_window", 0, " Set to 0 to disable.") flags.DEFINE_integer('host_state_interval', 120, 'Interval in seconds for querying the host status') +flags.DEFINE_integer("running_deleted_instance_timeout", 0, + "Number of seconds after being deleted when a" + " still-running instance should be considered" + " eligible for cleanup.") +flags.DEFINE_integer("running_deleted_instance_poll_interval", 30, + "Number of periodic scheduler ticks to wait between" + " runs of the cleanup task.") +flags.DEFINE_string("running_deleted_instance_action", "noop", + "Action to take if a running deleted instance is" + " detected. Valid options are 'noop', 'log', and" + " 'reap'. Set to 'noop' to disable.") LOG = logging.getLogger('nova.compute.manager') @@ -2046,3 +2057,65 @@ class ComputeManager(manager.SchedulerDependentManager): 'details': fault.message, } self.db.instance_fault_create(context, values) + + @manager.periodic_task( + ticks_between_runs=FLAGS.running_deleted_instance_poll_interval) + def _cleanup_running_deleted_instances(self, context): + """Cleanup any instances which are erroneously still running after + having been deleted. + + Valid actions to take are: + + 1. noop - do nothing + 2. log - log which instances are erroneously running + 3. reap - shutdown and cleanup any erroneously running instances + + The use-case for this cleanup task is: for various reasons, it may be + possible for the database to show an instance as deleted but for that + instance to still be running on a host machine (see bug + https://bugs.launchpad.net/nova/+bug/911366). + + This cleanup task is a cross-hypervisor utility for finding these + zombied instances and either logging the discrepancy (likely what you + should do in production), or automatically reaping the instances (more + appropriate for dev environments). + """ + action = FLAGS.running_deleted_instance_action + + if action == "noop": + return + + present_name_labels = set(self.driver.list_instances()) + + # NOTE(sirp): admin contexts don't ordinarily return deleted records + with utils.temporary_mutation(context, read_deleted="yes"): + instances = self.db.instance_get_all_by_host(context, self.host) + for instance in instances: + present = instance.name in present_name_labels + erroneously_running = instance.deleted and present + old_enough = (not instance.deleted_at or utils.is_older_than( + instance.deleted_at, + FLAGS.running_deleted_instance_timeout)) + + if erroneously_running and old_enough: + instance_id = instance.id + name_label = instance.name + + if action == "log": + LOG.warning(_("Detected instance %(instance_id)s with" + " name label '%(name_label)s' which is" + " marked as DELETED but still present on" + " host."), locals()) + + elif action == 'reap': + LOG.info(_("Destroying instance %(instance_id)s with" + " name label '%(name_label)s' which is" + " marked as DELETED but still present on" + " host."), locals()) + self._shutdown_instance( + context, instance, 'Terminating', True) + self._cleanup_volumes(context, instance_id) + else: + raise Exception(_("Unrecognized value '%(action)s'" + " for FLAGS.running_deleted_" + "instance_action"), locals()) diff --git a/nova/utils.py b/nova/utils.py index 7f37cc801..bceee974d 100644 --- a/nova/utils.py +++ b/nova/utils.py @@ -1181,3 +1181,31 @@ def read_cached_file(filename, cache_info): cache_info['data'] = data cache_info['mtime'] = mtime return data + + +@contextlib.contextmanager +def temporary_mutation(obj, **kwargs): + """Temporarily set the attr on a particular object to a given value then + revert when finished. + + One use of this is to temporarily set the read_deleted flag on a context + object: + + with temporary_mutation(context, read_deleted="yes"): + do_something_that_needed_deleted_objects() + """ + NOT_PRESENT = object() + + old_values = {} + for attr, new_value in kwargs.items(): + old_values[attr] = getattr(obj, attr, NOT_PRESENT) + setattr(obj, attr, new_value) + + try: + yield + finally: + for attr, old_value in old_values.items(): + if old_value is NOT_PRESENT: + del obj[attr] + else: + setattr(obj, attr, old_value) -- cgit