diff options
| author | Brian Elliott <brian.elliott@rackspace.com> | 2012-06-21 00:44:24 +0000 |
|---|---|---|
| committer | Brian Elliott <brian.elliott@rackspace.com> | 2012-07-17 22:15:41 +0000 |
| commit | 83fece1aa7e124a2adae05d95c8c6cecc12d5d41 (patch) | |
| tree | 2fcaa6ff5838fe6714b372e9cb5d3c40d080d605 /nova/compute | |
| parent | acb158714c562d3142bf2f3f560dc374daa2df7d (diff) | |
Adds generic retries for build failures.
Add a generic scheduler retry for build failures. Failed
build requests get casted back to scheduler for retry until
success or the maximum number of attempts is reached. The
number of attempts to make is configurable or can be
simply set to 1 to disable retries altogether.
Partially implements blueprint: scheduler-resource-race
DocImpact:
Adds a new capability to filter scheduler to enable retries of
scheduling requests.
1) New flag: scheduler_max_attempts (int) - Number of attempts to make
to schedule an instance before giving up and settting the instance to
error.
2) New RetryFilter. Avoids re-scheduling to the same host multiple
times. (nova.scheduler.filters.RetryFilter)
Change-Id: I1127caeed4418c75372a42ca7fafacb4f061ffe3
Diffstat (limited to 'nova/compute')
| -rw-r--r-- | nova/compute/manager.py | 84 |
1 files changed, 77 insertions, 7 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 061d596b9..4eef39793 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -70,6 +70,7 @@ from nova.openstack.common.notifier import api as notifier from nova.openstack.common import rpc from nova.openstack.common.rpc import common as rpc_common from nova.openstack.common import timeutils +from nova.scheduler import rpcapi as scheduler_rpcapi from nova import utils from nova.virt import driver from nova import volume @@ -259,6 +260,7 @@ class ComputeManager(manager.SchedulerDependentManager): self._last_info_cache_heal = 0 self.compute_api = compute.API() self.compute_rpcapi = compute_rpcapi.ComputeAPI() + self.scheduler_rpcapi = scheduler_rpcapi.SchedulerAPI() super(ComputeManager, self).__init__(service_name="compute", *args, **kwargs) @@ -470,22 +472,90 @@ class ComputeManager(manager.SchedulerDependentManager): instance = self._spawn(context, instance, image_meta, network_info, block_device_info, injected_files, admin_password) + except exception.InstanceNotFound: + raise # the instance got deleted during the spawn except Exception: - with excutils.save_and_reraise_exception(): - self._deallocate_network(context, instance) + # try to re-schedule instance: + self._reschedule_or_reraise(context, instance, + requested_networks, admin_password, injected_files, + is_first_time, **kwargs) + else: + # Spawn success: + if (is_first_time and not instance['access_ip_v4'] + and not instance['access_ip_v6']): + self._update_access_ip(context, instance, network_info) - if (is_first_time and not instance['access_ip_v4'] - and not instance['access_ip_v6']): - self._update_access_ip(context, instance, network_info) + self._notify_about_instance_usage(context, instance, + "create.end", network_info=network_info) - self._notify_about_instance_usage( - context, instance, "create.end", network_info=network_info) except exception.InstanceNotFound: LOG.warn(_("Instance not found."), instance_uuid=instance_uuid) except Exception as e: with excutils.save_and_reraise_exception(): self._set_instance_error_state(context, instance_uuid) + def _reschedule_or_reraise(self, context, instance, *args, **kwargs): + """Try to re-schedule the build or re-raise the original build error to + error out the instance. + """ + type_, value, tb = sys.exc_info() # save original exception + rescheduled = False + instance_uuid = instance['uuid'] + + def _log_original_error(): + LOG.error(_('Build error: %s') % + traceback.format_exception(type_, value, tb), + instance_uuid=instance_uuid) + + try: + self._deallocate_network(context, instance) + except Exception: + # do not attempt retry if network de-allocation occurs: + _log_original_error() + raise + + try: + rescheduled = self._reschedule(context, instance_uuid, *args, + **kwargs) + except Exception: + rescheduled = False + LOG.exception(_("Error trying to reschedule"), + instance_uuid=instance_uuid) + + if rescheduled: + # log the original build error + _log_original_error() + else: + # not re-scheduling + raise type_, value, tb + + def _reschedule(self, context, instance_uuid, requested_networks, + admin_password, injected_files, is_first_time, **kwargs): + + filter_properties = kwargs.get('filter_properties', {}) + retry = filter_properties.get('retry', None) + if not retry: + # no retry information, do not reschedule. + LOG.debug(_("Retry info not present, will not reschedule"), + instance_uuid=instance_uuid) + return + + request_spec = kwargs.get('request_spec', None) + if not request_spec: + LOG.debug(_("No request spec, will not reschedule"), + instance_uuid=instance_uuid) + return + + request_spec['num_instances'] = 1 + + LOG.debug(_("Re-scheduling instance: attempt %d"), + retry['num_attempts'], instance_uuid=instance_uuid) + self.scheduler_rpcapi.run_instance(context, FLAGS.compute_topic, + request_spec, admin_password, injected_files, + requested_networks, is_first_time, filter_properties, + reservations=None, call=False) + return True + @manager.periodic_task def _check_instance_build_time(self, context): """Ensure that instances are not stuck in build.""" |
