summaryrefslogtreecommitdiffstats
path: root/nova/compute
diff options
context:
space:
mode:
authorBrian Elliott <brian.elliott@rackspace.com>2012-06-21 00:44:24 +0000
committerBrian Elliott <brian.elliott@rackspace.com>2012-07-17 22:15:41 +0000
commit83fece1aa7e124a2adae05d95c8c6cecc12d5d41 (patch)
tree2fcaa6ff5838fe6714b372e9cb5d3c40d080d605 /nova/compute
parentacb158714c562d3142bf2f3f560dc374daa2df7d (diff)
Adds generic retries for build failures.
Add a generic scheduler retry for build failures. Failed build requests get casted back to scheduler for retry until success or the maximum number of attempts is reached. The number of attempts to make is configurable or can be simply set to 1 to disable retries altogether. Partially implements blueprint: scheduler-resource-race DocImpact: Adds a new capability to filter scheduler to enable retries of scheduling requests. 1) New flag: scheduler_max_attempts (int) - Number of attempts to make to schedule an instance before giving up and settting the instance to error. 2) New RetryFilter. Avoids re-scheduling to the same host multiple times. (nova.scheduler.filters.RetryFilter) Change-Id: I1127caeed4418c75372a42ca7fafacb4f061ffe3
Diffstat (limited to 'nova/compute')
-rw-r--r--nova/compute/manager.py84
1 files changed, 77 insertions, 7 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 061d596b9..4eef39793 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -70,6 +70,7 @@ from nova.openstack.common.notifier import api as notifier
from nova.openstack.common import rpc
from nova.openstack.common.rpc import common as rpc_common
from nova.openstack.common import timeutils
+from nova.scheduler import rpcapi as scheduler_rpcapi
from nova import utils
from nova.virt import driver
from nova import volume
@@ -259,6 +260,7 @@ class ComputeManager(manager.SchedulerDependentManager):
self._last_info_cache_heal = 0
self.compute_api = compute.API()
self.compute_rpcapi = compute_rpcapi.ComputeAPI()
+ self.scheduler_rpcapi = scheduler_rpcapi.SchedulerAPI()
super(ComputeManager, self).__init__(service_name="compute",
*args, **kwargs)
@@ -470,22 +472,90 @@ class ComputeManager(manager.SchedulerDependentManager):
instance = self._spawn(context, instance, image_meta,
network_info, block_device_info,
injected_files, admin_password)
+ except exception.InstanceNotFound:
+ raise # the instance got deleted during the spawn
except Exception:
- with excutils.save_and_reraise_exception():
- self._deallocate_network(context, instance)
+ # try to re-schedule instance:
+ self._reschedule_or_reraise(context, instance,
+ requested_networks, admin_password, injected_files,
+ is_first_time, **kwargs)
+ else:
+ # Spawn success:
+ if (is_first_time and not instance['access_ip_v4']
+ and not instance['access_ip_v6']):
+ self._update_access_ip(context, instance, network_info)
- if (is_first_time and not instance['access_ip_v4']
- and not instance['access_ip_v6']):
- self._update_access_ip(context, instance, network_info)
+ self._notify_about_instance_usage(context, instance,
+ "create.end", network_info=network_info)
- self._notify_about_instance_usage(
- context, instance, "create.end", network_info=network_info)
except exception.InstanceNotFound:
LOG.warn(_("Instance not found."), instance_uuid=instance_uuid)
except Exception as e:
with excutils.save_and_reraise_exception():
self._set_instance_error_state(context, instance_uuid)
+ def _reschedule_or_reraise(self, context, instance, *args, **kwargs):
+ """Try to re-schedule the build or re-raise the original build error to
+ error out the instance.
+ """
+ type_, value, tb = sys.exc_info() # save original exception
+ rescheduled = False
+ instance_uuid = instance['uuid']
+
+ def _log_original_error():
+ LOG.error(_('Build error: %s') %
+ traceback.format_exception(type_, value, tb),
+ instance_uuid=instance_uuid)
+
+ try:
+ self._deallocate_network(context, instance)
+ except Exception:
+ # do not attempt retry if network de-allocation occurs:
+ _log_original_error()
+ raise
+
+ try:
+ rescheduled = self._reschedule(context, instance_uuid, *args,
+ **kwargs)
+ except Exception:
+ rescheduled = False
+ LOG.exception(_("Error trying to reschedule"),
+ instance_uuid=instance_uuid)
+
+ if rescheduled:
+ # log the original build error
+ _log_original_error()
+ else:
+ # not re-scheduling
+ raise type_, value, tb
+
+ def _reschedule(self, context, instance_uuid, requested_networks,
+ admin_password, injected_files, is_first_time, **kwargs):
+
+ filter_properties = kwargs.get('filter_properties', {})
+ retry = filter_properties.get('retry', None)
+ if not retry:
+ # no retry information, do not reschedule.
+ LOG.debug(_("Retry info not present, will not reschedule"),
+ instance_uuid=instance_uuid)
+ return
+
+ request_spec = kwargs.get('request_spec', None)
+ if not request_spec:
+ LOG.debug(_("No request spec, will not reschedule"),
+ instance_uuid=instance_uuid)
+ return
+
+ request_spec['num_instances'] = 1
+
+ LOG.debug(_("Re-scheduling instance: attempt %d"),
+ retry['num_attempts'], instance_uuid=instance_uuid)
+ self.scheduler_rpcapi.run_instance(context, FLAGS.compute_topic,
+ request_spec, admin_password, injected_files,
+ requested_networks, is_first_time, filter_properties,
+ reservations=None, call=False)
+ return True
+
@manager.periodic_task
def _check_instance_build_time(self, context):
"""Ensure that instances are not stuck in build."""