From 433e6885fd276aa2b78688745aec3ef50a06452a Mon Sep 17 00:00:00 2001 From: Brian Elliott Date: Tue, 16 Apr 2013 21:42:23 +0000 Subject: Refactor _run_instance() to unify control flow _run_instance() had multiple points where various classes of errors were handled. This patch classifies the possible error/exit conditions into the following categories: * success * rescheduled - an error occurred, but the build will be returned to scheduled for another attempt. * aborted - build was aborted for a non-error. (e.x. instace was deleted during spawn) * fatal - general case of uncaught exceptions resulting in the instance going to error. blueprint create-error-notification Change-Id: I3f01753bde8af0750ba9067c8a98ded45d5a0eb7 --- nova/compute/manager.py | 205 +++++++++++++++++++++++++++--------------------- 1 file changed, 117 insertions(+), 88 deletions(-) (limited to 'nova/compute') diff --git a/nova/compute/manager.py b/nova/compute/manager.py index dc0725933..1e03d6f7e 100755 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -834,6 +834,56 @@ class ComputeManager(manager.SchedulerDependentManager): filter_properties, requested_networks, injected_files, admin_password, is_first_time, node, instance): """Launch a new instance with specified options.""" + + extra_usage_info = {} + + def notify(status, msg=None): + """Send a create.{start,end} notification.""" + type_ = "create.%(status)s" % dict(status=status) + self._notify_about_instance_usage(context, instance, type_, + extra_usage_info=extra_usage_info) + + try: + image_meta = self._prebuild_instance(context, instance) + if image_meta: + extra_usage_info = {"image_name": image_meta['name']} + + notify("start") # notify that build is starting + + instance = self._build_instance(context, request_spec, + filter_properties, requested_networks, injected_files, + admin_password, is_first_time, node, instance, image_meta) + notify("end") # notify that build is done + + except exception.RescheduledException as e: + # Instance build encountered an error, and has been rescheduled. + pass + + except exception.BuildAbortException as e: + # Instance build aborted due to a non-failure + LOG.info(e) + + except Exception as e: + # Instance build encountered a non-recoverable error: + with excutils.save_and_reraise_exception(): + self._set_instance_error_state(context, instance['uuid']) + + def _prebuild_instance(self, context, instance): + self._check_instance_exists(context, instance) + + try: + self._start_building(context, instance) + except exception.InstanceNotFound: + msg = _("Instance disappeared before we could start it") + # Quickly bail out of here + raise exception.BuildAbortException(instance_uuid=instance['uuid'], + reason=msg) + + return self._check_image_size(context, instance) + + def _build_instance(self, context, request_spec, filter_properties, + requested_networks, injected_files, admin_password, is_first_time, + node, instance, image_meta): context = context.elevated() # If quantum security groups pass requested security @@ -843,97 +893,81 @@ class ComputeManager(manager.SchedulerDependentManager): else: security_groups = [] - try: - self._check_instance_exists(context, instance) - - try: - self._start_building(context, instance) - except exception.InstanceNotFound: - LOG.info(_("Instance disappeared before we could start it"), - instance=instance) - # Quickly bail out of here - return - - image_meta = self._check_image_size(context, instance) - - if node is None: - node = self.driver.get_available_nodes()[0] - LOG.debug(_("No node specified, defaulting to %(node)s") % - locals()) - - if image_meta: - extra_usage_info = {"image_name": image_meta['name']} - else: - extra_usage_info = {} - - self._notify_about_instance_usage( - context, instance, "create.start", - extra_usage_info=extra_usage_info) + if node is None: + node = self.driver.get_available_nodes()[0] + LOG.debug(_("No node specified, defaulting to %(node)s") % + locals()) - network_info = None - bdms = self.conductor_api.block_device_mapping_get_all_by_instance( - context, instance) + network_info = None + bdms = self.conductor_api.block_device_mapping_get_all_by_instance( + context, instance) - # b64 decode the files to inject: - injected_files_orig = injected_files - injected_files = self._decode_files(injected_files) + # b64 decode the files to inject: + injected_files_orig = injected_files + injected_files = self._decode_files(injected_files) - rt = self._get_resource_tracker(node) - try: - limits = filter_properties.get('limits', {}) - with rt.instance_claim(context, instance, limits): - macs = self.driver.macs_for_instance(instance) + rt = self._get_resource_tracker(node) + try: + limits = filter_properties.get('limits', {}) + with rt.instance_claim(context, instance, limits): + macs = self.driver.macs_for_instance(instance) - network_info = self._allocate_network(context, instance, - requested_networks, macs, security_groups) + network_info = self._allocate_network(context, instance, + requested_networks, macs, security_groups) - self._instance_update( - context, instance['uuid'], - vm_state=vm_states.BUILDING, - task_state=task_states.BLOCK_DEVICE_MAPPING) + self._instance_update( + context, instance['uuid'], + vm_state=vm_states.BUILDING, + task_state=task_states.BLOCK_DEVICE_MAPPING) - block_device_info = self._prep_block_device( - context, instance, bdms) + block_device_info = self._prep_block_device( + context, instance, bdms) - set_access_ip = (is_first_time and - not instance['access_ip_v4'] and - not instance['access_ip_v6']) + set_access_ip = (is_first_time and + not instance['access_ip_v4'] and + not instance['access_ip_v6']) - instance = self._spawn(context, instance, image_meta, - network_info, block_device_info, - injected_files, admin_password, - set_access_ip=set_access_ip) - except exception.InstanceNotFound: - # the instance got deleted during the spawn - with excutils.save_and_reraise_exception(): - try: - self._deallocate_network(context, instance) - except Exception: - msg = _('Failed to dealloc network ' - 'for deleted instance') - LOG.exception(msg, instance=instance) - except exception.UnexpectedTaskStateError as e: - actual_task_state = e.kwargs.get('actual', None) - if actual_task_state == 'deleting': - msg = _('Instance was deleted during spawn.') - LOG.debug(msg, instance=instance) - else: - raise - except Exception: - exc_info = sys.exc_info() - # try to re-schedule instance: - self._reschedule_or_reraise(context, instance, exc_info, - requested_networks, admin_password, - injected_files_orig, is_first_time, request_spec, - filter_properties, bdms) + instance = self._spawn(context, instance, image_meta, + network_info, block_device_info, + injected_files, admin_password, + set_access_ip=set_access_ip) + except exception.InstanceNotFound: + # the instance got deleted during the spawn + with excutils.save_and_reraise_exception(): + try: + self._deallocate_network(context, instance) + except Exception: + msg = _('Failed to dealloc network ' + 'for deleted instance') + LOG.exception(msg, instance=instance) + except exception.UnexpectedTaskStateError as e: + actual_task_state = e.kwargs.get('actual', None) + if actual_task_state == 'deleting': + msg = _('Instance was deleted during spawn.') + LOG.debug(msg, instance=instance) + raise exception.BuildAbortException( + instance_uuid=instance['uuid'], reason=msg) else: - # Spawn success: - self._notify_about_instance_usage(context, instance, - "create.end", network_info=network_info, - extra_usage_info=extra_usage_info) + raise except Exception: - with excutils.save_and_reraise_exception(): - self._set_instance_error_state(context, instance['uuid']) + exc_info = sys.exc_info() + # try to re-schedule instance: + rescheduled = self._reschedule_or_error(context, instance, + exc_info, requested_networks, admin_password, + injected_files_orig, is_first_time, request_spec, + filter_properties, bdms) + if rescheduled: + # log the original build error + self._log_original_error(exc_info, instance['uuid']) + raise exception.RescheduledException( + instance_uuid=instance['uuid'], + reason=unicode(exc_info[1])) + else: + # not re-scheduling, go to error: + raise exc_info[0], exc_info[1], exc_info[2] + + # spawn success + return instance def _log_original_error(self, exc_info, instance_uuid): type_, value, tb = exc_info @@ -941,7 +975,7 @@ class ComputeManager(manager.SchedulerDependentManager): traceback.format_exception(type_, value, tb), instance_uuid=instance_uuid) - def _reschedule_or_reraise(self, context, instance, exc_info, + def _reschedule_or_error(self, context, instance, exc_info, requested_networks, admin_password, injected_files, is_first_time, request_spec, filter_properties, bdms=None): """Try to re-schedule the build or re-raise the original build error to @@ -982,12 +1016,7 @@ class ComputeManager(manager.SchedulerDependentManager): LOG.exception(_("Error trying to reschedule"), instance_uuid=instance_uuid) - if rescheduled: - # log the original build error - self._log_original_error(exc_info, instance_uuid) - else: - # not re-scheduling - raise exc_info[0], exc_info[1], exc_info[2] + return rescheduled def _reschedule(self, context, request_spec, filter_properties, instance_uuid, scheduler_method, method_args, task_state, -- cgit