From 433e6885fd276aa2b78688745aec3ef50a06452a Mon Sep 17 00:00:00 2001 From: Brian Elliott Date: Tue, 16 Apr 2013 21:42:23 +0000 Subject: Refactor _run_instance() to unify control flow _run_instance() had multiple points where various classes of errors were handled. This patch classifies the possible error/exit conditions into the following categories: * success * rescheduled - an error occurred, but the build will be returned to scheduled for another attempt. * aborted - build was aborted for a non-error. (e.x. instace was deleted during spawn) * fatal - general case of uncaught exceptions resulting in the instance going to error. blueprint create-error-notification Change-Id: I3f01753bde8af0750ba9067c8a98ded45d5a0eb7 --- nova/compute/manager.py | 205 +++++++++++++++++++++---------------- nova/exception.py | 9 ++ nova/tests/compute/test_compute.py | 63 +++++++----- 3 files changed, 161 insertions(+), 116 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index dc0725933..1e03d6f7e 100755 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -834,6 +834,56 @@ class ComputeManager(manager.SchedulerDependentManager): filter_properties, requested_networks, injected_files, admin_password, is_first_time, node, instance): """Launch a new instance with specified options.""" + + extra_usage_info = {} + + def notify(status, msg=None): + """Send a create.{start,end} notification.""" + type_ = "create.%(status)s" % dict(status=status) + self._notify_about_instance_usage(context, instance, type_, + extra_usage_info=extra_usage_info) + + try: + image_meta = self._prebuild_instance(context, instance) + if image_meta: + extra_usage_info = {"image_name": image_meta['name']} + + notify("start") # notify that build is starting + + instance = self._build_instance(context, request_spec, + filter_properties, requested_networks, injected_files, + admin_password, is_first_time, node, instance, image_meta) + notify("end") # notify that build is done + + except exception.RescheduledException as e: + # Instance build encountered an error, and has been rescheduled. + pass + + except exception.BuildAbortException as e: + # Instance build aborted due to a non-failure + LOG.info(e) + + except Exception as e: + # Instance build encountered a non-recoverable error: + with excutils.save_and_reraise_exception(): + self._set_instance_error_state(context, instance['uuid']) + + def _prebuild_instance(self, context, instance): + self._check_instance_exists(context, instance) + + try: + self._start_building(context, instance) + except exception.InstanceNotFound: + msg = _("Instance disappeared before we could start it") + # Quickly bail out of here + raise exception.BuildAbortException(instance_uuid=instance['uuid'], + reason=msg) + + return self._check_image_size(context, instance) + + def _build_instance(self, context, request_spec, filter_properties, + requested_networks, injected_files, admin_password, is_first_time, + node, instance, image_meta): context = context.elevated() # If quantum security groups pass requested security @@ -843,97 +893,81 @@ class ComputeManager(manager.SchedulerDependentManager): else: security_groups = [] - try: - self._check_instance_exists(context, instance) - - try: - self._start_building(context, instance) - except exception.InstanceNotFound: - LOG.info(_("Instance disappeared before we could start it"), - instance=instance) - # Quickly bail out of here - return - - image_meta = self._check_image_size(context, instance) - - if node is None: - node = self.driver.get_available_nodes()[0] - LOG.debug(_("No node specified, defaulting to %(node)s") % - locals()) - - if image_meta: - extra_usage_info = {"image_name": image_meta['name']} - else: - extra_usage_info = {} - - self._notify_about_instance_usage( - context, instance, "create.start", - extra_usage_info=extra_usage_info) + if node is None: + node = self.driver.get_available_nodes()[0] + LOG.debug(_("No node specified, defaulting to %(node)s") % + locals()) - network_info = None - bdms = self.conductor_api.block_device_mapping_get_all_by_instance( - context, instance) + network_info = None + bdms = self.conductor_api.block_device_mapping_get_all_by_instance( + context, instance) - # b64 decode the files to inject: - injected_files_orig = injected_files - injected_files = self._decode_files(injected_files) + # b64 decode the files to inject: + injected_files_orig = injected_files + injected_files = self._decode_files(injected_files) - rt = self._get_resource_tracker(node) - try: - limits = filter_properties.get('limits', {}) - with rt.instance_claim(context, instance, limits): - macs = self.driver.macs_for_instance(instance) + rt = self._get_resource_tracker(node) + try: + limits = filter_properties.get('limits', {}) + with rt.instance_claim(context, instance, limits): + macs = self.driver.macs_for_instance(instance) - network_info = self._allocate_network(context, instance, - requested_networks, macs, security_groups) + network_info = self._allocate_network(context, instance, + requested_networks, macs, security_groups) - self._instance_update( - context, instance['uuid'], - vm_state=vm_states.BUILDING, - task_state=task_states.BLOCK_DEVICE_MAPPING) + self._instance_update( + context, instance['uuid'], + vm_state=vm_states.BUILDING, + task_state=task_states.BLOCK_DEVICE_MAPPING) - block_device_info = self._prep_block_device( - context, instance, bdms) + block_device_info = self._prep_block_device( + context, instance, bdms) - set_access_ip = (is_first_time and - not instance['access_ip_v4'] and - not instance['access_ip_v6']) + set_access_ip = (is_first_time and + not instance['access_ip_v4'] and + not instance['access_ip_v6']) - instance = self._spawn(context, instance, image_meta, - network_info, block_device_info, - injected_files, admin_password, - set_access_ip=set_access_ip) - except exception.InstanceNotFound: - # the instance got deleted during the spawn - with excutils.save_and_reraise_exception(): - try: - self._deallocate_network(context, instance) - except Exception: - msg = _('Failed to dealloc network ' - 'for deleted instance') - LOG.exception(msg, instance=instance) - except exception.UnexpectedTaskStateError as e: - actual_task_state = e.kwargs.get('actual', None) - if actual_task_state == 'deleting': - msg = _('Instance was deleted during spawn.') - LOG.debug(msg, instance=instance) - else: - raise - except Exception: - exc_info = sys.exc_info() - # try to re-schedule instance: - self._reschedule_or_reraise(context, instance, exc_info, - requested_networks, admin_password, - injected_files_orig, is_first_time, request_spec, - filter_properties, bdms) + instance = self._spawn(context, instance, image_meta, + network_info, block_device_info, + injected_files, admin_password, + set_access_ip=set_access_ip) + except exception.InstanceNotFound: + # the instance got deleted during the spawn + with excutils.save_and_reraise_exception(): + try: + self._deallocate_network(context, instance) + except Exception: + msg = _('Failed to dealloc network ' + 'for deleted instance') + LOG.exception(msg, instance=instance) + except exception.UnexpectedTaskStateError as e: + actual_task_state = e.kwargs.get('actual', None) + if actual_task_state == 'deleting': + msg = _('Instance was deleted during spawn.') + LOG.debug(msg, instance=instance) + raise exception.BuildAbortException( + instance_uuid=instance['uuid'], reason=msg) else: - # Spawn success: - self._notify_about_instance_usage(context, instance, - "create.end", network_info=network_info, - extra_usage_info=extra_usage_info) + raise except Exception: - with excutils.save_and_reraise_exception(): - self._set_instance_error_state(context, instance['uuid']) + exc_info = sys.exc_info() + # try to re-schedule instance: + rescheduled = self._reschedule_or_error(context, instance, + exc_info, requested_networks, admin_password, + injected_files_orig, is_first_time, request_spec, + filter_properties, bdms) + if rescheduled: + # log the original build error + self._log_original_error(exc_info, instance['uuid']) + raise exception.RescheduledException( + instance_uuid=instance['uuid'], + reason=unicode(exc_info[1])) + else: + # not re-scheduling, go to error: + raise exc_info[0], exc_info[1], exc_info[2] + + # spawn success + return instance def _log_original_error(self, exc_info, instance_uuid): type_, value, tb = exc_info @@ -941,7 +975,7 @@ class ComputeManager(manager.SchedulerDependentManager): traceback.format_exception(type_, value, tb), instance_uuid=instance_uuid) - def _reschedule_or_reraise(self, context, instance, exc_info, + def _reschedule_or_error(self, context, instance, exc_info, requested_networks, admin_password, injected_files, is_first_time, request_spec, filter_properties, bdms=None): """Try to re-schedule the build or re-raise the original build error to @@ -982,12 +1016,7 @@ class ComputeManager(manager.SchedulerDependentManager): LOG.exception(_("Error trying to reschedule"), instance_uuid=instance_uuid) - if rescheduled: - # log the original build error - self._log_original_error(exc_info, instance_uuid) - else: - # not re-scheduling - raise exc_info[0], exc_info[1], exc_info[2] + return rescheduled def _reschedule(self, context, request_spec, filter_properties, instance_uuid, scheduler_method, method_args, task_state, diff --git a/nova/exception.py b/nova/exception.py index de922f50e..30b8ae0f1 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -1215,3 +1215,12 @@ class UnsupportedHardware(Invalid): class Base64Exception(NovaException): message = _("Invalid Base 64 data for file %(path)s") + + +class BuildAbortException(NovaException): + message = _("Build of instance %(instance_uuid)s aborted: %(reason)s") + + +class RescheduledException(NovaException): + message = _("Build of instance %(instance_uuid)s was re-scheduled: " + "%(reason)s") diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py index 88434078d..5e16167f4 100644 --- a/nova/tests/compute/test_compute.py +++ b/nova/tests/compute/test_compute.py @@ -7915,26 +7915,28 @@ class InnerTestingException(Exception): pass -class ComputeRescheduleOrReraiseTestCase(BaseTestCase): +class ComputeRescheduleOrErrorTestCase(BaseTestCase): """Test logic and exception handling around rescheduling or re-raising original exceptions when builds fail. """ def setUp(self): - super(ComputeRescheduleOrReraiseTestCase, self).setUp() + super(ComputeRescheduleOrErrorTestCase, self).setUp() self.instance = self._create_fake_instance() - def test_reschedule_or_reraise_called(self): - """Basic sanity check to make sure _reschedule_or_reraise is called + def test_reschedule_or_error_called(self): + """Basic sanity check to make sure _reschedule_or_error is called when a build fails. """ self.mox.StubOutWithMock(self.compute, '_spawn') - self.mox.StubOutWithMock(self.compute, '_reschedule_or_reraise') + self.mox.StubOutWithMock(self.compute, '_reschedule_or_error') - self.compute._spawn(mox.IgnoreArg(), self.instance, None, None, None, - False, None).AndRaise(test.TestingException("BuildError")) - self.compute._reschedule_or_reraise(mox.IgnoreArg(), self.instance, - mox.IgnoreArg(), None, None, None, False, None, {}, []) + self.compute._spawn(mox.IgnoreArg(), self.instance, mox.IgnoreArg(), + [], mox.IgnoreArg(), [], None, set_access_ip=False).AndRaise( + test.TestingException("BuildError")) + self.compute._reschedule_or_error(mox.IgnoreArg(), self.instance, + mox.IgnoreArg(), None, None, None, False, None, {}, []).\ + AndReturn(True) self.mox.ReplayAll() self.compute._run_instance(self.context, None, {}, None, None, None, @@ -7964,11 +7966,16 @@ class ComputeRescheduleOrReraiseTestCase(BaseTestCase): # should raise the deallocation exception, not the original build # error: self.assertRaises(InnerTestingException, - self.compute._reschedule_or_reraise, self.context, + self.compute._reschedule_or_error, self.context, self.instance, exc_info, None, None, None, False, None, {}) def test_reschedule_fail(self): # Test handling of exception from _reschedule. + try: + raise test.TestingException("Original") + except Exception: + exc_info = sys.exc_info() + instance_uuid = self.instance['uuid'] method_args = (None, None, None, None, False, {}) self.mox.StubOutWithMock(self.compute, '_shutdown_instance') @@ -7981,19 +7988,13 @@ class ComputeRescheduleOrReraiseTestCase(BaseTestCase): mox.IgnoreArg()) self.compute._reschedule(self.context, None, instance_uuid, {}, self.compute.scheduler_rpcapi.run_instance, - method_args, task_states.SCHEDULING).AndRaise( + method_args, task_states.SCHEDULING, exc_info).AndRaise( InnerTestingException("Inner")) self.mox.ReplayAll() - try: - raise test.TestingException("Original") - except Exception: - # not re-scheduling, should raise the original build error: - exc_info = sys.exc_info() - self.assertRaises(test.TestingException, - self.compute._reschedule_or_reraise, self.context, - self.instance, exc_info, None, None, None, False, None, {}) + self.assertFalse(self.compute._reschedule_or_error(self.context, + self.instance, exc_info, None, None, None, False, None, {})) def test_reschedule_false(self): # Test not-rescheduling, but no nested exception. @@ -8023,9 +8024,8 @@ class ComputeRescheduleOrReraiseTestCase(BaseTestCase): # re-scheduling is False, the original build error should be # raised here: - self.assertRaises(test.TestingException, - self.compute._reschedule_or_reraise, self.context, - self.instance, exc_info, None, None, None, False, None, {}) + self.assertFalse(self.compute._reschedule_or_error(self.context, + self.instance, exc_info, None, None, None, False, None, {})) def test_reschedule_true(self): # Test behavior when re-scheduling happens. @@ -8057,14 +8057,14 @@ class ComputeRescheduleOrReraiseTestCase(BaseTestCase): # re-scheduling is True, original error is logged, but nothing # is raised: - self.compute._reschedule_or_reraise(self.context, self.instance, + self.compute._reschedule_or_error(self.context, self.instance, exc_info, None, None, None, False, None, {}) def test_no_reschedule_on_delete_during_spawn(self): # instance should not be rescheduled if instance is deleted # during the build self.mox.StubOutWithMock(self.compute, '_spawn') - self.mox.StubOutWithMock(self.compute, '_reschedule_or_reraise') + self.mox.StubOutWithMock(self.compute, '_reschedule_or_error') exc = exception.UnexpectedTaskStateError(expected=task_states.SPAWNING, actual=task_states.DELETING) @@ -8073,7 +8073,7 @@ class ComputeRescheduleOrReraiseTestCase(BaseTestCase): mox.IgnoreArg(), set_access_ip=False).AndRaise(exc) self.mox.ReplayAll() - # test succeeds if mocked method '_reschedule_or_reraise' is not + # test succeeds if mocked method '_reschedule_or_error' is not # called. self.compute._run_instance(self.context, None, {}, None, None, None, False, None, self.instance) @@ -8082,7 +8082,7 @@ class ComputeRescheduleOrReraiseTestCase(BaseTestCase): # instance shouldn't be rescheduled if unexpected task state arises. # the exception should get reraised. self.mox.StubOutWithMock(self.compute, '_spawn') - self.mox.StubOutWithMock(self.compute, '_reschedule_or_reraise') + self.mox.StubOutWithMock(self.compute, '_reschedule_or_error') exc = exception.UnexpectedTaskStateError(expected=task_states.SPAWNING, actual=task_states.SCHEDULING) @@ -8413,12 +8413,19 @@ class ComputeInjectedFilesTestCase(BaseTestCase): ('/d/e/f', base64.b64encode('seespotrun')), ] - def _ror(context, instance, exc_info, requested_networks, + def _roe(context, instance, exc_info, requested_networks, admin_password, injected_files, is_first_time, request_spec, filter_properties, bdms=None): self.assertEqual(expected, injected_files) + return True + + def spawn_explode(context, instance, image_meta, injected_files, + admin_password, nw_info, block_device_info): + # force reschedule logic to execute + raise test.TestingException(_("spawn error")) - self.stubs.Set(self.compute, '_reschedule_or_reraise', _ror) + self.stubs.Set(self.compute.driver, 'spawn', spawn_explode) + self.stubs.Set(self.compute, '_reschedule_or_error', _roe) self.compute.run_instance(self.context, self.instance, injected_files=expected) -- cgit