From baf7e02f29600e79eacb6c0f747075afeb74fdd5 Mon Sep 17 00:00:00 2001 From: Chris Behrens Date: Thu, 15 Dec 2011 14:16:42 -0800 Subject: Fix scheduler error handler Fixes bug 904971 Scheduler error handler was looking for instance_id when it may or may not exist. Added the proper code for it to determine whether the instance was actually created in the DB or not and how to find its ID. Note: there's some pretty nasty stuff in here, but unavoidable without larger changes. I'd like to hold off on these larger changes, because the problem should be solved with some of the scalability work coming. Tests included. Change-Id: Ief5fde8128437c9dc257af9c4d0c2950d0962ce5 --- nova/compute/api.py | 2 +- nova/scheduler/chance.py | 5 ++++- nova/scheduler/distributed_scheduler.py | 7 ++++++- nova/scheduler/driver.py | 8 ++++++-- nova/scheduler/manager.py | 29 ++++++++++++++++++++--------- nova/scheduler/simple.py | 4 ++++ nova/tests/scheduler/test_scheduler.py | 31 +++++++++++++++++++++++++++++-- 7 files changed, 70 insertions(+), 16 deletions(-) diff --git a/nova/compute/api.py b/nova/compute/api.py index 3b79e660d..e939de62c 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -310,7 +310,7 @@ class API(base.Base): context, instance_type, image, base_options, security_group, block_device_mapping) # Tells scheduler we created the instance already. - base_options['id'] = instance['id'] + base_options['uuid'] = instance['uuid'] rpc_method = rpc.cast else: # We need to wait for the scheduler to create the instance diff --git a/nova/scheduler/chance.py b/nova/scheduler/chance.py index fe47e4f54..2995bda70 100644 --- a/nova/scheduler/chance.py +++ b/nova/scheduler/chance.py @@ -74,7 +74,10 @@ class ChanceScheduler(driver.Scheduler): driver.cast_to_compute_host(context, host, 'run_instance', instance_uuid=instance['uuid'], **kwargs) instances.append(driver.encode_instance(instance)) - + # So if we loop around, create_instance_db_entry will actually + # create a new entry, instead of assume it's been created + # already + del request_spec['instance_properties']['uuid'] return instances def schedule_prep_resize(self, context, request_spec, *args, **kwargs): diff --git a/nova/scheduler/distributed_scheduler.py b/nova/scheduler/distributed_scheduler.py index 888594731..1d149a139 100644 --- a/nova/scheduler/distributed_scheduler.py +++ b/nova/scheduler/distributed_scheduler.py @@ -173,7 +173,12 @@ class DistributedScheduler(driver.Scheduler): instance = self.create_instance_db_entry(context, request_spec) driver.cast_to_compute_host(context, weighted_host.host, 'run_instance', instance_uuid=instance['uuid'], **kwargs) - return driver.encode_instance(instance, local=True) + inst = driver.encode_instance(instance, local=True) + # So if another instance is created, create_instance_db_entry will + # actually create a new entry, instead of assume it's been created + # already + del request_spec['instance_properties']['uuid'] + return inst def _make_weighted_host_from_blob(self, blob): """Returns the decrypted blob as a WeightedHost object diff --git a/nova/scheduler/driver.py b/nova/scheduler/driver.py index 8c7945200..85acf6a09 100644 --- a/nova/scheduler/driver.py +++ b/nova/scheduler/driver.py @@ -147,9 +147,9 @@ class Scheduler(object): def create_instance_db_entry(self, context, request_spec): """Create instance DB entry based on request_spec""" base_options = request_spec['instance_properties'] - if base_options.get('id'): + if base_options.get('uuid'): # Instance was already created before calling scheduler - return db.instance_get(context, base_options['id']) + return db.instance_get_by_uuid(context, base_options['uuid']) image = request_spec['image'] instance_type = request_spec.get('instance_type') security_group = request_spec.get('security_group', 'default') @@ -158,6 +158,10 @@ class Scheduler(object): instance = self.compute_api.create_db_entry_for_new_instance( context, instance_type, image, base_options, security_group, block_device_mapping) + # NOTE(comstud): This needs to be set for the generic exception + # checking in scheduler manager, so that it'll set this instance + # to ERROR properly. + base_options['uuid'] = instance['uuid'] return instance def schedule(self, context, topic, method, *_args, **_kwargs): diff --git a/nova/scheduler/manager.py b/nova/scheduler/manager.py index 38dda9931..7edc4e4f2 100644 --- a/nova/scheduler/manager.py +++ b/nova/scheduler/manager.py @@ -108,18 +108,29 @@ class SchedulerManager(manager.Manager): with utils.save_and_reraise_exception(): self._set_instance_error(method, context, ex, *args, **kwargs) - # NOTE (David Subiros) : If the exception is raised ruing run_instance - # method, the DB record probably does not exist yet. + # NOTE (David Subiros) : If the exception is raised during run_instance + # method, we may or may not have an instance_id def _set_instance_error(self, method, context, ex, *args, **kwargs): """Sets VM to Error state""" LOG.warning(_("Failed to schedule_%(method)s: %(ex)s") % locals()) - if method == "start_instance" or method == "run_instance": - instance_id = kwargs['instance_id'] - if instance_id: - LOG.warning(_("Setting instance %(instance_id)s to " - "ERROR state.") % locals()) - db.instance_update(context, instance_id, - {'vm_state': vm_states.ERROR}) + if method != "start_instance" and method != "run_instance": + return + # FIXME(comstud): Clean this up after fully on UUIDs. + instance_id = kwargs.get('instance_uuid', kwargs.get('instance_id')) + if not instance_id: + # FIXME(comstud): We should make this easier. run_instance + # only sends a request_spec, and an instance may or may not + # have been created in the API (or scheduler) already. If it + # was created, there's a 'uuid' set in the instance_properties + # of the request_spec. + request_spec = kwargs.get('request_spec', {}) + properties = request_spec.get('instance_properties', {}) + instance_id = properties.get('uuid', {}) + if instance_id: + LOG.warning(_("Setting instance %(instance_id)s to " + "ERROR state.") % locals()) + db.instance_update(context, instance_id, + {'vm_state': vm_states.ERROR}) # NOTE (masumotok) : This method should be moved to nova.api.ec2.admin. # Based on bexar design summit discussion, diff --git a/nova/scheduler/simple.py b/nova/scheduler/simple.py index 6fed16841..dfd2115f3 100644 --- a/nova/scheduler/simple.py +++ b/nova/scheduler/simple.py @@ -82,6 +82,10 @@ class SimpleScheduler(chance.ChanceScheduler): driver.cast_to_compute_host(context, host, 'run_instance', instance_uuid=instance_ref['uuid'], **_kwargs) instances.append(driver.encode_instance(instance_ref)) + # So if we loop around, create_instance_db_entry will actually + # create a new entry, instead of assume it's been created + # already + del request_spec['instance_properties']['uuid'] return instances def schedule_start_instance(self, context, instance_id, *_args, **_kwargs): diff --git a/nova/tests/scheduler/test_scheduler.py b/nova/tests/scheduler/test_scheduler.py index c0fce0fe3..57218fca3 100644 --- a/nova/tests/scheduler/test_scheduler.py +++ b/nova/tests/scheduler/test_scheduler.py @@ -113,6 +113,7 @@ def _fake_create_instance_db_entry(simple_self, context, request_spec): instance = _create_instance_from_spec(request_spec) global instance_uuids instance_uuids.append(instance['uuid']) + request_spec['instance_properties']['uuid'] = instance['uuid'] return instance @@ -236,15 +237,41 @@ class SchedulerTestCase(test.TestCase): scheduler = manager.SchedulerManager() ins_ref = _create_instance(task_state=task_states.STARTING, vm_state=vm_states.STOPPED) - self.stubs = stubout.StubOutForTesting() self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser) - self.mox.StubOutWithMock(rpc, 'cast', use_mock_anything=True) ctxt = context.get_admin_context() scheduler.start_instance(ctxt, 'topic', instance_id=ins_ref['id']) # assert that the instance goes to ERROR state self._assert_state({'vm_state': vm_states.ERROR, 'task_state': task_states.STARTING}) + def test_no_valid_host_exception_on_run_with_id(self): + """check the vm goes to ERROR state if run_instance fails""" + + def NoValidHost_raiser(context, topic, *args, **kwargs): + raise exception.NoValidHost(_("Test NoValidHost exception")) + scheduler = manager.SchedulerManager() + ins_ref = _create_instance(task_state=task_states.STARTING, + vm_state=vm_states.STOPPED) + self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser) + ctxt = context.get_admin_context() + request_spec = {'instance_properties': {'uuid': ins_ref['uuid']}} + scheduler.run_instance(ctxt, 'topic', request_spec=request_spec) + # assert that the instance goes to ERROR state + self._assert_state({'vm_state': vm_states.ERROR, + 'task_state': task_states.STARTING}) + + def test_no_valid_host_exception_on_run_without_id(self): + """check error handler doesn't raise if instance wasn't created""" + + def NoValidHost_raiser(context, topic, *args, **kwargs): + raise exception.NoValidHost(_("Test NoValidHost exception")) + scheduler = manager.SchedulerManager() + self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser) + ctxt = context.get_admin_context() + request_spec = {'instance_properties': {}} + scheduler.run_instance(ctxt, 'topic', request_spec=request_spec) + # No error + def test_show_host_resources_no_project(self): """No instance are running on the given host.""" -- cgit