Adds generic retries for build failures.

Add a generic scheduler retry for build failures. Failed build requests get casted back to scheduler for retry until success or the maximum number of attempts is reached. The number of attempts to make is configurable or can be simply set to 1 to disable retries altogether. Partially implements blueprint: scheduler-resource-race DocImpact: Adds a new capability to filter scheduler to enable retries of scheduling requests. 1) New flag: scheduler_max_attempts (int) - Number of attempts to make to schedule an instance before giving up and settting the instance to error. 2) New RetryFilter. Avoids re-scheduling to the same host multiple times. (nova.scheduler.filters.RetryFilter) Change-Id: I1127caeed4418c75372a42ca7fafacb4f061ffe3
author: Brian Elliott <brian.elliott@rackspace.com> 2012-06-21 00:44:24 +0000
committer: Brian Elliott <brian.elliott@rackspace.com> 2012-07-17 22:15:41 +0000
commit: 83fece1aa7e124a2adae05d95c8c6cecc12d5d41 (patch)
tree: 2fcaa6ff5838fe6714b372e9cb5d3c40d080d605 /nova/tests
parent: acb158714c562d3142bf2f3f560dc374daa2df7d (diff)
3 files changed, 219 insertions, 5 deletions
diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py
index 28020c762..3dcf9d6b4 100644
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@@ -20,6 +20,7 @@
 
 import copy
 import datetime
+import functools
 import sys
 import time
 
@@ -102,6 +103,11 @@ def nop_report_driver_status(self):
     pass
 
 
+class FakeSchedulerAPI(object):
+    def run_instance(self, *args, **kwargs):
+        pass
+
+
 class BaseTestCase(test.TestCase):
 
     def setUp(self):
@@ -129,6 +135,9 @@ class BaseTestCase(test.TestCase):
         self.stubs.Set(rpc, 'call', rpc_call_wrapper)
         self.stubs.Set(rpc, 'cast', rpc_cast_wrapper)
 
+        fake_rpcapi = FakeSchedulerAPI()
+        self.stubs.Set(self.compute, 'scheduler_rpcapi', fake_rpcapi)
+
     def tearDown(self):
         fake_image.FakeImageService_reset()
         instances = db.instance_get_all(self.context.elevated())
@@ -4290,3 +4299,100 @@ class DisabledInstanceTypesTestCase(BaseTestCase):
         self.assertNotRaises(exception.FlavorNotFound,
             self.compute_api.resize, self.context, instance, None,
             exc_msg="Disabled flavors can be migrated to")
+
+
+class ComputeReschedulingTestCase(BaseTestCase):
+    """Tests related to re-scheduling build requests"""
+
+    def setUp(self):
+        super(ComputeReschedulingTestCase, self).setUp()
+
+        self._reschedule = self._reschedule_partial()
+
+    def _reschedule_partial(self):
+        uuid = "12-34-56-78-90"
+
+        requested_networks = None
+        admin_password = None
+        injected_files = None
+        is_first_time = False
+
+        return functools.partial(self.compute._reschedule, self.context, uuid,
+                requested_networks, admin_password, injected_files,
+                is_first_time)
+
+    def test_reschedule_no_filter_properties(self):
+        """no filter_properties will disable re-scheduling"""
+        self.assertFalse(self._reschedule())
+
+    def test_reschedule_no_retry_info(self):
+        """no retry info will also disable re-scheduling"""
+        filter_properties = {}
+        self.assertFalse(self._reschedule(filter_properties=filter_properties))
+
+    def test_reschedule_no_request_spec(self):
+        """no request spec will also disable re-scheduling"""
+        retry = dict(num_attempts=1)
+        filter_properties = dict(retry=retry)
+        self.assertFalse(self._reschedule(filter_properties=filter_properties))
+
+    def test_reschedule_success(self):
+        retry = dict(num_attempts=1)
+        filter_properties = dict(retry=retry)
+        request_spec = {'num_instances': 42}
+        self.assertTrue(self._reschedule(filter_properties=filter_properties,
+            request_spec=request_spec))
+        self.assertEqual(1, request_spec['num_instances'])
+
+
+class ThatsNoOrdinaryRabbitException(Exception):
+    pass
+
+
+class ComputeReschedulingExceptionTestCase(BaseTestCase):
+    """Tests for re-scheduling exception handling logic"""
+
+    def setUp(self):
+        super(ComputeReschedulingExceptionTestCase, self).setUp()
+
+        # cause _spawn to raise an exception to test the exception logic:
+        def exploding_spawn(*args, **kwargs):
+            raise ThatsNoOrdinaryRabbitException()
+        self.stubs.Set(self.compute, '_spawn',
+                exploding_spawn)
+
+        self.instance_uuid = self._create_fake_instance()['uuid']
+
+    def test_exception_with_rescheduling_disabled(self):
+        """Spawn fails and re-scheduling is disabled."""
+        # this won't be re-scheduled:
+        self.assertRaises(ThatsNoOrdinaryRabbitException,
+                self.compute._run_instance, self.context, self.instance_uuid)
+
+    def test_exception_with_rescheduling_enabled(self):
+        """Spawn fails and re-scheduling is enabled.  Original exception
+        should *not* be re-raised.
+        """
+        # provide the expected status so that this one will be re-scheduled:
+        retry = dict(num_attempts=1)
+        filter_properties = dict(retry=retry)
+        request_spec = dict(num_attempts=1)
+        self.assertNotRaises(ThatsNoOrdinaryRabbitException,
+                self.compute._run_instance, self.context, self.instance_uuid,
+                filter_properties=filter_properties, request_spec=request_spec)
+
+    def test_exception_context_cleared(self):
+        """Test with no rescheduling and an additional exception occurs
+        clearing the original build error's exception context.
+        """
+        # clears the original exception context:
+        class FleshWoundException(Exception):
+            pass
+
+        def reschedule_explode(*args, **kwargs):
+            raise FleshWoundException()
+        self.stubs.Set(self.compute, '_reschedule', reschedule_explode)
+
+        # the original exception should now be raised:
+        self.assertRaises(ThatsNoOrdinaryRabbitException,
+                self.compute._run_instance, self.context, self.instance_uuid)
diff --git a/nova/tests/scheduler/test_filter_scheduler.py b/nova/tests/scheduler/test_filter_scheduler.py
index 73c23e07b..4fccb6299 100644
--- a/nova/tests/scheduler/test_filter_scheduler.py
+++ b/nova/tests/scheduler/test_filter_scheduler.py
@@ -79,7 +79,7 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
         sched = fakes.FakeFilterScheduler()
         fake_context = context.RequestContext('user', 'project')
         self.assertRaises(NotImplementedError, sched._schedule, fake_context,
-                          "foo", {})
+                          "foo", {}, {})
 
     def test_scheduler_includes_launch_index(self):
         ctxt = "fake-context"
@@ -111,18 +111,18 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
         self.mox.StubOutWithMock(self.driver, '_provision_resource')
 
         self.driver._schedule(context_fake, 'compute',
-                              request_spec, **fake_kwargs
+                              request_spec, {}, **fake_kwargs
                               ).AndReturn(['host1', 'host2'])
         # instance 1
         self.driver._provision_resource(
             ctxt, 'host1',
             mox.Func(_has_launch_index(0)), None,
-            fake_kwargs).AndReturn(instance1)
+            {}, fake_kwargs).AndReturn(instance1)
         # instance 2
         self.driver._provision_resource(
             ctxt, 'host2',
             mox.Func(_has_launch_index(1)), None,
-            fake_kwargs).AndReturn(instance2)
+            {}, fake_kwargs).AndReturn(instance2)
         self.mox.ReplayAll()
 
         self.driver.schedule_run_instance(context_fake, request_spec, None,
@@ -160,7 +160,7 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
                                                 'vcpus': 1}}
         self.mox.ReplayAll()
         weighted_hosts = sched._schedule(fake_context, 'compute',
-                request_spec)
+                request_spec, {})
         self.assertEquals(len(weighted_hosts), 10)
         for weighted_host in weighted_hosts:
             self.assertTrue(weighted_host.host_state is not None)
@@ -176,3 +176,88 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
         hostinfo.update_from_compute_node(dict(memory_mb=1000,
                 local_gb=0, vcpus=1))
         self.assertEquals(1000 - 128, fn(hostinfo, {}))
+
+    def test_max_attempts(self):
+        self.flags(scheduler_max_attempts=4)
+
+        sched = fakes.FakeFilterScheduler()
+        self.assertEqual(4, sched._max_attempts())
+
+    def test_invalid_max_attempts(self):
+        self.flags(scheduler_max_attempts=0)
+
+        sched = fakes.FakeFilterScheduler()
+        self.assertRaises(exception.NovaException, sched._max_attempts)
+
+    def test_retry_disabled(self):
+        """Retry info should not get populated when re-scheduling is off"""
+        self.flags(scheduler_max_attempts=1)
+        sched = fakes.FakeFilterScheduler()
+
+        instance_properties = {}
+        request_spec = dict(instance_properties=instance_properties)
+        filter_properties = {}
+
+        sched._schedule(self.context, 'compute', request_spec,
+                filter_properties=filter_properties)
+
+        # should not have retry info in the populated filter properties:
+        self.assertFalse("retry" in filter_properties)
+
+    def test_retry_attempt_one(self):
+        """Test retry logic on initial scheduling attempt"""
+        self.flags(scheduler_max_attempts=2)
+        sched = fakes.FakeFilterScheduler()
+
+        instance_properties = {}
+        request_spec = dict(instance_properties=instance_properties)
+        filter_properties = {}
+
+        sched._schedule(self.context, 'compute', request_spec,
+                filter_properties=filter_properties)
+
+        num_attempts = filter_properties['retry']['num_attempts']
+        self.assertEqual(1, num_attempts)
+
+    def test_retry_attempt_two(self):
+        """Test retry logic when re-scheduling"""
+        self.flags(scheduler_max_attempts=2)
+        sched = fakes.FakeFilterScheduler()
+
+        instance_properties = {}
+        request_spec = dict(instance_properties=instance_properties)
+
+        retry = dict(num_attempts=1)
+        filter_properties = dict(retry=retry)
+
+        sched._schedule(self.context, 'compute', request_spec,
+                filter_properties=filter_properties)
+
+        num_attempts = filter_properties['retry']['num_attempts']
+        self.assertEqual(2, num_attempts)
+
+    def test_retry_exceeded_max_attempts(self):
+        """Test for necessary explosion when max retries is exceeded"""
+        self.flags(scheduler_max_attempts=2)
+        sched = fakes.FakeFilterScheduler()
+
+        instance_properties = {}
+        request_spec = dict(instance_properties=instance_properties)
+
+        retry = dict(num_attempts=2)
+        filter_properties = dict(retry=retry)
+
+        self.assertRaises(exception.NoValidHost, sched._schedule, self.context,
+                'compute', request_spec, filter_properties=filter_properties)
+
+    def test_add_retry_host(self):
+        retry = dict(num_attempts=1, hosts=[])
+        filter_properties = dict(retry=retry)
+        host = "fakehost"
+
+        sched = fakes.FakeFilterScheduler()
+        sched._add_retry_host(filter_properties, host)
+
+        hosts = filter_properties['retry']['hosts']
+        self.assertEqual(1, len(hosts))
+        self.assertEqual(host, hosts[0])
diff --git a/nova/tests/scheduler/test_host_filters.py b/nova/tests/scheduler/test_host_filters.py
index c6fabc1b5..759bd70ec 100644
--- a/nova/tests/scheduler/test_host_filters.py
+++ b/nova/tests/scheduler/test_host_filters.py
@@ -885,3 +885,26 @@ class HostFiltersTestCase(test.TestCase):
         host = fakes.FakeHostState('host1', 'compute',
             {'capabilities': capabilities, 'service': service})
         self.assertFalse(filt_cls.host_passes(host, filter_properties))
+
+    def test_retry_filter_disabled(self):
+        """Test case where retry/re-scheduling is disabled"""
+        filt_cls = self.class_map['RetryFilter']()
+        host = fakes.FakeHostState('host1', 'compute', {})
+        filter_properties = {}
+        self.assertTrue(filt_cls.host_passes(host, filter_properties))
+
+    def test_retry_filter_pass(self):
+        """Host not previously tried"""
+        filt_cls = self.class_map['RetryFilter']()
+        host = fakes.FakeHostState('host1', 'compute', {})
+        retry = dict(num_attempts=1, hosts=['host2', 'host3'])
+        filter_properties = dict(retry=retry)
+        self.assertTrue(filt_cls.host_passes(host, filter_properties))
+
+    def test_retry_filter_fail(self):
+        """Host was already tried"""
+        filt_cls = self.class_map['RetryFilter']()
+        host = fakes.FakeHostState('host1', 'compute', {})
+        retry = dict(num_attempts=1, hosts=['host3', 'host1'])
+        filter_properties = dict(retry=retry)
+        self.assertFalse(filt_cls.host_passes(host, filter_properties))
author	Brian Elliott <brian.elliott@rackspace.com>	2012-06-21 00:44:24 +0000
committer	Brian Elliott <brian.elliott@rackspace.com>	2012-07-17 22:15:41 +0000
commit	83fece1aa7e124a2adae05d95c8c6cecc12d5d41 (patch)
tree	2fcaa6ff5838fe6714b372e9cb5d3c40d080d605 /nova/tests
parent	acb158714c562d3142bf2f3f560dc374daa2df7d (diff)