Allow retrying network allocations separately

Introduce a new config option, 'network_allocate_retries', that allows one to retry network allocations. The default is 0 for no retries to match the current behavior. The network allocations currently get retried by a full retry of a build via the scheduler, if those are enabled. This patch reduces the need to re-schedule for simple network allocation issues. The retrying happens in the network alloc async greenthread, so for virt drivers that support the new NetworkModel, the retrying potentially happens in the background while the image is being downloaded, etc. DocImpact Change-Id: I1a5fdcccbb736fc0b1d8c0cbc3b45a8372a6aef7
author: Chris Behrens <cbehrens@codestud.com> 2013-06-25 20:00:02 +0000
committer: Chris Behrens <cbehrens@codestud.com> 2013-06-26 19:58:16 +0000
commit: f0cf1c0fc14ba44ae6af5aad93ccd2fe010094a5 (patch)
tree: a803f7a00fdfdb1318f7503f2e3383118abc92fe
parent: 9331c5c1115c7d8cc5bcab71b1100eeea1ce72fe (diff)
download: nova-f0cf1c0fc14ba44ae6af5aad93ccd2fe010094a5.tar.gz
nova-f0cf1c0fc14ba44ae6af5aad93ccd2fe010094a5.tar.xz
nova-f0cf1c0fc14ba44ae6af5aad93ccd2fe010094a5.zip
2 files changed, 147 insertions, 19 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 22881f5bd..e3d84b1fd 100755
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -106,6 +106,9 @@ compute_opts = [
                 default=False,
                 help='Whether to start guests that were running before the '
                      'host rebooted'),
+    cfg.IntOpt('network_allocate_retries',
+               default=0,
+               help="Number of times to retry network allocation on failures"),
     ]
 
 interval_opts = [
@@ -1147,6 +1150,50 @@ class ComputeManager(manager.SchedulerDependentManager):
                               expected_task_state=(task_states.SCHEDULING,
                                                    None))
 
+    def _allocate_network_async(self, context, instance, requested_networks,
+                                macs, security_groups, is_vpn):
+        """Method used to allocate networks in the background.
+
+        Broken out for testing.
+        """
+        LOG.debug(_("Allocating IP information in the background."),
+                  instance=instance)
+        retries = CONF.network_allocate_retries
+        if retries < 0:
+            LOG.warn(_("Treating negative config value (%(retries)s) for "
+                       "'network_allocate_retries' as 0."),
+                     {'retries': retries})
+        attempts = retries > 1 and retries + 1 or 1
+        retry_time = 1
+        for attempt in range(1, attempts + 1):
+            try:
+                nwinfo = self.network_api.allocate_for_instance(
+                        context, instance, vpn=is_vpn,
+                        requested_networks=requested_networks,
+                        macs=macs,
+                        conductor_api=self.conductor_api,
+                        security_groups=security_groups)
+                LOG.debug(_('Instance network_info: |%s|'), nwinfo,
+                          instance=instance)
+                return nwinfo
+            except Exception:
+                exc_info = sys.exc_info()
+                log_info = {'attempt': attempt,
+                            'attempts': attempts}
+                if attempt == attempts:
+                    LOG.exception(_('Instance failed network setup '
+                                    'after %(attempts)d attempt(s)'),
+                                  log_info)
+                    raise exc_info[0], exc_info[1], exc_info[2]
+                LOG.warn(_('Instance failed network setup '
+                           '(attempt %(attempt)d of %(attempts)d)'),
+                         log_info, instance=instance)
+                time.sleep(retry_time)
+                retry_time *= 2
+                if retry_time > 30:
+                    retry_time = 30
+        # Not reached.
+
     def _allocate_network(self, context, instance, requested_networks, macs,
                           security_groups):
         """Start network allocation asynchronously.  Return an instance
@@ -1161,25 +1208,9 @@ class ComputeManager(manager.SchedulerDependentManager):
                                          task_state=task_states.NETWORKING,
                                          expected_task_state=None)
         is_vpn = pipelib.is_vpn_image(instance['image_ref'])
-
-        def async_alloc():
-            LOG.debug(_("Allocating IP information in the background."),
-                      instance=instance)
-            try:
-                nwinfo = self.network_api.allocate_for_instance(
-                        context, instance, vpn=is_vpn,
-                        requested_networks=requested_networks,
-                        macs=macs,
-                        conductor_api=self.conductor_api,
-                        security_groups=security_groups)
-            except Exception:
-                with excutils.save_and_reraise_exception():
-                    LOG.exception(_('Instance failed network setup'),
-                                  instance=instance)
-            LOG.debug(_('Instance network_info: |%s|'), nwinfo,
-                      instance=instance)
-            return nwinfo
-        return network_model.NetworkInfoAsyncWrapper(async_alloc)
+        return network_model.NetworkInfoAsyncWrapper(
+                self._allocate_network_async, context, instance,
+                requested_networks, macs, security_groups, is_vpn)
 
     def _prep_block_device(self, context, instance, bdms):
         """Set up the block device for an instance with error logging."""
diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py
index 7953f8b63..466a039a1 100644
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@@ -1079,6 +1079,103 @@ class ComputeTestCase(BaseTestCase):
         self._assert_state({'vm_state': vm_states.ERROR,
                             'task_state': None})
 
+    def test_allocate_network_succeeds_after_retries(self):
+        # Undo setUp() stubs as this is a true unit test
+        self.stubs.UnsetAll()
+        self.flags(network_allocate_retries=8)
+
+        nwapi = self.compute.network_api
+        self.mox.StubOutWithMock(nwapi, 'allocate_for_instance')
+        self.mox.StubOutWithMock(time, 'sleep')
+
+        instance = {}
+        is_vpn = 'fake-is-vpn'
+        req_networks = 'fake-req-networks'
+        macs = 'fake-macs'
+        sec_groups = 'fake-sec-groups'
+        final_result = 'meow'
+
+        expected_sleep_times = [1, 2, 4, 8, 16, 30, 30, 30]
+
+        for sleep_time in expected_sleep_times:
+            nwapi.allocate_for_instance(
+                    self.context, instance, vpn=is_vpn,
+                    requested_networks=req_networks, macs=macs,
+                    conductor_api=self.compute.conductor_api,
+                    security_groups=sec_groups).AndRaise(
+                            test.TestingException())
+            time.sleep(sleep_time)
+
+        nwapi.allocate_for_instance(
+                self.context, instance, vpn=is_vpn,
+                requested_networks=req_networks, macs=macs,
+                conductor_api=self.compute.conductor_api,
+                security_groups=sec_groups).AndReturn(final_result)
+
+        self.mox.ReplayAll()
+
+        res = self.compute._allocate_network_async(self.context, instance,
+                                                   req_networks,
+                                                   macs,
+                                                   sec_groups,
+                                                   is_vpn)
+        self.assertEqual(final_result, res)
+
+    def test_allocate_network_fails(self):
+        # Undo setUp() stubs as this is a true unit test
+        self.stubs.UnsetAll()
+        self.flags(network_allocate_retries=0)
+
+        nwapi = self.compute.network_api
+        self.mox.StubOutWithMock(nwapi, 'allocate_for_instance')
+
+        instance = {}
+        is_vpn = 'fake-is-vpn'
+        req_networks = 'fake-req-networks'
+        macs = 'fake-macs'
+        sec_groups = 'fake-sec-groups'
+
+        nwapi.allocate_for_instance(
+                self.context, instance, vpn=is_vpn,
+                requested_networks=req_networks, macs=macs,
+                conductor_api=self.compute.conductor_api,
+                security_groups=sec_groups).AndRaise(test.TestingException())
+
+        self.mox.ReplayAll()
+
+        self.assertRaises(test.TestingException,
+                          self.compute._allocate_network_async,
+                          self.context, instance, req_networks, macs,
+                          sec_groups, is_vpn)
+
+    def test_allocate_network_neg_conf_value_treated_as_zero(self):
+        # Undo setUp() stubs as this is a true unit test
+        self.stubs.UnsetAll()
+        self.flags(network_allocate_retries=-1)
+
+        nwapi = self.compute.network_api
+        self.mox.StubOutWithMock(nwapi, 'allocate_for_instance')
+
+        instance = {}
+        is_vpn = 'fake-is-vpn'
+        req_networks = 'fake-req-networks'
+        macs = 'fake-macs'
+        sec_groups = 'fake-sec-groups'
+
+        # Only attempted once.
+        nwapi.allocate_for_instance(
+                self.context, instance, vpn=is_vpn,
+                requested_networks=req_networks, macs=macs,
+                conductor_api=self.compute.conductor_api,
+                security_groups=sec_groups).AndRaise(test.TestingException())
+
+        self.mox.ReplayAll()
+
+        self.assertRaises(test.TestingException,
+                          self.compute._allocate_network_async,
+                          self.context, instance, req_networks, macs,
+                          sec_groups, is_vpn)
+
     def test_run_instance_dealloc_network_instance_not_found(self):
         """spawn network deallocate test.
author	Chris Behrens <cbehrens@codestud.com>	2013-06-25 20:00:02 +0000
committer	Chris Behrens <cbehrens@codestud.com>	2013-06-26 19:58:16 +0000
commit	f0cf1c0fc14ba44ae6af5aad93ccd2fe010094a5 (patch)
tree	a803f7a00fdfdb1318f7503f2e3383118abc92fe
parent	9331c5c1115c7d8cc5bcab71b1100eeea1ce72fe (diff)
download	nova-f0cf1c0fc14ba44ae6af5aad93ccd2fe010094a5.tar.gz nova-f0cf1c0fc14ba44ae6af5aad93ccd2fe010094a5.tar.xz nova-f0cf1c0fc14ba44ae6af5aad93ccd2fe010094a5.zip