Make NBD retry logic more generic, add retry to loop.

Other device implementations require retry logic as well. This change pushes the retry logic up one layer so that its easy to implement for the other drivers that will benefit from it, and then adds retry to the loop driver. This change also adds some unit test coverage for the loop driver. Change-Id: Iab0d42d5075e9d50b4e7eb8c7fcef12cae281b40
author: Michael Still <mikal@stillhq.com> 2012-12-15 18:32:20 +1100
committer: Michael Still <mikal@stillhq.com> 2012-12-19 12:00:03 +1100
commit: a246c5576d726c7bc385b49bc7b626eb6edcd137 (patch)
tree: e549c2a5ce5f8970d9692d2f44c926669db937a2 /nova/virt
parent: 30ddc85a1046d8792cb9c2f82033124aebe50d0f (diff)
3 files changed, 47 insertions, 16 deletions
diff --git a/nova/virt/disk/mount/api.py b/nova/virt/disk/mount/api.py
index 5354c8553..ac396ff80 100644
--- a/nova/virt/disk/mount/api.py
+++ b/nova/virt/disk/mount/api.py
@@ -16,6 +16,7 @@
 """Support for mounting virtual image files"""
 
 import os
+import time
 
 from nova.openstack.common import importutils
 from nova.openstack.common import log as logging
@@ -23,6 +24,8 @@ from nova import utils
 
 LOG = logging.getLogger(__name__)
 
+MAX_DEVICE_WAIT = 30
+
 
 class Mount(object):
     """Standard mounting operations, that can be overridden by subclasses.
@@ -102,6 +105,26 @@ class Mount(object):
         self.linked = True
         return True
 
+    def _get_dev_retry_helper(self):
+        """Some implementations need to retry their get_dev."""
+        # NOTE(mikal): This method helps implement retries. The implementation
+        # simply calls _get_dev_retry_helper from their get_dev, and implements
+        # _inner_get_dev with their device acquistion logic. The NBD
+        # implementation has an example.
+        start_time = time.time()
+        device = self._inner_get_dev()
+        while not device:
+            LOG.info(_('Device allocation failed. Will retry in 2 seconds.'))
+            time.sleep(2)
+            if time.time() - start_time > MAX_DEVICE_WAIT:
+                LOG.warn(_('Device allocation failed after repeated retries.'))
+                return False
+            device = self._inner_get_dev()
+        return True
+
+    def _inner_get_dev(self):
+        raise NotImplementedError()
+
     def unget_dev(self):
         """Release the block device from the file system namespace."""
         self.linked = False
diff --git a/nova/virt/disk/mount/loop.py b/nova/virt/disk/mount/loop.py
index 180e4d796..667ecee14 100644
--- a/nova/virt/disk/mount/loop.py
+++ b/nova/virt/disk/mount/loop.py
@@ -26,11 +26,14 @@ class LoopMount(api.Mount):
     """loop back support for raw images."""
     mode = 'loop'
 
-    def get_dev(self):
+    def _inner_get_dev(self):
         out, err = utils.trycmd('losetup', '--find', '--show', self.image,
                                 run_as_root=True)
         if err:
             self.error = _('Could not attach image to loopback: %s') % err
+            LOG.info(_('Loop mount error: %s'), self.error)
+            self.linked = False
+            self.device = None
             return False
 
         self.device = out.strip()
@@ -38,9 +41,22 @@ class LoopMount(api.Mount):
         self.linked = True
         return True
 
+    def get_dev(self):
+        # NOTE(mikal): the retry is required here in case we are low on loop
+        # devices. Note however that modern kernels will use more loop devices
+        # if they exist. If you're seeing lots of retries, consider adding
+        # more devices.
+        return self._get_dev_retry_helper()
+
     def unget_dev(self):
         if not self.linked:
             return
+
+        # NOTE(mikal): On some kernels, losetup -d will intermittently fail,
+        # thus leaking a loop device unless the losetup --detach is retried:
+        # https://lkml.org/lkml/2012/9/28/62
         LOG.debug(_("Release loop device %s"), self.device)
-        utils.execute('losetup', '--detach', self.device, run_as_root=True)
+        utils.execute('losetup', '--detach', self.device, run_as_root=True,
+                      attempts=3)
         self.linked = False
+        self.device = None
diff --git a/nova/virt/disk/mount/nbd.py b/nova/virt/disk/mount/nbd.py
index 1b6cc0778..ec9bde73c 100644
--- a/nova/virt/disk/mount/nbd.py
+++ b/nova/virt/disk/mount/nbd.py
@@ -37,7 +37,6 @@ CONF = cfg.CONF
 CONF.register_opts(nbd_opts)
 
 NBD_DEVICE_RE = re.compile('nbd[0-9]+')
-MAX_NBD_WAIT = 30
 
 
 class NbdMount(api.Mount):
@@ -89,6 +88,7 @@ class NbdMount(api.Mount):
                                  run_as_root=True)
         if err:
             self.error = _('qemu-nbd error: %s') % err
+            LOG.info(_('NBD mount error: %s'), self.error)
             return False
 
         # NOTE(vish): this forks into another process, so give it a chance
@@ -100,12 +100,15 @@ class NbdMount(api.Mount):
                 break
             time.sleep(1)
         else:
+            self.error = _('nbd device %s did not show up') % device
+            LOG.info(_('NBD mount error: %s'), self.error)
+
+            # Cleanup
             _out, err = utils.trycmd('qemu-nbd', '-d', device,
                                      run_as_root=True)
             if err:
                 LOG.warn(_('Detaching from erroneous nbd device returned '
                            'error: %s'), err)
-            self.error = _('nbd device %s did not show up') % device
             return False
 
         self.error = ''
@@ -114,18 +117,7 @@ class NbdMount(api.Mount):
 
     def get_dev(self):
         """Retry requests for NBD devices."""
-        start_time = time.time()
-        device = self._inner_get_dev()
-        while not device:
-            LOG.info(_('nbd device allocation failed. Will retry in 2 '
-                       'seconds.'))
-            time.sleep(2)
-            if time.time() - start_time > MAX_NBD_WAIT:
-                LOG.warn(_('nbd device allocation failed after repeated '
-                           'retries.'))
-                return False
-            device = self._inner_get_dev()
-        return True
+        return self._get_dev_retry_helper()
 
     def unget_dev(self):
         if not self.linked:
author	Michael Still <mikal@stillhq.com>	2012-12-15 18:32:20 +1100
committer	Michael Still <mikal@stillhq.com>	2012-12-19 12:00:03 +1100
commit	a246c5576d726c7bc385b49bc7b626eb6edcd137 (patch)
tree	e549c2a5ce5f8970d9692d2f44c926669db937a2 /nova/virt
parent	30ddc85a1046d8792cb9c2f82033124aebe50d0f (diff)