Merge "xenapi: Fix reboot with hung volumes"

author: Jenkins <jenkins@review.openstack.org> 2013-03-12 23:59:59 +0000
committer: Gerrit Code Review <review@openstack.org> 2013-03-12 23:59:59 +0000
commit: b0694b33d3a7c32b582030d44231ef31bcc8ec60 (patch)
tree: d0a5a1b1a273e8c60738e8c9e94e68808ec2dc8e
parent: 071719a3e16b747bc9c2cdadab829c3c1287eede (diff)
parent: 40feb35898ed0a6d57b1f481c165e683796b045c (diff)
download: nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.tar.gz
nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.tar.xz
nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.zip
12 files changed, 116 insertions, 18 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index bb1447da4..24b79a0ab 100755
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1582,6 +1582,32 @@ class ComputeManager(manager.SchedulerDependentManager):
                     network_info=network_info,
                     extra_usage_info=extra_usage_info)
 
+    def _handle_bad_volumes_detached(self, context, instance, bad_devices,
+                                     block_device_info):
+        """Handle cases where the virt-layer had to detach non-working volumes
+        in order to complete an operation.
+        """
+        for bdm in block_device_info['block_device_mapping']:
+            if bdm.get('mount_device') in bad_devices:
+                try:
+                    volume_id = bdm['connection_info']['data']['volume_id']
+                except KeyError:
+                    continue
+
+                # NOTE(sirp): ideally we'd just call
+                # `compute_api.detach_volume` here but since that hits the
+                # DB directly, that's off limits from within the
+                # compute-manager.
+                #
+                # API-detach
+                LOG.info(_("Detaching from volume api: %s") % volume_id)
+                volume = self.volume_api.get(context, volume_id)
+                self.volume_api.check_detach(context, volume)
+                self.volume_api.begin_detaching(context, volume)
+
+                # Manager-detach
+                self.detach_volume(context, volume_id, instance)
+
     @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
     @reverts_task_state
     @wrap_instance_event
@@ -1616,10 +1642,16 @@ class ComputeManager(manager.SchedulerDependentManager):
                      'expected: %(running)s)') % locals(),
                      context=context, instance=instance)
 
+        def bad_volumes_callback(bad_devices):
+            self._handle_bad_volumes_detached(
+                    context, instance, bad_devices, block_device_info)
+
         try:
             self.driver.reboot(context, instance,
                                self._legacy_nw_info(network_info),
-                               reboot_type, block_device_info)
+                               reboot_type,
+                               block_device_info=block_device_info,
+                               bad_volumes_callback=bad_volumes_callback)
         except Exception, exc:
             LOG.error(_('Cannot reboot instance: %(exc)s'), locals(),
                       context=context, instance=instance)
diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py
index 9da253360..f8c0e86f2 100644
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@@ -1150,15 +1150,20 @@ class ComputeTestCase(BaseTestCase):
         # this is called with the wrong args, so we have to hack
         # around it.
         reboot_call_info = {}
-        expected_call_info = {'args': (econtext, updated_instance1,
-                                       expected_nw_info, reboot_type,
-                                       fake_block_dev_info),
-                              'kwargs': {}}
+        expected_call_info = {
+            'args': (econtext, updated_instance1, expected_nw_info,
+                     reboot_type),
+            'kwargs': {'block_device_info': fake_block_dev_info}}
 
         def fake_reboot(*args, **kwargs):
             reboot_call_info['args'] = args
             reboot_call_info['kwargs'] = kwargs
 
+            # NOTE(sirp): Since `bad_volumes_callback` is a function defined
+            # within `reboot_instance`, we don't have access to its value and
+            # can't stub it out, thus we skip that comparison.
+            kwargs.pop('bad_volumes_callback')
+
         self.stubs.Set(self.compute.driver, 'reboot', fake_reboot)
 
         # Power state should be updated again
diff --git a/nova/virt/baremetal/driver.py b/nova/virt/baremetal/driver.py
index 94d3f0a92..97d72cc74 100755
--- a/nova/virt/baremetal/driver.py
+++ b/nova/virt/baremetal/driver.py
@@ -275,7 +275,7 @@ class BareMetalDriver(driver.ComputeDriver):
                 _update_state(context, node, None, baremetal_states.DELETED)
 
     def reboot(self, context, instance, network_info, reboot_type,
-               block_device_info=None):
+               block_device_info=None, bad_volumes_callback=None):
         node = _get_baremetal_node_by_instance_uuid(instance['uuid'])
         ctx = nova_context.get_admin_context()
         pm = get_power_manager(node=node, instance=instance)
diff --git a/nova/virt/driver.py b/nova/virt/driver.py
index 11c65519c..71bcaef42 100755
--- a/nova/virt/driver.py
+++ b/nova/virt/driver.py
@@ -248,7 +248,7 @@ class ComputeDriver(object):
         raise NotImplementedError()
 
     def reboot(self, context, instance, network_info, reboot_type,
-               block_device_info=None):
+               block_device_info=None, bad_volumes_callback=None):
         """Reboot the specified instance.
 
         After this is called successfully, the instance's state
@@ -261,6 +261,9 @@ class ComputeDriver(object):
         :param network_info:
            :py:meth:`~nova.network.manager.NetworkManager.get_instance_nw_info`
         :param reboot_type: Either a HARD or SOFT reboot
+        :param block_device_info: Info pertaining to attached volumes
+        :param bad_volumes_callback: Function to handle any bad volumes
+            encountered
         """
         raise NotImplementedError()
 
diff --git a/nova/virt/fake.py b/nova/virt/fake.py
index 58f303efc..b2b102486 100755
--- a/nova/virt/fake.py
+++ b/nova/virt/fake.py
@@ -131,7 +131,7 @@ class FakeDriver(driver.ComputeDriver):
         update_task_state(task_state=task_states.IMAGE_UPLOADING)
 
     def reboot(self, context, instance, network_info, reboot_type,
-               block_device_info=None):
+               block_device_info=None, bad_volumes_callback=None):
         pass
 
     @staticmethod
diff --git a/nova/virt/hyperv/driver.py b/nova/virt/hyperv/driver.py
index 289f3c4b6..477f8fa2a 100755
--- a/nova/virt/hyperv/driver.py
+++ b/nova/virt/hyperv/driver.py
@@ -54,7 +54,7 @@ class HyperVDriver(driver.ComputeDriver):
                           admin_password, network_info, block_device_info)
 
     def reboot(self, context, instance, network_info, reboot_type,
-               block_device_info=None):
+               block_device_info=None, bad_volumes_callback=None):
         self._vmops.reboot(instance, network_info, reboot_type)
 
     def destroy(self, instance, network_info, block_device_info=None,
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index e7e6b716f..4a9d2f3f5 100755
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -1279,7 +1279,7 @@ class LibvirtDriver(driver.ComputeDriver):
                                        out_path, image_format)
 
     def reboot(self, context, instance, network_info, reboot_type='SOFT',
-               block_device_info=None):
+               block_device_info=None, bad_volumes_callback=None):
         """Reboot a virtual machine, given an instance reference."""
         if reboot_type == 'SOFT':
             # NOTE(vish): This will attempt to do a graceful shutdown/restart.
diff --git a/nova/virt/powervm/driver.py b/nova/virt/powervm/driver.py
index c388eecfd..d06948f17 100755
--- a/nova/virt/powervm/driver.py
+++ b/nova/virt/powervm/driver.py
@@ -106,13 +106,16 @@ class PowerVMDriver(driver.ComputeDriver):
         self._powervm.destroy(instance['name'], destroy_disks)
 
     def reboot(self, context, instance, network_info, reboot_type,
-               block_device_info=None):
+               block_device_info=None, bad_volumes_callback=None):
         """Reboot the specified instance.
 
         :param instance: Instance object as returned by DB layer.
         :param network_info:
            :py:meth:`~nova.network.manager.NetworkManager.get_instance_nw_info`
         :param reboot_type: Either a HARD or SOFT reboot
+        :param block_device_info: Info pertaining to attached volumes
+        :param bad_volumes_callback: Function to handle any bad volumes
+            encountered
         """
         pass
 
diff --git a/nova/virt/vmwareapi/driver.py b/nova/virt/vmwareapi/driver.py
index eeec4c5c2..798a2fde3 100755
--- a/nova/virt/vmwareapi/driver.py
+++ b/nova/virt/vmwareapi/driver.py
@@ -181,7 +181,7 @@ class VMwareESXDriver(driver.ComputeDriver):
         self._vmops.snapshot(context, instance, name, update_task_state)
 
     def reboot(self, context, instance, network_info, reboot_type,
-               block_device_info=None):
+               block_device_info=None, bad_volumes_callback=None):
         """Reboot VM instance."""
         self._vmops.reboot(instance, network_info)
 
diff --git a/nova/virt/xenapi/driver.py b/nova/virt/xenapi/driver.py
index decaed2b0..302679685 100755
--- a/nova/virt/xenapi/driver.py
+++ b/nova/virt/xenapi/driver.py
@@ -194,9 +194,10 @@ class XenAPIDriver(driver.ComputeDriver):
         self._vmops.snapshot(context, instance, image_id, update_task_state)
 
     def reboot(self, context, instance, network_info, reboot_type,
-               block_device_info=None):
+               block_device_info=None, bad_volumes_callback=None):
         """Reboot VM instance."""
-        self._vmops.reboot(instance, reboot_type)
+        self._vmops.reboot(instance, reboot_type,
+                           bad_volumes_callback=bad_volumes_callback)
 
     def set_admin_password(self, instance, new_pass):
         """Set the root/admin password on the VM instance."""
diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py
index 56dd5bd3d..0593be52d 100644
--- a/nova/virt/xenapi/vmops.py
+++ b/nova/virt/xenapi/vmops.py
@@ -272,14 +272,31 @@ class VMOps(object):
                                        step=5,
                                        total_steps=RESIZE_TOTAL_STEPS)
 
-    def _start(self, instance, vm_ref=None):
+    def _start(self, instance, vm_ref=None, bad_volumes_callback=None):
         """Power on a VM instance."""
         vm_ref = vm_ref or self._get_vm_opaque_ref(instance)
         LOG.debug(_("Starting instance"), instance=instance)
+
+        # Attached volumes that have become non-responsive will prevent a VM
+        # from starting, so scan for these before attempting to start
+        #
+        # In order to make sure this detach is consistent (virt, BDM, cinder),
+        # we only detach in the virt-layer if a callback is provided.
+        if bad_volumes_callback:
+            bad_devices = self._volumeops.find_bad_volumes(vm_ref)
+            for device_name in bad_devices:
+                self._volumeops.detach_volume(
+                        None, instance['name'], device_name)
+
         self._session.call_xenapi('VM.start_on', vm_ref,
                                   self._session.get_xenapi_host(),
                                   False, False)
 
+        # Allow higher-layers a chance to detach bad-volumes as well (in order
+        # to cleanup BDM entries and detach in Cinder)
+        if bad_volumes_callback and bad_devices:
+            bad_volumes_callback(bad_devices)
+
     def _create_disks(self, context, instance, name_label, disk_image_type,
                       image_meta, block_device_info=None):
         vdis = vm_utils.get_vdis_for_instance(context, self._session,
@@ -930,7 +947,7 @@ class VMOps(object):
 
         return 'VDI.resize'
 
-    def reboot(self, instance, reboot_type):
+    def reboot(self, instance, reboot_type, bad_volumes_callback=None):
         """Reboot VM instance."""
         # Note (salvatore-orlando): security group rules are not re-enforced
         # upon reboot, since this action on the XenAPI drivers does not
@@ -948,9 +965,18 @@ class VMOps(object):
                     details[-1] == 'halted'):
                 LOG.info(_("Starting halted instance found during reboot"),
                     instance=instance)
-                self._session.call_xenapi('VM.start', vm_ref, False, False)
+                self._start(instance, vm_ref=vm_ref,
+                            bad_volumes_callback=bad_volumes_callback)
                 return
-            raise
+            elif details[0] == 'SR_BACKEND_FAILURE_46':
+                LOG.warn(_("Reboot failed due to bad volumes, detaching bad"
+                           " volumes and starting halted instance"),
+                         instance=instance)
+                self._start(instance, vm_ref=vm_ref,
+                            bad_volumes_callback=bad_volumes_callback)
+                return
+            else:
+                raise
 
     def set_admin_password(self, instance, new_pass):
         """Set the root/admin password on the VM instance."""
diff --git a/nova/virt/xenapi/volumeops.py b/nova/virt/xenapi/volumeops.py
index d3c3046b7..add3787a3 100644
--- a/nova/virt/xenapi/volumeops.py
+++ b/nova/virt/xenapi/volumeops.py
@@ -151,3 +151,31 @@ class VolumeOps(object):
         vbd_refs = self._get_all_volume_vbd_refs(vm_ref)
         for vbd_ref in vbd_refs:
             self._detach_vbd(vbd_ref, unplug=unplug)
+
+    def find_bad_volumes(self, vm_ref):
+        """Find any volumes with their connection severed.
+
+        Certain VM operations (e.g. `VM.start`, `VM.reboot`, etc.) will not
+        work when a VBD is present that points to a non-working volume. To work
+        around this, we scan for non-working volumes and detach them before
+        retrying a failed operation.
+        """
+        bad_devices = []
+        vbd_refs = self._get_all_volume_vbd_refs(vm_ref)
+        for vbd_ref in vbd_refs:
+            sr_ref = volume_utils.find_sr_from_vbd(self._session, vbd_ref)
+
+            try:
+                # TODO(sirp): bug1152401 This relies on a 120 sec timeout
+                # within XenServer, update this to fail-fast when this is fixed
+                # upstream
+                self._session.call_xenapi("SR.scan", sr_ref)
+            except self._session.XenAPI.Failure, exc:
+                if exc.details[0] == 'SR_BACKEND_FAILURE_40':
+                    vbd_rec = vbd_rec = self._session.call_xenapi(
+                            "VBD.get_record", vbd_ref)
+                    bad_devices.append('/dev/%s' % vbd_rec['device'])
+                else:
+                    raise
+
+        return bad_devices
author	Jenkins <jenkins@review.openstack.org>	2013-03-12 23:59:59 +0000
committer	Gerrit Code Review <review@openstack.org>	2013-03-12 23:59:59 +0000
commit	b0694b33d3a7c32b582030d44231ef31bcc8ec60 (patch)
tree	d0a5a1b1a273e8c60738e8c9e94e68808ec2dc8e
parent	071719a3e16b747bc9c2cdadab829c3c1287eede (diff)
parent	40feb35898ed0a6d57b1f481c165e683796b045c (diff)
download	nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.tar.gz nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.tar.xz nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.zip