xenapi: Fix reboot with hung volumes

If a volume becomes inoperable (e.g. the ISCSI connection is severed) and the user goes to reboot, the instance may enter a permanently halted state. The root cause is that a VBD that points to 'bad' volume prevents VM operations ('reboot', 'start') from completing under XenServer. The work around is to detect which volumes are bad, detach in the virt-layer, retry the operation (or in the case of reboot, just 'start' the halted instance), and then notify the compute manager via a callback so it can detach the volume in Cinder. Fixes bug 1148614 Change-Id: Id4e8e84bb5748cfa267c2a418f9405fd86829e8f
author: Rick Harris <rconradharris@gmail.com> 2013-03-06 05:28:41 +0000
committer: Rick Harris <rconradharris@gmail.com> 2013-03-11 18:56:10 +0000
commit: 40feb35898ed0a6d57b1f481c165e683796b045c (patch)
tree: d354febeb8875cdbb71a65cef9aca3f1eab8561d /nova/compute
parent: 00b5ec9a00d3d310ff8f924ab356a39215b6a528 (diff)
1 files changed, 33 insertions, 1 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 338708f4e..a06c8cadd 100755
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1544,6 +1544,32 @@ class ComputeManager(manager.SchedulerDependentManager):
                     network_info=network_info,
                     extra_usage_info=extra_usage_info)
 
+    def _handle_bad_volumes_detached(self, context, instance, bad_devices,
+                                     block_device_info):
+        """Handle cases where the virt-layer had to detach non-working volumes
+        in order to complete an operation.
+        """
+        for bdm in block_device_info['block_device_mapping']:
+            if bdm.get('mount_device') in bad_devices:
+                try:
+                    volume_id = bdm['connection_info']['data']['volume_id']
+                except KeyError:
+                    continue
+
+                # NOTE(sirp): ideally we'd just call
+                # `compute_api.detach_volume` here but since that hits the
+                # DB directly, that's off limits from within the
+                # compute-manager.
+                #
+                # API-detach
+                LOG.info(_("Detaching from volume api: %s") % volume_id)
+                volume = self.volume_api.get(context, volume_id)
+                self.volume_api.check_detach(context, volume)
+                self.volume_api.begin_detaching(context, volume)
+
+                # Manager-detach
+                self.detach_volume(context, volume_id, instance)
+
     @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
     @reverts_task_state
     @wrap_instance_event
@@ -1578,10 +1604,16 @@ class ComputeManager(manager.SchedulerDependentManager):
                      'expected: %(running)s)') % locals(),
                      context=context, instance=instance)
 
+        def bad_volumes_callback(bad_devices):
+            self._handle_bad_volumes_detached(
+                    context, instance, bad_devices, block_device_info)
+
         try:
             self.driver.reboot(context, instance,
                                self._legacy_nw_info(network_info),
-                               reboot_type, block_device_info)
+                               reboot_type,
+                               block_device_info=block_device_info,
+                               bad_volumes_callback=bad_volumes_callback)
         except Exception, exc:
             LOG.error(_('Cannot reboot instance: %(exc)s'), locals(),
                       context=context, instance=instance)
author	Rick Harris <rconradharris@gmail.com>	2013-03-06 05:28:41 +0000
committer	Rick Harris <rconradharris@gmail.com>	2013-03-11 18:56:10 +0000
commit	40feb35898ed0a6d57b1f481c165e683796b045c (patch)
tree	d354febeb8875cdbb71a65cef9aca3f1eab8561d /nova/compute
parent	00b5ec9a00d3d310ff8f924ab356a39215b6a528 (diff)