summaryrefslogtreecommitdiffstats
path: root/nova/compute
diff options
context:
space:
mode:
authorRick Harris <rconradharris@gmail.com>2013-03-06 05:28:41 +0000
committerRick Harris <rconradharris@gmail.com>2013-03-11 18:56:10 +0000
commit40feb35898ed0a6d57b1f481c165e683796b045c (patch)
treed354febeb8875cdbb71a65cef9aca3f1eab8561d /nova/compute
parent00b5ec9a00d3d310ff8f924ab356a39215b6a528 (diff)
xenapi: Fix reboot with hung volumes
If a volume becomes inoperable (e.g. the ISCSI connection is severed) and the user goes to reboot, the instance may enter a permanently halted state. The root cause is that a VBD that points to 'bad' volume prevents VM operations ('reboot', 'start') from completing under XenServer. The work around is to detect which volumes are bad, detach in the virt-layer, retry the operation (or in the case of reboot, just 'start' the halted instance), and then notify the compute manager via a callback so it can detach the volume in Cinder. Fixes bug 1148614 Change-Id: Id4e8e84bb5748cfa267c2a418f9405fd86829e8f
Diffstat (limited to 'nova/compute')
-rwxr-xr-xnova/compute/manager.py34
1 files changed, 33 insertions, 1 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 338708f4e..a06c8cadd 100755
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1544,6 +1544,32 @@ class ComputeManager(manager.SchedulerDependentManager):
network_info=network_info,
extra_usage_info=extra_usage_info)
+ def _handle_bad_volumes_detached(self, context, instance, bad_devices,
+ block_device_info):
+ """Handle cases where the virt-layer had to detach non-working volumes
+ in order to complete an operation.
+ """
+ for bdm in block_device_info['block_device_mapping']:
+ if bdm.get('mount_device') in bad_devices:
+ try:
+ volume_id = bdm['connection_info']['data']['volume_id']
+ except KeyError:
+ continue
+
+ # NOTE(sirp): ideally we'd just call
+ # `compute_api.detach_volume` here but since that hits the
+ # DB directly, that's off limits from within the
+ # compute-manager.
+ #
+ # API-detach
+ LOG.info(_("Detaching from volume api: %s") % volume_id)
+ volume = self.volume_api.get(context, volume_id)
+ self.volume_api.check_detach(context, volume)
+ self.volume_api.begin_detaching(context, volume)
+
+ # Manager-detach
+ self.detach_volume(context, volume_id, instance)
+
@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@reverts_task_state
@wrap_instance_event
@@ -1578,10 +1604,16 @@ class ComputeManager(manager.SchedulerDependentManager):
'expected: %(running)s)') % locals(),
context=context, instance=instance)
+ def bad_volumes_callback(bad_devices):
+ self._handle_bad_volumes_detached(
+ context, instance, bad_devices, block_device_info)
+
try:
self.driver.reboot(context, instance,
self._legacy_nw_info(network_info),
- reboot_type, block_device_info)
+ reboot_type,
+ block_device_info=block_device_info,
+ bad_volumes_callback=bad_volumes_callback)
except Exception, exc:
LOG.error(_('Cannot reboot instance: %(exc)s'), locals(),
context=context, instance=instance)