diff options
| author | Rick Harris <rconradharris@gmail.com> | 2013-03-06 05:28:41 +0000 |
|---|---|---|
| committer | Rick Harris <rconradharris@gmail.com> | 2013-03-11 18:56:10 +0000 |
| commit | 40feb35898ed0a6d57b1f481c165e683796b045c (patch) | |
| tree | d354febeb8875cdbb71a65cef9aca3f1eab8561d /nova/compute | |
| parent | 00b5ec9a00d3d310ff8f924ab356a39215b6a528 (diff) | |
xenapi: Fix reboot with hung volumes
If a volume becomes inoperable (e.g. the ISCSI connection is severed)
and the user goes to reboot, the instance may enter a permanently halted
state.
The root cause is that a VBD that points to 'bad' volume prevents VM
operations ('reboot', 'start') from completing under XenServer.
The work around is to detect which volumes are bad, detach in the
virt-layer, retry the operation (or in the case of reboot, just 'start'
the halted instance), and then notify the compute manager via a
callback so it can detach the volume in Cinder.
Fixes bug 1148614
Change-Id: Id4e8e84bb5748cfa267c2a418f9405fd86829e8f
Diffstat (limited to 'nova/compute')
| -rwxr-xr-x | nova/compute/manager.py | 34 |
1 files changed, 33 insertions, 1 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 338708f4e..a06c8cadd 100755 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1544,6 +1544,32 @@ class ComputeManager(manager.SchedulerDependentManager): network_info=network_info, extra_usage_info=extra_usage_info) + def _handle_bad_volumes_detached(self, context, instance, bad_devices, + block_device_info): + """Handle cases where the virt-layer had to detach non-working volumes + in order to complete an operation. + """ + for bdm in block_device_info['block_device_mapping']: + if bdm.get('mount_device') in bad_devices: + try: + volume_id = bdm['connection_info']['data']['volume_id'] + except KeyError: + continue + + # NOTE(sirp): ideally we'd just call + # `compute_api.detach_volume` here but since that hits the + # DB directly, that's off limits from within the + # compute-manager. + # + # API-detach + LOG.info(_("Detaching from volume api: %s") % volume_id) + volume = self.volume_api.get(context, volume_id) + self.volume_api.check_detach(context, volume) + self.volume_api.begin_detaching(context, volume) + + # Manager-detach + self.detach_volume(context, volume_id, instance) + @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id()) @reverts_task_state @wrap_instance_event @@ -1578,10 +1604,16 @@ class ComputeManager(manager.SchedulerDependentManager): 'expected: %(running)s)') % locals(), context=context, instance=instance) + def bad_volumes_callback(bad_devices): + self._handle_bad_volumes_detached( + context, instance, bad_devices, block_device_info) + try: self.driver.reboot(context, instance, self._legacy_nw_info(network_info), - reboot_type, block_device_info) + reboot_type, + block_device_info=block_device_info, + bad_volumes_callback=bad_volumes_callback) except Exception, exc: LOG.error(_('Cannot reboot instance: %(exc)s'), locals(), context=context, instance=instance) |
