summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJenkins <jenkins@review.openstack.org>2013-03-12 23:59:59 +0000
committerGerrit Code Review <review@openstack.org>2013-03-12 23:59:59 +0000
commitb0694b33d3a7c32b582030d44231ef31bcc8ec60 (patch)
treed0a5a1b1a273e8c60738e8c9e94e68808ec2dc8e
parent071719a3e16b747bc9c2cdadab829c3c1287eede (diff)
parent40feb35898ed0a6d57b1f481c165e683796b045c (diff)
downloadnova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.tar.gz
nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.tar.xz
nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.zip
Merge "xenapi: Fix reboot with hung volumes"
-rwxr-xr-xnova/compute/manager.py34
-rw-r--r--nova/tests/compute/test_compute.py13
-rwxr-xr-xnova/virt/baremetal/driver.py2
-rwxr-xr-xnova/virt/driver.py5
-rwxr-xr-xnova/virt/fake.py2
-rwxr-xr-xnova/virt/hyperv/driver.py2
-rwxr-xr-xnova/virt/libvirt/driver.py2
-rwxr-xr-xnova/virt/powervm/driver.py5
-rwxr-xr-xnova/virt/vmwareapi/driver.py2
-rwxr-xr-xnova/virt/xenapi/driver.py5
-rw-r--r--nova/virt/xenapi/vmops.py34
-rw-r--r--nova/virt/xenapi/volumeops.py28
12 files changed, 116 insertions, 18 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index bb1447da4..24b79a0ab 100755
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1582,6 +1582,32 @@ class ComputeManager(manager.SchedulerDependentManager):
network_info=network_info,
extra_usage_info=extra_usage_info)
+ def _handle_bad_volumes_detached(self, context, instance, bad_devices,
+ block_device_info):
+ """Handle cases where the virt-layer had to detach non-working volumes
+ in order to complete an operation.
+ """
+ for bdm in block_device_info['block_device_mapping']:
+ if bdm.get('mount_device') in bad_devices:
+ try:
+ volume_id = bdm['connection_info']['data']['volume_id']
+ except KeyError:
+ continue
+
+ # NOTE(sirp): ideally we'd just call
+ # `compute_api.detach_volume` here but since that hits the
+ # DB directly, that's off limits from within the
+ # compute-manager.
+ #
+ # API-detach
+ LOG.info(_("Detaching from volume api: %s") % volume_id)
+ volume = self.volume_api.get(context, volume_id)
+ self.volume_api.check_detach(context, volume)
+ self.volume_api.begin_detaching(context, volume)
+
+ # Manager-detach
+ self.detach_volume(context, volume_id, instance)
+
@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@reverts_task_state
@wrap_instance_event
@@ -1616,10 +1642,16 @@ class ComputeManager(manager.SchedulerDependentManager):
'expected: %(running)s)') % locals(),
context=context, instance=instance)
+ def bad_volumes_callback(bad_devices):
+ self._handle_bad_volumes_detached(
+ context, instance, bad_devices, block_device_info)
+
try:
self.driver.reboot(context, instance,
self._legacy_nw_info(network_info),
- reboot_type, block_device_info)
+ reboot_type,
+ block_device_info=block_device_info,
+ bad_volumes_callback=bad_volumes_callback)
except Exception, exc:
LOG.error(_('Cannot reboot instance: %(exc)s'), locals(),
context=context, instance=instance)
diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py
index 9da253360..f8c0e86f2 100644
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@@ -1150,15 +1150,20 @@ class ComputeTestCase(BaseTestCase):
# this is called with the wrong args, so we have to hack
# around it.
reboot_call_info = {}
- expected_call_info = {'args': (econtext, updated_instance1,
- expected_nw_info, reboot_type,
- fake_block_dev_info),
- 'kwargs': {}}
+ expected_call_info = {
+ 'args': (econtext, updated_instance1, expected_nw_info,
+ reboot_type),
+ 'kwargs': {'block_device_info': fake_block_dev_info}}
def fake_reboot(*args, **kwargs):
reboot_call_info['args'] = args
reboot_call_info['kwargs'] = kwargs
+ # NOTE(sirp): Since `bad_volumes_callback` is a function defined
+ # within `reboot_instance`, we don't have access to its value and
+ # can't stub it out, thus we skip that comparison.
+ kwargs.pop('bad_volumes_callback')
+
self.stubs.Set(self.compute.driver, 'reboot', fake_reboot)
# Power state should be updated again
diff --git a/nova/virt/baremetal/driver.py b/nova/virt/baremetal/driver.py
index 94d3f0a92..97d72cc74 100755
--- a/nova/virt/baremetal/driver.py
+++ b/nova/virt/baremetal/driver.py
@@ -275,7 +275,7 @@ class BareMetalDriver(driver.ComputeDriver):
_update_state(context, node, None, baremetal_states.DELETED)
def reboot(self, context, instance, network_info, reboot_type,
- block_device_info=None):
+ block_device_info=None, bad_volumes_callback=None):
node = _get_baremetal_node_by_instance_uuid(instance['uuid'])
ctx = nova_context.get_admin_context()
pm = get_power_manager(node=node, instance=instance)
diff --git a/nova/virt/driver.py b/nova/virt/driver.py
index 11c65519c..71bcaef42 100755
--- a/nova/virt/driver.py
+++ b/nova/virt/driver.py
@@ -248,7 +248,7 @@ class ComputeDriver(object):
raise NotImplementedError()
def reboot(self, context, instance, network_info, reboot_type,
- block_device_info=None):
+ block_device_info=None, bad_volumes_callback=None):
"""Reboot the specified instance.
After this is called successfully, the instance's state
@@ -261,6 +261,9 @@ class ComputeDriver(object):
:param network_info:
:py:meth:`~nova.network.manager.NetworkManager.get_instance_nw_info`
:param reboot_type: Either a HARD or SOFT reboot
+ :param block_device_info: Info pertaining to attached volumes
+ :param bad_volumes_callback: Function to handle any bad volumes
+ encountered
"""
raise NotImplementedError()
diff --git a/nova/virt/fake.py b/nova/virt/fake.py
index 58f303efc..b2b102486 100755
--- a/nova/virt/fake.py
+++ b/nova/virt/fake.py
@@ -131,7 +131,7 @@ class FakeDriver(driver.ComputeDriver):
update_task_state(task_state=task_states.IMAGE_UPLOADING)
def reboot(self, context, instance, network_info, reboot_type,
- block_device_info=None):
+ block_device_info=None, bad_volumes_callback=None):
pass
@staticmethod
diff --git a/nova/virt/hyperv/driver.py b/nova/virt/hyperv/driver.py
index 289f3c4b6..477f8fa2a 100755
--- a/nova/virt/hyperv/driver.py
+++ b/nova/virt/hyperv/driver.py
@@ -54,7 +54,7 @@ class HyperVDriver(driver.ComputeDriver):
admin_password, network_info, block_device_info)
def reboot(self, context, instance, network_info, reboot_type,
- block_device_info=None):
+ block_device_info=None, bad_volumes_callback=None):
self._vmops.reboot(instance, network_info, reboot_type)
def destroy(self, instance, network_info, block_device_info=None,
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index e7e6b716f..4a9d2f3f5 100755
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -1279,7 +1279,7 @@ class LibvirtDriver(driver.ComputeDriver):
out_path, image_format)
def reboot(self, context, instance, network_info, reboot_type='SOFT',
- block_device_info=None):
+ block_device_info=None, bad_volumes_callback=None):
"""Reboot a virtual machine, given an instance reference."""
if reboot_type == 'SOFT':
# NOTE(vish): This will attempt to do a graceful shutdown/restart.
diff --git a/nova/virt/powervm/driver.py b/nova/virt/powervm/driver.py
index c388eecfd..d06948f17 100755
--- a/nova/virt/powervm/driver.py
+++ b/nova/virt/powervm/driver.py
@@ -106,13 +106,16 @@ class PowerVMDriver(driver.ComputeDriver):
self._powervm.destroy(instance['name'], destroy_disks)
def reboot(self, context, instance, network_info, reboot_type,
- block_device_info=None):
+ block_device_info=None, bad_volumes_callback=None):
"""Reboot the specified instance.
:param instance: Instance object as returned by DB layer.
:param network_info:
:py:meth:`~nova.network.manager.NetworkManager.get_instance_nw_info`
:param reboot_type: Either a HARD or SOFT reboot
+ :param block_device_info: Info pertaining to attached volumes
+ :param bad_volumes_callback: Function to handle any bad volumes
+ encountered
"""
pass
diff --git a/nova/virt/vmwareapi/driver.py b/nova/virt/vmwareapi/driver.py
index eeec4c5c2..798a2fde3 100755
--- a/nova/virt/vmwareapi/driver.py
+++ b/nova/virt/vmwareapi/driver.py
@@ -181,7 +181,7 @@ class VMwareESXDriver(driver.ComputeDriver):
self._vmops.snapshot(context, instance, name, update_task_state)
def reboot(self, context, instance, network_info, reboot_type,
- block_device_info=None):
+ block_device_info=None, bad_volumes_callback=None):
"""Reboot VM instance."""
self._vmops.reboot(instance, network_info)
diff --git a/nova/virt/xenapi/driver.py b/nova/virt/xenapi/driver.py
index decaed2b0..302679685 100755
--- a/nova/virt/xenapi/driver.py
+++ b/nova/virt/xenapi/driver.py
@@ -194,9 +194,10 @@ class XenAPIDriver(driver.ComputeDriver):
self._vmops.snapshot(context, instance, image_id, update_task_state)
def reboot(self, context, instance, network_info, reboot_type,
- block_device_info=None):
+ block_device_info=None, bad_volumes_callback=None):
"""Reboot VM instance."""
- self._vmops.reboot(instance, reboot_type)
+ self._vmops.reboot(instance, reboot_type,
+ bad_volumes_callback=bad_volumes_callback)
def set_admin_password(self, instance, new_pass):
"""Set the root/admin password on the VM instance."""
diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py
index 56dd5bd3d..0593be52d 100644
--- a/nova/virt/xenapi/vmops.py
+++ b/nova/virt/xenapi/vmops.py
@@ -272,14 +272,31 @@ class VMOps(object):
step=5,
total_steps=RESIZE_TOTAL_STEPS)
- def _start(self, instance, vm_ref=None):
+ def _start(self, instance, vm_ref=None, bad_volumes_callback=None):
"""Power on a VM instance."""
vm_ref = vm_ref or self._get_vm_opaque_ref(instance)
LOG.debug(_("Starting instance"), instance=instance)
+
+ # Attached volumes that have become non-responsive will prevent a VM
+ # from starting, so scan for these before attempting to start
+ #
+ # In order to make sure this detach is consistent (virt, BDM, cinder),
+ # we only detach in the virt-layer if a callback is provided.
+ if bad_volumes_callback:
+ bad_devices = self._volumeops.find_bad_volumes(vm_ref)
+ for device_name in bad_devices:
+ self._volumeops.detach_volume(
+ None, instance['name'], device_name)
+
self._session.call_xenapi('VM.start_on', vm_ref,
self._session.get_xenapi_host(),
False, False)
+ # Allow higher-layers a chance to detach bad-volumes as well (in order
+ # to cleanup BDM entries and detach in Cinder)
+ if bad_volumes_callback and bad_devices:
+ bad_volumes_callback(bad_devices)
+
def _create_disks(self, context, instance, name_label, disk_image_type,
image_meta, block_device_info=None):
vdis = vm_utils.get_vdis_for_instance(context, self._session,
@@ -930,7 +947,7 @@ class VMOps(object):
return 'VDI.resize'
- def reboot(self, instance, reboot_type):
+ def reboot(self, instance, reboot_type, bad_volumes_callback=None):
"""Reboot VM instance."""
# Note (salvatore-orlando): security group rules are not re-enforced
# upon reboot, since this action on the XenAPI drivers does not
@@ -948,9 +965,18 @@ class VMOps(object):
details[-1] == 'halted'):
LOG.info(_("Starting halted instance found during reboot"),
instance=instance)
- self._session.call_xenapi('VM.start', vm_ref, False, False)
+ self._start(instance, vm_ref=vm_ref,
+ bad_volumes_callback=bad_volumes_callback)
return
- raise
+ elif details[0] == 'SR_BACKEND_FAILURE_46':
+ LOG.warn(_("Reboot failed due to bad volumes, detaching bad"
+ " volumes and starting halted instance"),
+ instance=instance)
+ self._start(instance, vm_ref=vm_ref,
+ bad_volumes_callback=bad_volumes_callback)
+ return
+ else:
+ raise
def set_admin_password(self, instance, new_pass):
"""Set the root/admin password on the VM instance."""
diff --git a/nova/virt/xenapi/volumeops.py b/nova/virt/xenapi/volumeops.py
index d3c3046b7..add3787a3 100644
--- a/nova/virt/xenapi/volumeops.py
+++ b/nova/virt/xenapi/volumeops.py
@@ -151,3 +151,31 @@ class VolumeOps(object):
vbd_refs = self._get_all_volume_vbd_refs(vm_ref)
for vbd_ref in vbd_refs:
self._detach_vbd(vbd_ref, unplug=unplug)
+
+ def find_bad_volumes(self, vm_ref):
+ """Find any volumes with their connection severed.
+
+ Certain VM operations (e.g. `VM.start`, `VM.reboot`, etc.) will not
+ work when a VBD is present that points to a non-working volume. To work
+ around this, we scan for non-working volumes and detach them before
+ retrying a failed operation.
+ """
+ bad_devices = []
+ vbd_refs = self._get_all_volume_vbd_refs(vm_ref)
+ for vbd_ref in vbd_refs:
+ sr_ref = volume_utils.find_sr_from_vbd(self._session, vbd_ref)
+
+ try:
+ # TODO(sirp): bug1152401 This relies on a 120 sec timeout
+ # within XenServer, update this to fail-fast when this is fixed
+ # upstream
+ self._session.call_xenapi("SR.scan", sr_ref)
+ except self._session.XenAPI.Failure, exc:
+ if exc.details[0] == 'SR_BACKEND_FAILURE_40':
+ vbd_rec = vbd_rec = self._session.call_xenapi(
+ "VBD.get_record", vbd_ref)
+ bad_devices.append('/dev/%s' % vbd_rec['device'])
+ else:
+ raise
+
+ return bad_devices