diff options
author | Jenkins <jenkins@review.openstack.org> | 2013-03-12 23:59:59 +0000 |
---|---|---|
committer | Gerrit Code Review <review@openstack.org> | 2013-03-12 23:59:59 +0000 |
commit | b0694b33d3a7c32b582030d44231ef31bcc8ec60 (patch) | |
tree | d0a5a1b1a273e8c60738e8c9e94e68808ec2dc8e | |
parent | 071719a3e16b747bc9c2cdadab829c3c1287eede (diff) | |
parent | 40feb35898ed0a6d57b1f481c165e683796b045c (diff) | |
download | nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.tar.gz nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.tar.xz nova-b0694b33d3a7c32b582030d44231ef31bcc8ec60.zip |
Merge "xenapi: Fix reboot with hung volumes"
-rwxr-xr-x | nova/compute/manager.py | 34 | ||||
-rw-r--r-- | nova/tests/compute/test_compute.py | 13 | ||||
-rwxr-xr-x | nova/virt/baremetal/driver.py | 2 | ||||
-rwxr-xr-x | nova/virt/driver.py | 5 | ||||
-rwxr-xr-x | nova/virt/fake.py | 2 | ||||
-rwxr-xr-x | nova/virt/hyperv/driver.py | 2 | ||||
-rwxr-xr-x | nova/virt/libvirt/driver.py | 2 | ||||
-rwxr-xr-x | nova/virt/powervm/driver.py | 5 | ||||
-rwxr-xr-x | nova/virt/vmwareapi/driver.py | 2 | ||||
-rwxr-xr-x | nova/virt/xenapi/driver.py | 5 | ||||
-rw-r--r-- | nova/virt/xenapi/vmops.py | 34 | ||||
-rw-r--r-- | nova/virt/xenapi/volumeops.py | 28 |
12 files changed, 116 insertions, 18 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py index bb1447da4..24b79a0ab 100755 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1582,6 +1582,32 @@ class ComputeManager(manager.SchedulerDependentManager): network_info=network_info, extra_usage_info=extra_usage_info) + def _handle_bad_volumes_detached(self, context, instance, bad_devices, + block_device_info): + """Handle cases where the virt-layer had to detach non-working volumes + in order to complete an operation. + """ + for bdm in block_device_info['block_device_mapping']: + if bdm.get('mount_device') in bad_devices: + try: + volume_id = bdm['connection_info']['data']['volume_id'] + except KeyError: + continue + + # NOTE(sirp): ideally we'd just call + # `compute_api.detach_volume` here but since that hits the + # DB directly, that's off limits from within the + # compute-manager. + # + # API-detach + LOG.info(_("Detaching from volume api: %s") % volume_id) + volume = self.volume_api.get(context, volume_id) + self.volume_api.check_detach(context, volume) + self.volume_api.begin_detaching(context, volume) + + # Manager-detach + self.detach_volume(context, volume_id, instance) + @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id()) @reverts_task_state @wrap_instance_event @@ -1616,10 +1642,16 @@ class ComputeManager(manager.SchedulerDependentManager): 'expected: %(running)s)') % locals(), context=context, instance=instance) + def bad_volumes_callback(bad_devices): + self._handle_bad_volumes_detached( + context, instance, bad_devices, block_device_info) + try: self.driver.reboot(context, instance, self._legacy_nw_info(network_info), - reboot_type, block_device_info) + reboot_type, + block_device_info=block_device_info, + bad_volumes_callback=bad_volumes_callback) except Exception, exc: LOG.error(_('Cannot reboot instance: %(exc)s'), locals(), context=context, instance=instance) diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py index 9da253360..f8c0e86f2 100644 --- a/nova/tests/compute/test_compute.py +++ b/nova/tests/compute/test_compute.py @@ -1150,15 +1150,20 @@ class ComputeTestCase(BaseTestCase): # this is called with the wrong args, so we have to hack # around it. reboot_call_info = {} - expected_call_info = {'args': (econtext, updated_instance1, - expected_nw_info, reboot_type, - fake_block_dev_info), - 'kwargs': {}} + expected_call_info = { + 'args': (econtext, updated_instance1, expected_nw_info, + reboot_type), + 'kwargs': {'block_device_info': fake_block_dev_info}} def fake_reboot(*args, **kwargs): reboot_call_info['args'] = args reboot_call_info['kwargs'] = kwargs + # NOTE(sirp): Since `bad_volumes_callback` is a function defined + # within `reboot_instance`, we don't have access to its value and + # can't stub it out, thus we skip that comparison. + kwargs.pop('bad_volumes_callback') + self.stubs.Set(self.compute.driver, 'reboot', fake_reboot) # Power state should be updated again diff --git a/nova/virt/baremetal/driver.py b/nova/virt/baremetal/driver.py index 94d3f0a92..97d72cc74 100755 --- a/nova/virt/baremetal/driver.py +++ b/nova/virt/baremetal/driver.py @@ -275,7 +275,7 @@ class BareMetalDriver(driver.ComputeDriver): _update_state(context, node, None, baremetal_states.DELETED) def reboot(self, context, instance, network_info, reboot_type, - block_device_info=None): + block_device_info=None, bad_volumes_callback=None): node = _get_baremetal_node_by_instance_uuid(instance['uuid']) ctx = nova_context.get_admin_context() pm = get_power_manager(node=node, instance=instance) diff --git a/nova/virt/driver.py b/nova/virt/driver.py index 11c65519c..71bcaef42 100755 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -248,7 +248,7 @@ class ComputeDriver(object): raise NotImplementedError() def reboot(self, context, instance, network_info, reboot_type, - block_device_info=None): + block_device_info=None, bad_volumes_callback=None): """Reboot the specified instance. After this is called successfully, the instance's state @@ -261,6 +261,9 @@ class ComputeDriver(object): :param network_info: :py:meth:`~nova.network.manager.NetworkManager.get_instance_nw_info` :param reboot_type: Either a HARD or SOFT reboot + :param block_device_info: Info pertaining to attached volumes + :param bad_volumes_callback: Function to handle any bad volumes + encountered """ raise NotImplementedError() diff --git a/nova/virt/fake.py b/nova/virt/fake.py index 58f303efc..b2b102486 100755 --- a/nova/virt/fake.py +++ b/nova/virt/fake.py @@ -131,7 +131,7 @@ class FakeDriver(driver.ComputeDriver): update_task_state(task_state=task_states.IMAGE_UPLOADING) def reboot(self, context, instance, network_info, reboot_type, - block_device_info=None): + block_device_info=None, bad_volumes_callback=None): pass @staticmethod diff --git a/nova/virt/hyperv/driver.py b/nova/virt/hyperv/driver.py index 289f3c4b6..477f8fa2a 100755 --- a/nova/virt/hyperv/driver.py +++ b/nova/virt/hyperv/driver.py @@ -54,7 +54,7 @@ class HyperVDriver(driver.ComputeDriver): admin_password, network_info, block_device_info) def reboot(self, context, instance, network_info, reboot_type, - block_device_info=None): + block_device_info=None, bad_volumes_callback=None): self._vmops.reboot(instance, network_info, reboot_type) def destroy(self, instance, network_info, block_device_info=None, diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index e7e6b716f..4a9d2f3f5 100755 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -1279,7 +1279,7 @@ class LibvirtDriver(driver.ComputeDriver): out_path, image_format) def reboot(self, context, instance, network_info, reboot_type='SOFT', - block_device_info=None): + block_device_info=None, bad_volumes_callback=None): """Reboot a virtual machine, given an instance reference.""" if reboot_type == 'SOFT': # NOTE(vish): This will attempt to do a graceful shutdown/restart. diff --git a/nova/virt/powervm/driver.py b/nova/virt/powervm/driver.py index c388eecfd..d06948f17 100755 --- a/nova/virt/powervm/driver.py +++ b/nova/virt/powervm/driver.py @@ -106,13 +106,16 @@ class PowerVMDriver(driver.ComputeDriver): self._powervm.destroy(instance['name'], destroy_disks) def reboot(self, context, instance, network_info, reboot_type, - block_device_info=None): + block_device_info=None, bad_volumes_callback=None): """Reboot the specified instance. :param instance: Instance object as returned by DB layer. :param network_info: :py:meth:`~nova.network.manager.NetworkManager.get_instance_nw_info` :param reboot_type: Either a HARD or SOFT reboot + :param block_device_info: Info pertaining to attached volumes + :param bad_volumes_callback: Function to handle any bad volumes + encountered """ pass diff --git a/nova/virt/vmwareapi/driver.py b/nova/virt/vmwareapi/driver.py index eeec4c5c2..798a2fde3 100755 --- a/nova/virt/vmwareapi/driver.py +++ b/nova/virt/vmwareapi/driver.py @@ -181,7 +181,7 @@ class VMwareESXDriver(driver.ComputeDriver): self._vmops.snapshot(context, instance, name, update_task_state) def reboot(self, context, instance, network_info, reboot_type, - block_device_info=None): + block_device_info=None, bad_volumes_callback=None): """Reboot VM instance.""" self._vmops.reboot(instance, network_info) diff --git a/nova/virt/xenapi/driver.py b/nova/virt/xenapi/driver.py index decaed2b0..302679685 100755 --- a/nova/virt/xenapi/driver.py +++ b/nova/virt/xenapi/driver.py @@ -194,9 +194,10 @@ class XenAPIDriver(driver.ComputeDriver): self._vmops.snapshot(context, instance, image_id, update_task_state) def reboot(self, context, instance, network_info, reboot_type, - block_device_info=None): + block_device_info=None, bad_volumes_callback=None): """Reboot VM instance.""" - self._vmops.reboot(instance, reboot_type) + self._vmops.reboot(instance, reboot_type, + bad_volumes_callback=bad_volumes_callback) def set_admin_password(self, instance, new_pass): """Set the root/admin password on the VM instance.""" diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py index 56dd5bd3d..0593be52d 100644 --- a/nova/virt/xenapi/vmops.py +++ b/nova/virt/xenapi/vmops.py @@ -272,14 +272,31 @@ class VMOps(object): step=5, total_steps=RESIZE_TOTAL_STEPS) - def _start(self, instance, vm_ref=None): + def _start(self, instance, vm_ref=None, bad_volumes_callback=None): """Power on a VM instance.""" vm_ref = vm_ref or self._get_vm_opaque_ref(instance) LOG.debug(_("Starting instance"), instance=instance) + + # Attached volumes that have become non-responsive will prevent a VM + # from starting, so scan for these before attempting to start + # + # In order to make sure this detach is consistent (virt, BDM, cinder), + # we only detach in the virt-layer if a callback is provided. + if bad_volumes_callback: + bad_devices = self._volumeops.find_bad_volumes(vm_ref) + for device_name in bad_devices: + self._volumeops.detach_volume( + None, instance['name'], device_name) + self._session.call_xenapi('VM.start_on', vm_ref, self._session.get_xenapi_host(), False, False) + # Allow higher-layers a chance to detach bad-volumes as well (in order + # to cleanup BDM entries and detach in Cinder) + if bad_volumes_callback and bad_devices: + bad_volumes_callback(bad_devices) + def _create_disks(self, context, instance, name_label, disk_image_type, image_meta, block_device_info=None): vdis = vm_utils.get_vdis_for_instance(context, self._session, @@ -930,7 +947,7 @@ class VMOps(object): return 'VDI.resize' - def reboot(self, instance, reboot_type): + def reboot(self, instance, reboot_type, bad_volumes_callback=None): """Reboot VM instance.""" # Note (salvatore-orlando): security group rules are not re-enforced # upon reboot, since this action on the XenAPI drivers does not @@ -948,9 +965,18 @@ class VMOps(object): details[-1] == 'halted'): LOG.info(_("Starting halted instance found during reboot"), instance=instance) - self._session.call_xenapi('VM.start', vm_ref, False, False) + self._start(instance, vm_ref=vm_ref, + bad_volumes_callback=bad_volumes_callback) return - raise + elif details[0] == 'SR_BACKEND_FAILURE_46': + LOG.warn(_("Reboot failed due to bad volumes, detaching bad" + " volumes and starting halted instance"), + instance=instance) + self._start(instance, vm_ref=vm_ref, + bad_volumes_callback=bad_volumes_callback) + return + else: + raise def set_admin_password(self, instance, new_pass): """Set the root/admin password on the VM instance.""" diff --git a/nova/virt/xenapi/volumeops.py b/nova/virt/xenapi/volumeops.py index d3c3046b7..add3787a3 100644 --- a/nova/virt/xenapi/volumeops.py +++ b/nova/virt/xenapi/volumeops.py @@ -151,3 +151,31 @@ class VolumeOps(object): vbd_refs = self._get_all_volume_vbd_refs(vm_ref) for vbd_ref in vbd_refs: self._detach_vbd(vbd_ref, unplug=unplug) + + def find_bad_volumes(self, vm_ref): + """Find any volumes with their connection severed. + + Certain VM operations (e.g. `VM.start`, `VM.reboot`, etc.) will not + work when a VBD is present that points to a non-working volume. To work + around this, we scan for non-working volumes and detach them before + retrying a failed operation. + """ + bad_devices = [] + vbd_refs = self._get_all_volume_vbd_refs(vm_ref) + for vbd_ref in vbd_refs: + sr_ref = volume_utils.find_sr_from_vbd(self._session, vbd_ref) + + try: + # TODO(sirp): bug1152401 This relies on a 120 sec timeout + # within XenServer, update this to fail-fast when this is fixed + # upstream + self._session.call_xenapi("SR.scan", sr_ref) + except self._session.XenAPI.Failure, exc: + if exc.details[0] == 'SR_BACKEND_FAILURE_40': + vbd_rec = vbd_rec = self._session.call_xenapi( + "VBD.get_record", vbd_ref) + bad_devices.append('/dev/%s' % vbd_rec['device']) + else: + raise + + return bad_devices |