diff options
| author | Yun Mao <yunmao@gmail.com> | 2012-08-30 14:55:13 -0400 |
|---|---|---|
| committer | Yun Mao <yunmao@gmail.com> | 2012-08-31 13:39:29 -0400 |
| commit | 4082c8375a6ae7e7e67c7ad2e263be2d5fc3dd1f (patch) | |
| tree | c4754f7470f4ca9bff17c79129a1f50348a4f585 /nova/compute | |
| parent | 3dd31e5a55c9932a3073d9988840066c0a7b87d0 (diff) | |
Address race condition from concurrent task state update
task_state acts like a guard to avoid concurrent tasks to be
scheduled. There might be two race conditions:
1) two tasks are concurrently accepted by api, check task_state to be
None and allow the tasks to be executed concurrently.
2) one ordinary task is running, so that task_state is set. The delete
task is accepted at API, and will "take over" and change task_state to
DELETING. However the first task may continue to update task_state or
set it to None as it finishes.
This patch specifies current expected task_state when updating task_state.
If unexpected state is met, abort the task without updating. Various
compute tests are fixed accordingly to set the pre condition of the
task state.
Part of bug 1037372
Change-Id: I5fdf0946c728a47febb56ad468043a828b2736c8
Diffstat (limited to 'nova/compute')
| -rw-r--r-- | nova/compute/api.py | 50 | ||||
| -rw-r--r-- | nova/compute/manager.py | 98 |
2 files changed, 110 insertions, 38 deletions
diff --git a/nova/compute/api.py b/nova/compute/api.py index 6ce5b5526..68185cca9 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -830,6 +830,7 @@ class API(base.Base): if instance['host']: instance = self.update(context, instance, task_state=task_states.POWERING_OFF, + expected_task_state=None, deleted_at=timeutils.utcnow()) self.compute_rpcapi.power_off_instance(context, instance) @@ -846,6 +847,8 @@ class API(base.Base): host = instance['host'] reservations = None try: + + #Note(maoy): no expected_task_state needs to be set old, updated = self._update(context, instance, task_state=task_states.DELETING, @@ -931,13 +934,16 @@ class API(base.Base): """Restore a previously deleted (but not reclaimed) instance.""" if instance['host']: instance = self.update(context, instance, - task_state=task_states.POWERING_ON, deleted_at=None) + task_state=task_states.POWERING_ON, + expected_task_state=None, + deleted_at=None) self.compute_rpcapi.power_on_instance(context, instance) else: self.update(context, instance, vm_state=vm_states.ACTIVE, task_state=None, + expected_task_state=None, deleted_at=None) @wrap_check_policy @@ -957,7 +963,9 @@ class API(base.Base): LOG.debug(_("Going to try to stop instance"), instance=instance) instance = self.update(context, instance, - task_state=task_states.STOPPING, progress=0) + task_state=task_states.STOPPING, + expected_task_state=None, + progress=0) self.compute_rpcapi.stop_instance(context, instance, cast=do_cast) @@ -969,7 +977,8 @@ class API(base.Base): LOG.debug(_("Going to try to start instance"), instance=instance) instance = self.update(context, instance, - task_state=task_states.STARTING) + task_state=task_states.STARTING, + expected_task_state=None) # TODO(yamahata): injected_files isn't supported right now. # It is used only for osapi. not for ec2 api. @@ -1235,7 +1244,8 @@ class API(base.Base): state = {'SOFT': task_states.REBOOTING, 'HARD': task_states.REBOOTING_HARD}[reboot_type] instance = self.update(context, instance, vm_state=vm_states.ACTIVE, - task_state=state) + task_state=state, + expected_task_state=None) self.compute_rpcapi.reboot_instance(context, instance=instance, reboot_type=reboot_type) @@ -1295,6 +1305,7 @@ class API(base.Base): instance = self.update(context, instance, task_state=task_states.REBUILDING, + expected_task_state=None, # Unfortunately we need to set image_ref early, # so API users can see it. image_ref=image_href, progress=0, **kwargs) @@ -1325,7 +1336,8 @@ class API(base.Base): reservations = self._reserve_quota_delta(context, deltas) instance = self.update(context, instance, - task_state=task_states.RESIZE_REVERTING) + task_state=task_states.RESIZE_REVERTING, + expected_task_state=None) self.compute_rpcapi.revert_resize(context, instance=instance, migration_id=migration_ref['id'], @@ -1351,7 +1363,8 @@ class API(base.Base): reservations = self._reserve_quota_delta(context, deltas) instance = self.update(context, instance, vm_state=vm_states.ACTIVE, - task_state=None) + task_state=None, + expected_task_state=None) self.compute_rpcapi.confirm_resize(context, instance=instance, migration_id=migration_ref['id'], @@ -1506,7 +1519,9 @@ class API(base.Base): resource=resource) instance = self.update(context, instance, - task_state=task_states.RESIZE_PREP, progress=0, **kwargs) + task_state=task_states.RESIZE_PREP, + expected_task_state=None, + progress=0, **kwargs) request_spec = { 'instance_type': new_instance_type, @@ -1550,7 +1565,8 @@ class API(base.Base): self.update(context, instance, vm_state=vm_states.ACTIVE, - task_state=task_states.PAUSING) + task_state=task_states.PAUSING, + expected_task_state=None) self.compute_rpcapi.pause_instance(context, instance=instance) @wrap_check_policy @@ -1561,7 +1577,8 @@ class API(base.Base): self.update(context, instance, vm_state=vm_states.PAUSED, - task_state=task_states.UNPAUSING) + task_state=task_states.UNPAUSING, + expected_task_state=None) self.compute_rpcapi.unpause_instance(context, instance=instance) @wrap_check_policy @@ -1577,7 +1594,8 @@ class API(base.Base): self.update(context, instance, vm_state=vm_states.ACTIVE, - task_state=task_states.SUSPENDING) + task_state=task_states.SUSPENDING, + expected_task_state=None) self.compute_rpcapi.suspend_instance(context, instance=instance) @wrap_check_policy @@ -1588,7 +1606,8 @@ class API(base.Base): self.update(context, instance, vm_state=vm_states.SUSPENDED, - task_state=task_states.RESUMING) + task_state=task_states.RESUMING, + expected_task_state=None) self.compute_rpcapi.resume_instance(context, instance=instance) @wrap_check_policy @@ -1599,7 +1618,8 @@ class API(base.Base): self.update(context, instance, vm_state=vm_states.ACTIVE, - task_state=task_states.RESCUING) + task_state=task_states.RESCUING, + expected_task_state=None) self.compute_rpcapi.rescue_instance(context, instance=instance, rescue_password=rescue_password) @@ -1612,7 +1632,8 @@ class API(base.Base): self.update(context, instance, vm_state=vm_states.RESCUED, - task_state=task_states.UNRESCUING) + task_state=task_states.UNRESCUING, + expected_task_state=None) self.compute_rpcapi.unrescue_instance(context, instance=instance) @wrap_check_policy @@ -1622,7 +1643,8 @@ class API(base.Base): """Set the root/admin password for the given instance.""" self.update(context, instance, - task_state=task_states.UPDATING_PASSWORD) + task_state=task_states.UPDATING_PASSWORD, + expected_task_state=None) self.compute_rpcapi.set_admin_password(context, instance=instance, diff --git a/nova/compute/manager.py b/nova/compute/manager.py index f61f29293..f6acefc3f 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -165,6 +165,12 @@ def reverts_task_state(function): def decorated_function(self, context, *args, **kwargs): try: return function(self, context, *args, **kwargs) + except exception.UnexpectedTaskStateError: + LOG.exception(_("Possibly task preempted.")) + # Note(maoy): unexpected task state means the current + # task is preempted. Do not clear task state in this + # case. + raise except Exception: with excutils.save_and_reraise_exception(): try: @@ -676,13 +682,16 @@ class ComputeManager(manager.SchedulerDependentManager): self._instance_update(context, instance['uuid'], host=self.host, launched_on=self.host, vm_state=vm_states.BUILDING, - task_state=None) + task_state=None, + expected_task_state=(task_states.SCHEDULING, + None)) def _allocate_network(self, context, instance, requested_networks): """Allocate networks for an instance and return the network info""" self._instance_update(context, instance['uuid'], vm_state=vm_states.BUILDING, - task_state=task_states.NETWORKING) + task_state=task_states.NETWORKING, + expected_task_state=None) is_vpn = instance['image_ref'] == str(FLAGS.vpn_image_id) try: # allocate and get network info @@ -716,7 +725,9 @@ class ComputeManager(manager.SchedulerDependentManager): """Spawn an instance with error logging and update its power state""" self._instance_update(context, instance['uuid'], vm_state=vm_states.BUILDING, - task_state=task_states.SPAWNING) + task_state=task_states.SPAWNING, + expected_task_state=task_states. + BLOCK_DEVICE_MAPPING) try: self.driver.spawn(context, instance, image_meta, injected_files, admin_password, @@ -731,6 +742,7 @@ class ComputeManager(manager.SchedulerDependentManager): power_state=current_power_state, vm_state=vm_states.ACTIVE, task_state=None, + expected_task_state=task_states.SPAWNING, launched_at=timeutils.utcnow()) def _notify_about_instance_usage(self, context, instance, event_suffix, @@ -863,6 +875,8 @@ class ComputeManager(manager.SchedulerDependentManager): self._notify_about_instance_usage(context, instance, "delete.start") self._shutdown_instance(context, instance) self._cleanup_volumes(context, instance_uuid) + # if a delete task succeed, always update vm state and task state + # without expecting task state to be DELETING instance = self._instance_update(context, instance_uuid, vm_state=vm_states.DELETED, @@ -929,6 +943,8 @@ class ComputeManager(manager.SchedulerDependentManager): instance['uuid'], power_state=current_power_state, vm_state=final_state, + expected_task_state=(task_states.POWERING_OFF, + task_states.STOPPING), task_state=None) self._notify_about_instance_usage(context, instance, "power_off.end") @@ -944,7 +960,9 @@ class ComputeManager(manager.SchedulerDependentManager): instance['uuid'], power_state=current_power_state, vm_state=vm_states.ACTIVE, - task_state=None) + task_state=None, + expected_task_state=(task_states.POWERING_ON, + task_states.STARTING)) self._notify_about_instance_usage(context, instance, "power_on.end") @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id()) @@ -989,15 +1007,17 @@ class ComputeManager(manager.SchedulerDependentManager): self._instance_update(context, instance['uuid'], power_state=current_power_state, - task_state=task_states.REBUILDING) + task_state=task_states.REBUILDING, + expected_task_state=task_states.REBUILDING) network_info = self._get_instance_nw_info(context, instance) self.driver.destroy(instance, self._legacy_nw_info(network_info)) instance = self._instance_update(context, instance['uuid'], - task_state=task_states.\ - REBUILD_BLOCK_DEVICE_MAPPING) + task_state=task_states. + REBUILD_BLOCK_DEVICE_MAPPING, + expected_task_state=task_states.REBUILDING) instance.injected_files = injected_files network_info = self.network_api.get_instance_nw_info(context, @@ -1006,8 +1026,10 @@ class ComputeManager(manager.SchedulerDependentManager): instance = self._instance_update(context, instance['uuid'], - task_state=task_states.\ - REBUILD_SPAWNING) + task_state=task_states. + REBUILD_SPAWNING, + expected_task_state=task_states. + REBUILD_BLOCK_DEVICE_MAPPING) # pull in new password here since the original password isn't in # the db admin_password = new_pass @@ -1023,6 +1045,8 @@ class ComputeManager(manager.SchedulerDependentManager): power_state=current_power_state, vm_state=vm_states.ACTIVE, task_state=None, + expected_task_state=task_states. + REBUILD_SPAWNING, launched_at=timeutils.utcnow()) self._notify_about_instance_usage( @@ -1114,7 +1138,8 @@ class ComputeManager(manager.SchedulerDependentManager): context, instance, "snapshot.start") self.driver.snapshot(context, instance, image_id) - self._instance_update(context, instance['uuid'], task_state=None) + self._instance_update(context, instance['uuid'], task_state=None, + expected_task_state=task_states.IMAGE_SNAPSHOT) if image_type == 'snapshot' and rotation: raise exception.ImageRotationNotAllowed() @@ -1201,7 +1226,9 @@ class ComputeManager(manager.SchedulerDependentManager): if current_power_state != expected_state: self._instance_update(context, instance['uuid'], - task_state=None) + task_state=None, + expected_task_state=task_states. + UPDATING_PASSWORD) _msg = _('Failed to set admin password. Instance %s is not' ' running') % instance["uuid"] raise exception.InstancePasswordSetFailed( @@ -1212,7 +1239,9 @@ class ComputeManager(manager.SchedulerDependentManager): LOG.audit(_("Root password set"), instance=instance) self._instance_update(context, instance['uuid'], - task_state=None) + task_state=None, + expected_task_state=task_states. + UPDATING_PASSWORD) break except NotImplementedError: # NOTE(dprince): if the driver doesn't implement @@ -1222,9 +1251,15 @@ class ComputeManager(manager.SchedulerDependentManager): LOG.warn(_msg, instance=instance) self._instance_update(context, instance['uuid'], - task_state=None) + task_state=None, + expected_task_state=task_states. + UPDATING_PASSWORD) raise exception.InstancePasswordSetFailed( instance=instance['uuid'], reason=_msg) + except exception.UnexpectedTaskStateError: + # interrupted by another (most likely delete) task + # do not retry + raise except Exception, e: # Catch all here because this could be anything. LOG.exception(_('set_admin_password failed: %s') % e, @@ -1285,7 +1320,8 @@ class ComputeManager(manager.SchedulerDependentManager): instance['uuid'], vm_state=vm_states.RESCUED, task_state=None, - power_state=current_power_state) + power_state=current_power_state, + expected_task_state=task_states.RESCUING) @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id()) @reverts_task_state @@ -1306,6 +1342,7 @@ class ComputeManager(manager.SchedulerDependentManager): instance['uuid'], vm_state=vm_states.ACTIVE, task_state=None, + expected_task_state=task_states.UNRESCUING, power_state=current_power_state) @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id()) @@ -1405,7 +1442,9 @@ class ComputeManager(manager.SchedulerDependentManager): instance_type_id=instance_type['id'], launched_at=timeutils.utcnow(), vm_state=vm_states.ACTIVE, - task_state=None) + task_state=None, + expected_task_state=task_states. + RESIZE_REVERTING) self.db.migration_update(context, migration_id, {'status': 'reverted'}) @@ -1495,7 +1534,8 @@ class ComputeManager(manager.SchedulerDependentManager): {'status': 'migrating'}) self._instance_update(context, instance['uuid'], - task_state=task_states.RESIZE_MIGRATING) + task_state=task_states.RESIZE_MIGRATING, + expected_task_state=task_states.RESIZE_PREP) self._notify_about_instance_usage( context, instance, "resize.start", network_info=network_info) @@ -1509,7 +1549,9 @@ class ComputeManager(manager.SchedulerDependentManager): {'status': 'post-migrating'}) self._instance_update(context, instance['uuid'], - task_state=task_states.RESIZE_MIGRATED) + task_state=task_states.RESIZE_MIGRATED, + expected_task_state=task_states. + RESIZE_MIGRATING) self.compute_rpcapi.finish_resize(context, instance, migration_id, image, disk_info, migration_ref['dest_compute'], reservations) @@ -1542,7 +1584,8 @@ class ComputeManager(manager.SchedulerDependentManager): network_info = self._get_instance_nw_info(context, instance) self._instance_update(context, instance['uuid'], - task_state=task_states.RESIZE_FINISH) + task_state=task_states.RESIZE_FINISH, + expected_task_state=task_states.RESIZE_MIGRATED) self._notify_about_instance_usage( context, instance, "finish_resize.start", @@ -1558,7 +1601,9 @@ class ComputeManager(manager.SchedulerDependentManager): vm_state=vm_states.RESIZED, host=migration_ref['dest_compute'], launched_at=timeutils.utcnow(), - task_state=None) + task_state=None, + expected_task_state=task_states. + RESIZE_FINISH) self.db.migration_update(context, migration_ref.id, {'status': 'finished'}) @@ -1647,7 +1692,8 @@ class ComputeManager(manager.SchedulerDependentManager): instance['uuid'], power_state=current_power_state, vm_state=vm_states.PAUSED, - task_state=None) + task_state=None, + expected_task_state=task_states.PAUSING) @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id()) @reverts_task_state @@ -1663,7 +1709,8 @@ class ComputeManager(manager.SchedulerDependentManager): instance['uuid'], power_state=current_power_state, vm_state=vm_states.ACTIVE, - task_state=None) + task_state=None, + expected_task_state=task_states.UNPAUSING) @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id()) def host_power_action(self, context, host=None, action=None): @@ -1711,7 +1758,8 @@ class ComputeManager(manager.SchedulerDependentManager): instance['uuid'], power_state=current_power_state, vm_state=vm_states.SUSPENDED, - task_state=None) + task_state=None, + expected_task_state=task_states.SUSPENDING) self._notify_about_instance_usage(context, instance, 'suspend') @@ -2201,7 +2249,8 @@ class ComputeManager(manager.SchedulerDependentManager): host=self.host, power_state=current_power_state, vm_state=vm_states.ACTIVE, - task_state=None) + task_state=None, + expected_task_state=task_states.MIGRATING) # NOTE(vish): this is necessary to update dhcp self.network_api.setup_networks_on_host(context, instance, self.host) @@ -2223,7 +2272,8 @@ class ComputeManager(manager.SchedulerDependentManager): instance_ref['uuid'], host=host, vm_state=vm_states.ACTIVE, - task_state=None) + task_state=None, + expected_task_state=task_states.MIGRATING) # NOTE(tr3buchet): setup networks on source host (really it's re-setup) self.network_api.setup_networks_on_host(context, instance_ref, |
