summaryrefslogtreecommitdiffstats
path: root/nova/compute
diff options
context:
space:
mode:
authorYun Mao <yunmao@gmail.com>2012-08-30 14:55:13 -0400
committerYun Mao <yunmao@gmail.com>2012-08-31 13:39:29 -0400
commit4082c8375a6ae7e7e67c7ad2e263be2d5fc3dd1f (patch)
treec4754f7470f4ca9bff17c79129a1f50348a4f585 /nova/compute
parent3dd31e5a55c9932a3073d9988840066c0a7b87d0 (diff)
Address race condition from concurrent task state update
task_state acts like a guard to avoid concurrent tasks to be scheduled. There might be two race conditions: 1) two tasks are concurrently accepted by api, check task_state to be None and allow the tasks to be executed concurrently. 2) one ordinary task is running, so that task_state is set. The delete task is accepted at API, and will "take over" and change task_state to DELETING. However the first task may continue to update task_state or set it to None as it finishes. This patch specifies current expected task_state when updating task_state. If unexpected state is met, abort the task without updating. Various compute tests are fixed accordingly to set the pre condition of the task state. Part of bug 1037372 Change-Id: I5fdf0946c728a47febb56ad468043a828b2736c8
Diffstat (limited to 'nova/compute')
-rw-r--r--nova/compute/api.py50
-rw-r--r--nova/compute/manager.py98
2 files changed, 110 insertions, 38 deletions
diff --git a/nova/compute/api.py b/nova/compute/api.py
index 6ce5b5526..68185cca9 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -830,6 +830,7 @@ class API(base.Base):
if instance['host']:
instance = self.update(context, instance,
task_state=task_states.POWERING_OFF,
+ expected_task_state=None,
deleted_at=timeutils.utcnow())
self.compute_rpcapi.power_off_instance(context, instance)
@@ -846,6 +847,8 @@ class API(base.Base):
host = instance['host']
reservations = None
try:
+
+ #Note(maoy): no expected_task_state needs to be set
old, updated = self._update(context,
instance,
task_state=task_states.DELETING,
@@ -931,13 +934,16 @@ class API(base.Base):
"""Restore a previously deleted (but not reclaimed) instance."""
if instance['host']:
instance = self.update(context, instance,
- task_state=task_states.POWERING_ON, deleted_at=None)
+ task_state=task_states.POWERING_ON,
+ expected_task_state=None,
+ deleted_at=None)
self.compute_rpcapi.power_on_instance(context, instance)
else:
self.update(context,
instance,
vm_state=vm_states.ACTIVE,
task_state=None,
+ expected_task_state=None,
deleted_at=None)
@wrap_check_policy
@@ -957,7 +963,9 @@ class API(base.Base):
LOG.debug(_("Going to try to stop instance"), instance=instance)
instance = self.update(context, instance,
- task_state=task_states.STOPPING, progress=0)
+ task_state=task_states.STOPPING,
+ expected_task_state=None,
+ progress=0)
self.compute_rpcapi.stop_instance(context, instance, cast=do_cast)
@@ -969,7 +977,8 @@ class API(base.Base):
LOG.debug(_("Going to try to start instance"), instance=instance)
instance = self.update(context, instance,
- task_state=task_states.STARTING)
+ task_state=task_states.STARTING,
+ expected_task_state=None)
# TODO(yamahata): injected_files isn't supported right now.
# It is used only for osapi. not for ec2 api.
@@ -1235,7 +1244,8 @@ class API(base.Base):
state = {'SOFT': task_states.REBOOTING,
'HARD': task_states.REBOOTING_HARD}[reboot_type]
instance = self.update(context, instance, vm_state=vm_states.ACTIVE,
- task_state=state)
+ task_state=state,
+ expected_task_state=None)
self.compute_rpcapi.reboot_instance(context, instance=instance,
reboot_type=reboot_type)
@@ -1295,6 +1305,7 @@ class API(base.Base):
instance = self.update(context, instance,
task_state=task_states.REBUILDING,
+ expected_task_state=None,
# Unfortunately we need to set image_ref early,
# so API users can see it.
image_ref=image_href, progress=0, **kwargs)
@@ -1325,7 +1336,8 @@ class API(base.Base):
reservations = self._reserve_quota_delta(context, deltas)
instance = self.update(context, instance,
- task_state=task_states.RESIZE_REVERTING)
+ task_state=task_states.RESIZE_REVERTING,
+ expected_task_state=None)
self.compute_rpcapi.revert_resize(context,
instance=instance, migration_id=migration_ref['id'],
@@ -1351,7 +1363,8 @@ class API(base.Base):
reservations = self._reserve_quota_delta(context, deltas)
instance = self.update(context, instance, vm_state=vm_states.ACTIVE,
- task_state=None)
+ task_state=None,
+ expected_task_state=None)
self.compute_rpcapi.confirm_resize(context,
instance=instance, migration_id=migration_ref['id'],
@@ -1506,7 +1519,9 @@ class API(base.Base):
resource=resource)
instance = self.update(context, instance,
- task_state=task_states.RESIZE_PREP, progress=0, **kwargs)
+ task_state=task_states.RESIZE_PREP,
+ expected_task_state=None,
+ progress=0, **kwargs)
request_spec = {
'instance_type': new_instance_type,
@@ -1550,7 +1565,8 @@ class API(base.Base):
self.update(context,
instance,
vm_state=vm_states.ACTIVE,
- task_state=task_states.PAUSING)
+ task_state=task_states.PAUSING,
+ expected_task_state=None)
self.compute_rpcapi.pause_instance(context, instance=instance)
@wrap_check_policy
@@ -1561,7 +1577,8 @@ class API(base.Base):
self.update(context,
instance,
vm_state=vm_states.PAUSED,
- task_state=task_states.UNPAUSING)
+ task_state=task_states.UNPAUSING,
+ expected_task_state=None)
self.compute_rpcapi.unpause_instance(context, instance=instance)
@wrap_check_policy
@@ -1577,7 +1594,8 @@ class API(base.Base):
self.update(context,
instance,
vm_state=vm_states.ACTIVE,
- task_state=task_states.SUSPENDING)
+ task_state=task_states.SUSPENDING,
+ expected_task_state=None)
self.compute_rpcapi.suspend_instance(context, instance=instance)
@wrap_check_policy
@@ -1588,7 +1606,8 @@ class API(base.Base):
self.update(context,
instance,
vm_state=vm_states.SUSPENDED,
- task_state=task_states.RESUMING)
+ task_state=task_states.RESUMING,
+ expected_task_state=None)
self.compute_rpcapi.resume_instance(context, instance=instance)
@wrap_check_policy
@@ -1599,7 +1618,8 @@ class API(base.Base):
self.update(context,
instance,
vm_state=vm_states.ACTIVE,
- task_state=task_states.RESCUING)
+ task_state=task_states.RESCUING,
+ expected_task_state=None)
self.compute_rpcapi.rescue_instance(context, instance=instance,
rescue_password=rescue_password)
@@ -1612,7 +1632,8 @@ class API(base.Base):
self.update(context,
instance,
vm_state=vm_states.RESCUED,
- task_state=task_states.UNRESCUING)
+ task_state=task_states.UNRESCUING,
+ expected_task_state=None)
self.compute_rpcapi.unrescue_instance(context, instance=instance)
@wrap_check_policy
@@ -1622,7 +1643,8 @@ class API(base.Base):
"""Set the root/admin password for the given instance."""
self.update(context,
instance,
- task_state=task_states.UPDATING_PASSWORD)
+ task_state=task_states.UPDATING_PASSWORD,
+ expected_task_state=None)
self.compute_rpcapi.set_admin_password(context,
instance=instance,
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index f61f29293..f6acefc3f 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -165,6 +165,12 @@ def reverts_task_state(function):
def decorated_function(self, context, *args, **kwargs):
try:
return function(self, context, *args, **kwargs)
+ except exception.UnexpectedTaskStateError:
+ LOG.exception(_("Possibly task preempted."))
+ # Note(maoy): unexpected task state means the current
+ # task is preempted. Do not clear task state in this
+ # case.
+ raise
except Exception:
with excutils.save_and_reraise_exception():
try:
@@ -676,13 +682,16 @@ class ComputeManager(manager.SchedulerDependentManager):
self._instance_update(context, instance['uuid'],
host=self.host, launched_on=self.host,
vm_state=vm_states.BUILDING,
- task_state=None)
+ task_state=None,
+ expected_task_state=(task_states.SCHEDULING,
+ None))
def _allocate_network(self, context, instance, requested_networks):
"""Allocate networks for an instance and return the network info"""
self._instance_update(context, instance['uuid'],
vm_state=vm_states.BUILDING,
- task_state=task_states.NETWORKING)
+ task_state=task_states.NETWORKING,
+ expected_task_state=None)
is_vpn = instance['image_ref'] == str(FLAGS.vpn_image_id)
try:
# allocate and get network info
@@ -716,7 +725,9 @@ class ComputeManager(manager.SchedulerDependentManager):
"""Spawn an instance with error logging and update its power state"""
self._instance_update(context, instance['uuid'],
vm_state=vm_states.BUILDING,
- task_state=task_states.SPAWNING)
+ task_state=task_states.SPAWNING,
+ expected_task_state=task_states.
+ BLOCK_DEVICE_MAPPING)
try:
self.driver.spawn(context, instance, image_meta,
injected_files, admin_password,
@@ -731,6 +742,7 @@ class ComputeManager(manager.SchedulerDependentManager):
power_state=current_power_state,
vm_state=vm_states.ACTIVE,
task_state=None,
+ expected_task_state=task_states.SPAWNING,
launched_at=timeutils.utcnow())
def _notify_about_instance_usage(self, context, instance, event_suffix,
@@ -863,6 +875,8 @@ class ComputeManager(manager.SchedulerDependentManager):
self._notify_about_instance_usage(context, instance, "delete.start")
self._shutdown_instance(context, instance)
self._cleanup_volumes(context, instance_uuid)
+ # if a delete task succeed, always update vm state and task state
+ # without expecting task state to be DELETING
instance = self._instance_update(context,
instance_uuid,
vm_state=vm_states.DELETED,
@@ -929,6 +943,8 @@ class ComputeManager(manager.SchedulerDependentManager):
instance['uuid'],
power_state=current_power_state,
vm_state=final_state,
+ expected_task_state=(task_states.POWERING_OFF,
+ task_states.STOPPING),
task_state=None)
self._notify_about_instance_usage(context, instance, "power_off.end")
@@ -944,7 +960,9 @@ class ComputeManager(manager.SchedulerDependentManager):
instance['uuid'],
power_state=current_power_state,
vm_state=vm_states.ACTIVE,
- task_state=None)
+ task_state=None,
+ expected_task_state=(task_states.POWERING_ON,
+ task_states.STARTING))
self._notify_about_instance_usage(context, instance, "power_on.end")
@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@@ -989,15 +1007,17 @@ class ComputeManager(manager.SchedulerDependentManager):
self._instance_update(context,
instance['uuid'],
power_state=current_power_state,
- task_state=task_states.REBUILDING)
+ task_state=task_states.REBUILDING,
+ expected_task_state=task_states.REBUILDING)
network_info = self._get_instance_nw_info(context, instance)
self.driver.destroy(instance, self._legacy_nw_info(network_info))
instance = self._instance_update(context,
instance['uuid'],
- task_state=task_states.\
- REBUILD_BLOCK_DEVICE_MAPPING)
+ task_state=task_states.
+ REBUILD_BLOCK_DEVICE_MAPPING,
+ expected_task_state=task_states.REBUILDING)
instance.injected_files = injected_files
network_info = self.network_api.get_instance_nw_info(context,
@@ -1006,8 +1026,10 @@ class ComputeManager(manager.SchedulerDependentManager):
instance = self._instance_update(context,
instance['uuid'],
- task_state=task_states.\
- REBUILD_SPAWNING)
+ task_state=task_states.
+ REBUILD_SPAWNING,
+ expected_task_state=task_states.
+ REBUILD_BLOCK_DEVICE_MAPPING)
# pull in new password here since the original password isn't in
# the db
admin_password = new_pass
@@ -1023,6 +1045,8 @@ class ComputeManager(manager.SchedulerDependentManager):
power_state=current_power_state,
vm_state=vm_states.ACTIVE,
task_state=None,
+ expected_task_state=task_states.
+ REBUILD_SPAWNING,
launched_at=timeutils.utcnow())
self._notify_about_instance_usage(
@@ -1114,7 +1138,8 @@ class ComputeManager(manager.SchedulerDependentManager):
context, instance, "snapshot.start")
self.driver.snapshot(context, instance, image_id)
- self._instance_update(context, instance['uuid'], task_state=None)
+ self._instance_update(context, instance['uuid'], task_state=None,
+ expected_task_state=task_states.IMAGE_SNAPSHOT)
if image_type == 'snapshot' and rotation:
raise exception.ImageRotationNotAllowed()
@@ -1201,7 +1226,9 @@ class ComputeManager(manager.SchedulerDependentManager):
if current_power_state != expected_state:
self._instance_update(context, instance['uuid'],
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.
+ UPDATING_PASSWORD)
_msg = _('Failed to set admin password. Instance %s is not'
' running') % instance["uuid"]
raise exception.InstancePasswordSetFailed(
@@ -1212,7 +1239,9 @@ class ComputeManager(manager.SchedulerDependentManager):
LOG.audit(_("Root password set"), instance=instance)
self._instance_update(context,
instance['uuid'],
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.
+ UPDATING_PASSWORD)
break
except NotImplementedError:
# NOTE(dprince): if the driver doesn't implement
@@ -1222,9 +1251,15 @@ class ComputeManager(manager.SchedulerDependentManager):
LOG.warn(_msg, instance=instance)
self._instance_update(context,
instance['uuid'],
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.
+ UPDATING_PASSWORD)
raise exception.InstancePasswordSetFailed(
instance=instance['uuid'], reason=_msg)
+ except exception.UnexpectedTaskStateError:
+ # interrupted by another (most likely delete) task
+ # do not retry
+ raise
except Exception, e:
# Catch all here because this could be anything.
LOG.exception(_('set_admin_password failed: %s') % e,
@@ -1285,7 +1320,8 @@ class ComputeManager(manager.SchedulerDependentManager):
instance['uuid'],
vm_state=vm_states.RESCUED,
task_state=None,
- power_state=current_power_state)
+ power_state=current_power_state,
+ expected_task_state=task_states.RESCUING)
@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@reverts_task_state
@@ -1306,6 +1342,7 @@ class ComputeManager(manager.SchedulerDependentManager):
instance['uuid'],
vm_state=vm_states.ACTIVE,
task_state=None,
+ expected_task_state=task_states.UNRESCUING,
power_state=current_power_state)
@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@@ -1405,7 +1442,9 @@ class ComputeManager(manager.SchedulerDependentManager):
instance_type_id=instance_type['id'],
launched_at=timeutils.utcnow(),
vm_state=vm_states.ACTIVE,
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.
+ RESIZE_REVERTING)
self.db.migration_update(context, migration_id,
{'status': 'reverted'})
@@ -1495,7 +1534,8 @@ class ComputeManager(manager.SchedulerDependentManager):
{'status': 'migrating'})
self._instance_update(context, instance['uuid'],
- task_state=task_states.RESIZE_MIGRATING)
+ task_state=task_states.RESIZE_MIGRATING,
+ expected_task_state=task_states.RESIZE_PREP)
self._notify_about_instance_usage(
context, instance, "resize.start", network_info=network_info)
@@ -1509,7 +1549,9 @@ class ComputeManager(manager.SchedulerDependentManager):
{'status': 'post-migrating'})
self._instance_update(context, instance['uuid'],
- task_state=task_states.RESIZE_MIGRATED)
+ task_state=task_states.RESIZE_MIGRATED,
+ expected_task_state=task_states.
+ RESIZE_MIGRATING)
self.compute_rpcapi.finish_resize(context, instance, migration_id,
image, disk_info, migration_ref['dest_compute'], reservations)
@@ -1542,7 +1584,8 @@ class ComputeManager(manager.SchedulerDependentManager):
network_info = self._get_instance_nw_info(context, instance)
self._instance_update(context, instance['uuid'],
- task_state=task_states.RESIZE_FINISH)
+ task_state=task_states.RESIZE_FINISH,
+ expected_task_state=task_states.RESIZE_MIGRATED)
self._notify_about_instance_usage(
context, instance, "finish_resize.start",
@@ -1558,7 +1601,9 @@ class ComputeManager(manager.SchedulerDependentManager):
vm_state=vm_states.RESIZED,
host=migration_ref['dest_compute'],
launched_at=timeutils.utcnow(),
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.
+ RESIZE_FINISH)
self.db.migration_update(context, migration_ref.id,
{'status': 'finished'})
@@ -1647,7 +1692,8 @@ class ComputeManager(manager.SchedulerDependentManager):
instance['uuid'],
power_state=current_power_state,
vm_state=vm_states.PAUSED,
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.PAUSING)
@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@reverts_task_state
@@ -1663,7 +1709,8 @@ class ComputeManager(manager.SchedulerDependentManager):
instance['uuid'],
power_state=current_power_state,
vm_state=vm_states.ACTIVE,
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.UNPAUSING)
@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
def host_power_action(self, context, host=None, action=None):
@@ -1711,7 +1758,8 @@ class ComputeManager(manager.SchedulerDependentManager):
instance['uuid'],
power_state=current_power_state,
vm_state=vm_states.SUSPENDED,
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.SUSPENDING)
self._notify_about_instance_usage(context, instance, 'suspend')
@@ -2201,7 +2249,8 @@ class ComputeManager(manager.SchedulerDependentManager):
host=self.host,
power_state=current_power_state,
vm_state=vm_states.ACTIVE,
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.MIGRATING)
# NOTE(vish): this is necessary to update dhcp
self.network_api.setup_networks_on_host(context, instance, self.host)
@@ -2223,7 +2272,8 @@ class ComputeManager(manager.SchedulerDependentManager):
instance_ref['uuid'],
host=host,
vm_state=vm_states.ACTIVE,
- task_state=None)
+ task_state=None,
+ expected_task_state=task_states.MIGRATING)
# NOTE(tr3buchet): setup networks on source host (really it's re-setup)
self.network_api.setup_networks_on_host(context, instance_ref,