From 129b87e17d3333aeaa9e855a70dea51e6581ea63 Mon Sep 17 00:00:00 2001 From: Yun Mao Date: Tue, 5 Jun 2012 14:55:34 -0400 Subject: vm state and task state management partially implements bp task-management fixes bug 997867 also see http://wiki.openstack.org/VMState Refactored the following API/state: * rebuild * migrate * resize * start * stop * delete * soft delete * rework sync_power_state in compute/manager. fix broken tests, add transition diagram in dot Change-Id: I3c5a97508a6dad7175fba12828bd3fa6ef1e50ee --- nova/compute/api.py | 60 ++++++--------- nova/compute/manager.py | 182 +++++++++++++++++++++++++++----------------- nova/compute/task_states.py | 44 ++++++++++- nova/compute/vm_states.py | 35 +++++---- 4 files changed, 197 insertions(+), 124 deletions(-) (limited to 'nova/compute') diff --git a/nova/compute/api.py b/nova/compute/api.py index 0f8375058..caec8f4b6 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -62,7 +62,7 @@ flags.DECLARE('consoleauth_topic', 'nova.consoleauth') QUOTAS = quota.QUOTAS -def check_instance_state(vm_state=None, task_state=None): +def check_instance_state(vm_state=None, task_state=(None,)): """Decorator to check VM and/or task state before entry to API functions. If the instance is in the wrong state, the wrapper will raise an exception. @@ -811,7 +811,7 @@ class API(base.Base): return dict(instance_ref.iteritems()) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF, + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED, vm_states.ERROR]) def soft_delete(self, context, instance): """Terminate an instance.""" @@ -865,7 +865,7 @@ class API(base.Base): task_state=task_states.DELETING, progress=0) - if instance['task_state'] == task_states.RESIZE_VERIFY: + if instance['vm_state'] == vm_states.RESIZED: # If in the middle of a resize, use confirm_resize to # ensure the original instance is cleaned up too migration_ref = self.db.migration_get_by_instance_and_status( @@ -887,13 +887,9 @@ class API(base.Base): with excutils.save_and_reraise_exception(): QUOTAS.rollback(context, reservations) - # NOTE(jerdfelt): The API implies that only ACTIVE and ERROR are - # allowed but the EC2 API appears to allow from RESCUED and STOPPED - # too + # NOTE(maoy): we allow delete to be called no matter what vm_state says. @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.BUILDING, - vm_states.ERROR, vm_states.RESCUED, - vm_states.SHUTOFF, vm_states.STOPPED]) + @check_instance_state(vm_state=None, task_state=None) def delete(self, context, instance): """Terminate an instance.""" LOG.debug(_("Going to try to terminate instance"), instance=instance) @@ -904,7 +900,7 @@ class API(base.Base): self._delete(context, instance) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.SOFT_DELETE]) + @check_instance_state(vm_state=[vm_states.SOFT_DELETED]) def restore(self, context, instance): """Restore a previously deleted (but not reclaimed) instance.""" if instance['host']: @@ -921,14 +917,14 @@ class API(base.Base): deleted_at=None) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.SOFT_DELETE]) + @check_instance_state(vm_state=[vm_states.SOFT_DELETED]) def force_delete(self, context, instance): """Force delete a previously deleted (but not reclaimed) instance.""" self._delete(context, instance) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF, - vm_states.RESCUED], + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.RESCUED, + vm_states.ERROR, vm_states.STOPPED], task_state=[None]) def stop(self, context, instance, do_cast=True): """Stop an instance.""" @@ -943,7 +939,7 @@ class API(base.Base): self.compute_rpcapi.stop_instance(context, instance, cast=do_cast) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.STOPPED, vm_states.SHUTOFF]) + @check_instance_state(vm_state=[vm_states.STOPPED]) def start(self, context, instance): """Start an instance.""" LOG.debug(_("Going to try to start instance"), instance=instance) @@ -1088,7 +1084,7 @@ class API(base.Base): sort_dir) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF]) + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED]) def backup(self, context, instance, name, backup_type, rotation, extra_properties=None): """Backup the given instance @@ -1106,7 +1102,7 @@ class API(base.Base): return recv_meta @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF]) + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED]) def snapshot(self, context, instance, name, extra_properties=None): """Snapshot the given instance. @@ -1201,7 +1197,7 @@ class API(base.Base): return min_ram, min_disk @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF, + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED, vm_states.RESCUED], task_state=[None]) def reboot(self, context, instance, reboot_type): @@ -1222,7 +1218,7 @@ class API(base.Base): return image_service.show(context, image_id) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF], + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED], task_state=[None]) def rebuild(self, context, instance, image_href, admin_password, **kwargs): """Rebuild the given instance with the provided attributes.""" @@ -1270,11 +1266,10 @@ class API(base.Base): self.update(context, instance, - vm_state=vm_states.REBUILDING, + task_state=task_states.REBUILDING, # Unfortunately we need to set image_ref early, # so API users can see it. image_ref=image_href, - task_state=None, progress=0, **kwargs) @@ -1288,8 +1283,7 @@ class API(base.Base): image_ref=image_href, orig_image_ref=orig_image_ref) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF], - task_state=[task_states.RESIZE_VERIFY]) + @check_instance_state(vm_state=[vm_states.RESIZED]) def revert_resize(self, context, instance): """Reverts a resize, deleting the 'new' instance in the process.""" context = context.elevated() @@ -1301,7 +1295,6 @@ class API(base.Base): self.update(context, instance, - vm_state=vm_states.RESIZING, task_state=task_states.RESIZE_REVERTING) self.compute_rpcapi.revert_resize(context, @@ -1312,8 +1305,7 @@ class API(base.Base): {'status': 'reverted'}) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF], - task_state=[task_states.RESIZE_VERIFY]) + @check_instance_state(vm_state=[vm_states.RESIZED]) def confirm_resize(self, context, instance): """Confirms a migration/resize and deletes the 'old' instance.""" context = context.elevated() @@ -1338,7 +1330,7 @@ class API(base.Base): {'host': migration_ref['dest_compute'], }) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF], + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED], task_state=[None]) def resize(self, context, instance, flavor_id=None, **kwargs): """Resize (ie, migrate) a running instance. @@ -1385,7 +1377,6 @@ class API(base.Base): self.update(context, instance, - vm_state=vm_states.RESIZING, task_state=task_states.RESIZE_PREP, progress=0, **kwargs) @@ -1424,9 +1415,7 @@ class API(base.Base): instance=instance, address=address) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF, - vm_states.RESCUED], - task_state=[None]) + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.RESCUED]) def pause(self, context, instance): """Pause the given instance.""" self.update(context, @@ -1451,9 +1440,7 @@ class API(base.Base): return self.compute_rpcapi.get_diagnostics(context, instance=instance) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF, - vm_states.RESCUED], - task_state=[None]) + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.RESCUED]) def suspend(self, context, instance): """Suspend the given instance.""" self.update(context, @@ -1473,9 +1460,7 @@ class API(base.Base): self.compute_rpcapi.resume_instance(context, instance=instance) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.SHUTOFF, - vm_states.STOPPED], - task_state=[None]) + @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED]) def rescue(self, context, instance, rescue_password=None): """Rescue the given instance.""" self.update(context, @@ -1497,8 +1482,7 @@ class API(base.Base): self.compute_rpcapi.unrescue_instance(context, instance=instance) @wrap_check_policy - @check_instance_state(vm_state=[vm_states.ACTIVE], - task_state=[None]) + @check_instance_state(vm_state=[vm_states.ACTIVE]) def set_admin_password(self, context, instance, password=None): """Set the root/admin password for the given instance.""" self.update(context, diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 7e5070bce..a9816817b 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -811,7 +811,7 @@ class ComputeManager(manager.SchedulerDependentManager): @checks_instance_lock @wrap_instance_fault def power_off_instance(self, context, instance_uuid, - final_state=vm_states.SOFT_DELETE): + final_state=vm_states.SOFT_DELETED): """Power off an instance on this host.""" instance = self.db.instance_get_by_uuid(context, instance_uuid) self._notify_about_instance_usage(context, instance, "power_off.start") @@ -895,16 +895,15 @@ class ComputeManager(manager.SchedulerDependentManager): self._instance_update(context, instance_uuid, power_state=current_power_state, - vm_state=vm_states.REBUILDING, - task_state=None) + task_state=task_states.REBUILDING) network_info = self._get_instance_nw_info(context, instance) self.driver.destroy(instance, self._legacy_nw_info(network_info)) instance = self._instance_update(context, instance_uuid, - vm_state=vm_states.REBUILDING, - task_state=task_states.BLOCK_DEVICE_MAPPING) + task_state=task_states.\ + REBUILD_BLOCK_DEVICE_MAPPING) instance.injected_files = kwargs.get('injected_files', []) network_info = self.network_api.get_instance_nw_info(context, @@ -913,8 +912,8 @@ class ComputeManager(manager.SchedulerDependentManager): instance = self._instance_update(context, instance_uuid, - vm_state=vm_states.REBUILDING, - task_state=task_states.SPAWNING) + task_state=task_states.\ + REBUILD_SPAWNING) # pull in new password here since the original password isn't in the db instance.admin_pass = kwargs.get('new_pass', utils.generate_password(FLAGS.password_length)) @@ -1459,10 +1458,10 @@ class ComputeManager(manager.SchedulerDependentManager): instance_ref = self._instance_update(context, instance_ref.uuid, - vm_state=vm_states.ACTIVE, + vm_state=vm_states.RESIZED, host=migration_ref['dest_compute'], launched_at=timeutils.utcnow(), - task_state=task_states.RESIZE_VERIFY) + task_state=None) self.db.migration_update(context, migration_ref.id, {'status': 'finished'}) @@ -2340,9 +2339,10 @@ class ComputeManager(manager.SchedulerDependentManager): _set_migration_to_error(migration_id, reason % locals(), instance=instance) continue - if instance['task_state'] != task_states.RESIZE_VERIFY: - state = instance['task_state'] - reason = _("In %(state)s task_state, not RESIZE_VERIFY") + if instance['vm_state'] != vm_states.RESIZED \ + and instance['task_state'] != None: + state = instance['vm_state'] + reason = _("In %(state)s vm_state, not RESIZED") _set_migration_to_error(migration_id, reason % locals(), instance=instance) continue @@ -2407,7 +2407,7 @@ class ComputeManager(manager.SchedulerDependentManager): each loop to allow the periodic task eventlet to do other work. If the instance is not found on the hypervisor, but is in the database, - then it will be set to power_state.NOSTATE. + then a stop() API will be called on the instance. """ db_instances = self.db.instance_get_all_by_host(context, self.host) @@ -2422,68 +2422,114 @@ class ComputeManager(manager.SchedulerDependentManager): # Allow other periodic tasks to do some work... greenthread.sleep(0) db_power_state = db_instance['power_state'] + if db_instance['task_state'] is not None: + LOG.info(_("During sync_power_state the instance has a " + "pending task. Skip."), instance=db_instance) + continue + # No pending tasks. Now try to figure out the real vm_power_state. try: vm_instance = self.driver.get_info(db_instance) vm_power_state = vm_instance['state'] except exception.InstanceNotFound: - # This exception might have been caused by a race condition - # between _sync_power_states and live migrations. Two cases - # are possible as documented below. To this aim, refresh the - # DB instance state. - try: - u = self.db.instance_get_by_uuid(context, - db_instance['uuid']) - if self.host != u['host']: - # on the sending end of nova-compute _sync_power_state - # may have yielded to the greenthread performing a live - # migration; this in turn has changed the resident-host - # for the VM; However, the instance is still active, it - # is just in the process of migrating to another host. - # This implies that the compute source must relinquish - # control to the compute destination. - LOG.info(_("During the sync_power process the " - "instance has moved from " - "host %(src)s to host %(dst)s") % - {'src': self.host, - 'dst': u['host']}, - instance=db_instance) - elif (u['host'] == self.host and - u['vm_state'] == vm_states.MIGRATING): - # on the receiving end of nova-compute, it could happen - # that the DB instance already report the new resident - # but the actual VM has not showed up on the hypervisor - # yet. In this case, let's allow the loop to continue - # and run the state sync in a later round - LOG.info(_("Instance is in the process of " - "migrating to this host. Wait next " - "sync_power cycle before setting " - "power state to NOSTATE"), - instance=db_instance) - else: - LOG.warn(_("Instance found in database but not " - "known by hypervisor. Setting power " - "state to NOSTATE"), locals(), - instance=db_instance) - vm_power_state = power_state.NOSTATE - except exception.InstanceNotFound: - # no need to update vm_state for deleted instances - continue - - if vm_power_state == db_power_state: + vm_power_state = power_state.NOSTATE + # Note(maoy): the above get_info call might take a long time, + # for example, because of a broken libvirt driver. + # We re-query the DB to get the latest instance info to minimize + # (not eliminate) race condition. + u = self.db.instance_get_by_uuid(context, + db_instance['uuid']) + db_power_state = u["power_state"] + vm_state = u['vm_state'] + if self.host != u['host']: + # on the sending end of nova-compute _sync_power_state + # may have yielded to the greenthread performing a live + # migration; this in turn has changed the resident-host + # for the VM; However, the instance is still active, it + # is just in the process of migrating to another host. + # This implies that the compute source must relinquish + # control to the compute destination. + LOG.info(_("During the sync_power process the " + "instance has moved from " + "host %(src)s to host %(dst)s") % + {'src': self.host, + 'dst': u['host']}, + instance=db_instance) continue - - if (vm_power_state in (power_state.NOSTATE, - power_state.SHUTDOWN, - power_state.CRASHED) - and db_instance['vm_state'] == vm_states.ACTIVE): - self._instance_update(context, - db_instance['uuid'], - power_state=vm_power_state, - vm_state=vm_states.SHUTOFF) - else: + elif u['task_state'] is not None: + # on the receiving end of nova-compute, it could happen + # that the DB instance already report the new resident + # but the actual VM has not showed up on the hypervisor + # yet. In this case, let's allow the loop to continue + # and run the state sync in a later round + LOG.info(_("During sync_power_state the instance has a " + "pending task. Skip."), instance=db_instance) + continue + if vm_power_state != db_power_state: + # power_state is always updated from hypervisor to db self._instance_update(context, db_instance['uuid'], power_state=vm_power_state) + db_power_state = vm_power_state + # Note(maoy): Now resolve the discrepancy between vm_state and + # vm_power_state. We go through all possible vm_states. + if vm_state in (vm_states.BUILDING, + vm_states.RESCUED, + vm_states.RESIZED, + vm_states.SUSPENDED, + vm_states.PAUSED, + vm_states.ERROR): + # TODO(maoy): we ignore these vm_state for now. + pass + elif vm_state == vm_states.ACTIVE: + # The only rational power state should be RUNNING + if vm_power_state in (power_state.NOSTATE, + power_state.SHUTDOWN, + power_state.CRASHED): + LOG.warn(_("Instance shutdown by itself. Calling " + "the stop API."), instance=db_instance) + try: + # Note(maoy): here we call the API instead of + # brutally updating the vm_state in the database + # to allow all the hooks and checks to be performed. + self.compute_api.stop(context, db_instance) + except Exception: + # Note(maoy): there is no need to propergate the error + # because the same power_state will be retrieved next + # time and retried. + # For example, there might be another task scheduled. + LOG.exception(_("error during stop() in " + "sync_power_state.")) + elif vm_power_state in (power_state.PAUSED, + power_state.SUSPENDED): + LOG.warn(_("Instance is paused or suspended " + "unexpectedly. Calling " + "the stop API."), instance=db_instance) + try: + self.compute_api.stop(context, db_instance) + except Exception: + LOG.exception(_("error during stop() in " + "sync_power_state.")) + elif vm_state == vm_states.STOPPED: + if vm_power_state not in (power_state.NOSTATE, + power_state.SHUTDOWN, + power_state.CRASHED): + LOG.warn(_("Instance is not stopped. Calling " + "the stop API."), instance=db_instance) + try: + # Note(maoy): this assumes that the stop API is + # idempotent. + self.compute_api.stop(context, db_instance) + except Exception: + LOG.exception(_("error during stop() in " + "sync_power_state.")) + elif vm_state in (vm_states.SOFT_DELETED, + vm_states.DELETED): + if vm_power_state not in (power_state.NOSTATE, + power_state.SHUTDOWN): + # Note(maoy): this should be taken care of periodically in + # _cleanup_running_deleted_instances(). + LOG.warn(_("Instance is not (soft-)deleted."), + instance=db_instance) @manager.periodic_task def _reclaim_queued_deletes(self, context): @@ -2498,7 +2544,7 @@ class ComputeManager(manager.SchedulerDependentManager): old_enough = (not instance.deleted_at or timeutils.is_older_than(instance.deleted_at, interval)) - soft_deleted = instance.vm_state == vm_states.SOFT_DELETE + soft_deleted = instance.vm_state == vm_states.SOFT_DELETED if soft_deleted and old_enough: LOG.info(_('Reclaiming deleted instance'), instance=instance) diff --git a/nova/compute/task_states.py b/nova/compute/task_states.py index 795213be0..d4df75e60 100644 --- a/nova/compute/task_states.py +++ b/nova/compute/task_states.py @@ -25,36 +25,74 @@ necessary. """ +# possible task states during create() SCHEDULING = 'scheduling' BLOCK_DEVICE_MAPPING = 'block_device_mapping' NETWORKING = 'networking' SPAWNING = 'spawning' +# possible task states during snapshot() IMAGE_SNAPSHOT = 'image_snapshot' + +# possible task states during backup() IMAGE_BACKUP = 'image_backup' +# possible task states during set_admin_password() UPDATING_PASSWORD = 'updating_password' +# possible task states during resize() RESIZE_PREP = 'resize_prep' RESIZE_MIGRATING = 'resize_migrating' RESIZE_MIGRATED = 'resize_migrated' RESIZE_FINISH = 'resize_finish' + +# possible task states during revert_resize() RESIZE_REVERTING = 'resize_reverting' + +# possible task states during confirm_resize() RESIZE_CONFIRMING = 'resize_confirming' -RESIZE_VERIFY = 'resize_verify' +# possible task states during reboot() REBOOTING = 'rebooting' REBOOTING_HARD = 'rebooting_hard' + +# possible task states during pause() PAUSING = 'pausing' + +# possible task states during unpause() UNPAUSING = 'unpausing' + +# possible task states during suspend() SUSPENDING = 'suspending' + +# possible task states during resume() RESUMING = 'resuming' + +# possible task states during stop() +STOPPING = 'stopping' + +# possible task states during start() +STARTING = 'starting' + +# possible task states during soft_delete() POWERING_OFF = 'powering-off' + +# possible task states during restore() POWERING_ON = 'powering-on' +# possible task states during rescue() RESCUING = 'rescuing' + +# possible task states during unrescue() UNRESCUING = 'unrescuing' +# possible task states during rebuild() +REBUILDING = 'rebuilding' +REBUILD_BLOCK_DEVICE_MAPPING = "rebuild_block_device_mapping" +REBUILD_SPAWNING = 'rebuild_spawning' + +# possible task states during live_migrate() +MIGRATING = "migrating" + +# possible task states during delete() DELETING = 'deleting' -STOPPING = 'stopping' -STARTING = 'starting' diff --git a/nova/compute/vm_states.py b/nova/compute/vm_states.py index 1d0aa6d62..94a566cce 100644 --- a/nova/compute/vm_states.py +++ b/nova/compute/vm_states.py @@ -18,24 +18,29 @@ """Possible vm states for instances. Compute instance vm states represent the state of an instance as it pertains to -a user or administrator. When combined with task states (task_states.py), a -better picture can be formed regarding the instance's health. +a user or administrator. -""" +vm_state describes a VM's current stable (not transition) state. That is, if +there is no ongoing compute API calls (running tasks), vm_state should reflect +what the customer expect the VM to be. When combined with task states +(task_states.py), a better picture can be formed regarding the instance's +health and progress. -ACTIVE = 'active' -BUILDING = 'building' -REBUILDING = 'rebuilding' +See http://wiki.openstack.org/VMState +""" +ACTIVE = 'active' # VM is running +BUILDING = 'building' # VM only exists in DB PAUSED = 'paused' -SUSPENDED = 'suspended' -SHUTOFF = 'shutoff' -RESCUED = 'rescued' -DELETED = 'deleted' -STOPPED = 'stopped' -SOFT_DELETE = 'soft-delete' - -MIGRATING = 'migrating' -RESIZING = 'resizing' +SUSPENDED = 'suspended' # VM is suspended to disk. +STOPPED = 'stopped' # VM is powered off, the disk image is still there. +RESCUED = 'rescued' # A rescue image is running with the original VM image +# attached. +RESIZED = 'resized' # a VM with the new size is active. The user is expected +# to manually confirm or revert. + +SOFT_DELETED = 'soft-delete' # VM is marked as deleted but the disk images are +# still available to restore. +DELETED = 'deleted' # VM is permanently deleted. ERROR = 'error' -- cgit