From 7314985d1a660c42d516d1440e284395355b47dd Mon Sep 17 00:00:00 2001
From: Brian Elliott <brian.elliott@rackspace.com>
Date: Mon, 1 Oct 2012 20:56:44 +0000
Subject: Add scheduler retries for prep_resize operations.

Retry operations are subject to race conditions for compute resources.
prep_resize can race for resources on the destination host and
revert_resize can race for resources on the original source host.
This type of problem is handled for new instance build requests
by the resource tracker in compute.

As a first step to extending the resource tracker to cover resizes,
this patch adds scheduler retries around the prep_resize operation.

bug 1065267

Change-Id: I1b32c0a282772d9580e322b50990932016058329
---
 nova/compute/manager.py | 162 +++++++++++++++++++++++++++++++-----------------
 nova/compute/rpcapi.py  |  11 +++-
 2 files changed, 114 insertions(+), 59 deletions(-)

(limited to 'nova/compute')

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index f43aa0096..1dd336ab9 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -212,7 +212,7 @@ def _get_image_meta(context, image_ref):
 class ComputeManager(manager.SchedulerDependentManager):
     """Manages the running instances from creation to destruction."""
 
-    RPC_API_VERSION = '2.9'
+    RPC_API_VERSION = '2.10'
 
     def __init__(self, compute_driver=None, *args, **kwargs):
         """Load configuration options and connect to the hypervisor."""
@@ -531,32 +531,39 @@ class ComputeManager(manager.SchedulerDependentManager):
             with excutils.save_and_reraise_exception():
                 self._set_instance_error_state(context, instance['uuid'])
 
+    def _log_original_error(self, exc_info, instance_uuid):
+        type_, value, tb = exc_info
+        LOG.error(_('Error: %s') %
+                  traceback.format_exception(type_, value, tb),
+                  instance_uuid=instance_uuid)
+
     def _reschedule_or_reraise(self, context, instance, requested_networks,
                                admin_password, injected_files, is_first_time,
                                request_spec, filter_properties):
         """Try to re-schedule the build or re-raise the original build error to
         error out the instance.
         """
-        type_, value, tb = sys.exc_info()  # save original exception
-        rescheduled = False
+        exc_info = sys.exc_info()
         instance_uuid = instance['uuid']
-
-        def _log_original_error():
-            LOG.error(_('Build error: %s') %
-                    traceback.format_exception(type_, value, tb),
-                    instance_uuid=instance_uuid)
+        rescheduled = False
 
         try:
             self._deallocate_network(context, instance)
         except Exception:
             # do not attempt retry if network de-allocation failed:
-            _log_original_error()
+            self._log_original_error(exc_info, instance_uuid)
             raise
 
         try:
-            rescheduled = self._reschedule(context, instance_uuid,
-                    requested_networks, admin_password, injected_files,
-                    is_first_time, request_spec, filter_properties)
+            method_args = (request_spec, admin_password, injected_files,
+                    requested_networks, is_first_time, filter_properties)
+            task_state = task_states.SCHEDULING
+
+            rescheduled = self._reschedule(context, request_spec,
+                    instance['uuid'], filter_properties,
+                    self.scheduler_rpcapi.run_instance, method_args,
+                    task_state)
+
         except Exception:
             rescheduled = False
             LOG.exception(_("Error trying to reschedule"),
@@ -564,14 +571,14 @@ class ComputeManager(manager.SchedulerDependentManager):
 
         if rescheduled:
             # log the original build error
-            _log_original_error()
+            self._log_original_error(exc_info, instance_uuid)
         else:
             # not re-scheduling
-            raise type_, value, tb
+            raise exc_info[0], exc_info[1], exc_info[2]
 
-    def _reschedule(self, context, instance_uuid, requested_networks,
-            admin_password, injected_files, is_first_time, request_spec,
-            filter_properties):
+    def _reschedule(self, context, request_spec, filter_properties,
+            instance_uuid, scheduler_method, method_args, task_state):
+        """Attempt to re-schedule a compute operation."""
 
         retry = filter_properties.get('retry', None)
         if not retry:
@@ -587,16 +594,14 @@ class ComputeManager(manager.SchedulerDependentManager):
 
         request_spec['instance_uuids'] = [instance_uuid]
 
-        LOG.debug(_("Re-scheduling instance: attempt %d"),
-                  retry['num_attempts'], instance_uuid=instance_uuid)
+        LOG.debug(_("Re-scheduling %(method)s: attempt %(num)d") %
+                {'method': scheduler_method.func_name,
+                 'num': retry['num_attempts']}, instance_uuid=instance_uuid)
 
         # reset the task state:
-        self._instance_update(context, instance_uuid,
-                task_state=task_states.SCHEDULING)
+        self._instance_update(context, instance_uuid, task_state=task_state)
 
-        self.scheduler_rpcapi.run_instance(context,
-                request_spec, admin_password, injected_files,
-                requested_networks, is_first_time, filter_properties)
+        scheduler_method(context, *method_args)
         return True
 
     @manager.periodic_task
@@ -1590,7 +1595,8 @@ class ComputeManager(manager.SchedulerDependentManager):
     @reverts_task_state
     @wrap_instance_fault
     def prep_resize(self, context, image, instance, instance_type,
-                    reservations=None):
+                    reservations=None, request_spec=None,
+                    filter_properties=None):
         """Initiates the process of moving a running instance to another host.
 
         Possibly changes the RAM and disk size in the process.
@@ -1603,38 +1609,82 @@ class ComputeManager(manager.SchedulerDependentManager):
             self._notify_about_instance_usage(
                     context, instance, "resize.prep.start")
 
-            same_host = instance['host'] == self.host
-            if same_host and not FLAGS.allow_resize_to_same_host:
-                self._set_instance_error_state(context, instance['uuid'])
-                msg = _('destination same as source!')
-                raise exception.MigrationError(msg)
-
-            # TODO(russellb): no-db-compute: Send the old instance type info
-            # that is needed via rpc so db access isn't required here.
-            old_instance_type_id = instance['instance_type_id']
-            old_instance_type = instance_types.get_instance_type(
-                    old_instance_type_id)
-
-            migration_ref = self.db.migration_create(context.elevated(),
-                    {'instance_uuid': instance['uuid'],
-                     'source_compute': instance['host'],
-                     'dest_compute': self.host,
-                     'dest_host': self.driver.get_host_ip_addr(),
-                     'old_instance_type_id': old_instance_type['id'],
-                     'new_instance_type_id': instance_type['id'],
-                     'status': 'pre-migrating'})
-
-            LOG.audit(_('Migrating'), context=context, instance=instance)
-            self.compute_rpcapi.resize_instance(context, instance,
-                    migration_ref, image, reservations)
-
-            extra_usage_info = dict(
-                    new_instance_type=instance_type['name'],
-                    new_instance_type_id=instance_type['id'])
+            try:
+                same_host = instance['host'] == self.host
+                if same_host and not FLAGS.allow_resize_to_same_host:
+                    self._set_instance_error_state(context, instance['uuid'])
+                    msg = _('destination same as source!')
+                    raise exception.MigrationError(msg)
+
+                # TODO(russellb): no-db-compute: Send the old instance type
+                # info that is needed via rpc so db access isn't required
+                # here.
+                old_instance_type_id = instance['instance_type_id']
+                old_instance_type = instance_types.get_instance_type(
+                        old_instance_type_id)
+
+                migration_ref = self.db.migration_create(context.elevated(),
+                        {'instance_uuid': instance['uuid'],
+                         'source_compute': instance['host'],
+                         'dest_compute': self.host,
+                         'dest_host': self.driver.get_host_ip_addr(),
+                         'old_instance_type_id': old_instance_type['id'],
+                         'new_instance_type_id': instance_type['id'],
+                         'status': 'pre-migrating'})
+
+                LOG.audit(_('Migrating'), context=context,
+                        instance=instance)
+                self.compute_rpcapi.resize_instance(context, instance,
+                        migration_ref, image, reservations)
 
-            self._notify_about_instance_usage(
-                context, instance, "resize.prep.end",
-                extra_usage_info=extra_usage_info)
+            except Exception:
+                # try to re-schedule the resize elsewhere:
+                self._reschedule_resize_or_reraise(context, image, instance,
+                        instance_type, reservations, request_spec,
+                        filter_properties)
+            finally:
+                extra_usage_info = dict(
+                        new_instance_type=instance_type['name'],
+                        new_instance_type_id=instance_type['id'])
+
+                self._notify_about_instance_usage(
+                    context, instance, "resize.prep.end",
+                    extra_usage_info=extra_usage_info)
+
+    def _reschedule_resize_or_reraise(self, context, image, instance,
+            instance_type, reservations, request_spec, filter_properties):
+        """Try to re-schedule the resize or re-raise the original error to
+        error out the instance.
+        """
+        if not request_spec:
+            request_spec = {}
+        if not filter_properties:
+            filter_properties = {}
+
+        exc_info = sys.exc_info()
+        rescheduled = False
+        instance_uuid = instance['uuid']
+
+        try:
+            scheduler_method = self.scheduler_rpcapi.prep_resize
+            method_args = (instance, instance_type, image, request_spec,
+                           filter_properties, reservations)
+            task_state = task_states.RESIZE_PREP
+
+            rescheduled = self._reschedule(context, request_spec,
+                    filter_properties, instance_uuid, scheduler_method,
+                    method_args, task_state)
+        except Exception:
+            rescheduled = False
+            LOG.exception(_("Error trying to reschedule"),
+                          instance_uuid=instance_uuid)
+
+        if rescheduled:
+            # log the original build error
+            self._log_original_error(exc_info, instance_uuid)
+        else:
+            # not re-scheduling
+            raise exc_info[0], exc_info[1], exc_info[2]
 
     @exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
     @reverts_task_state
diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py
index 38d9b3426..8b4a4a529 100644
--- a/nova/compute/rpcapi.py
+++ b/nova/compute/rpcapi.py
@@ -137,6 +137,7 @@ class ComputeAPI(nova.openstack.common.rpc.proxy.RpcProxy):
         2.7 - Remove migration_id, add migration to confirm_resize
         2.8 - Remove migration_id, add migration to finish_resize
         2.9 - Add publish_service_capabilities()
+        2.10 - Adds filter_properties and request_spec to prep_resize()
     '''
 
     #
@@ -343,13 +344,17 @@ class ComputeAPI(nova.openstack.common.rpc.proxy.RpcProxy):
                 disk=disk), _compute_topic(self.topic, ctxt, host, None))
 
     def prep_resize(self, ctxt, image, instance, instance_type, host,
-                    reservations=None):
+                    reservations=None, request_spec=None,
+                    filter_properties=None):
         instance_p = jsonutils.to_primitive(instance)
         instance_type_p = jsonutils.to_primitive(instance_type)
         self.cast(ctxt, self.make_msg('prep_resize',
                 instance=instance_p, instance_type=instance_type_p,
-                image=image, reservations=reservations),
-                _compute_topic(self.topic, ctxt, host, None))
+                image=image, reservations=reservations,
+                request_spec=request_spec,
+                filter_properties=filter_properties),
+                _compute_topic(self.topic, ctxt, host, None),
+                version='2.10')
 
     def reboot_instance(self, ctxt, instance,
                         block_device_info, network_info, reboot_type):
-- 
cgit