Move failed instances to error state

On instance creation there is the possibility of an instance raising. This would not cause the instance to be moved to the error state. This patch fixes that. lp885323 update 1: fixing exception handling update 2: preserving the individual messages update 3: rebase on master & fix spacing Change-Id: I7584b527e408c08014f1b6a8abda343f1e2aa3b8
author: Aaron Lee <aaron.lee@rackspace.com> 2011-11-03 15:05:30 -0500
committer: Aaron Lee <wwkeyboard@gmail.com> 2011-11-07 17:03:40 -0600
commit: c04b431cd63f4d934f40dd1f62a9107ae6dfde90 (patch)
tree: 4f93445ef97b554459b3e1d8aba89d0cc5e3b2f4
parent: d90aaaafbc11c8e951ccde27cac11f70ae65c9b2 (diff)
2 files changed, 72 insertions, 15 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 4a51147a2..f70109d57 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -33,6 +33,7 @@ terminating it.
 
 """
 
+import contextlib
 import datetime
 import functools
 import os
@@ -401,6 +402,23 @@ class ComputeManager(manager.SchedulerDependentManager):
                 self.network_api.deallocate_for_instance(context,
                                     instance)
 
+        def _cleanup():
+            with utils.save_and_reraise_exception():
+                self._instance_update(context,
+                                      instance_id,
+                                      vm_state=vm_states.ERROR)
+                if network_info is not None:
+                    _deallocate_network()
+
+        @contextlib.contextmanager
+        def _logging_error(instance_id, message):
+            try:
+                yield
+            except Exception as error:
+                with utils.save_and_reraise_exception():
+                    LOG.exception(_("Instance '%(instance_id)s' "
+                                   "failed %(message)s.") % locals())
+
         context = context.elevated()
         instance = self.db.instance_get(context, instance_id)
 
@@ -423,14 +441,17 @@ class ComputeManager(manager.SchedulerDependentManager):
         instance['admin_pass'] = kwargs.get('admin_password', None)
 
         is_vpn = instance['image_ref'] == str(FLAGS.vpn_image_id)
-        network_info = _make_network_info()
         try:
+            network_info = None
+            with _logging_error(instance_id, "network setup"):
+                network_info = _make_network_info()
+
             self._instance_update(context,
                                   instance_id,
                                   vm_state=vm_states.BUILDING,
                                   task_state=task_states.BLOCK_DEVICE_MAPPING)
-
-            block_device_info = _make_block_device_info()
+            with _logging_error(instance_id, "block device setup"):
+                block_device_info = _make_block_device_info()
 
             self._instance_update(context,
                                   instance_id,
@@ -438,17 +459,9 @@ class ComputeManager(manager.SchedulerDependentManager):
                                   task_state=task_states.SPAWNING)
 
             # TODO(vish) check to make sure the availability zone matches
-            try:
+            with _logging_error(instance_id, "failed to spawn"):
                 self.driver.spawn(context, instance,
                                   network_info, block_device_info)
-            except Exception as error:  # pylint: disable=W0702
-                LOG.exception(_("Instance '%(instance_id)s' failed to spawn. "
-                                "Details: %(error)s") % locals())
-                self._instance_update(context,
-                                      instance_id,
-                                      vm_state=vm_states.ERROR)
-                _deallocate_network()
-                return
 
             current_power_state = self._get_power_state(context, instance)
             self._instance_update(context,
@@ -469,9 +482,8 @@ class ComputeManager(manager.SchedulerDependentManager):
             # deleted before it actually got created.  This should
             # be fixed once we have no-db-messaging
             pass
-        except:
-            with utils.save_and_reraise_exception():
-                _deallocate_network()
+        except Exception:
+            _cleanup()
 
     def _get_instance_volume_bdms(self, context, instance_id):
         bdms = self.db.block_device_mapping_get_all_by_instance(context,
diff --git a/nova/tests/test_compute.py b/nova/tests/test_compute.py
index b235bcade..4b80cc58a 100644
--- a/nova/tests/test_compute.py
+++ b/nova/tests/test_compute.py
@@ -44,6 +44,7 @@ from nova.db.sqlalchemy import models
 from nova.image import fake as fake_image
 from nova.notifier import test_notifier
 from nova.tests import fake_network
+from nova.network.quantum import client as quantum_client
 
 
 LOG = logging.getLogger('nova.tests.compute')
@@ -551,6 +552,50 @@ class ComputeTestCase(test.TestCase):
                           instance_id)
         self.compute.terminate_instance(self.context, instance_id)
 
+    def test_instance_set_to_error_on_uncaught_exception(self):
+        """Test that instance is set to error state when exception is raised"""
+        instance_id = self._create_instance()
+
+        self.mox.StubOutWithMock(self.compute.network_api,
+                                 "allocate_for_instance")
+        self.compute.network_api.allocate_for_instance(mox.IgnoreArg(),
+                                                       mox.IgnoreArg(),
+                                                       requested_networks=None,
+                                                       vpn=False).\
+            AndRaise(quantum_client.QuantumServerException())
+
+        FLAGS.stub_network = False
+
+        self.mox.ReplayAll()
+
+        self.assertRaises(quantum_client.QuantumServerException,
+                          self.compute.run_instance,
+                          self.context,
+                          instance_id)
+
+        instances = db.instance_get_all(context.get_admin_context())
+        self.assertEqual(vm_states.ERROR, instances[0]['vm_state'])
+
+        self.compute.terminate_instance(self.context, instance_id)
+
+    def test_network_is_deallocated_on_spawn_failure(self):
+        """When a spawn fails the network must be deallocated"""
+        instance_id = self._create_instance()
+
+        self.mox.StubOutWithMock(self.compute, "_setup_block_device_mapping")
+        self.compute._setup_block_device_mapping(mox.IgnoreArg(),
+                                                 mox.IgnoreArg()).\
+            AndRaise(rpc.common.RemoteError('', '', ''))
+
+        self.mox.ReplayAll()
+
+        self.assertRaises(rpc.common.RemoteError,
+                          self.compute.run_instance,
+                          self.context,
+                          instance_id)
+
+        self.compute.terminate_instance(self.context, instance_id)
+
     def test_lock(self):
         """ensure locked instance cannot be changed"""
         instance_id = self._create_instance()
author	Aaron Lee <aaron.lee@rackspace.com>	2011-11-03 15:05:30 -0500
committer	Aaron Lee <wwkeyboard@gmail.com>	2011-11-07 17:03:40 -0600
commit	c04b431cd63f4d934f40dd1f62a9107ae6dfde90 (patch)
tree	4f93445ef97b554459b3e1d8aba89d0cc5e3b2f4
parent	d90aaaafbc11c8e951ccde27cac11f70ae65c9b2 (diff)