Adds the ability to automatically issue a hard reboot to instances that have been stuck in a 'rebooting' state for longer than a specified window.

Fixes bug 873099. Change-Id: Ife2c64326fdb3ec849242583d1bd1d96f9f4be0f
author: Josh Kearney <josh@jk0.org> 2011-10-13 13:14:57 -0500
committer: Josh Kearney <josh@jk0.org> 2011-10-13 16:12:42 -0500
commit: e50e9b44ab2b8b1184f93d24734af4b5862777bf (patch)
tree: 9ed8165fa69eb2e1000441441de04a28fc071cb6
parent: 52b5611a863bd59102a492237f9cd7049c2908c2 (diff)
12 files changed, 104 insertions, 1 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index a10cb1bd6..708920c6a 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -73,6 +73,10 @@ flags.DEFINE_string('console_host', socket.gethostname(),
 flags.DEFINE_integer('live_migration_retry_count', 30,
                      "Retry count needed in live_migration."
                      " sleep 1 sec for each count")
+flags.DEFINE_integer("reboot_timeout", 0,
+                     "Automatically hard reboot an instance if it has been "
+                     "stuck in a rebooting state longer than N seconds."
+                     " Set to 0 to disable.")
 flags.DEFINE_integer("rescue_timeout", 0,
                      "Automatically unrescue an instance after N seconds."
                      " Set to 0 to disable.")
@@ -1785,6 +1789,14 @@ class ComputeManager(manager.SchedulerDependentManager):
             error_list = []
 
         try:
+            if FLAGS.reboot_timeout > 0:
+                self.driver.poll_rebooting_instances(FLAGS.reboot_timeout)
+        except Exception as ex:
+            LOG.warning(_("Error during poll_rebooting_instances: %s"),
+                    unicode(ex))
+            error_list.append(ex)
+
+        try:
             if FLAGS.rescue_timeout > 0:
                 self.driver.poll_rescued_instances(FLAGS.rescue_timeout)
         except Exception as ex:
diff --git a/nova/db/api.py b/nova/db/api.py
index a26cb3908..5dbc8b9ad 100644
--- a/nova/db/api.py
+++ b/nova/db/api.py
@@ -585,6 +585,12 @@ def instance_get_project_vpn(context, project_id):
     return IMPL.instance_get_project_vpn(context, project_id)
 
 
+def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
+    """Get all instances stuck in a rebooting state."""
+    return IMPL.instance_get_all_hung_in_rebooting(context, reboot_window,
+            session)
+
+
 def instance_set_state(context, instance_id, state, description=None):
     """Set the state of an instance."""
     return IMPL.instance_set_state(context, instance_id, state, description)
diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
index 077471e95..cb049652b 100644
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@@ -1529,6 +1529,21 @@ def instance_get_floating_address(context, instance_id):
     return fixed_ip_refs[0].floating_ips[0]['address']
 
 
+@require_admin_context
+def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
+    reboot_window = datetime.datetime.utcnow() - datetime.timedelta(
+            seconds=reboot_window)
+
+    if not session:
+        session = get_session()
+
+    results = session.query(models.Instance).\
+            filter(models.Instance.updated_at <= reboot_window).\
+            filter_by(task_state="rebooting").all()
+
+    return results
+
+
 @require_context
 def instance_update(context, instance_id, values):
     session = get_session()
diff --git a/nova/tests/test_db_api.py b/nova/tests/test_db_api.py
index 81194e3f9..6f6269e52 100644
--- a/nova/tests/test_db_api.py
+++ b/nova/tests/test_db_api.py
@@ -123,3 +123,27 @@ class DbApiTestCase(test.TestCase):
         results = db.migration_get_all_unconfirmed(ctxt, 10)
         self.assertEqual(0, len(results))
         db.migration_update(ctxt, migration.id, {"status": "CONFIRMED"})
+
+    def test_instance_get_all_hung_in_rebooting(self):
+        ctxt = context.get_admin_context()
+
+        # Ensure no instances are returned.
+        results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+        self.assertEqual(0, len(results))
+
+        # Ensure one rebooting instance with updated_at older than 10 seconds
+        # is returned.
+        updated_at = datetime.datetime(2000, 01, 01, 12, 00, 00)
+        values = {"task_state": "rebooting", "updated_at": updated_at}
+        instance = db.instance_create(ctxt, values)
+        results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+        self.assertEqual(1, len(results))
+        db.instance_update(ctxt, instance.id, {"task_state": None})
+
+        # Ensure the newly rebooted instance is not returned.
+        updated_at = datetime.datetime.utcnow()
+        values = {"task_state": "rebooting", "updated_at": updated_at}
+        instance = db.instance_create(ctxt, values)
+        results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+        self.assertEqual(0, len(results))
+        db.instance_update(ctxt, instance.id, {"task_state": None})
diff --git a/nova/tests/test_virt_drivers.py b/nova/tests/test_virt_drivers.py
index fed89a2ec..be77dab2f 100644
--- a/nova/tests/test_virt_drivers.py
+++ b/nova/tests/test_virt_drivers.py
@@ -173,6 +173,10 @@ class _VirtDriverTestCase(test.TestCase):
         self.connection.unrescue(instance_ref, lambda x: None, network_info)
 
     @catch_notimplementederror
+    def test_poll_rebooting_instances(self):
+        self.connection.poll_rebooting_instances(10)
+
+    @catch_notimplementederror
     def test_poll_rescued_instances(self):
         self.connection.poll_rescued_instances(10)
 
diff --git a/nova/virt/driver.py b/nova/virt/driver.py
index 3e57980f3..88a239002 100644
--- a/nova/virt/driver.py
+++ b/nova/virt/driver.py
@@ -485,6 +485,11 @@ class ComputeDriver(object):
         # TODO(Vek): Need to pass context in for access to auth_token
         pass
 
+    def poll_rebooting_instances(self, timeout):
+        """Poll for rebooting instances"""
+        # TODO(Vek): Need to pass context in for access to auth_token
+        raise NotImplementedError()
+
     def poll_rescued_instances(self, timeout):
         """Poll for rescued instances"""
         # TODO(Vek): Need to pass context in for access to auth_token
diff --git a/nova/virt/fake.py b/nova/virt/fake.py
index 1e07eb928..6b70be2bc 100644
--- a/nova/virt/fake.py
+++ b/nova/virt/fake.py
@@ -131,6 +131,9 @@ class FakeConnection(driver.ComputeDriver):
     def unrescue(self, instance, callback, network_info):
         pass
 
+    def poll_rebooting_instances(self, timeout):
+        pass
+
     def poll_rescued_instances(self, timeout):
         pass
 
diff --git a/nova/virt/hyperv.py b/nova/virt/hyperv.py
index 0d48c3792..16fd94e7f 100644
--- a/nova/virt/hyperv.py
+++ b/nova/virt/hyperv.py
@@ -485,10 +485,16 @@ class HyperVConnection(driver.ComputeDriver):
         if vm is None:
             raise exception.InstanceNotFound(instance_id=instance_name)
 
+    def poll_rebooting_instances(self, timeout):
+        """See xenapi_conn.py implementation."""
+        pass
+
     def poll_rescued_instances(self, timeout):
+        """See xenapi_conn.py implementation."""
         pass
 
     def poll_unconfirmed_resizes(self, resize_confirm_window):
+        """See xenapi_conn.py implementation."""
         pass
 
     def update_available_resource(self, ctxt, host):
diff --git a/nova/virt/libvirt/connection.py b/nova/virt/libvirt/connection.py
index 97f90312b..4d6ecac28 100644
--- a/nova/virt/libvirt/connection.py
+++ b/nova/virt/libvirt/connection.py
@@ -614,6 +614,10 @@ class LibvirtConnection(driver.ComputeDriver):
         self.reboot(instance, network_info, xml=unrescue_xml)
 
     @exception.wrap_exception()
+    def poll_rebooting_instances(self, timeout):
+        pass
+
+    @exception.wrap_exception()
     def poll_rescued_instances(self, timeout):
         pass
 
diff --git a/nova/virt/xenapi/vm_utils.py b/nova/virt/xenapi/vm_utils.py
index 495317228..02c4158e9 100644
--- a/nova/virt/xenapi/vm_utils.py
+++ b/nova/virt/xenapi/vm_utils.py
@@ -713,7 +713,7 @@ class VMHelper(HelperBase):
 
     @classmethod
     def lookup(cls, session, name_label):
-        """Look the instance i up, and returns it if available"""
+        """Look the instance up and return it if available"""
         vm_refs = session.get_xenapi().VM.get_by_name_label(name_label)
         n = len(vm_refs)
         if n == 0:
diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py
index d539871f1..ee70c4e35 100644
--- a/nova/virt/xenapi/vmops.py
+++ b/nova/virt/xenapi/vmops.py
@@ -1117,6 +1117,26 @@ class VMOps(object):
         vm_ref = self._get_vm_opaque_ref(instance)
         self._start(instance, vm_ref)
 
+    def poll_rebooting_instances(self, timeout):
+        """Look for expirable rebooting instances.
+
+            - issue a "hard" reboot to any instance that has been stuck in a
+              reboot state for >= the given timeout
+        """
+        ctxt = nova_context.get_admin_context()
+        instances = db.instance_get_all_hung_in_rebooting(ctxt, timeout)
+
+        instances_info = dict(instance_count=len(instances),
+                timeout=timeout)
+
+        if instances_info["instance_count"] > 0:
+            LOG.info(_("Found %(instance_count)d hung reboots "
+                    "older than %(timeout)d seconds") % instances_info)
+
+        for instance in instances:
+            LOG.info(_("Automatically hard rebooting %d"), instance.id)
+            self.compute_api.reboot(ctxt, instance.id, "HARD")
+
     def poll_rescued_instances(self, timeout):
         """Look for expirable rescued instances.
 
diff --git a/nova/virt/xenapi_conn.py b/nova/virt/xenapi_conn.py
index 700934420..2e4a53c5b 100644
--- a/nova/virt/xenapi_conn.py
+++ b/nova/virt/xenapi_conn.py
@@ -265,6 +265,10 @@ class XenAPIConnection(driver.ComputeDriver):
         """Power on the specified instance"""
         self._vmops.power_on(instance)
 
+    def poll_rebooting_instances(self, timeout):
+        """Poll for rebooting instances"""
+        self._vmops.poll_rebooting_instances(timeout)
+
     def poll_rescued_instances(self, timeout):
         """Poll for rescued instances"""
         self._vmops.poll_rescued_instances(timeout)
author	Josh Kearney <josh@jk0.org>	2011-10-13 13:14:57 -0500
committer	Josh Kearney <josh@jk0.org>	2011-10-13 16:12:42 -0500
commit	e50e9b44ab2b8b1184f93d24734af4b5862777bf (patch)
tree	9ed8165fa69eb2e1000441441de04a28fc071cb6
parent	52b5611a863bd59102a492237f9cd7049c2908c2 (diff)