summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJosh Kearney <josh@jk0.org>2011-10-13 13:14:57 -0500
committerJosh Kearney <josh@jk0.org>2011-10-13 16:12:42 -0500
commite50e9b44ab2b8b1184f93d24734af4b5862777bf (patch)
tree9ed8165fa69eb2e1000441441de04a28fc071cb6
parent52b5611a863bd59102a492237f9cd7049c2908c2 (diff)
Adds the ability to automatically issue a hard reboot to instances that have been stuck in a 'rebooting' state for longer than a specified window.
Fixes bug 873099. Change-Id: Ife2c64326fdb3ec849242583d1bd1d96f9f4be0f
-rw-r--r--nova/compute/manager.py12
-rw-r--r--nova/db/api.py6
-rw-r--r--nova/db/sqlalchemy/api.py15
-rw-r--r--nova/tests/test_db_api.py24
-rw-r--r--nova/tests/test_virt_drivers.py4
-rw-r--r--nova/virt/driver.py5
-rw-r--r--nova/virt/fake.py3
-rw-r--r--nova/virt/hyperv.py6
-rw-r--r--nova/virt/libvirt/connection.py4
-rw-r--r--nova/virt/xenapi/vm_utils.py2
-rw-r--r--nova/virt/xenapi/vmops.py20
-rw-r--r--nova/virt/xenapi_conn.py4
12 files changed, 104 insertions, 1 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index a10cb1bd6..708920c6a 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -73,6 +73,10 @@ flags.DEFINE_string('console_host', socket.gethostname(),
flags.DEFINE_integer('live_migration_retry_count', 30,
"Retry count needed in live_migration."
" sleep 1 sec for each count")
+flags.DEFINE_integer("reboot_timeout", 0,
+ "Automatically hard reboot an instance if it has been "
+ "stuck in a rebooting state longer than N seconds."
+ " Set to 0 to disable.")
flags.DEFINE_integer("rescue_timeout", 0,
"Automatically unrescue an instance after N seconds."
" Set to 0 to disable.")
@@ -1785,6 +1789,14 @@ class ComputeManager(manager.SchedulerDependentManager):
error_list = []
try:
+ if FLAGS.reboot_timeout > 0:
+ self.driver.poll_rebooting_instances(FLAGS.reboot_timeout)
+ except Exception as ex:
+ LOG.warning(_("Error during poll_rebooting_instances: %s"),
+ unicode(ex))
+ error_list.append(ex)
+
+ try:
if FLAGS.rescue_timeout > 0:
self.driver.poll_rescued_instances(FLAGS.rescue_timeout)
except Exception as ex:
diff --git a/nova/db/api.py b/nova/db/api.py
index a26cb3908..5dbc8b9ad 100644
--- a/nova/db/api.py
+++ b/nova/db/api.py
@@ -585,6 +585,12 @@ def instance_get_project_vpn(context, project_id):
return IMPL.instance_get_project_vpn(context, project_id)
+def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
+ """Get all instances stuck in a rebooting state."""
+ return IMPL.instance_get_all_hung_in_rebooting(context, reboot_window,
+ session)
+
+
def instance_set_state(context, instance_id, state, description=None):
"""Set the state of an instance."""
return IMPL.instance_set_state(context, instance_id, state, description)
diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
index 077471e95..cb049652b 100644
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@@ -1529,6 +1529,21 @@ def instance_get_floating_address(context, instance_id):
return fixed_ip_refs[0].floating_ips[0]['address']
+@require_admin_context
+def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
+ reboot_window = datetime.datetime.utcnow() - datetime.timedelta(
+ seconds=reboot_window)
+
+ if not session:
+ session = get_session()
+
+ results = session.query(models.Instance).\
+ filter(models.Instance.updated_at <= reboot_window).\
+ filter_by(task_state="rebooting").all()
+
+ return results
+
+
@require_context
def instance_update(context, instance_id, values):
session = get_session()
diff --git a/nova/tests/test_db_api.py b/nova/tests/test_db_api.py
index 81194e3f9..6f6269e52 100644
--- a/nova/tests/test_db_api.py
+++ b/nova/tests/test_db_api.py
@@ -123,3 +123,27 @@ class DbApiTestCase(test.TestCase):
results = db.migration_get_all_unconfirmed(ctxt, 10)
self.assertEqual(0, len(results))
db.migration_update(ctxt, migration.id, {"status": "CONFIRMED"})
+
+ def test_instance_get_all_hung_in_rebooting(self):
+ ctxt = context.get_admin_context()
+
+ # Ensure no instances are returned.
+ results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+ self.assertEqual(0, len(results))
+
+ # Ensure one rebooting instance with updated_at older than 10 seconds
+ # is returned.
+ updated_at = datetime.datetime(2000, 01, 01, 12, 00, 00)
+ values = {"task_state": "rebooting", "updated_at": updated_at}
+ instance = db.instance_create(ctxt, values)
+ results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+ self.assertEqual(1, len(results))
+ db.instance_update(ctxt, instance.id, {"task_state": None})
+
+ # Ensure the newly rebooted instance is not returned.
+ updated_at = datetime.datetime.utcnow()
+ values = {"task_state": "rebooting", "updated_at": updated_at}
+ instance = db.instance_create(ctxt, values)
+ results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+ self.assertEqual(0, len(results))
+ db.instance_update(ctxt, instance.id, {"task_state": None})
diff --git a/nova/tests/test_virt_drivers.py b/nova/tests/test_virt_drivers.py
index fed89a2ec..be77dab2f 100644
--- a/nova/tests/test_virt_drivers.py
+++ b/nova/tests/test_virt_drivers.py
@@ -173,6 +173,10 @@ class _VirtDriverTestCase(test.TestCase):
self.connection.unrescue(instance_ref, lambda x: None, network_info)
@catch_notimplementederror
+ def test_poll_rebooting_instances(self):
+ self.connection.poll_rebooting_instances(10)
+
+ @catch_notimplementederror
def test_poll_rescued_instances(self):
self.connection.poll_rescued_instances(10)
diff --git a/nova/virt/driver.py b/nova/virt/driver.py
index 3e57980f3..88a239002 100644
--- a/nova/virt/driver.py
+++ b/nova/virt/driver.py
@@ -485,6 +485,11 @@ class ComputeDriver(object):
# TODO(Vek): Need to pass context in for access to auth_token
pass
+ def poll_rebooting_instances(self, timeout):
+ """Poll for rebooting instances"""
+ # TODO(Vek): Need to pass context in for access to auth_token
+ raise NotImplementedError()
+
def poll_rescued_instances(self, timeout):
"""Poll for rescued instances"""
# TODO(Vek): Need to pass context in for access to auth_token
diff --git a/nova/virt/fake.py b/nova/virt/fake.py
index 1e07eb928..6b70be2bc 100644
--- a/nova/virt/fake.py
+++ b/nova/virt/fake.py
@@ -131,6 +131,9 @@ class FakeConnection(driver.ComputeDriver):
def unrescue(self, instance, callback, network_info):
pass
+ def poll_rebooting_instances(self, timeout):
+ pass
+
def poll_rescued_instances(self, timeout):
pass
diff --git a/nova/virt/hyperv.py b/nova/virt/hyperv.py
index 0d48c3792..16fd94e7f 100644
--- a/nova/virt/hyperv.py
+++ b/nova/virt/hyperv.py
@@ -485,10 +485,16 @@ class HyperVConnection(driver.ComputeDriver):
if vm is None:
raise exception.InstanceNotFound(instance_id=instance_name)
+ def poll_rebooting_instances(self, timeout):
+ """See xenapi_conn.py implementation."""
+ pass
+
def poll_rescued_instances(self, timeout):
+ """See xenapi_conn.py implementation."""
pass
def poll_unconfirmed_resizes(self, resize_confirm_window):
+ """See xenapi_conn.py implementation."""
pass
def update_available_resource(self, ctxt, host):
diff --git a/nova/virt/libvirt/connection.py b/nova/virt/libvirt/connection.py
index 97f90312b..4d6ecac28 100644
--- a/nova/virt/libvirt/connection.py
+++ b/nova/virt/libvirt/connection.py
@@ -614,6 +614,10 @@ class LibvirtConnection(driver.ComputeDriver):
self.reboot(instance, network_info, xml=unrescue_xml)
@exception.wrap_exception()
+ def poll_rebooting_instances(self, timeout):
+ pass
+
+ @exception.wrap_exception()
def poll_rescued_instances(self, timeout):
pass
diff --git a/nova/virt/xenapi/vm_utils.py b/nova/virt/xenapi/vm_utils.py
index 495317228..02c4158e9 100644
--- a/nova/virt/xenapi/vm_utils.py
+++ b/nova/virt/xenapi/vm_utils.py
@@ -713,7 +713,7 @@ class VMHelper(HelperBase):
@classmethod
def lookup(cls, session, name_label):
- """Look the instance i up, and returns it if available"""
+ """Look the instance up and return it if available"""
vm_refs = session.get_xenapi().VM.get_by_name_label(name_label)
n = len(vm_refs)
if n == 0:
diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py
index d539871f1..ee70c4e35 100644
--- a/nova/virt/xenapi/vmops.py
+++ b/nova/virt/xenapi/vmops.py
@@ -1117,6 +1117,26 @@ class VMOps(object):
vm_ref = self._get_vm_opaque_ref(instance)
self._start(instance, vm_ref)
+ def poll_rebooting_instances(self, timeout):
+ """Look for expirable rebooting instances.
+
+ - issue a "hard" reboot to any instance that has been stuck in a
+ reboot state for >= the given timeout
+ """
+ ctxt = nova_context.get_admin_context()
+ instances = db.instance_get_all_hung_in_rebooting(ctxt, timeout)
+
+ instances_info = dict(instance_count=len(instances),
+ timeout=timeout)
+
+ if instances_info["instance_count"] > 0:
+ LOG.info(_("Found %(instance_count)d hung reboots "
+ "older than %(timeout)d seconds") % instances_info)
+
+ for instance in instances:
+ LOG.info(_("Automatically hard rebooting %d"), instance.id)
+ self.compute_api.reboot(ctxt, instance.id, "HARD")
+
def poll_rescued_instances(self, timeout):
"""Look for expirable rescued instances.
diff --git a/nova/virt/xenapi_conn.py b/nova/virt/xenapi_conn.py
index 700934420..2e4a53c5b 100644
--- a/nova/virt/xenapi_conn.py
+++ b/nova/virt/xenapi_conn.py
@@ -265,6 +265,10 @@ class XenAPIConnection(driver.ComputeDriver):
"""Power on the specified instance"""
self._vmops.power_on(instance)
+ def poll_rebooting_instances(self, timeout):
+ """Poll for rebooting instances"""
+ self._vmops.poll_rebooting_instances(timeout)
+
def poll_rescued_instances(self, timeout):
"""Poll for rescued instances"""
self._vmops.poll_rescued_instances(timeout)