summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJenkins <jenkins@review.openstack.org>2011-10-14 19:04:13 +0000
committerGerrit Code Review <review@openstack.org>2011-10-14 19:04:13 +0000
commitc9d2aa8a72c16bfdf76e9a8622143ef7cf500cca (patch)
tree900b256935b084f8c94fcc3852d9c270f148fb98
parent80105fbc530b7fc842f1fa8f8318128cf067fb77 (diff)
parente50e9b44ab2b8b1184f93d24734af4b5862777bf (diff)
downloadnova-c9d2aa8a72c16bfdf76e9a8622143ef7cf500cca.tar.gz
nova-c9d2aa8a72c16bfdf76e9a8622143ef7cf500cca.tar.xz
nova-c9d2aa8a72c16bfdf76e9a8622143ef7cf500cca.zip
Merge "Adds the ability to automatically issue a hard reboot to instances that have been stuck in a 'rebooting' state for longer than a specified window."
-rw-r--r--nova/compute/manager.py12
-rw-r--r--nova/db/api.py6
-rw-r--r--nova/db/sqlalchemy/api.py15
-rw-r--r--nova/tests/test_db_api.py24
-rw-r--r--nova/tests/test_virt_drivers.py4
-rw-r--r--nova/virt/driver.py5
-rw-r--r--nova/virt/fake.py3
-rw-r--r--nova/virt/hyperv.py6
-rw-r--r--nova/virt/libvirt/connection.py4
-rw-r--r--nova/virt/xenapi/vm_utils.py2
-rw-r--r--nova/virt/xenapi/vmops.py20
-rw-r--r--nova/virt/xenapi_conn.py4
12 files changed, 104 insertions, 1 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 77283554e..2ccf44050 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -73,6 +73,10 @@ flags.DEFINE_string('console_host', socket.gethostname(),
flags.DEFINE_integer('live_migration_retry_count', 30,
"Retry count needed in live_migration."
" sleep 1 sec for each count")
+flags.DEFINE_integer("reboot_timeout", 0,
+ "Automatically hard reboot an instance if it has been "
+ "stuck in a rebooting state longer than N seconds."
+ " Set to 0 to disable.")
flags.DEFINE_integer("rescue_timeout", 0,
"Automatically unrescue an instance after N seconds."
" Set to 0 to disable.")
@@ -1777,6 +1781,14 @@ class ComputeManager(manager.SchedulerDependentManager):
error_list = []
try:
+ if FLAGS.reboot_timeout > 0:
+ self.driver.poll_rebooting_instances(FLAGS.reboot_timeout)
+ except Exception as ex:
+ LOG.warning(_("Error during poll_rebooting_instances: %s"),
+ unicode(ex))
+ error_list.append(ex)
+
+ try:
if FLAGS.rescue_timeout > 0:
self.driver.poll_rescued_instances(FLAGS.rescue_timeout)
except Exception as ex:
diff --git a/nova/db/api.py b/nova/db/api.py
index a26cb3908..5dbc8b9ad 100644
--- a/nova/db/api.py
+++ b/nova/db/api.py
@@ -585,6 +585,12 @@ def instance_get_project_vpn(context, project_id):
return IMPL.instance_get_project_vpn(context, project_id)
+def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
+ """Get all instances stuck in a rebooting state."""
+ return IMPL.instance_get_all_hung_in_rebooting(context, reboot_window,
+ session)
+
+
def instance_set_state(context, instance_id, state, description=None):
"""Set the state of an instance."""
return IMPL.instance_set_state(context, instance_id, state, description)
diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
index 077471e95..cb049652b 100644
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@@ -1529,6 +1529,21 @@ def instance_get_floating_address(context, instance_id):
return fixed_ip_refs[0].floating_ips[0]['address']
+@require_admin_context
+def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
+ reboot_window = datetime.datetime.utcnow() - datetime.timedelta(
+ seconds=reboot_window)
+
+ if not session:
+ session = get_session()
+
+ results = session.query(models.Instance).\
+ filter(models.Instance.updated_at <= reboot_window).\
+ filter_by(task_state="rebooting").all()
+
+ return results
+
+
@require_context
def instance_update(context, instance_id, values):
session = get_session()
diff --git a/nova/tests/test_db_api.py b/nova/tests/test_db_api.py
index 81194e3f9..6f6269e52 100644
--- a/nova/tests/test_db_api.py
+++ b/nova/tests/test_db_api.py
@@ -123,3 +123,27 @@ class DbApiTestCase(test.TestCase):
results = db.migration_get_all_unconfirmed(ctxt, 10)
self.assertEqual(0, len(results))
db.migration_update(ctxt, migration.id, {"status": "CONFIRMED"})
+
+ def test_instance_get_all_hung_in_rebooting(self):
+ ctxt = context.get_admin_context()
+
+ # Ensure no instances are returned.
+ results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+ self.assertEqual(0, len(results))
+
+ # Ensure one rebooting instance with updated_at older than 10 seconds
+ # is returned.
+ updated_at = datetime.datetime(2000, 01, 01, 12, 00, 00)
+ values = {"task_state": "rebooting", "updated_at": updated_at}
+ instance = db.instance_create(ctxt, values)
+ results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+ self.assertEqual(1, len(results))
+ db.instance_update(ctxt, instance.id, {"task_state": None})
+
+ # Ensure the newly rebooted instance is not returned.
+ updated_at = datetime.datetime.utcnow()
+ values = {"task_state": "rebooting", "updated_at": updated_at}
+ instance = db.instance_create(ctxt, values)
+ results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+ self.assertEqual(0, len(results))
+ db.instance_update(ctxt, instance.id, {"task_state": None})
diff --git a/nova/tests/test_virt_drivers.py b/nova/tests/test_virt_drivers.py
index fed89a2ec..be77dab2f 100644
--- a/nova/tests/test_virt_drivers.py
+++ b/nova/tests/test_virt_drivers.py
@@ -173,6 +173,10 @@ class _VirtDriverTestCase(test.TestCase):
self.connection.unrescue(instance_ref, lambda x: None, network_info)
@catch_notimplementederror
+ def test_poll_rebooting_instances(self):
+ self.connection.poll_rebooting_instances(10)
+
+ @catch_notimplementederror
def test_poll_rescued_instances(self):
self.connection.poll_rescued_instances(10)
diff --git a/nova/virt/driver.py b/nova/virt/driver.py
index 3e57980f3..88a239002 100644
--- a/nova/virt/driver.py
+++ b/nova/virt/driver.py
@@ -485,6 +485,11 @@ class ComputeDriver(object):
# TODO(Vek): Need to pass context in for access to auth_token
pass
+ def poll_rebooting_instances(self, timeout):
+ """Poll for rebooting instances"""
+ # TODO(Vek): Need to pass context in for access to auth_token
+ raise NotImplementedError()
+
def poll_rescued_instances(self, timeout):
"""Poll for rescued instances"""
# TODO(Vek): Need to pass context in for access to auth_token
diff --git a/nova/virt/fake.py b/nova/virt/fake.py
index 1e07eb928..6b70be2bc 100644
--- a/nova/virt/fake.py
+++ b/nova/virt/fake.py
@@ -131,6 +131,9 @@ class FakeConnection(driver.ComputeDriver):
def unrescue(self, instance, callback, network_info):
pass
+ def poll_rebooting_instances(self, timeout):
+ pass
+
def poll_rescued_instances(self, timeout):
pass
diff --git a/nova/virt/hyperv.py b/nova/virt/hyperv.py
index 0d48c3792..16fd94e7f 100644
--- a/nova/virt/hyperv.py
+++ b/nova/virt/hyperv.py
@@ -485,10 +485,16 @@ class HyperVConnection(driver.ComputeDriver):
if vm is None:
raise exception.InstanceNotFound(instance_id=instance_name)
+ def poll_rebooting_instances(self, timeout):
+ """See xenapi_conn.py implementation."""
+ pass
+
def poll_rescued_instances(self, timeout):
+ """See xenapi_conn.py implementation."""
pass
def poll_unconfirmed_resizes(self, resize_confirm_window):
+ """See xenapi_conn.py implementation."""
pass
def update_available_resource(self, ctxt, host):
diff --git a/nova/virt/libvirt/connection.py b/nova/virt/libvirt/connection.py
index 97f90312b..4d6ecac28 100644
--- a/nova/virt/libvirt/connection.py
+++ b/nova/virt/libvirt/connection.py
@@ -614,6 +614,10 @@ class LibvirtConnection(driver.ComputeDriver):
self.reboot(instance, network_info, xml=unrescue_xml)
@exception.wrap_exception()
+ def poll_rebooting_instances(self, timeout):
+ pass
+
+ @exception.wrap_exception()
def poll_rescued_instances(self, timeout):
pass
diff --git a/nova/virt/xenapi/vm_utils.py b/nova/virt/xenapi/vm_utils.py
index 9f6a8d6b0..56a937c4f 100644
--- a/nova/virt/xenapi/vm_utils.py
+++ b/nova/virt/xenapi/vm_utils.py
@@ -759,7 +759,7 @@ class VMHelper(HelperBase):
@classmethod
def lookup(cls, session, name_label):
- """Look the instance i up, and returns it if available"""
+ """Look the instance up and return it if available"""
vm_refs = session.get_xenapi().VM.get_by_name_label(name_label)
n = len(vm_refs)
if n == 0:
diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py
index 55ec21155..4f8cceb09 100644
--- a/nova/virt/xenapi/vmops.py
+++ b/nova/virt/xenapi/vmops.py
@@ -1133,6 +1133,26 @@ class VMOps(object):
vm_ref = self._get_vm_opaque_ref(instance)
self._start(instance, vm_ref)
+ def poll_rebooting_instances(self, timeout):
+ """Look for expirable rebooting instances.
+
+ - issue a "hard" reboot to any instance that has been stuck in a
+ reboot state for >= the given timeout
+ """
+ ctxt = nova_context.get_admin_context()
+ instances = db.instance_get_all_hung_in_rebooting(ctxt, timeout)
+
+ instances_info = dict(instance_count=len(instances),
+ timeout=timeout)
+
+ if instances_info["instance_count"] > 0:
+ LOG.info(_("Found %(instance_count)d hung reboots "
+ "older than %(timeout)d seconds") % instances_info)
+
+ for instance in instances:
+ LOG.info(_("Automatically hard rebooting %d"), instance.id)
+ self.compute_api.reboot(ctxt, instance.id, "HARD")
+
def poll_rescued_instances(self, timeout):
"""Look for expirable rescued instances.
diff --git a/nova/virt/xenapi_conn.py b/nova/virt/xenapi_conn.py
index 700934420..2e4a53c5b 100644
--- a/nova/virt/xenapi_conn.py
+++ b/nova/virt/xenapi_conn.py
@@ -265,6 +265,10 @@ class XenAPIConnection(driver.ComputeDriver):
"""Power on the specified instance"""
self._vmops.power_on(instance)
+ def poll_rebooting_instances(self, timeout):
+ """Poll for rebooting instances"""
+ self._vmops.poll_rebooting_instances(timeout)
+
def poll_rescued_instances(self, timeout):
"""Poll for rescued instances"""
self._vmops.poll_rescued_instances(timeout)