diff options
| author | Jenkins <jenkins@review.openstack.org> | 2011-10-14 19:04:13 +0000 |
|---|---|---|
| committer | Gerrit Code Review <review@openstack.org> | 2011-10-14 19:04:13 +0000 |
| commit | c9d2aa8a72c16bfdf76e9a8622143ef7cf500cca (patch) | |
| tree | 900b256935b084f8c94fcc3852d9c270f148fb98 | |
| parent | 80105fbc530b7fc842f1fa8f8318128cf067fb77 (diff) | |
| parent | e50e9b44ab2b8b1184f93d24734af4b5862777bf (diff) | |
| download | nova-c9d2aa8a72c16bfdf76e9a8622143ef7cf500cca.tar.gz nova-c9d2aa8a72c16bfdf76e9a8622143ef7cf500cca.tar.xz nova-c9d2aa8a72c16bfdf76e9a8622143ef7cf500cca.zip | |
Merge "Adds the ability to automatically issue a hard reboot to instances that have been stuck in a 'rebooting' state for longer than a specified window."
| -rw-r--r-- | nova/compute/manager.py | 12 | ||||
| -rw-r--r-- | nova/db/api.py | 6 | ||||
| -rw-r--r-- | nova/db/sqlalchemy/api.py | 15 | ||||
| -rw-r--r-- | nova/tests/test_db_api.py | 24 | ||||
| -rw-r--r-- | nova/tests/test_virt_drivers.py | 4 | ||||
| -rw-r--r-- | nova/virt/driver.py | 5 | ||||
| -rw-r--r-- | nova/virt/fake.py | 3 | ||||
| -rw-r--r-- | nova/virt/hyperv.py | 6 | ||||
| -rw-r--r-- | nova/virt/libvirt/connection.py | 4 | ||||
| -rw-r--r-- | nova/virt/xenapi/vm_utils.py | 2 | ||||
| -rw-r--r-- | nova/virt/xenapi/vmops.py | 20 | ||||
| -rw-r--r-- | nova/virt/xenapi_conn.py | 4 |
12 files changed, 104 insertions, 1 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 77283554e..2ccf44050 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -73,6 +73,10 @@ flags.DEFINE_string('console_host', socket.gethostname(), flags.DEFINE_integer('live_migration_retry_count', 30, "Retry count needed in live_migration." " sleep 1 sec for each count") +flags.DEFINE_integer("reboot_timeout", 0, + "Automatically hard reboot an instance if it has been " + "stuck in a rebooting state longer than N seconds." + " Set to 0 to disable.") flags.DEFINE_integer("rescue_timeout", 0, "Automatically unrescue an instance after N seconds." " Set to 0 to disable.") @@ -1777,6 +1781,14 @@ class ComputeManager(manager.SchedulerDependentManager): error_list = [] try: + if FLAGS.reboot_timeout > 0: + self.driver.poll_rebooting_instances(FLAGS.reboot_timeout) + except Exception as ex: + LOG.warning(_("Error during poll_rebooting_instances: %s"), + unicode(ex)) + error_list.append(ex) + + try: if FLAGS.rescue_timeout > 0: self.driver.poll_rescued_instances(FLAGS.rescue_timeout) except Exception as ex: diff --git a/nova/db/api.py b/nova/db/api.py index a26cb3908..5dbc8b9ad 100644 --- a/nova/db/api.py +++ b/nova/db/api.py @@ -585,6 +585,12 @@ def instance_get_project_vpn(context, project_id): return IMPL.instance_get_project_vpn(context, project_id) +def instance_get_all_hung_in_rebooting(context, reboot_window, session=None): + """Get all instances stuck in a rebooting state.""" + return IMPL.instance_get_all_hung_in_rebooting(context, reboot_window, + session) + + def instance_set_state(context, instance_id, state, description=None): """Set the state of an instance.""" return IMPL.instance_set_state(context, instance_id, state, description) diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index 077471e95..cb049652b 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -1529,6 +1529,21 @@ def instance_get_floating_address(context, instance_id): return fixed_ip_refs[0].floating_ips[0]['address'] +@require_admin_context +def instance_get_all_hung_in_rebooting(context, reboot_window, session=None): + reboot_window = datetime.datetime.utcnow() - datetime.timedelta( + seconds=reboot_window) + + if not session: + session = get_session() + + results = session.query(models.Instance).\ + filter(models.Instance.updated_at <= reboot_window).\ + filter_by(task_state="rebooting").all() + + return results + + @require_context def instance_update(context, instance_id, values): session = get_session() diff --git a/nova/tests/test_db_api.py b/nova/tests/test_db_api.py index 81194e3f9..6f6269e52 100644 --- a/nova/tests/test_db_api.py +++ b/nova/tests/test_db_api.py @@ -123,3 +123,27 @@ class DbApiTestCase(test.TestCase): results = db.migration_get_all_unconfirmed(ctxt, 10) self.assertEqual(0, len(results)) db.migration_update(ctxt, migration.id, {"status": "CONFIRMED"}) + + def test_instance_get_all_hung_in_rebooting(self): + ctxt = context.get_admin_context() + + # Ensure no instances are returned. + results = db.instance_get_all_hung_in_rebooting(ctxt, 10) + self.assertEqual(0, len(results)) + + # Ensure one rebooting instance with updated_at older than 10 seconds + # is returned. + updated_at = datetime.datetime(2000, 01, 01, 12, 00, 00) + values = {"task_state": "rebooting", "updated_at": updated_at} + instance = db.instance_create(ctxt, values) + results = db.instance_get_all_hung_in_rebooting(ctxt, 10) + self.assertEqual(1, len(results)) + db.instance_update(ctxt, instance.id, {"task_state": None}) + + # Ensure the newly rebooted instance is not returned. + updated_at = datetime.datetime.utcnow() + values = {"task_state": "rebooting", "updated_at": updated_at} + instance = db.instance_create(ctxt, values) + results = db.instance_get_all_hung_in_rebooting(ctxt, 10) + self.assertEqual(0, len(results)) + db.instance_update(ctxt, instance.id, {"task_state": None}) diff --git a/nova/tests/test_virt_drivers.py b/nova/tests/test_virt_drivers.py index fed89a2ec..be77dab2f 100644 --- a/nova/tests/test_virt_drivers.py +++ b/nova/tests/test_virt_drivers.py @@ -173,6 +173,10 @@ class _VirtDriverTestCase(test.TestCase): self.connection.unrescue(instance_ref, lambda x: None, network_info) @catch_notimplementederror + def test_poll_rebooting_instances(self): + self.connection.poll_rebooting_instances(10) + + @catch_notimplementederror def test_poll_rescued_instances(self): self.connection.poll_rescued_instances(10) diff --git a/nova/virt/driver.py b/nova/virt/driver.py index 3e57980f3..88a239002 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -485,6 +485,11 @@ class ComputeDriver(object): # TODO(Vek): Need to pass context in for access to auth_token pass + def poll_rebooting_instances(self, timeout): + """Poll for rebooting instances""" + # TODO(Vek): Need to pass context in for access to auth_token + raise NotImplementedError() + def poll_rescued_instances(self, timeout): """Poll for rescued instances""" # TODO(Vek): Need to pass context in for access to auth_token diff --git a/nova/virt/fake.py b/nova/virt/fake.py index 1e07eb928..6b70be2bc 100644 --- a/nova/virt/fake.py +++ b/nova/virt/fake.py @@ -131,6 +131,9 @@ class FakeConnection(driver.ComputeDriver): def unrescue(self, instance, callback, network_info): pass + def poll_rebooting_instances(self, timeout): + pass + def poll_rescued_instances(self, timeout): pass diff --git a/nova/virt/hyperv.py b/nova/virt/hyperv.py index 0d48c3792..16fd94e7f 100644 --- a/nova/virt/hyperv.py +++ b/nova/virt/hyperv.py @@ -485,10 +485,16 @@ class HyperVConnection(driver.ComputeDriver): if vm is None: raise exception.InstanceNotFound(instance_id=instance_name) + def poll_rebooting_instances(self, timeout): + """See xenapi_conn.py implementation.""" + pass + def poll_rescued_instances(self, timeout): + """See xenapi_conn.py implementation.""" pass def poll_unconfirmed_resizes(self, resize_confirm_window): + """See xenapi_conn.py implementation.""" pass def update_available_resource(self, ctxt, host): diff --git a/nova/virt/libvirt/connection.py b/nova/virt/libvirt/connection.py index 97f90312b..4d6ecac28 100644 --- a/nova/virt/libvirt/connection.py +++ b/nova/virt/libvirt/connection.py @@ -614,6 +614,10 @@ class LibvirtConnection(driver.ComputeDriver): self.reboot(instance, network_info, xml=unrescue_xml) @exception.wrap_exception() + def poll_rebooting_instances(self, timeout): + pass + + @exception.wrap_exception() def poll_rescued_instances(self, timeout): pass diff --git a/nova/virt/xenapi/vm_utils.py b/nova/virt/xenapi/vm_utils.py index 9f6a8d6b0..56a937c4f 100644 --- a/nova/virt/xenapi/vm_utils.py +++ b/nova/virt/xenapi/vm_utils.py @@ -759,7 +759,7 @@ class VMHelper(HelperBase): @classmethod def lookup(cls, session, name_label): - """Look the instance i up, and returns it if available""" + """Look the instance up and return it if available""" vm_refs = session.get_xenapi().VM.get_by_name_label(name_label) n = len(vm_refs) if n == 0: diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py index 55ec21155..4f8cceb09 100644 --- a/nova/virt/xenapi/vmops.py +++ b/nova/virt/xenapi/vmops.py @@ -1133,6 +1133,26 @@ class VMOps(object): vm_ref = self._get_vm_opaque_ref(instance) self._start(instance, vm_ref) + def poll_rebooting_instances(self, timeout): + """Look for expirable rebooting instances. + + - issue a "hard" reboot to any instance that has been stuck in a + reboot state for >= the given timeout + """ + ctxt = nova_context.get_admin_context() + instances = db.instance_get_all_hung_in_rebooting(ctxt, timeout) + + instances_info = dict(instance_count=len(instances), + timeout=timeout) + + if instances_info["instance_count"] > 0: + LOG.info(_("Found %(instance_count)d hung reboots " + "older than %(timeout)d seconds") % instances_info) + + for instance in instances: + LOG.info(_("Automatically hard rebooting %d"), instance.id) + self.compute_api.reboot(ctxt, instance.id, "HARD") + def poll_rescued_instances(self, timeout): """Look for expirable rescued instances. diff --git a/nova/virt/xenapi_conn.py b/nova/virt/xenapi_conn.py index 700934420..2e4a53c5b 100644 --- a/nova/virt/xenapi_conn.py +++ b/nova/virt/xenapi_conn.py @@ -265,6 +265,10 @@ class XenAPIConnection(driver.ComputeDriver): """Power on the specified instance""" self._vmops.power_on(instance) + def poll_rebooting_instances(self, timeout): + """Poll for rebooting instances""" + self._vmops.poll_rebooting_instances(timeout) + def poll_rescued_instances(self, timeout): """Poll for rescued instances""" self._vmops.poll_rescued_instances(timeout) |
