From 07af4ceea7718c4c67b43ca41843910c23c68169 Mon Sep 17 00:00:00 2001 From: Brian Elliott Date: Mon, 10 Dec 2012 20:49:06 +0000 Subject: Add accounting for orphans to resource tracker. Add accounting for orphaned instances to resource tracker. Orphans are instances that, for whatever reason, exist on the hypervisor but are not accounted for in the Nova DB. Such instances would cause resource tracker to under-report usage numbers and result in out of memory errors during build. Change-Id: Icc970e34e01ff8c7dfb000889e5ea6e2d0421c77 --- nova/compute/resource_tracker.py | 43 ++++++++++++++++- nova/tests/compute/test_resource_tracker.py | 72 +++++++++++++++++++++-------- nova/tests/test_xenapi.py | 27 +++++++++++ nova/virt/driver.py | 7 +++ nova/virt/xenapi/driver.py | 8 ++++ nova/virt/xenapi/fake.py | 8 ++++ nova/virt/xenapi/vmops.py | 21 +++++++++ 7 files changed, 164 insertions(+), 22 deletions(-) diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 7306b5940..82f8ec461 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -268,6 +268,11 @@ class ResourceTracker(object): self._update_usage_from_migrations(resources, migrations) + # Detect and account for orphaned instances that may exist on the + # hypervisor, but are not in the DB: + orphans = self._find_orphaned_instances() + self._update_usage_from_orphans(resources, orphans) + self._report_final_resource_view(resources) self._sync_compute_node(context, resources) @@ -364,8 +369,8 @@ class ResourceTracker(object): def _update_usage(self, resources, usage, sign=1): resources['memory_mb_used'] += sign * usage['memory_mb'] - resources['local_gb_used'] += sign * usage['root_gb'] - resources['local_gb_used'] += sign * usage['ephemeral_gb'] + resources['local_gb_used'] += sign * usage.get('root_gb', 0) + resources['local_gb_used'] += sign * usage.get('ephemeral_gb', 0) # free ram and disk may be negative, depending on policy: resources['free_ram_mb'] = (resources['memory_mb'] - @@ -501,6 +506,40 @@ class ResourceTracker(object): for instance in instances: self._update_usage_from_instance(resources, instance) + def _find_orphaned_instances(self): + """Given the set of instances and migrations already account for + by resource tracker, sanity check the hypervisor to determine + if there are any "orphaned" instances left hanging around. + + Orphans could be consuming memory and should be accounted for in + usage calculations to guard against potential out of memory + errors. + """ + uuids1 = frozenset(self.tracked_instances.keys()) + uuids2 = frozenset(self.tracked_migrations.keys()) + uuids = uuids1 | uuids2 + + usage = self.driver.get_per_instance_usage() + vuuids = frozenset(usage.keys()) + + orphan_uuids = vuuids - uuids + orphans = [usage[uuid] for uuid in orphan_uuids] + + return orphans + + def _update_usage_from_orphans(self, resources, orphans): + """Include orphaned instances in usage.""" + for orphan in orphans: + uuid = orphan['uuid'] + memory_mb = orphan['memory_mb'] + + LOG.warn(_("Detected running orphan instance: %(uuid)s (consuming " + "%(memory_mb)s MB memory") % locals()) + + # just record memory usage for the orphan + usage = {'memory_mb': orphan['memory_mb']} + self._update_usage(resources, usage) + def _verify_resources(self, resources): resource_keys = ["vcpus", "memory_mb", "local_gb", "cpu_info", "vcpus_used", "memory_mb_used", "local_gb_used"] diff --git a/nova/tests/compute/test_resource_tracker.py b/nova/tests/compute/test_resource_tracker.py index 9bad14275..9cc235579 100644 --- a/nova/tests/compute/test_resource_tracker.py +++ b/nova/tests/compute/test_resource_tracker.py @@ -193,17 +193,17 @@ class BaseTestCase(test.TestCase): # only used in the subsequent notification: return (instance, instance) - def _tracker(self, host=None, unsupported=False): + def _driver(self): + return FakeVirtDriver() + + def _tracker(self, host=None): if host is None: host = self.host node = "fakenode" - if unsupported: - driver = UnsupportedVirtDriver() - else: - driver = FakeVirtDriver() + driver = self._driver() tracker = resource_tracker.ResourceTracker(host, driver, node) return tracker @@ -215,10 +215,13 @@ class UnsupportedDriverTestCase(BaseTestCase): """ def setUp(self): super(UnsupportedDriverTestCase, self).setUp() - self.tracker = self._tracker(unsupported=True) + self.tracker = self._tracker() # seed tracker with data: self.tracker.update_available_resource(self.context) + def _driver(self): + return UnsupportedVirtDriver() + def test_disabled(self): # disabled = no compute node stats self.assertTrue(self.tracker.disabled) @@ -248,7 +251,7 @@ class UnsupportedDriverTestCase(BaseTestCase): root_gb=10) self.tracker.update_usage(self.context, instance) - def testDisabledResizeClaim(self): + def test_disabled_resize_claim(self): instance = self._fake_instance() instance_type = self._fake_instance_type_create() claim = self.tracker.resize_claim(self.context, instance, @@ -258,7 +261,7 @@ class UnsupportedDriverTestCase(BaseTestCase): self.assertEqual(instance_type['id'], claim.migration['new_instance_type_id']) - def testDisabledResizeContextClaim(self): + def test_disabled_resize_context_claim(self): instance = self._fake_instance() instance_type = self._fake_instance_type_create() with self.tracker.resize_claim(self.context, instance, instance_type) \ @@ -327,18 +330,6 @@ class BaseTrackerTestCase(BaseTestCase): self.tracker.update_available_resource(self.context) self.limits = self._limits() - self._assert(FAKE_VIRT_MEMORY_MB, 'memory_mb') - self._assert(FAKE_VIRT_LOCAL_GB, 'local_gb') - self._assert(FAKE_VIRT_VCPUS, 'vcpus') - self._assert(0, 'memory_mb_used') - self._assert(0, 'local_gb_used') - self._assert(0, 'vcpus_used') - self._assert(0, 'running_vms') - self._assert(FAKE_VIRT_MEMORY_MB, 'free_ram_mb') - self._assert(FAKE_VIRT_LOCAL_GB, 'free_disk_gb') - self.assertFalse(self.tracker.disabled) - self.assertEqual(0, self.tracker.compute_node['current_workload']) - def _fake_service_get_all_compute_by_host(self, ctx, host): self.compute = self._create_compute_node() self.service = self._create_service(host, compute=self.compute) @@ -412,6 +403,19 @@ class TrackerTestCase(BaseTrackerTestCase): self.assertFalse(self.tracker.disabled) self.assertTrue(self.updated) + def test_init(self): + self._assert(FAKE_VIRT_MEMORY_MB, 'memory_mb') + self._assert(FAKE_VIRT_LOCAL_GB, 'local_gb') + self._assert(FAKE_VIRT_VCPUS, 'vcpus') + self._assert(0, 'memory_mb_used') + self._assert(0, 'local_gb_used') + self._assert(0, 'vcpus_used') + self._assert(0, 'running_vms') + self._assert(FAKE_VIRT_MEMORY_MB, 'free_ram_mb') + self._assert(FAKE_VIRT_LOCAL_GB, 'free_disk_gb') + self.assertFalse(self.tracker.disabled) + self.assertEqual(0, self.tracker.compute_node['current_workload']) + class InstanceClaimTestCase(BaseTrackerTestCase): @@ -817,3 +821,31 @@ class ResizeClaimTestCase(BaseTrackerTestCase): self.assertEqual('fakehost', instance['host']) self.assertEqual('fakehost', instance['launched_on']) self.assertEqual('fakenode', instance['node']) + + +class OrphanTestCase(BaseTrackerTestCase): + + def setUp(self): + super(OrphanTestCase, self).setUp() + + def _driver(self): + class OrphanVirtDriver(FakeVirtDriver): + def get_per_instance_usage(self): + return { + '1-2-3-4-5': {'memory_mb': 4, 'uuid': '1-2-3-4-5'}, + '2-3-4-5-6': {'memory_mb': 4, 'uuid': '2-3-4-5-6'}, + + } + + return OrphanVirtDriver() + + def test_usage(self): + # 2 instances, 4 mb each + self.assertEqual(8, self.tracker.compute_node['memory_mb_used']) + + def test_find(self): + # create one legit instance and verify the 2 orphans remain + self._fake_instance() + orphans = self.tracker._find_orphaned_instances() + + self.assertEqual(2, len(orphans)) diff --git a/nova/tests/test_xenapi.py b/nova/tests/test_xenapi.py index c49664aa8..f2799b8f3 100644 --- a/nova/tests/test_xenapi.py +++ b/nova/tests/test_xenapi.py @@ -1016,6 +1016,33 @@ class XenAPIVMTestCase(stubs.XenAPITestBase): pass self.assertTrue(was['called']) + def test_per_instance_usage_running(self): + instance = self._create_instance(spawn=True) + instance_type = instance_types.get_instance_type(3) + + expected = {instance['uuid']: {'memory_mb': instance_type['memory_mb'], + 'uuid': instance['uuid']}} + actual = self.conn.get_per_instance_usage() + self.assertEqual(expected, actual) + + # Paused instances still consume resources: + self.conn.pause(instance) + actual = self.conn.get_per_instance_usage() + self.assertEqual(expected, actual) + + def test_per_instance_usage_suspended(self): + # Suspended instances do not consume memory: + instance = self._create_instance(spawn=True) + self.conn.suspend(instance) + actual = self.conn.get_per_instance_usage() + self.assertEqual({}, actual) + + def test_per_instance_usage_halted(self): + instance = self._create_instance(spawn=True) + self.conn.power_off(instance) + actual = self.conn.get_per_instance_usage() + self.assertEqual({}, actual) + def _create_instance(self, instance_id=1, spawn=True): """Creates and spawns a test instance.""" instance_values = { diff --git a/nova/virt/driver.py b/nova/virt/driver.py index 991a0f6ce..cb72aa2dc 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -767,6 +767,13 @@ class ComputeDriver(object): stats = [stats] return [s['hypervisor_hostname'] for s in stats] + def get_per_instance_usage(self): + """Get information about instance resource usage. + + :returns: dict of nova uuid => dict of usage info + """ + return {} + def load_compute_driver(virtapi, compute_driver=None): """Load a compute driver module. diff --git a/nova/virt/xenapi/driver.py b/nova/virt/xenapi/driver.py index 1649ffb47..8e9e74d02 100644 --- a/nova/virt/xenapi/driver.py +++ b/nova/virt/xenapi/driver.py @@ -607,6 +607,14 @@ class XenAPIDriver(driver.ComputeDriver): """resume guest state when a host is booted""" self._vmops.power_on(instance) + def get_per_instance_usage(self): + """Get information about instance resource usage. + + :returns: dict of nova uuid => dict of usage + info + """ + return self._vmops.get_per_instance_usage() + class XenAPISession(object): """The session to invoke XenAPI SDK calls""" diff --git a/nova/virt/xenapi/fake.py b/nova/virt/xenapi/fake.py index db4f5d03e..9af8a9f41 100644 --- a/nova/virt/xenapi/fake.py +++ b/nova/virt/xenapi/fake.py @@ -635,6 +635,14 @@ class SessionBase(object): db_ref['power_state'] = 'Halted' VM_clean_shutdown = VM_hard_shutdown + def VM_suspend(self, session, vm_ref): + db_ref = _db_content['VM'][vm_ref] + db_ref['power_state'] = 'Suspended' + + def VM_pause(self, session, vm_ref): + db_ref = _db_content['VM'][vm_ref] + db_ref['power_state'] = 'Paused' + def pool_eject(self, session, host_ref): pass diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py index ee466b998..8d4687fe8 100644 --- a/nova/virt/xenapi/vmops.py +++ b/nova/virt/xenapi/vmops.py @@ -1639,3 +1639,24 @@ class VMOps(object): with excutils.save_and_reraise_exception(): recover_method(context, instance, destination_hostname, block_migration) + + def get_per_instance_usage(self): + """Get usage info about each active instance.""" + usage = {} + + def _is_active(vm_rec): + power_state = vm_rec['power_state'].lower() + return power_state in ['running', 'paused'] + + def _get_uuid(vm_rec): + other_config = vm_rec['other_config'] + return other_config.get('nova_uuid', None) + + for vm_ref, vm_rec in vm_utils.list_vms(self._session): + uuid = _get_uuid(vm_rec) + + if _is_active(vm_rec) and uuid is not None: + memory_mb = int(vm_rec['memory_static_max']) / 1024 / 1024 + usage[uuid] = {'memory_mb': memory_mb, 'uuid': uuid} + + return usage -- cgit