From c2de33a0a2132774dc295861cef138ec24bb0cf9 Mon Sep 17 00:00:00 2001 From: Michael Still Date: Wed, 14 Nov 2012 18:37:04 +1100 Subject: Detect shared storage; handle base cleanup better. If base image storage is shared, we need to care about remote instances when we clean up. This patch "learns" which storage is shared, and then decides what base images are in use anywhere on the set of compute nodes which share that base storage. This is complicated because shared instance storage doesn't have to be per-cluster. It could for example be per rack. We need to handle that properly. This should resolve bug 1078594. Change-Id: I36d0d6e965b114bb68c8f7b7fd43f8e96b2dd8f5 --- nova/compute/manager.py | 18 +++++++++++- nova/tests/test_imagecache.py | 17 ++++++----- nova/virt/driver.py | 1 + nova/virt/libvirt/driver.py | 2 +- nova/virt/libvirt/imagecache.py | 46 ++++++++++++++---------------- nova/virt/storage_users.py | 63 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 114 insertions(+), 33 deletions(-) create mode 100644 nova/virt/storage_users.py diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 7484a9252..acadd6f61 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -73,6 +73,7 @@ from nova import quota from nova.scheduler import rpcapi as scheduler_rpcapi from nova import utils from nova.virt import driver +from nova.virt import storage_users from nova.virt import virtapi from nova import volume @@ -3247,4 +3248,19 @@ class ComputeManager(manager.SchedulerDependentManager): return all_instances = self.db.instance_get_all(context) - self.driver.manage_image_cache(context, all_instances) + + # Determine what other nodes use this storage + storage_users.register_storage_use(CONF.instances_path, CONF.host) + nodes = storage_users.get_storage_users(CONF.instances_path) + + # Filter all_instances to only include those nodes which share this + # storage path. + # TODO(mikal): this should be further refactored so that the cache + # cleanup code doesn't know what those instances are, just a remote + # count, and then this logic should be pushed up the stack. + filtered_instances = [] + for instance in all_instances: + if instance['host'] in nodes: + filtered_instances.append(instance) + + self.driver.manage_image_cache(context, filtered_instances) diff --git a/nova/tests/test_imagecache.py b/nova/tests/test_imagecache.py index 78b7248bc..09570204f 100644 --- a/nova/tests/test_imagecache.py +++ b/nova/tests/test_imagecache.py @@ -627,22 +627,22 @@ class ImageCacheManagerTestCase(test.TestCase): self.assertEquals(image_cache_manager.corrupt_base_files, []) def test_handle_base_image_used_remotely(self): + self.stubs.Set(virtutils, 'chown', lambda x, y: None) img = '123' with self._make_base_file() as fname: os.utime(fname, (-1, time.time() - 3601)) image_cache_manager = imagecache.ImageCacheManager() + image_cache_manager.unexplained_images = [fname] image_cache_manager.used_images = {'123': (0, 1, ['banana-42'])} - image_cache_manager._handle_base_image(img, None) + image_cache_manager._handle_base_image(img, fname) self.assertEquals(image_cache_manager.unexplained_images, []) self.assertEquals(image_cache_manager.removable_base_files, []) self.assertEquals(image_cache_manager.corrupt_base_files, []) def test_handle_base_image_absent(self): - """Ensure we warn for use of a missing base image.""" - img = '123' with self._intercept_log_messages() as stream: @@ -942,7 +942,10 @@ class ImageCacheManagerTestCase(test.TestCase): 'vm_state': '', 'task_state': ''}] - self.stubs.Set(db, 'instance_get_all', fake_get_all) - compute = importutils.import_object(CONF.compute_manager) - compute._run_image_cache_manager_pass(None) - self.assertTrue(was['called']) + with utils.tempdir() as tmpdir: + self.flags(instances_path=tmpdir) + + self.stubs.Set(db, 'instance_get_all', fake_get_all) + compute = importutils.import_object(CONF.compute_manager) + compute._run_image_cache_manager_pass(None) + self.assertTrue(was['called']) diff --git a/nova/virt/driver.py b/nova/virt/driver.py index 4dd7b1c66..3f4d94f19 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -708,6 +708,7 @@ class ComputeDriver(object): related to other calls into the driver. The prime example is to clean the cache and remove images which are no longer of interest. """ + pass def add_to_aggregate(self, context, aggregate, host, **kwargs): """Add a compute host to an aggregate.""" diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 865577105..14e8a75d2 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -2388,7 +2388,7 @@ class LibvirtDriver(driver.ComputeDriver): raise exception.InvalidCPUInfo(reason=m % locals()) def _create_shared_storage_test_file(self): - """Makes tmpfile under CONF.instance_path.""" + """Makes tmpfile under CONF.instances_path.""" dirpath = CONF.instances_path fd, tmp_file = tempfile.mkstemp(dir=dirpath) LOG.debug(_("Creating tmpfile %s to notify to other " diff --git a/nova/virt/libvirt/imagecache.py b/nova/virt/libvirt/imagecache.py index 634210a26..753fd966c 100644 --- a/nova/virt/libvirt/imagecache.py +++ b/nova/virt/libvirt/imagecache.py @@ -258,8 +258,8 @@ class ImageCacheManager(object): current_checksum = utils.hash_file(f) if current_checksum != stored_checksum: - LOG.error(_('%(id)s (%(base_file)s): image verification ' - 'failed'), + LOG.error(_('image %(id)s at (%(base_file)s): image ' + 'verification failed'), {'id': img_id, 'base_file': base_file}) return False @@ -268,8 +268,8 @@ class ImageCacheManager(object): return True else: - LOG.info(_('%(id)s (%(base_file)s): image verification ' - 'skipped, no hash stored'), + LOG.info(_('image %(id)s at (%(base_file)s): image ' + 'verification skipped, no hash stored'), {'id': img_id, 'base_file': base_file}) @@ -325,7 +325,7 @@ class ImageCacheManager(object): image_bad = False image_in_use = False - LOG.info(_('%(id)s (%(base_file)s): checking'), + LOG.info(_('image %(id)s at (%(base_file)s): checking'), {'id': img_id, 'base_file': base_file}) @@ -346,44 +346,42 @@ class ImageCacheManager(object): instances = [] if img_id in self.used_images: local, remote, instances = self.used_images[img_id] - if local > 0: - LOG.debug(_('%(id)s (%(base_file)s): ' - 'in use: on this node %(local)d local, ' - '%(remote)d on other nodes'), - {'id': img_id, - 'base_file': base_file, - 'local': local, - 'remote': remote}) + if local > 0 or remote > 0: image_in_use = True + LOG.info(_('image %(id)s at (%(base_file)s): ' + 'in use: on this node %(local)d local, ' + '%(remote)d on other nodes sharing this instance ' + 'storage'), + {'id': img_id, + 'base_file': base_file, + 'local': local, + 'remote': remote}) + self.active_base_files.append(base_file) if not base_file: - LOG.warning(_('%(id)s (%(base_file)s): warning -- an ' - 'absent base file is in use! instances: ' - '%(instance_list)s'), + LOG.warning(_('image %(id)s at (%(base_file)s): warning ' + '-- an absent base file is in use! ' + 'instances: %(instance_list)s'), {'id': img_id, 'base_file': base_file, 'instance_list': ' '.join(instances)}) - else: - LOG.debug(_('%(id)s (%(base_file)s): in use on (%(remote)d on ' - 'other nodes)'), - {'id': img_id, - 'base_file': base_file, - 'remote': remote}) if image_bad: self.corrupt_base_files.append(base_file) if base_file: if not image_in_use: - LOG.debug(_('%(id)s (%(base_file)s): image is not in use'), + LOG.debug(_('image %(id)s at (%(base_file)s): image is not in ' + 'use'), {'id': img_id, 'base_file': base_file}) self.removable_base_files.append(base_file) else: - LOG.debug(_('%(id)s (%(base_file)s): image is in use'), + LOG.debug(_('image %(id)s at (%(base_file)s): image is in ' + 'use'), {'id': img_id, 'base_file': base_file}) if os.path.exists(base_file): diff --git a/nova/virt/storage_users.py b/nova/virt/storage_users.py new file mode 100644 index 000000000..6555609a4 --- /dev/null +++ b/nova/virt/storage_users.py @@ -0,0 +1,63 @@ +# vim: tabstop=4 shiftwidth=4 softtabstop=4 + +# Copyright 2012 Michael Still and Canonical Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +import json +import os +import time + +from nova.openstack.common import lockutils + + +TWENTY_FOUR_HOURS = 3600 * 24 + + +@lockutils.synchronized('storage-registry-lock', 'nova-', external=True) +def register_storage_use(storage_path, hostname): + """Idenfity the id of this instance storage.""" + + # NOTE(mikal): this is required to determine if the instance storage is + # shared, which is something that the image cache manager needs to + # know. I can imagine other uses as well though. + + d = {} + id_path = os.path.join(storage_path, 'compute_nodes') + if os.path.exists(id_path): + with open(id_path) as f: + d = json.loads(f.read()) + + d[hostname] = time.time() + + with open(id_path, 'w') as f: + f.write(json.dumps(d)) + + +@lockutils.synchronized('storage-registry-lock', 'nova-', external=True) +def get_storage_users(storage_path): + """Get a list of all the users of this storage path.""" + + d = {} + id_path = os.path.join(storage_path, 'compute_nodes') + if os.path.exists(id_path): + with open(id_path) as f: + d = json.loads(f.read()) + + recent_users = [] + for node in d: + if time.time() - d[node] < TWENTY_FOUR_HOURS: + recent_users.append(node) + + return recent_users -- cgit