From e4377fdb0ef2087a36d2b1fbee96543f735040de Mon Sep 17 00:00:00 2001 From: Michael Still Date: Wed, 12 Dec 2012 17:13:53 +1100 Subject: Stop nbd leaks, remove pid race. With the previous implementation, if a nbd device was found to be in use by something other than nova it was removed from the class scoped list of devices, but never re-added. This meant we "leaked" devices away over time if we were competing with other nbd users on the machine. Instead of tracking our use at all, we should rely on the presence of a user pid in /sys. Resolves bug 1088083. Change-Id: If777e270a0dda12034ea7ef1bc7fd688cadde8a9 --- nova/virt/disk/mount/nbd.py | 78 ++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 47 deletions(-) (limited to 'nova/virt') diff --git a/nova/virt/disk/mount/nbd.py b/nova/virt/disk/mount/nbd.py index 5298a3a8e..90d858a4b 100644 --- a/nova/virt/disk/mount/nbd.py +++ b/nova/virt/disk/mount/nbd.py @@ -16,6 +16,7 @@ """Support for mounting images with qemu-nbd""" import os +import random import re import time @@ -42,82 +43,66 @@ class NbdMount(api.Mount): """qemu-nbd support disk images.""" mode = 'nbd' - # NOTE(padraig): There are two issues with this nbd device handling - # 1. We assume nothing else on the system uses nbd devices - # 2. Multiple workers on a system can race against each other - # A patch has been proposed in Nov 2011, to add add a -f option to - # qemu-nbd, akin to losetup -f. One could test for this by running qemu-nbd - # with just the -f option, where it will fail if not supported, or if there - # are no free devices. Note that patch currently hardcodes 16 devices. - # We might be able to alleviate problem 2. by scanning /proc/partitions - # like the aformentioned patch does. - - _DEVICES_INITIALIZED = False - _DEVICES = None - - def __init__(self, image, mount_dir, partition=None, device=None): - super(NbdMount, self).__init__(image, mount_dir, partition=partition, - device=device) - - # NOTE(mikal): this must be done here, because we need configuration - # to have been parsed before we initialize. Note the scoping to ensure - # we're updating the class scoped variables. - if not self._DEVICES_INITIALIZED: - NbdMount._DEVICES = list(self._detect_nbd_devices()) - NbdMount._DEVICES_INITIALIZED = True + # NOTE(padraig): The remaining issue with this code is that multiple + # workers on a system can race against each other. def _detect_nbd_devices(self): """Detect nbd device files.""" return filter(NBD_DEVICE_RE.match, os.listdir('/sys/block/')) + def _find_unused(self, devices): + for device in devices: + if not os.path.exists(os.path.join('/sys/block/', device, 'pid')): + return device + LOG.warn(_('No free nbd devices')) + return None + def _allocate_nbd(self): - if not os.path.exists("/sys/block/nbd0"): + if not os.path.exists('/sys/block/nbd0'): + LOG.error(_('ndb module not loaded')) self.error = _('nbd unavailable: module not loaded') return None - while True: - if not self._DEVICES: - # really want to log this info, not raise - self.error = _('No free nbd devices') - return None - - device = self._DEVICES.pop() - if not os.path.exists('/sys/block/%s/pid' % device): - break + devices = self._detect_nbd_devices() + random.shuffle(devices) + device = self._find_unused(devices) + if not device: + # really want to log this info, not raise + self.error = _('No free nbd devices') + return None return os.path.join('/dev', device) - def _free_nbd(self, device): - # The device could already be present if unget_dev - # is called right after a nova restart - # (when destroying an LXC container for example). - device = os.path.basename(device) - if not device in self._DEVICES: - self._DEVICES.append(device) + def _read_pid_file(self, pidfile): + # This is for unit test convenience + with open(pidfile) as f: + pid = int(f.readline()) + return pid def get_dev(self): device = self._allocate_nbd() if not device: return False - LOG.debug(_("Get nbd device %(dev)s for %(imgfile)s") % + # NOTE(mikal): qemu-nbd will return an error if the device file is + # already in use. + LOG.debug(_('Get nbd device %(dev)s for %(imgfile)s'), {'dev': device, 'imgfile': self.image}) _out, err = utils.trycmd('qemu-nbd', '-c', device, self.image, run_as_root=True) if err: self.error = _('qemu-nbd error: %s') % err - self._free_nbd(device) return False # NOTE(vish): this forks into another process, so give it a chance - # to set up before continuing + # to set up before continuing + pidfile = "/sys/block/%s/pid" % os.path.basename(device) for _i in range(CONF.timeout_nbd): - if os.path.exists("/sys/block/%s/pid" % os.path.basename(device)): + if os.path.exists(pidfile): self.device = device break time.sleep(1) else: self.error = _('nbd device %s did not show up') % device - self._free_nbd(device) return False self.linked = True @@ -126,8 +111,7 @@ class NbdMount(api.Mount): def unget_dev(self): if not self.linked: return - LOG.debug(_("Release nbd device %s"), self.device) + LOG.debug(_('Release nbd device %s'), self.device) utils.execute('qemu-nbd', '-d', self.device, run_as_root=True) - self._free_nbd(self.device) self.linked = False self.device = None -- cgit