Rework virt.xenapi's concurrency model. There were many places where we were

inadvertently blocking the reactor thread. The reworking puts all calls to XenAPI on background threads, so that they won't block the reactor thread. Long-lived operations (VM start, reboot, etc) are invoked asynchronously at the XenAPI level (Async.VM.start, etc). These return a XenAPI task. We relinquish the background thread at this point, so as not to hold threads in the pool for too long, and use reactor.callLater to poll the task. This combination of techniques means that we don't block the reactor thread at all, and at the same time we don't hold lots of threads waiting for long-running operations. There is a FIXME in here: get_info does not conform to these new rules. Changes are required in compute.service before we can make get_info non-blocking.
author: Ewan Mellor <ewan.mellor@citrix.com> 2010-08-15 22:48:54 +0100
committer: Ewan Mellor <ewan.mellor@citrix.com> 2010-08-15 22:48:54 +0100
commit: fb6bf337bc2fe702307842b57e33b9f5f9011147 (patch)
tree: bcd9e97fd3493e56aabfb4e2c13eccada803366e
parent: 2bbb2b86272c89b35a1042ab2866bbe4863bc3e3 (diff)
1 files changed, 147 insertions, 31 deletions
diff --git a/nova/virt/xenapi.py b/nova/virt/xenapi.py
index 9fe15644f..6b41061c1 100644
--- a/nova/virt/xenapi.py
+++ b/nova/virt/xenapi.py
@@ -16,15 +16,33 @@
 
 """
 A connection to XenServer or Xen Cloud Platform.
+
+The concurrency model for this class is as follows:
+
+All XenAPI calls are on a thread (using t.i.t.deferToThread, or the decorator
+deferredToThread).  They are remote calls, and so may hang for the usual
+reasons.  They should not be allowed to block the reactor thread.
+
+All long-running XenAPI calls (VM.start, VM.reboot, etc) are called async
+(using XenAPI.VM.async_start etc).  These return a task, which can then be
+polled for completion.  Polling is handled using reactor.callLater.
+
+This combination of techniques means that we don't block the reactor thread at
+all, and at the same time we don't hold lots of threads waiting for
+long-running operations.
+
+FIXME: get_info currently doesn't conform to these rules, and will block the
+reactor thread if the VM.get_by_name_label or VM.get_record calls block.
 """
 
 import logging
 import xmlrpclib
 
 from twisted.internet import defer
+from twisted.internet import reactor
 from twisted.internet import task
+from twisted.internet.threads import deferToThread
 
-from nova import exception
 from nova import flags
 from nova import process
 from nova.auth.manager import AuthManager
@@ -43,6 +61,9 @@ flags.DEFINE_string('xenapi_connection_username',
 flags.DEFINE_string('xenapi_connection_password',
                     None,
                     'Password for connection to XenServer/Xen Cloud Platform.  Used only if connection_type=xenapi.')
+flags.DEFINE_float('xenapi_task_poll_interval',
+                   0.5,
+                   'The interval used for polling of remote tasks (Async.VM.start, etc).  Used only if connection_type=xenapi.')
 
 
 def get_connection(_):
@@ -61,6 +82,12 @@ def get_connection(_):
     return XenAPIConnection(url, username, password)
 
 
+def deferredToThread(f):
+    def g(*args, **kwargs):
+        return deferToThread(f, *args, **kwargs)
+    return g
+
+
 class XenAPIConnection(object):
 
     def __init__(self, url, user, pw):
@@ -72,9 +99,8 @@ class XenAPIConnection(object):
                   for vm in self._conn.xenapi.VM.get_all()]
 
     @defer.inlineCallbacks
-    @exception.wrap_exception
     def spawn(self, instance):
-        vm = yield self.lookup(instance.name)
+        vm = yield self._lookup(instance.name)
         if vm is not None:
             raise Exception('Attempted to create non-unique name %s' %
                             instance.name)
@@ -93,22 +119,28 @@ class XenAPIConnection(object):
 
         user = AuthManager().get_user(instance.datamodel['user_id'])
         project = AuthManager().get_project(instance.datamodel['project_id'])
-        vdi_uuid = yield self.fetch_image(
+        vdi_uuid = yield self._fetch_image(
             instance.datamodel['image_id'], user, project, True)
-        kernel = yield self.fetch_image(
+        kernel = yield self._fetch_image(
             instance.datamodel['kernel_id'], user, project, False)
-        ramdisk = yield self.fetch_image(
+        ramdisk = yield self._fetch_image(
             instance.datamodel['ramdisk_id'], user, project, False)
-        vdi_ref = yield self._conn.xenapi.VDI.get_by_uuid(vdi_uuid)
+        vdi_ref = yield self._call_xenapi('VDI.get_by_uuid', vdi_uuid)
 
-        vm_ref = yield self.create_vm(instance, kernel, ramdisk)
-        yield self.create_vbd(vm_ref, vdi_ref, 0, True)
+        vm_ref = yield self._create_vm(instance, kernel, ramdisk)
+        yield self._create_vbd(vm_ref, vdi_ref, 0, True)
         if network_ref:
             yield self._create_vif(vm_ref, network_ref, mac_address)
-        yield self._conn.xenapi.VM.start(vm_ref, False, False)
+        logging.debug('Starting VM %s...', vm_ref)
+        yield self._call_xenapi('VM.start', vm_ref, False, False)
+        logging.info('Spawning VM %s created %s.', instance.name, vm_ref)
 
 
-    def create_vm(self, instance, kernel, ramdisk):
+    @defer.inlineCallbacks
+    def _create_vm(self, instance, kernel, ramdisk):
+        """Create a VM record.  Returns a Deferred that gives the new
+        VM reference."""
+        
         mem = str(long(instance.datamodel['memory_kb']) * 1024)
         vcpus = str(instance.datamodel['vcpus'])
         rec = {
@@ -141,12 +173,16 @@ class XenAPIConnection(object):
             'other_config': {},
             }
         logging.debug('Created VM %s...', instance.name)
-        vm_ref = self._conn.xenapi.VM.create(rec)
+        vm_ref = yield self._call_xenapi('VM.create', rec)
         logging.debug('Created VM %s as %s.', instance.name, vm_ref)
-        return vm_ref
+        defer.returnValue(vm_ref)
 
 
-    def create_vbd(self, vm_ref, vdi_ref, userdevice, bootable):
+    @defer.inlineCallbacks
+    def _create_vbd(self, vm_ref, vdi_ref, userdevice, bootable):
+        """Create a VBD record.  Returns a Deferred that gives the new
+        VBD reference."""
+        
         vbd_rec = {}
         vbd_rec['VM'] = vm_ref
         vbd_rec['VDI'] = vdi_ref
@@ -161,13 +197,17 @@ class XenAPIConnection(object):
         vbd_rec['qos_algorithm_params'] = {}
         vbd_rec['qos_supported_algorithms'] = []
         logging.debug('Creating VBD for VM %s, VDI %s ... ', vm_ref, vdi_ref)
-        vbd_ref = self._conn.xenapi.VBD.create(vbd_rec)
+        vbd_ref = yield self._call_xenapi('VBD.create', vbd_rec)
         logging.debug('Created VBD %s for VM %s, VDI %s.', vbd_ref, vm_ref,
                       vdi_ref)
-        return vbd_ref
+        defer.returnValue(vbd_ref)
 
 
+    @defer.inlineCallbacks
     def _create_vif(self, vm_ref, network_ref, mac_address):
+        """Create a VIF record.  Returns a Deferred that gives the new
+        VIF reference."""
+        
         vif_rec = {}
         vif_rec['device'] = '0'
         vif_rec['network']= network_ref
@@ -179,27 +219,31 @@ class XenAPIConnection(object):
         vif_rec['qos_algorithm_params'] = {}
         logging.debug('Creating VIF for VM %s, network %s ... ', vm_ref,
                       network_ref)
-        vif_ref = self._conn.xenapi.VIF.create(vif_rec)
+        vif_ref = yield self._call_xenapi('VIF.create', vif_rec)
         logging.debug('Created VIF %s for VM %s, network %s.', vif_ref,
                       vm_ref, network_ref)
-        return vif_ref
+        defer.returnValue(vif_ref)
 
 
+    @defer.inlineCallbacks
     def _find_network_with_bridge(self, bridge):
         expr = 'field "bridge" = "%s"' % bridge
-        networks = self._conn.xenapi.network.get_all_records_where(expr)
+        networks = yield self._call_xenapi('network.get_all_records_where',
+                                           expr)
         if len(networks) == 1:
-            return networks.keys()[0]
+            defer.returnValue(networks.keys()[0])
         elif len(networks) > 1:
             raise Exception('Found non-unique network for bridge %s' % bridge)
         else:
             raise Exception('Found no network for bridge %s' % bridge)
 
 
-    def fetch_image(self, image, user, project, use_sr):
+    @defer.inlineCallbacks
+    def _fetch_image(self, image, user, project, use_sr):
         """use_sr: True to put the image as a VDI in an SR, False to place
         it on dom0's filesystem.  The former is for VM disks, the latter for
-        its kernel and ramdisk (if external kernels are being used)."""
+        its kernel and ramdisk (if external kernels are being used).
+        Returns a Deferred that gives the new VDI UUID."""
 
         url = images.image_url(image)
         access = AuthManager().get_access_key(user, project)
@@ -211,23 +255,31 @@ class XenAPIConnection(object):
         args['password'] = user.secret
         if use_sr:
             args['add_partition'] = 'true'
-        return self._call_plugin('objectstore', fn, args)
+        task = yield self._async_call_plugin('objectstore', fn, args)
+        uuid = yield self._wait_for_task(task)
+        defer.returnValue(uuid)
 
 
+    @defer.inlineCallbacks
     def reboot(self, instance):
-        vm = self.lookup(instance.name)
+        vm = yield self._lookup(instance.name)
         if vm is None:
             raise Exception('instance not present %s' % instance.name)
-        yield self._conn.xenapi.VM.clean_reboot(vm)
+        task = yield self._call_xenapi('Async.VM.clean_reboot', vm)
+        yield self._wait_for_task(task)
+
 
+    @defer.inlineCallbacks
     def destroy(self, instance):
-        vm = self.lookup(instance.name)
+        vm = yield self._lookup(instance.name)
         if vm is None:
             raise Exception('instance not present %s' % instance.name)
-        yield self._conn.xenapi.VM.destroy(vm)
+        task = yield self._call_xenapi('Async.VM.destroy', vm)
+        yield self._wait_for_task(task)
+
 
     def get_info(self, instance_id):
-        vm = self.lookup(instance_id)
+        vm = self._lookup_blocking(instance_id)
         if vm is None:
             raise Exception('instance not present %s' % instance_id)
         rec = self._conn.xenapi.VM.get_record(vm)
@@ -237,7 +289,13 @@ class XenAPIConnection(object):
                 'num_cpu': rec['VCPUs_max'],
                 'cpu_time': 0}
 
-    def lookup(self, i):
+
+    @deferredToThread
+    def _lookup(self, i):
+        return self._lookup_blocking(i)
+
+
+    def _lookup_blocking(self, i):
         vms = self._conn.xenapi.VM.get_by_name_label(i)
         n = len(vms) 
         if n == 0:
@@ -248,9 +306,55 @@ class XenAPIConnection(object):
             return vms[0]
 
 
-    def _call_plugin(self, plugin, fn, args):
+    def _wait_for_task(self, task):
+        """Return a Deferred that will give the result of the given task.
+        The task is polled until it completes."""
+        d = defer.Deferred()
+        reactor.callLater(0, self._poll_task, task, d)
+        return d
+
+
+    @deferredToThread
+    def _poll_task(self, task, deferred):
+        """Poll the given XenAPI task, and fire the given Deferred if we
+        get a result."""
+        try:
+            #logging.debug('Polling task %s...', task)
+            status = self._conn.xenapi.task.get_status(task)
+            if status == 'pending':
+                reactor.callLater(FLAGS.xenapi_task_poll_interval,
+                                  self._poll_task, task, deferred)
+            elif status == 'success':
+                result = self._conn.xenapi.task.get_result(task)
+                logging.info('Task %s status: success.  %s', task, result)
+                deferred.callback(_parse_xmlrpc_value(result))
+            else:
+                error_info = self._conn.xenapi.task.get_error_info(task)
+                logging.warn('Task %s status: %s.  %s', task, status,
+                             error_info)
+                deferred.errback(XenAPI.Failure(error_info))
+            #logging.debug('Polling task %s done.', task)
+        except Exception, exn:
+            logging.warn(exn)
+            deferred.errback(exn)
+
+
+    @deferredToThread
+    def _call_xenapi(self, method, *args):
+        """Call the specified XenAPI method on a background thread.  Returns
+        a Deferred for the result."""
+        f = self._conn.xenapi
+        for m in method.split('.'):
+            f = f.__getattr__(m)
+        return f(*args)
+
+
+    @deferredToThread
+    def _async_call_plugin(self, plugin, fn, args):
+        """Call Async.host.call_plugin on a background thread.  Returns a
+        Deferred with the task reference."""
         return _unwrap_plugin_exceptions(
-            self._conn.xenapi.host.call_plugin,
+            self._conn.xenapi.Async.host.call_plugin,
             self._get_xenapi_host(), plugin, fn, args)
 
 
@@ -286,3 +390,15 @@ def _unwrap_plugin_exceptions(func, *args, **kwargs):
     except xmlrpclib.ProtocolError, exn:
         logging.debug("Got exception: %s", exn)
         raise
+
+
+def _parse_xmlrpc_value(val):
+    """Parse the given value as if it were an XML-RPC value.  This is
+    sometimes used as the format for the task.result field."""
+    if not val:
+        return val
+    x = xmlrpclib.loads(
+        '<?xml version="1.0"?><methodResponse><params><param>' +
+        val +
+        '</param></params></methodResponse>')
+    return x[0][0]
author	Ewan Mellor <ewan.mellor@citrix.com>	2010-08-15 22:48:54 +0100
committer	Ewan Mellor <ewan.mellor@citrix.com>	2010-08-15 22:48:54 +0100
commit	fb6bf337bc2fe702307842b57e33b9f5f9011147 (patch)
tree	bcd9e97fd3493e56aabfb4e2c13eccada803366e
parent	2bbb2b86272c89b35a1042ab2866bbe4863bc3e3 (diff)