From 48439b98a1a7ac2dded34c8899918773f70667f2 Mon Sep 17 00:00:00 2001 From: Devananda van der Veen Date: Fri, 8 Feb 2013 20:36:19 -0800 Subject: Wait for baremetal deploy inside driver.spawn Previously, baremetal driver.spawn returned as soon as the machine power turned on, but before the user-image was deployed to the hardware node, and long before the node was available on the network. This meant the nova instance was marked as ACTIVE before provisioning had actually finished. If the deploy failed and the baremetal node was set to an ERROR state, the nova instance could still be left as ACTIVE and the user was never informed of the error. This patch introduces a LoopingCall to monitor the deployment status in the baremetal database. As the deployment is performed by nova-baremetal-deploy-helper, the database record is updated. Once the deployment is complete, driver.spawn() sets the baremetal node status and the nova instance status is also set properly. If an error occurs during the deployment, an exception is raised within driver.spawn() allowing nova to follow the normal cleanup and notify paths. This also allows the baremetal PXE driver to delete cached image files when a baremetal deployment fails. Fixes bug 1088655. Change-Id: I4feefd462fd956c9780995ec8b05b13e78278c8b --- nova/exception.py | 4 +++ nova/tests/baremetal/test_pxe.py | 31 +++++++++++++++++++++++ nova/virt/baremetal/driver.py | 25 +++++++++---------- nova/virt/baremetal/pxe.py | 53 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 98 insertions(+), 15 deletions(-) diff --git a/nova/exception.py b/nova/exception.py index fd0122835..ada55ba32 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -335,6 +335,10 @@ class InstanceTerminationFailure(Invalid): message = _("Failed to terminate instance") + ": %(reason)s" +class InstanceDeployFailure(Invalid): + message = _("Failed to deploy instance") + ": %(reason)s" + + class ServiceUnavailable(Invalid): message = _("Service is unavailable at this time.") diff --git a/nova/tests/baremetal/test_pxe.py b/nova/tests/baremetal/test_pxe.py index a2f38c39f..d9e41bc67 100644 --- a/nova/tests/baremetal/test_pxe.py +++ b/nova/tests/baremetal/test_pxe.py @@ -32,6 +32,7 @@ from nova.tests.baremetal.db import base as bm_db_base from nova.tests.baremetal.db import utils as bm_db_utils from nova.tests.image import fake as fake_image from nova.tests import utils +from nova.virt.baremetal import baremetal_states from nova.virt.baremetal import db from nova.virt.baremetal import pxe from nova.virt.baremetal import utils as bm_utils @@ -536,3 +537,33 @@ class PXEPublicMethodsTestCase(BareMetalPXETestCase): self.driver.deactivate_bootloader( self.context, self.node, self.instance) self.mox.VerifyAll() + + def test_activate_node(self): + self._create_node() + self.instance['uuid'] = 'fake-uuid' + self.flags(pxe_deploy_timeout=1, group='baremetal') + + db.bm_node_update(self.context, 1, + {'task_state': baremetal_states.DEPLOYING, + 'instance_uuid': 'fake-uuid'}) + + # test timeout + self.assertRaises(exception.InstanceDeployFailure, + self.driver.activate_node, + self.context, self.node, self.instance) + + # test DEPLOYDONE + db.bm_node_update(self.context, 1, + {'task_state': baremetal_states.DEPLOYDONE}) + self.driver.activate_node(self.context, self.node, self.instance) + + # test no deploy -- state is just ACTIVE + db.bm_node_update(self.context, 1, + {'task_state': baremetal_states.ACTIVE}) + self.driver.activate_node(self.context, self.node, self.instance) + + # test node gone + db.bm_node_destroy(self.context, 1) + self.assertRaises(exception.InstanceDeployFailure, + self.driver.activate_node, + self.context, self.node, self.instance) diff --git a/nova/virt/baremetal/driver.py b/nova/virt/baremetal/driver.py index 379eaf04d..7fc03efe0 100755 --- a/nova/virt/baremetal/driver.py +++ b/nova/virt/baremetal/driver.py @@ -26,7 +26,6 @@ from oslo.config import cfg from nova.compute import power_state from nova import context as nova_context from nova import exception -from nova.openstack.common.db.sqlalchemy import session as db_session from nova.openstack.common import importutils from nova.openstack.common import log as logging from nova import paths @@ -219,6 +218,7 @@ class BareMetalDriver(driver.ComputeDriver): node = db.bm_node_set_uuid_safe(context, node_id, {'instance_uuid': instance['uuid'], 'task_state': baremetal_states.BUILDING}) + pm = get_power_manager(node=node, instance=instance) try: @@ -249,6 +249,15 @@ class BareMetalDriver(driver.ComputeDriver): ) try: self.driver.activate_bootloader(context, node, instance) + pm.activate_node() + pm.start_console() + if pm.state != baremetal_states.ACTIVE: + raise exception.NovaException(_( + "Baremetal power manager failed to start node " + "for instance %r") % instance['uuid']) + self.driver.activate_node(context, node, instance) + _update_state(context, node, instance, + baremetal_states.ACTIVE) except Exception, e: self.driver.deactivate_bootloader(context, node, instance) raise e @@ -257,20 +266,8 @@ class BareMetalDriver(driver.ComputeDriver): raise e except Exception, e: # TODO(deva): do network and volume cleanup here + _update_state(context, node, instance, baremetal_states.ERROR) raise e - else: - # NOTE(deva): pm.activate_node should not raise exceptions. - # We check its success in "finally" block - pm.activate_node() - pm.start_console() - finally: - if pm.state != baremetal_states.ACTIVE: - pm.state = baremetal_states.ERROR - try: - _update_state(context, node, instance, pm.state) - except db_session.DBError, e: - LOG.warning(_("Failed to update state record for " - "baremetal node %s") % instance['uuid']) def reboot(self, context, instance, network_info, reboot_type, block_device_info=None): diff --git a/nova/virt/baremetal/pxe.py b/nova/virt/baremetal/pxe.py index e6cefcca1..0abede93c 100644 --- a/nova/virt/baremetal/pxe.py +++ b/nova/virt/baremetal/pxe.py @@ -20,6 +20,7 @@ Class for PXE bare-metal nodes. """ +import datetime import os from oslo.config import cfg @@ -29,6 +30,9 @@ from nova import exception from nova.openstack.common.db.sqlalchemy import session as db_session from nova.openstack.common import fileutils from nova.openstack.common import log as logging +from nova.openstack.common import timeutils +from nova import utils +from nova.virt.baremetal import baremetal_states from nova.virt.baremetal import base from nova.virt.baremetal import db from nova.virt.baremetal import utils as bm_utils @@ -47,6 +51,9 @@ pxe_opts = [ cfg.StrOpt('pxe_config_template', default='$pybasedir/nova/virt/baremetal/pxe_config.template', help='Template file for PXE configuration'), + cfg.IntOpt('pxe_deploy_timeout', + help='Timeout for PXE deployments. Default: 0 (unlimited)', + default=0), ] LOG = logging.getLogger(__name__) @@ -431,7 +438,51 @@ class PXE(base.NodeDriver): os.path.join(CONF.baremetal.tftp_root, instance['uuid'])) def activate_node(self, context, node, instance): - pass + """Wait for PXE deployment to complete.""" + + locals = {'error': '', 'started': False} + + def _wait_for_deploy(): + """Called at an interval until the deployment completes.""" + try: + row = db.bm_node_get(context, node['id']) + if instance['uuid'] != row.get('instance_uuid'): + locals['error'] = _("Node associated with another instance" + " while waiting for deploy of %s") + raise utils.LoopingCallDone() + + status = row.get('task_state') + if (status == baremetal_states.DEPLOYING + and locals['started'] == False): + LOG.info(_("PXE deploy started for instance %s") + % instance['uuid']) + locals['started'] = True + elif status in (baremetal_states.DEPLOYDONE, + baremetal_states.ACTIVE): + LOG.info(_("PXE deploy completed for instance %s") + % instance['uuid']) + raise utils.LoopingCallDone() + elif status == baremetal_states.DEPLOYFAIL: + locals['error'] = _("PXE deploy failed for instance %s") + except exception.InstanceNotFound: + locals['error'] = _("Baremetal node deleted while waiting " + "for deployment of instance %s") + + if (CONF.baremetal.pxe_deploy_timeout and + timeutils.utcnow() > expiration): + locals['error'] = _("Timeout reached while waiting for " + "PXE deploy of instance %s") + if locals['error']: + raise utils.LoopingCallDone() + + expiration = timeutils.utcnow() + datetime.timedelta( + seconds=CONF.baremetal.pxe_deploy_timeout) + timer = utils.FixedIntervalLoopingCall(_wait_for_deploy) + timer.start(interval=1).wait() + + if locals['error']: + raise exception.InstanceDeployFailure( + locals['error'] % instance['uuid']) def deactivate_node(self, context, node, instance): pass -- cgit