From 5e6c69bc7a7e5ddaa1bf0fa83f64da116343dba8 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Tue, 29 Mar 2011 16:35:39 -0700 Subject: Narrowly focused bugfix - don't lose libvirt instances on host reboot or if they crash --- nova/compute/manager.py | 14 ++++++----- nova/virt/libvirt_conn.py | 59 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 55 insertions(+), 18 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index e0a5e2b3f..72f04ecb1 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1088,12 +1088,14 @@ class ComputeManager(manager.SchedulerDependentManager): db_instance['id'], vm_state) - if vm_state == power_state.SHUTOFF: - # TODO(soren): This is what the compute manager does when you - # terminate an instance. At some point I figure we'll have a - # "terminated" state and some sort of cleanup job that runs - # occasionally, cleaning them out. - self.db.instance_destroy(context, db_instance['id']) + # NOTE(justinsb): We no longer auto-remove SHUTOFF instances + # It's quite hard to get them back when we do. + #if vm_state == power_state.SHUTOFF: + # # TODO(soren): This is what the compute manager does when you + # # terminate an instance. At some point I figure we'll have a + # # "terminated" state and some sort of cleanup job that runs + # # occasionally, cleaning them out. + # self.db.instance_destroy(context, db_instance['id']) # Are there VMs not in the DB? for vm_not_found_in_db in vms_not_found_in_db: diff --git a/nova/virt/libvirt_conn.py b/nova/virt/libvirt_conn.py index c144e827e..533ff9394 100644 --- a/nova/virt/libvirt_conn.py +++ b/nova/virt/libvirt_conn.py @@ -116,6 +116,8 @@ flags.DEFINE_integer('live_migration_bandwidth', 0, 'Define live migration behavior') flags.DEFINE_string('qemu_img', 'qemu-img', 'binary to use for qemu-img commands') +flags.DEFINE_bool('start_guests_on_host_boot', False, + 'Whether to restart guests when the host reboots') def get_connection(read_only): @@ -230,12 +232,14 @@ class LibvirtConnection(driver.ComputeDriver): {'name': instance['name'], 'state': state}) db.instance_set_state(ctxt, instance['id'], state) - if state == power_state.SHUTOFF: - # TODO(soren): This is what the compute manager does when you - # terminate # an instance. At some point I figure we'll have a - # "terminated" state and some sort of cleanup job that runs - # occasionally, cleaning them out. - db.instance_destroy(ctxt, instance['id']) + # NOTE(justinsb): We no longer delete these instances, + # the user may want to power them back on + #if state == power_state.SHUTOFF: + # # TODO(soren): This is what the compute manager does when you + # # terminate # an instance. At some point I figure we'll have a + # # "terminated" state and some sort of cleanup job that runs + # # occasionally, cleaning them out. + # db.instance_destroy(ctxt, instance['id']) if state != power_state.RUNNING: continue @@ -474,7 +478,7 @@ class LibvirtConnection(driver.ComputeDriver): xml = self.to_xml(instance) self.firewall_driver.setup_basic_filtering(instance) self.firewall_driver.prepare_instance_filter(instance) - self._conn.createXML(xml, 0) + self._create_new_domain(xml) self.firewall_driver.apply_instance_filter(instance) timer = utils.LoopingCall(f=None) @@ -522,7 +526,7 @@ class LibvirtConnection(driver.ComputeDriver): 'kernel_id': FLAGS.rescue_kernel_id, 'ramdisk_id': FLAGS.rescue_ramdisk_id} self._create_image(instance, xml, '.rescue', rescue_images) - self._conn.createXML(xml, 0) + self._create_new_domain(xml) timer = utils.LoopingCall(f=None) @@ -565,10 +569,15 @@ class LibvirtConnection(driver.ComputeDriver): self.firewall_driver.setup_basic_filtering(instance, network_info) self.firewall_driver.prepare_instance_filter(instance, network_info) self._create_image(instance, xml, network_info) - self._conn.createXML(xml, 0) + domain = self._create_new_domain(xml) LOG.debug(_("instance %s: is running"), instance['name']) self.firewall_driver.apply_instance_filter(instance) + if FLAGS.start_guests_on_host_boot: + LOG.debug(_("instance %s: setting autostart ON") % + instance['name']) + domain.setAutostart(1) + timer = utils.LoopingCall(f=None) def _wait_for_boot(): @@ -964,11 +973,19 @@ class LibvirtConnection(driver.ComputeDriver): return xml def get_info(self, instance_name): + # NOTE(justinsb): When libvirt isn't running / can't connect, we get: + # libvir: Remote error : unable to connect to + # '/var/run/libvirt/libvirt-sock', libvirtd may need to be started: + # No such file or directory try: virt_dom = self._conn.lookupByName(instance_name) - except: - raise exception.NotFound(_("Instance %s not found") - % instance_name) + except libvirt.libvirtError as e: + if e.get_error_code() == libvirt.VIR_ERR_UNKNOWN_HOST: + raise exception.NotFound(_("Instance %s not found") + % instance_name) + LOG.warning(_("Error from libvirt during lookup: %s") % e) + raise + (state, max_mem, mem, num_cpu, cpu_time) = virt_dom.info() return {'state': state, 'max_mem': max_mem, @@ -976,6 +993,24 @@ class LibvirtConnection(driver.ComputeDriver): 'num_cpu': num_cpu, 'cpu_time': cpu_time} + def _create_new_domain(self, xml, persistent=True, launch_flags=0): + # NOTE(justinsb): libvirt has two types of domain: + # * a transient domain disappears when the guest is shutdown + # or the host is rebooted. + # * a permanent domain is not automatically deleted + # NOTE(justinsb): Even for ephemeral instances, transient seems risky + + if persistent: + # To create a persistent domain, first define it, then launch it. + domain = self._conn.defineXML(xml) + + domain.createWithFlags(launch_flags) + else: + # createXML call creates a transient domain + domain = self._conn.createXML(xml, launch_flags) + + return domain + def get_diagnostics(self, instance_name): raise exception.ApiError(_("diagnostics are not supported " "for libvirt")) -- cgit From 399056300a0be228ec4c56587ec0d9c0d09d927c Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Tue, 29 Mar 2011 16:54:37 -0700 Subject: Fix unit test to reflect fact that instance is no longer deleted, just marked SHUTOFF --- nova/tests/test_compute.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nova/tests/test_compute.py b/nova/tests/test_compute.py index d1ef68de4..b7f08dfbe 100644 --- a/nova/tests/test_compute.py +++ b/nova/tests/test_compute.py @@ -654,4 +654,5 @@ class ComputeTestCase(test.TestCase): instances = db.instance_get_all(context.get_admin_context()) LOG.info(_("After force-killing instances: %s"), instances) - self.assertEqual(len(instances), 0) + self.assertEqual(len(instances), 1) + self.assertEqual(power_state.SHUTOFF, instances[0]['state']) -- cgit From b54be0e29cdcd91e3d106fb587b89c39ca3a0bff Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Thu, 7 Apr 2011 15:52:27 -0700 Subject: Removed commented-out old 'delete instance on SHUTOFF' code --- nova/compute/manager.py | 6 ------ nova/virt/libvirt_conn.py | 8 +------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 72f04ecb1..d3cd2e51a 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1090,12 +1090,6 @@ class ComputeManager(manager.SchedulerDependentManager): # NOTE(justinsb): We no longer auto-remove SHUTOFF instances # It's quite hard to get them back when we do. - #if vm_state == power_state.SHUTOFF: - # # TODO(soren): This is what the compute manager does when you - # # terminate an instance. At some point I figure we'll have a - # # "terminated" state and some sort of cleanup job that runs - # # occasionally, cleaning them out. - # self.db.instance_destroy(context, db_instance['id']) # Are there VMs not in the DB? for vm_not_found_in_db in vms_not_found_in_db: diff --git a/nova/virt/libvirt_conn.py b/nova/virt/libvirt_conn.py index 533ff9394..4523cdd2f 100644 --- a/nova/virt/libvirt_conn.py +++ b/nova/virt/libvirt_conn.py @@ -232,14 +232,8 @@ class LibvirtConnection(driver.ComputeDriver): {'name': instance['name'], 'state': state}) db.instance_set_state(ctxt, instance['id'], state) - # NOTE(justinsb): We no longer delete these instances, + # NOTE(justinsb): We no longer delete SHUTOFF instances, # the user may want to power them back on - #if state == power_state.SHUTOFF: - # # TODO(soren): This is what the compute manager does when you - # # terminate # an instance. At some point I figure we'll have a - # # "terminated" state and some sort of cleanup job that runs - # # occasionally, cleaning them out. - # db.instance_destroy(ctxt, instance['id']) if state != power_state.RUNNING: continue -- cgit From 52478e039b094861e7d783b7995b9cafa68e32b9 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Thu, 7 Apr 2011 15:56:16 -0700 Subject: Fix to correct libvirt error code when the domain is not found --- nova/virt/libvirt_conn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nova/virt/libvirt_conn.py b/nova/virt/libvirt_conn.py index 4523cdd2f..a7a8a14b1 100644 --- a/nova/virt/libvirt_conn.py +++ b/nova/virt/libvirt_conn.py @@ -974,7 +974,7 @@ class LibvirtConnection(driver.ComputeDriver): try: virt_dom = self._conn.lookupByName(instance_name) except libvirt.libvirtError as e: - if e.get_error_code() == libvirt.VIR_ERR_UNKNOWN_HOST: + if e.get_error_code() == libvirt.VIR_ERR_NO_DOMAIN: raise exception.NotFound(_("Instance %s not found") % instance_name) LOG.warning(_("Error from libvirt during lookup: %s") % e) -- cgit From 0c5f70c0bcf9395fb25a231057d997b075d04fda Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Thu, 7 Apr 2011 16:00:55 -0700 Subject: Log libvirt errcode on exception --- nova/virt/libvirt_conn.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nova/virt/libvirt_conn.py b/nova/virt/libvirt_conn.py index a7a8a14b1..5f1c12ab3 100644 --- a/nova/virt/libvirt_conn.py +++ b/nova/virt/libvirt_conn.py @@ -974,10 +974,13 @@ class LibvirtConnection(driver.ComputeDriver): try: virt_dom = self._conn.lookupByName(instance_name) except libvirt.libvirtError as e: - if e.get_error_code() == libvirt.VIR_ERR_NO_DOMAIN: + errcode = e.get_error_code() + if errcode == libvirt.VIR_ERR_NO_DOMAIN: raise exception.NotFound(_("Instance %s not found") % instance_name) - LOG.warning(_("Error from libvirt during lookup: %s") % e) + LOG.warning(_("Error from libvirt during lookup. " + "Code=%(errcode)s Error=%(e)s") % + locals()) raise (state, max_mem, mem, num_cpu, cpu_time) = virt_dom.info() -- cgit