From a4cb456631f0e70685fbc361eced9baa477a8c56 Mon Sep 17 00:00:00 2001 From: Michael Kerrin Date: Tue, 16 Apr 2013 09:43:28 +0000 Subject: Volume IO usage gets reset to 0 after a reboot / crash Check for reset of block device stats before updating volume usage cache. If an instance is reboot or crashes, the results of block device stats for that domain get reset. Add check that if block device stats are lower than the current stats in the database, then add current values to totals and update. Change-Id: I3011872f8d80a1519771f8423001650dcbcc7f5b Fixes bug: 1169097 --- nova/db/sqlalchemy/api.py | 47 ++++++++++--- nova/tests/test_db_api.py | 156 ++++++++++++++++++++++++++++++++++++++------ nova/virt/libvirt/driver.py | 29 +++++--- 3 files changed, 195 insertions(+), 37 deletions(-) diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index 2b278119a..ceb2741e7 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -4339,12 +4339,45 @@ def vol_usage_update(context, id, rd_req, rd_bytes, wr_req, wr_bytes, 'user_id': user_id, 'availability_zone': availability_zone} - rows = model_query(context, models.VolumeUsage, - session=session, read_deleted="yes").\ - filter_by(volume_id=id).\ - update(values, synchronize_session=False) - - if rows: + current_usage = model_query(context, models.VolumeUsage, + session=session, read_deleted="yes").\ + filter_by(volume_id=id).\ + first() + if current_usage: + if (rd_req < current_usage['curr_reads'] or + rd_bytes < current_usage['curr_read_bytes'] or + wr_req < current_usage['curr_writes'] or + wr_bytes < current_usage['curr_write_bytes']): + LOG.info(_("Volume(%s) has lower stats then what is in " + "the database. Instance must have been rebooted " + "or crashed. Updating totals.") % id) + if not update_totals: + values['tot_last_refreshed'] = last_refreshed + values['tot_reads'] = (models.VolumeUsage.tot_reads + + current_usage['curr_reads']) + values['tot_read_bytes'] = ( + models.VolumeUsage.tot_read_bytes + + current_usage['curr_read_bytes']) + values['tot_writes'] = (models.VolumeUsage.tot_writes + + current_usage['curr_writes']) + values['tot_write_bytes'] = ( + models.VolumeUsage.tot_write_bytes + + current_usage['curr_write_bytes']) + else: + values['tot_reads'] = (models.VolumeUsage.tot_reads + + current_usage['curr_reads'] + + rd_req) + values['tot_read_bytes'] = ( + models.VolumeUsage.tot_read_bytes + + current_usage['curr_read_bytes'] + rd_bytes) + values['tot_writes'] = (models.VolumeUsage.tot_writes + + current_usage['curr_writes'] + + wr_req) + values['tot_write_bytes'] = ( + models.VolumeUsage.tot_write_bytes + + current_usage['curr_write_bytes'] + wr_bytes) + + current_usage.update(values) return vol_usage = models.VolumeUsage() @@ -4369,8 +4402,6 @@ def vol_usage_update(context, id, rd_req, rd_bytes, wr_req, wr_bytes, vol_usage.save(session=session) - return - #################### diff --git a/nova/tests/test_db_api.py b/nova/tests/test_db_api.py index 6dc3287bf..3c817a6a0 100644 --- a/nova/tests/test_db_api.py +++ b/nova/tests/test_db_api.py @@ -2664,16 +2664,18 @@ class InstanceDestroyConstraints(test.TestCase): class VolumeUsageDBApiTestCase(test.TestCase): + def setUp(self): super(VolumeUsageDBApiTestCase, self).setUp() self.user_id = 'fake' self.project_id = 'fake' self.context = context.RequestContext(self.user_id, self.project_id) + self.useFixture(test.TimeOverride()) + def test_vol_usage_update_no_totals_update(self): ctxt = context.get_admin_context() now = timeutils.utcnow() - timeutils.set_time_override(now) start_time = now - datetime.timedelta(seconds=10) refreshed_time = now - datetime.timedelta(seconds=5) @@ -2684,7 +2686,11 @@ class VolumeUsageDBApiTestCase(test.TestCase): 'curr_reads': 1000, 'curr_read_bytes': 2000, 'curr_writes': 3000, - 'curr_write_bytes': 4000}, + 'curr_write_bytes': 4000, + 'tot_reads': 0, + 'tot_read_bytes': 0, + 'tot_writes': 0, + 'tot_write_bytes': 0}, {'volume_id': u'2', 'instance_uuid': 'fake-instance-uuid2', 'project_id': 'fake-project-uuid2', @@ -2692,7 +2698,11 @@ class VolumeUsageDBApiTestCase(test.TestCase): 'curr_reads': 100, 'curr_read_bytes': 200, 'curr_writes': 300, - 'curr_write_bytes': 400}] + 'curr_write_bytes': 400, + 'tot_reads': 0, + 'tot_read_bytes': 0, + 'tot_writes': 0, + 'tot_write_bytes': 0}] def _compare(vol_usage, expected): for key, value in expected.items(): @@ -2725,26 +2735,11 @@ class VolumeUsageDBApiTestCase(test.TestCase): self.assertEqual(len(vol_usages), 2) _compare(vol_usages[0], expected_vol_usages[0]) _compare(vol_usages[1], expected_vol_usages[1]) - timeutils.clear_time_override() def test_vol_usage_update_totals_update(self): ctxt = context.get_admin_context() now = timeutils.utcnow() - timeutils.set_time_override(now) start_time = now - datetime.timedelta(seconds=10) - expected_vol_usages = {'volume_id': u'1', - 'project_id': 'fake-project-uuid', - 'user_id': 'fake-user-uuid', - 'instance_uuid': 'fake-instance-uuid', - 'availability_zone': 'fake-az', - 'tot_reads': 600, - 'tot_read_bytes': 800, - 'tot_writes': 1000, - 'tot_write_bytes': 1200, - 'curr_reads': 0, - 'curr_read_bytes': 0, - 'curr_writes': 0, - 'curr_write_bytes': 0} vol_usage = db.vol_usage_update(ctxt, 1, rd_req=100, rd_bytes=200, wr_req=300, wr_bytes=400, @@ -2752,6 +2747,10 @@ class VolumeUsageDBApiTestCase(test.TestCase): project_id='fake-project-uuid', user_id='fake-user-uuid', availability_zone='fake-az') + current_usage = db.vol_get_usage_by_time(ctxt, start_time)[0] + self.assertEqual(current_usage['tot_reads'], 0) + self.assertEqual(current_usage['curr_reads'], 100) + vol_usage = db.vol_usage_update(ctxt, 1, rd_req=200, rd_bytes=300, wr_req=400, wr_bytes=500, instance_id='fake-instance-uuid', @@ -2759,12 +2758,20 @@ class VolumeUsageDBApiTestCase(test.TestCase): user_id='fake-user-uuid', availability_zone='fake-az', update_totals=True) + current_usage = db.vol_get_usage_by_time(ctxt, start_time)[0] + self.assertEqual(current_usage['tot_reads'], 200) + self.assertEqual(current_usage['curr_reads'], 0) + vol_usage = db.vol_usage_update(ctxt, 1, rd_req=300, rd_bytes=400, wr_req=500, wr_bytes=600, instance_id='fake-instance-uuid', project_id='fake-project-uuid', availability_zone='fake-az', user_id='fake-user-uuid') + current_usage = db.vol_get_usage_by_time(ctxt, start_time)[0] + self.assertEqual(current_usage['tot_reads'], 200) + self.assertEqual(current_usage['curr_reads'], 300) + vol_usage = db.vol_usage_update(ctxt, 1, rd_req=400, rd_bytes=500, wr_req=600, wr_bytes=700, instance_id='fake-instance-uuid', @@ -2775,10 +2782,121 @@ class VolumeUsageDBApiTestCase(test.TestCase): vol_usages = db.vol_get_usage_by_time(ctxt, start_time) + expected_vol_usages = {'volume_id': u'1', + 'project_id': 'fake-project-uuid', + 'user_id': 'fake-user-uuid', + 'instance_uuid': 'fake-instance-uuid', + 'availability_zone': 'fake-az', + 'tot_reads': 600, + 'tot_read_bytes': 800, + 'tot_writes': 1000, + 'tot_write_bytes': 1200, + 'curr_reads': 0, + 'curr_read_bytes': 0, + 'curr_writes': 0, + 'curr_write_bytes': 0} + self.assertEquals(1, len(vol_usages)) for key, value in expected_vol_usages.items(): self.assertEqual(vol_usages[0][key], value) - timeutils.clear_time_override() + + def test_vol_usage_update_when_blockdevicestats_reset(self): + ctxt = context.get_admin_context() + now = timeutils.utcnow() + start_time = now - datetime.timedelta(seconds=10) + + vol_usages = db.vol_get_usage_by_time(ctxt, start_time) + self.assertEqual(len(vol_usages), 0) + + db.vol_usage_update(ctxt, 1, + rd_req=10000, rd_bytes=20000, + wr_req=30000, wr_bytes=40000, + instance_id='fake-instance-uuid1', + project_id='fake-project-uuid1', + availability_zone='fake-az', + user_id='fake-user-uuid1') + + # Instance rebooted or crashed. block device stats were reset and are + # less then the previous values + db.vol_usage_update(ctxt, 1, + rd_req=100, rd_bytes=200, + wr_req=300, wr_bytes=400, + instance_id='fake-instance-uuid1', + project_id='fake-project-uuid1', + availability_zone='fake-az', + user_id='fake-user-uuid1') + + db.vol_usage_update(ctxt, 1, + rd_req=200, rd_bytes=300, + wr_req=400, wr_bytes=500, + instance_id='fake-instance-uuid1', + project_id='fake-project-uuid1', + availability_zone='fake-az', + user_id='fake-user-uuid1') + + vol_usage = db.vol_get_usage_by_time(ctxt, start_time)[0] + expected_vol_usage = {'volume_id': u'1', + 'instance_uuid': 'fake-instance-uuid1', + 'project_id': 'fake-project-uuid1', + 'availability_zone': 'fake-az', + 'user_id': 'fake-user-uuid1', + 'curr_reads': 200, + 'curr_read_bytes': 300, + 'curr_writes': 400, + 'curr_write_bytes': 500, + 'tot_reads': 10000, + 'tot_read_bytes': 20000, + 'tot_writes': 30000, + 'tot_write_bytes': 40000} + for key, value in expected_vol_usage.items(): + self.assertEqual(vol_usage[key], value, key) + + def test_vol_usage_update_totals_update_when_blockdevicestats_reset(self): + # This is unlikely to happen, but could when a volume is detached + # right after a instance has rebooted / recovered and before + # the system polled and updated the volume usage cache table. + ctxt = context.get_admin_context() + now = timeutils.utcnow() + start_time = now - datetime.timedelta(seconds=10) + + vol_usages = db.vol_get_usage_by_time(ctxt, start_time) + self.assertEqual(len(vol_usages), 0) + + db.vol_usage_update(ctxt, 1, + rd_req=10000, rd_bytes=20000, + wr_req=30000, wr_bytes=40000, + instance_id='fake-instance-uuid1', + project_id='fake-project-uuid1', + availability_zone='fake-az', + user_id='fake-user-uuid1') + + # Instance rebooted or crashed. block device stats were reset and are + # less then the previous values + db.vol_usage_update(ctxt, 1, + rd_req=100, rd_bytes=200, + wr_req=300, wr_bytes=400, + instance_id='fake-instance-uuid1', + project_id='fake-project-uuid1', + availability_zone='fake-az', + user_id='fake-user-uuid1', + update_totals=True) + + vol_usage = db.vol_get_usage_by_time(ctxt, start_time)[0] + expected_vol_usage = {'volume_id': u'1', + 'instance_uuid': 'fake-instance-uuid1', + 'project_id': 'fake-project-uuid1', + 'availability_zone': 'fake-az', + 'user_id': 'fake-user-uuid1', + 'curr_reads': 0, + 'curr_read_bytes': 0, + 'curr_writes': 0, + 'curr_write_bytes': 0, + 'tot_reads': 10100, + 'tot_read_bytes': 20200, + 'tot_writes': 30300, + 'tot_write_bytes': 40400} + for key, value in expected_vol_usage.items(): + self.assertEqual(vol_usage[key], value, key) class TaskLogTestCase(test.TestCase): diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index dc0af8539..d4daf5942 100755 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -2731,20 +2731,28 @@ class LibvirtDriver(driver.ComputeDriver): mountpoint = bdm['device_name'] if mountpoint.startswith('/dev/'): mountpoint = mountpoint[5:] + volume_id = bdm['volume_id'] LOG.debug(_("Trying to get stats for the volume %s"), - bdm['volume_id']) + volume_id) vol_stats = self.block_stats(instance['name'], mountpoint) if vol_stats: - rd_req, rd_bytes, wr_req, wr_bytes, flush_ops = vol_stats - vol_usage.append(dict(volume=bdm['volume_id'], - instance=instance, - rd_req=rd_req, - rd_bytes=rd_bytes, - wr_req=wr_req, - wr_bytes=wr_bytes, - flush_operations=flush_ops)) + stats = dict(volume=volume_id, + instance=instance, + rd_req=vol_stats[0], + rd_bytes=vol_stats[1], + wr_req=vol_stats[2], + wr_bytes=vol_stats[3], + flush_operations=vol_stats[4]) + LOG.debug( + _("Got volume usage stats for the volume=%(volume)s," + " instance=%(instance)s, rd_req=%(rd_req)d," + " rd_bytes=%(rd_bytes)d, wr_req=%(wr_req)d," + " wr_bytes=%(wr_bytes)d") + % stats) + vol_usage.append(stats) + return vol_usage def block_stats(self, instance_name, disk): @@ -2757,7 +2765,8 @@ class LibvirtDriver(driver.ComputeDriver): except libvirt.libvirtError as e: errcode = e.get_error_code() LOG.info(_("Getting block stats failed, device might have " - "been detached. Code=%(errcode)s Error=%(e)s") + "been detached. Instance=%(instance_name)s " + "Disk=%(disk)s Code=%(errcode)s Error=%(e)s") % locals()) except exception.InstanceNotFound: LOG.info(_("Could not find domain in libvirt for instance %s. " -- cgit