From 9f01b105fa3f240d9cae88a5b305b4cc312e95f4 Mon Sep 17 00:00:00 2001 From: Brian Elliott Date: Wed, 19 Dec 2012 22:57:26 +0000 Subject: Log last compute error when rescheduling. When a scheduling attempt is re-tried, log the last exception from the compute side. The goal is to provide simpler access to debugging information about why a build/resize is having issues. Change-Id: Ic31852dcbbf1cea9034a4b8eb1c7d20178c3229d --- nova/compute/manager.py | 11 ++++++++--- nova/scheduler/filter_scheduler.py | 21 ++++++++++++++++++++- nova/tests/compute/test_compute.py | 25 ++++++++++++++++++------- 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 6efc83fb9..3f609c027 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -675,7 +675,7 @@ class ComputeManager(manager.SchedulerDependentManager): rescheduled = self._reschedule(context, request_spec, filter_properties, instance['uuid'], self.scheduler_rpcapi.run_instance, method_args, - task_state) + task_state, exc_info) except Exception: rescheduled = False @@ -690,7 +690,8 @@ class ComputeManager(manager.SchedulerDependentManager): raise exc_info[0], exc_info[1], exc_info[2] def _reschedule(self, context, request_spec, filter_properties, - instance_uuid, scheduler_method, method_args, task_state): + instance_uuid, scheduler_method, method_args, task_state, + exc_info=None): """Attempt to re-schedule a compute operation.""" retry = filter_properties.get('retry', None) @@ -714,6 +715,10 @@ class ComputeManager(manager.SchedulerDependentManager): # reset the task state: self._instance_update(context, instance_uuid, task_state=task_state) + if exc_info: + # stringify to avoid circular ref problem in json serialization: + retry['exc'] = traceback.format_exception(*exc_info) + scheduler_method(context, *method_args) return True @@ -1871,7 +1876,7 @@ class ComputeManager(manager.SchedulerDependentManager): rescheduled = self._reschedule(context, request_spec, filter_properties, instance_uuid, scheduler_method, - method_args, task_state) + method_args, task_state, exc_info) except Exception: rescheduled = False LOG.exception(_("Error trying to reschedule"), diff --git a/nova/scheduler/filter_scheduler.py b/nova/scheduler/filter_scheduler.py index c18daa4cc..ea9a39b6f 100644 --- a/nova/scheduler/filter_scheduler.py +++ b/nova/scheduler/filter_scheduler.py @@ -191,6 +191,23 @@ class FilterScheduler(driver.Scheduler): "'scheduler_max_attempts', must be >= 1")) return max_attempts + def _log_compute_error(self, instance_uuid, retry): + """If the request contained an exception from a previous compute + build/resize operation, log it to aid debugging + """ + exc = retry.pop('exc', None) # string-ified exception from compute + if not exc: + return # no exception info from a prevous attempt, skip + + hosts = retry.get('hosts', None) + if not hosts: + return # no previously attempted hosts, skip + + last_host, last_node = hosts[-1] + msg = _("Error from last host: %(last_host)s (node %(last_node)s): " + "%(exc)s") % locals() + LOG.error(msg, instance_uuid=instance_uuid) + def _populate_retry(self, filter_properties, instance_properties): """Populate filter properties with history of retries for this request. If maximum retries is exceeded, raise NoValidHost. @@ -212,8 +229,10 @@ class FilterScheduler(driver.Scheduler): } filter_properties['retry'] = retry + instance_uuid = instance_properties.get('uuid') + self._log_compute_error(instance_uuid, retry) + if retry['num_attempts'] > max_attempts: - instance_uuid = instance_properties.get('uuid') msg = _("Exceeded max scheduling attempts %(max_attempts)d for " "instance %(instance_uuid)s") % locals() raise exception.NoValidHost(reason=msg) diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py index 079a25d27..364dfb7be 100644 --- a/nova/tests/compute/test_compute.py +++ b/nova/tests/compute/test_compute.py @@ -23,6 +23,7 @@ import copy import datetime import sys import time +import traceback import uuid import mox @@ -5868,7 +5869,8 @@ class ComputeReschedulingTestCase(BaseTestCase): self.updated_task_state = kwargs.get('task_state') self.stubs.Set(self.compute, '_instance_update', fake_update) - def _reschedule(self, request_spec=None, filter_properties=None): + def _reschedule(self, request_spec=None, filter_properties=None, + exc_info=None): if not filter_properties: filter_properties = {} @@ -5884,7 +5886,7 @@ class ComputeReschedulingTestCase(BaseTestCase): requested_networks, is_first_time, filter_properties) return self.compute._reschedule(self.context, request_spec, filter_properties, instance_uuid, scheduler_method, - method_args, self.expected_task_state) + method_args, self.expected_task_state, exc_info=exc_info) def test_reschedule_no_filter_properties(self): """no filter_properties will disable re-scheduling""" @@ -5905,10 +5907,17 @@ class ComputeReschedulingTestCase(BaseTestCase): retry = dict(num_attempts=1) filter_properties = dict(retry=retry) request_spec = {'instance_uuids': ['foo', 'bar']} + try: + raise test.TestingException("just need an exception") + except test.TestingException: + exc_info = sys.exc_info() + exc_str = traceback.format_exception(*exc_info) + self.assertTrue(self._reschedule(filter_properties=filter_properties, - request_spec=request_spec)) + request_spec=request_spec, exc_info=exc_info)) self.assertEqual(1, len(request_spec['instance_uuids'])) self.assertEqual(self.updated_task_state, self.expected_task_state) + self.assertEqual(exc_str, filter_properties['retry']['exc']) class ComputeReschedulingResizeTestCase(ComputeReschedulingTestCase): @@ -5918,7 +5927,8 @@ class ComputeReschedulingResizeTestCase(ComputeReschedulingTestCase): super(ComputeReschedulingResizeTestCase, self).setUp() self.expected_task_state = task_states.RESIZE_PREP - def _reschedule(self, request_spec=None, filter_properties=None): + def _reschedule(self, request_spec=None, filter_properties=None, + exc_info=None): if not filter_properties: filter_properties = {} @@ -5935,7 +5945,7 @@ class ComputeReschedulingResizeTestCase(ComputeReschedulingTestCase): return self.compute._reschedule(self.context, request_spec, filter_properties, instance_uuid, scheduler_method, - method_args, self.expected_task_state) + method_args, self.expected_task_state, exc_info=exc_info) class InnerTestingException(Exception): @@ -6055,7 +6065,8 @@ class ComputeRescheduleOrReraiseTestCase(BaseTestCase): self.instance) self.compute._reschedule(self.context, None, {}, instance_uuid, self.compute.scheduler_rpcapi.run_instance, - method_args, task_states.SCHEDULING).AndReturn(True) + method_args, task_states.SCHEDULING, exc_info).AndReturn( + True) self.compute._log_original_error(exc_info, instance_uuid) self.mox.ReplayAll() @@ -6149,7 +6160,7 @@ class ComputeRescheduleResizeOrReraiseTestCase(BaseTestCase): self.compute._reschedule(self.context, {}, {}, self.instance_uuid, self.compute.scheduler_rpcapi.prep_resize, method_args, - task_states.RESIZE_PREP).AndReturn(True) + task_states.RESIZE_PREP, exc_info).AndReturn(True) self.compute._log_original_error(exc_info, self.instance_uuid) self.mox.ReplayAll() -- cgit