From 0c59003b1cc2ce17e3215e6c69b0dc2407f1b38b Mon Sep 17 00:00:00 2001 From: Chris Behrens Date: Thu, 26 Jul 2012 08:59:24 +0000 Subject: Refactor glance image service code Fixes bug 1029397 This adds better retry support, making sure that if we need to retry requests to glance, we cycle through all hosts before potentially attemping the hosts that failed. The old random selection could cause immediate retrying of the same host that failed. This also adds logging of the host:port that failed and fixes a bug in the retry test, which didn't actually successfully test retrying. Tests for new code added. Change-Id: I400616081e1e547b9ca2e0be622889d3a399a5bf --- nova/exception.py | 3 +- nova/image/glance.py | 182 +++++++++++++++++------------------ nova/tests/api/openstack/fakes.py | 6 +- nova/tests/image/test_glance.py | 196 +++++++++++++++++++++++++++++++++++--- nova/tests/test_xenapi.py | 6 -- nova/virt/xenapi/vm_utils.py | 7 +- 6 files changed, 287 insertions(+), 113 deletions(-) diff --git a/nova/exception.py b/nova/exception.py index b63406427..981363fa1 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -200,7 +200,8 @@ class VirtualInterfaceMacAddressException(NovaException): class GlanceConnectionFailed(NovaException): - message = _("Connection to glance failed") + ": %(reason)s" + message = _("Connection to glance host %(host)s:%(port)s failed: " + "%(reason)s") class MelangeConnectionFailed(NovaException): diff --git a/nova/image/glance.py b/nova/image/glance.py index 0422d2083..4bebd2c64 100644 --- a/nova/image/glance.py +++ b/nova/image/glance.py @@ -20,6 +20,7 @@ from __future__ import absolute_import import copy +import itertools import random import sys import time @@ -33,7 +34,6 @@ from nova import flags from nova.openstack.common import jsonutils from nova.openstack.common import log as logging from nova.openstack.common import timeutils -from nova import utils LOG = logging.getLogger(__name__) @@ -68,88 +68,85 @@ def _create_glance_client(context, host, port): return glance.client.Client(host, port, **params) -def pick_glance_api_server(): - """Return which Glance API server to use for the request - - This method provides a very primitive form of load-balancing suitable for - testing and sandbox environments. In production, it would be better to use - one IP and route that to a real load-balancer. - - Returns (host, port) +def get_api_servers(): """ - host_port = random.choice(FLAGS.glance_api_servers) - host, port_str = host_port.split(':') - port = int(port_str) - return host, port - - -def _get_glance_client(context, image_href): - """Get the correct glance client and id for the given image_href. - - The image_href param can be an href of the form - http://myglanceserver:9292/images/42, or just an int such as 42. If the - image_href is an int, then flags are used to create the default - glance client. - - NOTE: Do not use this or glance.client directly, all other code - should be using GlanceImageService. - - :param image_href: image ref/id for an image - :returns: a tuple of the form (glance_client, image_id) - + Shuffle a list of FLAGS.glance_api_servers and return an iterator + that will cycle through the list, looping around to the beginning + if necessary. """ - glance_host, glance_port = pick_glance_api_server() + api_servers = [] + for api_server in FLAGS.glance_api_servers: + host, port_str = api_server.split(':') + api_servers.append((host, int(port_str))) + random.shuffle(api_servers) + return itertools.cycle(api_servers) + + +class GlanceClientWrapper(object): + """Glance client wrapper class that implements retries.""" + + def __init__(self, context=None, host=None, port=None): + if host is not None: + self._create_static_client(context, host, port) + else: + self.client = None + self.api_servers = None + + def _create_static_client(self, context, host, port): + """Create a client that we'll use for every call.""" + self.host = host + self.port = port + self.client = _create_glance_client(context, self.host, self.port) + + def _create_onetime_client(self, context): + """Create a client that will be used for one call.""" + if self.api_servers is None: + self.api_servers = get_api_servers() + self.host, self.port = self.api_servers.next() + return _create_glance_client(context, self.host, self.port) + + def call(self, context, method, *args, **kwargs): + """ + Call a glance client method. If we get a connection error, + retry the request according to FLAGS.glance_num_retries. + """ - # check if this is an id - if '/' not in str(image_href): - glance_client = _create_glance_client(context, - glance_host, - glance_port) - return (glance_client, image_href) + retry_excs = (glance_exception.ClientConnectionError, + glance_exception.ServiceUnavailable) - else: - try: - (image_id, glance_host, glance_port) = _parse_image_ref(image_href) - glance_client = _create_glance_client(context, - glance_host, - glance_port) - except ValueError: - raise exception.InvalidImageRef(image_href=image_href) + num_attempts = 1 + FLAGS.glance_num_retries - return (glance_client, image_id) + for attempt in xrange(1, num_attempts + 1): + if self.client: + client = self.client + else: + client = self._create_onetime_client(context) + try: + return getattr(client, method)(*args, **kwargs) + except retry_excs as e: + host = self.host + port = self.port + extra = "retrying" + error_msg = _("Error contacting glance server " + "'%(host)s:%(port)s' for '%(method)s', %(extra)s.") + if attempt == num_attempts: + extra = 'done trying' + LOG.exception(error_msg, locals()) + raise exception.GlanceConnectionFailed( + host=host, port=port, reason=str(e)) + LOG.exception(error_msg, locals()) + time.sleep(1) + # Not reached class GlanceImageService(object): """Provides storage and retrieval of disk image objects within Glance.""" def __init__(self, client=None): + if client is None: + client = GlanceClientWrapper() self._client = client - def _get_client(self, context): - # NOTE(sirp): we want to load balance each request across glance - # servers. Since GlanceImageService is a long-lived object, `client` - # is made to choose a new server each time via this property. - if self._client is not None: - return self._client - glance_host, glance_port = pick_glance_api_server() - return _create_glance_client(context, glance_host, glance_port) - - def _call_retry(self, context, name, *args, **kwargs): - """Retry call to glance server if there is a connection error. - Suitable only for idempotent calls.""" - for i in xrange(FLAGS.glance_num_retries + 1): - client = self._get_client(context) - try: - return getattr(client, name)(*args, **kwargs) - except glance_exception.ClientConnectionError as e: - LOG.exception(_('Connection error contacting glance' - ' server, retrying')) - - time.sleep(1) - - raise exception.GlanceConnectionFailed( - reason=_('Maximum attempts reached')) - def detail(self, context, **kwargs): """Calls out to Glance for a list of detailed image information.""" params = self._extract_query_params(kwargs) @@ -180,13 +177,12 @@ class GlanceImageService(object): # NOTE(vish): don't filter out private images kwargs['filters'].setdefault('is_public', 'none') - client = self._get_client(context) - return self._fetch_images(client.get_images_detailed, **kwargs) + return self._fetch_images(context, 'get_images_detailed', **kwargs) - def _fetch_images(self, fetch_func, **kwargs): + def _fetch_images(self, context, fetch_method, **kwargs): """Paginate through results from glance server""" try: - images = fetch_func(**kwargs) + images = self._client.call(context, fetch_method, **kwargs) except Exception: _reraise_translated_exception() @@ -212,14 +208,14 @@ class GlanceImageService(object): # ignore missing limit, just proceed without it pass - for image in self._fetch_images(fetch_func, **kwargs): + for image in self._fetch_images(context, fetch_method, **kwargs): yield image def show(self, context, image_id): """Returns a dict with image data for the given opaque image id.""" try: - image_meta = self._call_retry(context, 'get_image_meta', - image_id) + image_meta = self._client.call(context, 'get_image_meta', + image_id) except Exception: _reraise_translated_image_exception(image_id) @@ -232,8 +228,8 @@ class GlanceImageService(object): def download(self, context, image_id, data): """Calls out to Glance for metadata and data and writes data.""" try: - image_meta, image_chunks = self._call_retry(context, 'get_image', - image_id) + image_meta, image_chunks = self._client.call(context, + 'get_image', image_id) except Exception: _reraise_translated_image_exception(image_id) @@ -253,8 +249,8 @@ class GlanceImageService(object): LOG.debug(_('Metadata after formatting for Glance %s'), sent_service_image_meta) - recv_service_image_meta = self._get_client(context).add_image( - sent_service_image_meta, data) + recv_service_image_meta = self._client.call(context, + 'add_image', sent_service_image_meta, data) # Translate Service -> Base base_image_meta = self._translate_from_glance(recv_service_image_meta) @@ -271,10 +267,9 @@ class GlanceImageService(object): # NOTE(vish): show is to check if image is available self.show(context, image_id) image_meta = self._translate_to_glance(image_meta) - client = self._get_client(context) try: - image_meta = client.update_image(image_id, image_meta, data, - features) + image_meta = self._client.call(context, 'update_image', + image_id, image_meta, data, features) except Exception: _reraise_translated_image_exception(image_id) @@ -289,9 +284,9 @@ class GlanceImageService(object): """ # NOTE(vish): show is to check if image is available - image_meta = self.show(context, image_id) + self.show(context, image_id) try: - result = self._get_client(context).delete_image(image_id) + result = self._client.call(context, 'delete_image', image_id) except glance_exception.NotFound: raise exception.ImageNotFound(image_id=image_id) return result @@ -480,12 +475,17 @@ def get_remote_image_service(context, image_href): # standalone image ID if '/' not in str(image_href): image_service = get_default_image_service() - image_id = image_href - else: - (glance_client, image_id) = _get_glance_client(context, image_href) - image_service = GlanceImageService(glance_client) + return image_service, image_href + + try: + (image_id, glance_host, glance_port) = _parse_image_ref(image_href) + glance_client = GlanceClientWrapper(context=context, + host=glance_host, port=glance_port) + except ValueError: + raise exception.InvalidImageRef(image_href=image_href) - return (image_service, image_id) + image_service = GlanceImageService(client=glance_client) + return image_service, image_id def get_default_image_service(): diff --git a/nova/tests/api/openstack/fakes.py b/nova/tests/api/openstack/fakes.py index 20e675a8c..43ca2aec9 100644 --- a/nova/tests/api/openstack/fakes.py +++ b/nova/tests/api/openstack/fakes.py @@ -262,7 +262,11 @@ def stub_out_glance_add_image(stubs, sent_to_glance): def stub_out_glance(stubs): def fake_get_remote_image_service(): client = glance_stubs.StubGlanceClient(_make_image_fixtures()) - return nova.image.glance.GlanceImageService(client) + client_wrapper = nova.image.glance.GlanceClientWrapper() + client_wrapper.host = 'fake_host' + client_wrapper.port = 9292 + client_wrapper.client = client + return nova.image.glance.GlanceImageService(client=client_wrapper) stubs.Set(nova.image.glance, 'get_default_image_service', fake_get_remote_image_service) diff --git a/nova/tests/image/test_glance.py b/nova/tests/image/test_glance.py index d6c9f8606..4616f0949 100644 --- a/nova/tests/image/test_glance.py +++ b/nova/tests/image/test_glance.py @@ -17,6 +17,8 @@ import datetime +import random +import time import glance.common.exception as glance_exception @@ -91,11 +93,23 @@ class TestGlanceImageService(test.TestCase): def setUp(self): super(TestGlanceImageService, self).setUp() fakes.stub_out_compute_api_snapshot(self.stubs) + client = glance_stubs.StubGlanceClient() - self.service = glance.GlanceImageService(client=client) + self.service = self._create_image_service(client) self.context = context.RequestContext('fake', 'fake', auth_token=True) self.service.delete_all() + def _create_image_service(self, client): + def _fake_create_glance_client(context, host, port): + return client + + self.stubs.Set(glance, '_create_glance_client', + _fake_create_glance_client) + + client_wrapper = glance.GlanceClientWrapper( + 'fake', 'fake_host', 9292) + return glance.GlanceImageService(client=client_wrapper) + @staticmethod def _make_fixture(**kwargs): fixture = {'name': None, @@ -430,29 +444,27 @@ class TestGlanceImageService(test.TestCase): def test_download_with_retries(self): tries = [0] - class GlanceBusyException(Exception): - pass - class MyGlanceStubClient(glance_stubs.StubGlanceClient): """A client that fails the first time, then succeeds.""" def get_image(self, image_id): if tries[0] == 0: tries[0] = 1 - raise GlanceBusyException() + raise glance_exception.ClientConnectionError() else: return {}, [] client = MyGlanceStubClient() - service = glance.GlanceImageService(client=client) + service = self._create_image_service(client) image_id = 1 # doesn't matter writer = NullWriter() # When retries are disabled, we should get an exception self.flags(glance_num_retries=0) - self.assertRaises(GlanceBusyException, service.download, self.context, - image_id, writer) + self.assertRaises(exception.GlanceConnectionFailed, + service.download, self.context, image_id, writer) # Now lets enable retries. No exception should happen now. + tries = [0] self.flags(glance_num_retries=1) service.download(self.context, image_id, writer) @@ -463,7 +475,7 @@ class TestGlanceImageService(test.TestCase): raise glance_exception.Forbidden() client = MyGlanceStubClient() - service = glance.GlanceImageService(client=client) + service = self._create_image_service(client) image_id = 1 # doesn't matter writer = NullWriter() self.assertRaises(exception.ImageNotAuthorized, service.download, @@ -472,13 +484,173 @@ class TestGlanceImageService(test.TestCase): def test_glance_client_image_id(self): fixture = self._make_fixture(name='test image') image_id = self.service.create(self.context, fixture)['id'] - client, same_id = glance._get_glance_client(self.context, image_id) + (service, same_id) = glance.get_remote_image_service( + self.context, image_id) self.assertEquals(same_id, image_id) def test_glance_client_image_ref(self): fixture = self._make_fixture(name='test image') image_id = self.service.create(self.context, fixture)['id'] image_url = 'http://something-less-likely/%s' % image_id - client, same_id = glance._get_glance_client(self.context, image_url) + (service, same_id) = glance.get_remote_image_service( + self.context, image_url) self.assertEquals(same_id, image_id) - self.assertEquals(client.host, 'something-less-likely') + self.assertEquals(service._client.host, + 'something-less-likely') + + +def _create_failing_glance_client(info): + class MyGlanceStubClient(glance_stubs.StubGlanceClient): + """A client that fails the first time, then succeeds.""" + def get_image(self, image_id): + info['num_calls'] += 1 + if info['num_calls'] == 1: + raise glance_exception.ClientConnectionError() + return {}, [] + + return MyGlanceStubClient() + + +class TestGlanceClientWrapper(test.TestCase): + + def setUp(self): + super(TestGlanceClientWrapper, self).setUp() + self.flags(glance_api_servers=['host1:9292', 'host2:9293', + 'host3:9294']) + + # Make the test run fast + def _fake_sleep(secs): + pass + self.stubs.Set(time, 'sleep', _fake_sleep) + + def test_static_client_without_retries(self): + self.flags(glance_num_retries=0) + + ctxt = context.RequestContext('fake', 'fake') + fake_host = 'host4' + fake_port = 9295 + + info = {'num_calls': 0} + + def _fake_create_glance_client(context, host, port): + self.assertEqual(host, fake_host) + self.assertEqual(port, fake_port) + return _create_failing_glance_client(info) + + self.stubs.Set(glance, '_create_glance_client', + _fake_create_glance_client) + + client = glance.GlanceClientWrapper(context=ctxt, + host=fake_host, port=fake_port) + self.assertRaises(exception.GlanceConnectionFailed, + client.call, ctxt, 'get_image', 'meow') + self.assertEqual(info['num_calls'], 1) + + def test_default_client_without_retries(self): + self.flags(glance_num_retries=0) + + ctxt = context.RequestContext('fake', 'fake') + + info = {'num_calls': 0, + 'host': 'host1', + 'port': 9292} + + # Leave the list in a known-order + def _fake_shuffle(servers): + pass + + def _fake_create_glance_client(context, host, port): + self.assertEqual(host, info['host']) + self.assertEqual(port, info['port']) + return _create_failing_glance_client(info) + + self.stubs.Set(random, 'shuffle', _fake_shuffle) + self.stubs.Set(glance, '_create_glance_client', + _fake_create_glance_client) + + client = glance.GlanceClientWrapper() + client2 = glance.GlanceClientWrapper() + self.assertRaises(exception.GlanceConnectionFailed, + client.call, ctxt, 'get_image', 'meow') + self.assertEqual(info['num_calls'], 1) + + info = {'num_calls': 0, + 'host': 'host2', + 'port': 9293} + + def _fake_shuffle2(servers): + # fake shuffle in a known manner + servers.append(servers.pop(0)) + + self.stubs.Set(random, 'shuffle', _fake_shuffle2) + + self.assertRaises(exception.GlanceConnectionFailed, + client2.call, ctxt, 'get_image', 'meow') + self.assertEqual(info['num_calls'], 1) + + def test_static_client_with_retries(self): + self.flags(glance_num_retries=1) + + ctxt = context.RequestContext('fake', 'fake') + fake_host = 'host4' + fake_port = 9295 + + info = {'num_calls': 0} + + def _fake_create_glance_client(context, host, port): + self.assertEqual(host, fake_host) + self.assertEqual(port, fake_port) + return _create_failing_glance_client(info) + + self.stubs.Set(glance, '_create_glance_client', + _fake_create_glance_client) + + client = glance.GlanceClientWrapper(context=ctxt, + host=fake_host, port=fake_port) + client.call(ctxt, 'get_image', 'meow') + self.assertEqual(info['num_calls'], 2) + + def test_default_client_with_retries(self): + self.flags(glance_num_retries=1) + + ctxt = context.RequestContext('fake', 'fake') + + info = {'num_calls': 0, + 'host0': 'host1', + 'port0': 9292, + 'host1': 'host2', + 'port1': 9293} + + # Leave the list in a known-order + def _fake_shuffle(servers): + pass + + def _fake_create_glance_client(context, host, port): + attempt = info['num_calls'] + self.assertEqual(host, info['host%s' % attempt]) + self.assertEqual(port, info['port%s' % attempt]) + return _create_failing_glance_client(info) + + self.stubs.Set(random, 'shuffle', _fake_shuffle) + self.stubs.Set(glance, '_create_glance_client', + _fake_create_glance_client) + + client = glance.GlanceClientWrapper() + client2 = glance.GlanceClientWrapper() + client.call(ctxt, 'get_image', 'meow') + self.assertEqual(info['num_calls'], 2) + + def _fake_shuffle2(servers): + # fake shuffle in a known manner + servers.append(servers.pop(0)) + + self.stubs.Set(random, 'shuffle', _fake_shuffle2) + + info = {'num_calls': 0, + 'host0': 'host2', + 'port0': 9293, + 'host1': 'host3', + 'port1': 9294} + + client2.call(ctxt, 'get_image', 'meow') + self.assertEqual(info['num_calls'], 2) diff --git a/nova/tests/test_xenapi.py b/nova/tests/test_xenapi.py index df35d54ba..069a98c14 100644 --- a/nova/tests/test_xenapi.py +++ b/nova/tests/test_xenapi.py @@ -20,7 +20,6 @@ import ast import contextlib import cPickle as pickle import functools -import mox import os import re @@ -31,7 +30,6 @@ from nova import context from nova import db from nova import exception from nova import flags -from nova.image import glance from nova.openstack.common import importutils from nova.openstack.common import jsonutils from nova.openstack.common import log as logging @@ -2053,8 +2051,6 @@ class VmUtilsTestCase(test.TestCase): def test_upload_image(self): """Ensure image properties include instance system metadata as well as few local settings.""" - def fake_pick_glance_api_server(): - return ("host", 80) def fake_instance_system_metadata_get(context, uuid): return dict(image_a=1, image_b=2, image_c='c', d='d') @@ -2076,8 +2072,6 @@ class VmUtilsTestCase(test.TestCase): def fake_dumps(thing): return thing - self.stubs.Set(glance, "pick_glance_api_server", - fake_pick_glance_api_server) self.stubs.Set(db, "instance_system_metadata_get", fake_instance_system_metadata_get) self.stubs.Set(vm_utils, "get_sr_path", fake_get_sr_path) diff --git a/nova/virt/xenapi/vm_utils.py b/nova/virt/xenapi/vm_utils.py index 9d340861d..bfbffd8cd 100644 --- a/nova/virt/xenapi/vm_utils.py +++ b/nova/virt/xenapi/vm_utils.py @@ -560,7 +560,8 @@ def upload_image(context, session, instance, vdi_uuids, image_id): LOG.debug(_("Asking xapi to upload %(vdi_uuids)s as" " ID %(image_id)s"), locals(), instance=instance) - glance_host, glance_port = glance.pick_glance_api_server() + glance_api_servers = glance.get_api_servers() + glance_host, glance_port = glance_api_servers.next() # TODO(sirp): this inherit-image-property code should probably go in # nova/compute/manager so it can be shared across hypervisors @@ -916,8 +917,10 @@ def _fetch_vhd_image(context, session, instance, image_id): 'sr_path': get_sr_path(session), 'auth_token': getattr(context, 'auth_token', None)} + glance_api_servers = glance.get_api_servers() + def pick_glance(params): - glance_host, glance_port = glance.pick_glance_api_server() + glance_host, glance_port = glance_api_servers.next() params['glance_host'] = glance_host params['glance_port'] = glance_port -- cgit