diff options
| author | Vishvananda Ishaya <vishvananda@yahoo.com> | 2010-10-14 08:38:37 +0000 |
|---|---|---|
| committer | Tarmac <> | 2010-10-14 08:38:37 +0000 |
| commit | 7120249b21ae56c070cd2e3d32a9502c320ebb07 (patch) | |
| tree | 7372b04b62216a2d73e4c10dbc8eec8799ea4925 | |
| parent | 134b846d23be923f7453e945e92f32dffbc54f50 (diff) | |
| parent | 7403ece82902e633fbd3f2e6f0303ad08c269541 (diff) | |
Fixes a few concurrency issues with creating volumes and instances. Most importantly it adds retries to a number of the volume shell commands and it adds a unique constraint on export_devices and a safe create so that there aren't multiple copies of export devices in the database.
| -rw-r--r-- | nova/api/ec2/cloud.py | 15 | ||||
| -rw-r--r-- | nova/compute/manager.py | 4 | ||||
| -rw-r--r-- | nova/db/api.py | 11 | ||||
| -rw-r--r-- | nova/db/sqlalchemy/api.py | 16 | ||||
| -rw-r--r-- | nova/db/sqlalchemy/models.py | 1 | ||||
| -rw-r--r-- | nova/volume/driver.py | 56 | ||||
| -rw-r--r-- | nova/volume/manager.py | 9 |
7 files changed, 76 insertions, 36 deletions
diff --git a/nova/api/ec2/cloud.py b/nova/api/ec2/cloud.py index a7693cadd..56bf2db03 100644 --- a/nova/api/ec2/cloud.py +++ b/nova/api/ec2/cloud.py @@ -893,14 +893,20 @@ class CloudController(object): instance_ref = db.instance_get_by_internal_id(context, internal_id) except exception.NotFound: - logging.warning("Instance %s was not found during terminate" - % id_str) + logging.warning("Instance %s was not found during terminate", + id_str) continue + if (instance_ref['state_description'] == 'terminating'): + logging.warning("Instance %s is already being terminated", + id_str) + continue now = datetime.datetime.utcnow() db.instance_update(context, instance_ref['id'], - {'terminated_at': now}) + {'state_description': 'terminating', + 'state': 0, + 'terminated_at': now}) # FIXME(ja): where should network deallocate occur? address = db.instance_get_floating_address(context, instance_ref['id']) @@ -959,7 +965,8 @@ class CloudController(object): if volume_ref['status'] != "available": raise exception.ApiError("Volume status must be available") now = datetime.datetime.utcnow() - db.volume_update(context, volume_ref['id'], {'terminated_at': now}) + db.volume_update(context, volume_ref['id'], {'status': 'deleting', + 'terminated_at': now}) host = volume_ref['host'] rpc.cast(db.queue_get_for(context, FLAGS.volume_topic, host), {"method": "delete_volume", diff --git a/nova/compute/manager.py b/nova/compute/manager.py index c602d013d..94c95038f 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -114,10 +114,6 @@ class ComputeManager(manager.Manager): raise exception.Error('trying to destroy already destroyed' ' instance: %s' % instance_id) - self.db.instance_set_state(context, - instance_id, - power_state.NOSTATE, - 'shutting_down') yield self.driver.destroy(instance_ref) # TODO(ja): should we keep it in a terminated state for a bit? diff --git a/nova/db/api.py b/nova/db/api.py index 7e6994b56..6dbf3b809 100644 --- a/nova/db/api.py +++ b/nova/db/api.py @@ -466,13 +466,18 @@ def export_device_count(context): return IMPL.export_device_count(context) -def export_device_create(context, values): - """Create an export_device from the values dictionary.""" - return IMPL.export_device_create(context, values) +def export_device_create_safe(context, values): + """Create an export_device from the values dictionary. + + The device is not returned. If the create violates the unique + constraints because the shelf_id and blade_id already exist, + no exception is raised.""" + return IMPL.export_device_create_safe(context, values) ################### + def auth_destroy_token(context, token): """Destroy an auth token""" return IMPL.auth_destroy_token(context, token) diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index 6b979f0ae..f4a746cab 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -31,8 +31,7 @@ from sqlalchemy import or_ from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import joinedload from sqlalchemy.orm import joinedload_all -from sqlalchemy.sql import exists -from sqlalchemy.sql import func +from sqlalchemy.sql import exists, func from sqlalchemy.orm.exc import NoResultFound FLAGS = flags.FLAGS @@ -539,7 +538,7 @@ def instance_create(context, values): with session.begin(): while instance_ref.internal_id == None: internal_id = utils.generate_uid(instance_ref.__prefix__) - if not instance_internal_id_exists(context, internal_id, + if not instance_internal_id_exists(context, internal_id, session=session): instance_ref.internal_id = internal_id instance_ref.save(session=session) @@ -1023,12 +1022,15 @@ def export_device_count(context): @require_admin_context -def export_device_create(context, values): +def export_device_create_safe(context, values): export_device_ref = models.ExportDevice() for (key, value) in values.iteritems(): export_device_ref[key] = value - export_device_ref.save() - return export_device_ref + try: + export_device_ref.save() + return export_device_ref + except IntegrityError: + return None ################### @@ -1631,7 +1633,7 @@ def user_remove_project_role(context, user_id, project_id, role): with session.begin(): session.execute('delete from user_project_role_association where ' + \ 'user_id=:user_id and project_id=:project_id and ' + \ - 'role=:role', { 'user_id' : user_id, + 'role=:role', { 'user_id' : user_id, 'project_id' : project_id, 'role' : role }) diff --git a/nova/db/sqlalchemy/models.py b/nova/db/sqlalchemy/models.py index eed8f0578..a63bca2b0 100644 --- a/nova/db/sqlalchemy/models.py +++ b/nova/db/sqlalchemy/models.py @@ -277,6 +277,7 @@ class Quota(BASE, NovaBase): class ExportDevice(BASE, NovaBase): """Represates a shelf and blade that a volume can be exported on""" __tablename__ = 'export_devices' + __table_args__ = (schema.UniqueConstraint("shelf_id", "blade_id"), {'mysql_engine': 'InnoDB'}) id = Column(Integer, primary_key=True) shelf_id = Column(Integer) blade_id = Column(Integer) diff --git a/nova/volume/driver.py b/nova/volume/driver.py index 4604b85d5..cca619550 100644 --- a/nova/volume/driver.py +++ b/nova/volume/driver.py @@ -24,6 +24,7 @@ import logging from twisted.internet import defer +from nova import exception from nova import flags from nova import process @@ -33,6 +34,8 @@ flags.DEFINE_string('volume_group', 'nova-volumes', 'Name for the VG that will contain exported volumes') flags.DEFINE_string('aoe_eth_dev', 'eth0', 'Which device to export the volumes on') +flags.DEFINE_string('num_shell_tries', 3, + 'number of times to attempt to run flakey shell commands') class AOEDriver(object): @@ -41,6 +44,25 @@ class AOEDriver(object): self._execute = execute @defer.inlineCallbacks + def _try_execute(self, command): + # NOTE(vish): Volume commands can partially fail due to timing, but + # running them a second time on failure will usually + # recover nicely. + tries = 0 + while True: + try: + yield self._execute(command) + defer.returnValue(True) + except exception.ProcessExecutionError: + tries = tries + 1 + if tries >= FLAGS.num_shell_tries: + raise + logging.exception("Recovering from a failed execute." + "Try number %s", tries) + yield self._execute("sleep %s" % tries ** 2) + + + @defer.inlineCallbacks def create_volume(self, volume_name, size): """Creates a logical volume""" # NOTE(vish): makes sure that the volume group exists @@ -49,22 +71,22 @@ class AOEDriver(object): sizestr = '100M' else: sizestr = '%sG' % size - yield self._execute( - "sudo lvcreate -L %s -n %s %s" % (sizestr, - volume_name, - FLAGS.volume_group)) + yield self._try_execute("sudo lvcreate -L %s -n %s %s" % + (sizestr, + volume_name, + FLAGS.volume_group)) @defer.inlineCallbacks def delete_volume(self, volume_name): """Deletes a logical volume""" - yield self._execute( - "sudo lvremove -f %s/%s" % (FLAGS.volume_group, - volume_name)) + yield self._try_execute("sudo lvremove -f %s/%s" % + (FLAGS.volume_group, + volume_name)) @defer.inlineCallbacks def create_export(self, volume_name, shelf_id, blade_id): """Creates an export for a logical volume""" - yield self._execute( + yield self._try_execute( "sudo vblade-persist setup %s %s %s /dev/%s/%s" % (shelf_id, blade_id, @@ -81,16 +103,22 @@ class AOEDriver(object): @defer.inlineCallbacks def remove_export(self, _volume_name, shelf_id, blade_id): """Removes an export for a logical volume""" - yield self._execute( - "sudo vblade-persist stop %s %s" % (shelf_id, blade_id)) - yield self._execute( - "sudo vblade-persist destroy %s %s" % (shelf_id, blade_id)) + yield self._try_execute("sudo vblade-persist stop %s %s" % + (shelf_id, blade_id)) + yield self._try_execute("sudo vblade-persist destroy %s %s" % + (shelf_id, blade_id)) @defer.inlineCallbacks def ensure_exports(self): """Runs all existing exports""" - # NOTE(ja): wait for blades to appear - yield self._execute("sleep 5") + # NOTE(vish): The standard _try_execute does not work here + # because these methods throw errors if other + # volumes on this host are in the process of + # being created. The good news is the command + # still works for the other volumes, so we + # just wait a bit for the current volume to + # be ready and ignore any errors. + yield self._execute("sleep 2") yield self._execute("sudo vblade-persist auto all", check_exit_code=False) yield self._execute("sudo vblade-persist start all", diff --git a/nova/volume/manager.py b/nova/volume/manager.py index 8508f27b2..081a2d695 100644 --- a/nova/volume/manager.py +++ b/nova/volume/manager.py @@ -62,7 +62,7 @@ class AOEManager(manager.Manager): for shelf_id in xrange(FLAGS.num_shelves): for blade_id in xrange(FLAGS.blades_per_shelf): dev = {'shelf_id': shelf_id, 'blade_id': blade_id} - self.db.export_device_create(context, dev) + self.db.export_device_create_safe(context, dev) @defer.inlineCallbacks def create_volume(self, context, volume_id): @@ -95,20 +95,21 @@ class AOEManager(manager.Manager): yield self.driver.ensure_exports() now = datetime.datetime.utcnow() - self.db.volume_update(context, volume_id, {'status': 'available', - 'launched_at': now}) + self.db.volume_update(context, + volume_ref['id'], {'status': 'available', + 'launched_at': now}) logging.debug("volume %s: created successfully", volume_id) defer.returnValue(volume_id) @defer.inlineCallbacks def delete_volume(self, context, volume_id): """Deletes and unexports volume""" - logging.debug("Deleting volume with id of: %s", volume_id) volume_ref = self.db.volume_get(context, volume_id) if volume_ref['attach_status'] == "attached": raise exception.Error("Volume is still attached") if volume_ref['host'] != self.host: raise exception.Error("Volume is not local to this node") + logging.debug("Deleting volume with id of: %s", volume_id) shelf_id, blade_id = self.db.volume_get_shelf_and_blade(context, volume_id) yield self.driver.remove_export(volume_ref['ec2_id'], |
