summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVishvananda Ishaya <vishvananda@yahoo.com>2010-10-14 08:38:37 +0000
committerTarmac <>2010-10-14 08:38:37 +0000
commit7120249b21ae56c070cd2e3d32a9502c320ebb07 (patch)
tree7372b04b62216a2d73e4c10dbc8eec8799ea4925
parent134b846d23be923f7453e945e92f32dffbc54f50 (diff)
parent7403ece82902e633fbd3f2e6f0303ad08c269541 (diff)
Fixes a few concurrency issues with creating volumes and instances. Most importantly it adds retries to a number of the volume shell commands and it adds a unique constraint on export_devices and a safe create so that there aren't multiple copies of export devices in the database.
-rw-r--r--nova/api/ec2/cloud.py15
-rw-r--r--nova/compute/manager.py4
-rw-r--r--nova/db/api.py11
-rw-r--r--nova/db/sqlalchemy/api.py16
-rw-r--r--nova/db/sqlalchemy/models.py1
-rw-r--r--nova/volume/driver.py56
-rw-r--r--nova/volume/manager.py9
7 files changed, 76 insertions, 36 deletions
diff --git a/nova/api/ec2/cloud.py b/nova/api/ec2/cloud.py
index a7693cadd..56bf2db03 100644
--- a/nova/api/ec2/cloud.py
+++ b/nova/api/ec2/cloud.py
@@ -893,14 +893,20 @@ class CloudController(object):
instance_ref = db.instance_get_by_internal_id(context,
internal_id)
except exception.NotFound:
- logging.warning("Instance %s was not found during terminate"
- % id_str)
+ logging.warning("Instance %s was not found during terminate",
+ id_str)
continue
+ if (instance_ref['state_description'] == 'terminating'):
+ logging.warning("Instance %s is already being terminated",
+ id_str)
+ continue
now = datetime.datetime.utcnow()
db.instance_update(context,
instance_ref['id'],
- {'terminated_at': now})
+ {'state_description': 'terminating',
+ 'state': 0,
+ 'terminated_at': now})
# FIXME(ja): where should network deallocate occur?
address = db.instance_get_floating_address(context,
instance_ref['id'])
@@ -959,7 +965,8 @@ class CloudController(object):
if volume_ref['status'] != "available":
raise exception.ApiError("Volume status must be available")
now = datetime.datetime.utcnow()
- db.volume_update(context, volume_ref['id'], {'terminated_at': now})
+ db.volume_update(context, volume_ref['id'], {'status': 'deleting',
+ 'terminated_at': now})
host = volume_ref['host']
rpc.cast(db.queue_get_for(context, FLAGS.volume_topic, host),
{"method": "delete_volume",
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index c602d013d..94c95038f 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -114,10 +114,6 @@ class ComputeManager(manager.Manager):
raise exception.Error('trying to destroy already destroyed'
' instance: %s' % instance_id)
- self.db.instance_set_state(context,
- instance_id,
- power_state.NOSTATE,
- 'shutting_down')
yield self.driver.destroy(instance_ref)
# TODO(ja): should we keep it in a terminated state for a bit?
diff --git a/nova/db/api.py b/nova/db/api.py
index 7e6994b56..6dbf3b809 100644
--- a/nova/db/api.py
+++ b/nova/db/api.py
@@ -466,13 +466,18 @@ def export_device_count(context):
return IMPL.export_device_count(context)
-def export_device_create(context, values):
- """Create an export_device from the values dictionary."""
- return IMPL.export_device_create(context, values)
+def export_device_create_safe(context, values):
+ """Create an export_device from the values dictionary.
+
+ The device is not returned. If the create violates the unique
+ constraints because the shelf_id and blade_id already exist,
+ no exception is raised."""
+ return IMPL.export_device_create_safe(context, values)
###################
+
def auth_destroy_token(context, token):
"""Destroy an auth token"""
return IMPL.auth_destroy_token(context, token)
diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
index 6b979f0ae..f4a746cab 100644
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@@ -31,8 +31,7 @@ from sqlalchemy import or_
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload
from sqlalchemy.orm import joinedload_all
-from sqlalchemy.sql import exists
-from sqlalchemy.sql import func
+from sqlalchemy.sql import exists, func
from sqlalchemy.orm.exc import NoResultFound
FLAGS = flags.FLAGS
@@ -539,7 +538,7 @@ def instance_create(context, values):
with session.begin():
while instance_ref.internal_id == None:
internal_id = utils.generate_uid(instance_ref.__prefix__)
- if not instance_internal_id_exists(context, internal_id,
+ if not instance_internal_id_exists(context, internal_id,
session=session):
instance_ref.internal_id = internal_id
instance_ref.save(session=session)
@@ -1023,12 +1022,15 @@ def export_device_count(context):
@require_admin_context
-def export_device_create(context, values):
+def export_device_create_safe(context, values):
export_device_ref = models.ExportDevice()
for (key, value) in values.iteritems():
export_device_ref[key] = value
- export_device_ref.save()
- return export_device_ref
+ try:
+ export_device_ref.save()
+ return export_device_ref
+ except IntegrityError:
+ return None
###################
@@ -1631,7 +1633,7 @@ def user_remove_project_role(context, user_id, project_id, role):
with session.begin():
session.execute('delete from user_project_role_association where ' + \
'user_id=:user_id and project_id=:project_id and ' + \
- 'role=:role', { 'user_id' : user_id,
+ 'role=:role', { 'user_id' : user_id,
'project_id' : project_id,
'role' : role })
diff --git a/nova/db/sqlalchemy/models.py b/nova/db/sqlalchemy/models.py
index eed8f0578..a63bca2b0 100644
--- a/nova/db/sqlalchemy/models.py
+++ b/nova/db/sqlalchemy/models.py
@@ -277,6 +277,7 @@ class Quota(BASE, NovaBase):
class ExportDevice(BASE, NovaBase):
"""Represates a shelf and blade that a volume can be exported on"""
__tablename__ = 'export_devices'
+ __table_args__ = (schema.UniqueConstraint("shelf_id", "blade_id"), {'mysql_engine': 'InnoDB'})
id = Column(Integer, primary_key=True)
shelf_id = Column(Integer)
blade_id = Column(Integer)
diff --git a/nova/volume/driver.py b/nova/volume/driver.py
index 4604b85d5..cca619550 100644
--- a/nova/volume/driver.py
+++ b/nova/volume/driver.py
@@ -24,6 +24,7 @@ import logging
from twisted.internet import defer
+from nova import exception
from nova import flags
from nova import process
@@ -33,6 +34,8 @@ flags.DEFINE_string('volume_group', 'nova-volumes',
'Name for the VG that will contain exported volumes')
flags.DEFINE_string('aoe_eth_dev', 'eth0',
'Which device to export the volumes on')
+flags.DEFINE_string('num_shell_tries', 3,
+ 'number of times to attempt to run flakey shell commands')
class AOEDriver(object):
@@ -41,6 +44,25 @@ class AOEDriver(object):
self._execute = execute
@defer.inlineCallbacks
+ def _try_execute(self, command):
+ # NOTE(vish): Volume commands can partially fail due to timing, but
+ # running them a second time on failure will usually
+ # recover nicely.
+ tries = 0
+ while True:
+ try:
+ yield self._execute(command)
+ defer.returnValue(True)
+ except exception.ProcessExecutionError:
+ tries = tries + 1
+ if tries >= FLAGS.num_shell_tries:
+ raise
+ logging.exception("Recovering from a failed execute."
+ "Try number %s", tries)
+ yield self._execute("sleep %s" % tries ** 2)
+
+
+ @defer.inlineCallbacks
def create_volume(self, volume_name, size):
"""Creates a logical volume"""
# NOTE(vish): makes sure that the volume group exists
@@ -49,22 +71,22 @@ class AOEDriver(object):
sizestr = '100M'
else:
sizestr = '%sG' % size
- yield self._execute(
- "sudo lvcreate -L %s -n %s %s" % (sizestr,
- volume_name,
- FLAGS.volume_group))
+ yield self._try_execute("sudo lvcreate -L %s -n %s %s" %
+ (sizestr,
+ volume_name,
+ FLAGS.volume_group))
@defer.inlineCallbacks
def delete_volume(self, volume_name):
"""Deletes a logical volume"""
- yield self._execute(
- "sudo lvremove -f %s/%s" % (FLAGS.volume_group,
- volume_name))
+ yield self._try_execute("sudo lvremove -f %s/%s" %
+ (FLAGS.volume_group,
+ volume_name))
@defer.inlineCallbacks
def create_export(self, volume_name, shelf_id, blade_id):
"""Creates an export for a logical volume"""
- yield self._execute(
+ yield self._try_execute(
"sudo vblade-persist setup %s %s %s /dev/%s/%s" %
(shelf_id,
blade_id,
@@ -81,16 +103,22 @@ class AOEDriver(object):
@defer.inlineCallbacks
def remove_export(self, _volume_name, shelf_id, blade_id):
"""Removes an export for a logical volume"""
- yield self._execute(
- "sudo vblade-persist stop %s %s" % (shelf_id, blade_id))
- yield self._execute(
- "sudo vblade-persist destroy %s %s" % (shelf_id, blade_id))
+ yield self._try_execute("sudo vblade-persist stop %s %s" %
+ (shelf_id, blade_id))
+ yield self._try_execute("sudo vblade-persist destroy %s %s" %
+ (shelf_id, blade_id))
@defer.inlineCallbacks
def ensure_exports(self):
"""Runs all existing exports"""
- # NOTE(ja): wait for blades to appear
- yield self._execute("sleep 5")
+ # NOTE(vish): The standard _try_execute does not work here
+ # because these methods throw errors if other
+ # volumes on this host are in the process of
+ # being created. The good news is the command
+ # still works for the other volumes, so we
+ # just wait a bit for the current volume to
+ # be ready and ignore any errors.
+ yield self._execute("sleep 2")
yield self._execute("sudo vblade-persist auto all",
check_exit_code=False)
yield self._execute("sudo vblade-persist start all",
diff --git a/nova/volume/manager.py b/nova/volume/manager.py
index 8508f27b2..081a2d695 100644
--- a/nova/volume/manager.py
+++ b/nova/volume/manager.py
@@ -62,7 +62,7 @@ class AOEManager(manager.Manager):
for shelf_id in xrange(FLAGS.num_shelves):
for blade_id in xrange(FLAGS.blades_per_shelf):
dev = {'shelf_id': shelf_id, 'blade_id': blade_id}
- self.db.export_device_create(context, dev)
+ self.db.export_device_create_safe(context, dev)
@defer.inlineCallbacks
def create_volume(self, context, volume_id):
@@ -95,20 +95,21 @@ class AOEManager(manager.Manager):
yield self.driver.ensure_exports()
now = datetime.datetime.utcnow()
- self.db.volume_update(context, volume_id, {'status': 'available',
- 'launched_at': now})
+ self.db.volume_update(context,
+ volume_ref['id'], {'status': 'available',
+ 'launched_at': now})
logging.debug("volume %s: created successfully", volume_id)
defer.returnValue(volume_id)
@defer.inlineCallbacks
def delete_volume(self, context, volume_id):
"""Deletes and unexports volume"""
- logging.debug("Deleting volume with id of: %s", volume_id)
volume_ref = self.db.volume_get(context, volume_id)
if volume_ref['attach_status'] == "attached":
raise exception.Error("Volume is still attached")
if volume_ref['host'] != self.host:
raise exception.Error("Volume is not local to this node")
+ logging.debug("Deleting volume with id of: %s", volume_id)
shelf_id, blade_id = self.db.volume_get_shelf_and_blade(context,
volume_id)
yield self.driver.remove_export(volume_ref['ec2_id'],