Fixes a few concurrency issues with creating volumes and instances. Most importantly it adds retries to a number of the volume shell commands and it adds a unique constraint on export_devices and a safe create so that there aren't multiple copies of export devices in the database.

author: Vishvananda Ishaya <vishvananda@yahoo.com> 2010-10-14 08:38:37 +0000
committer: Tarmac <> 2010-10-14 08:38:37 +0000
commit: 7120249b21ae56c070cd2e3d32a9502c320ebb07 (patch)
tree: 7372b04b62216a2d73e4c10dbc8eec8799ea4925
parent: 134b846d23be923f7453e945e92f32dffbc54f50 (diff)
parent: 7403ece82902e633fbd3f2e6f0303ad08c269541 (diff)
7 files changed, 76 insertions, 36 deletions
diff --git a/nova/api/ec2/cloud.py b/nova/api/ec2/cloud.py
index a7693cadd..56bf2db03 100644
--- a/nova/api/ec2/cloud.py
+++ b/nova/api/ec2/cloud.py
@@ -893,14 +893,20 @@ class CloudController(object):
                 instance_ref = db.instance_get_by_internal_id(context, 
                                                               internal_id)
             except exception.NotFound:
-                logging.warning("Instance %s was not found during terminate"
-                                % id_str)
+                logging.warning("Instance %s was not found during terminate",
+                                id_str)
                 continue
 
+            if (instance_ref['state_description'] == 'terminating'):
+                logging.warning("Instance %s is already being terminated",
+                              id_str)
+                continue
             now = datetime.datetime.utcnow()
             db.instance_update(context,
                                instance_ref['id'],
-                               {'terminated_at': now})
+                               {'state_description': 'terminating',
+                                'state': 0,
+                                'terminated_at': now})
             # FIXME(ja): where should network deallocate occur?
             address = db.instance_get_floating_address(context,
                                                        instance_ref['id'])
@@ -959,7 +965,8 @@ class CloudController(object):
         if volume_ref['status'] != "available":
             raise exception.ApiError("Volume status must be available")
         now = datetime.datetime.utcnow()
-        db.volume_update(context, volume_ref['id'], {'terminated_at': now})
+        db.volume_update(context, volume_ref['id'], {'status': 'deleting',
+                                                     'terminated_at': now})
         host = volume_ref['host']
         rpc.cast(db.queue_get_for(context, FLAGS.volume_topic, host),
                             {"method": "delete_volume",
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index c602d013d..94c95038f 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -114,10 +114,6 @@ class ComputeManager(manager.Manager):
             raise exception.Error('trying to destroy already destroyed'
                                   ' instance: %s' % instance_id)
 
-        self.db.instance_set_state(context,
-                                   instance_id,
-                                   power_state.NOSTATE,
-                                   'shutting_down')
         yield self.driver.destroy(instance_ref)
 
         # TODO(ja): should we keep it in a terminated state for a bit?
diff --git a/nova/db/api.py b/nova/db/api.py
index 7e6994b56..6dbf3b809 100644
--- a/nova/db/api.py
+++ b/nova/db/api.py
@@ -466,13 +466,18 @@ def export_device_count(context):
     return IMPL.export_device_count(context)
 
 
-def export_device_create(context, values):
-    """Create an export_device from the values dictionary."""
-    return IMPL.export_device_create(context, values)
+def export_device_create_safe(context, values):
+    """Create an export_device from the values dictionary.
+
+    The device is not returned. If the create violates the unique
+    constraints because the shelf_id and blade_id already exist,
+    no exception is raised."""
+    return IMPL.export_device_create_safe(context, values)
 
 
 ###################
 
+
 def auth_destroy_token(context, token):
     """Destroy an auth token"""
     return IMPL.auth_destroy_token(context, token)
diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
index 6b979f0ae..f4a746cab 100644
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@@ -31,8 +31,7 @@ from sqlalchemy import or_
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import joinedload
 from sqlalchemy.orm import joinedload_all
-from sqlalchemy.sql import exists
-from sqlalchemy.sql import func
+from sqlalchemy.sql import exists, func
 from sqlalchemy.orm.exc import NoResultFound
 
 FLAGS = flags.FLAGS
@@ -539,7 +538,7 @@ def instance_create(context, values):
     with session.begin():
         while instance_ref.internal_id == None:
             internal_id = utils.generate_uid(instance_ref.__prefix__)
-            if not instance_internal_id_exists(context, internal_id, 
+            if not instance_internal_id_exists(context, internal_id,
                                                session=session):
                 instance_ref.internal_id = internal_id
         instance_ref.save(session=session)
@@ -1023,12 +1022,15 @@ def export_device_count(context):
 
 
 @require_admin_context
-def export_device_create(context, values):
+def export_device_create_safe(context, values):
     export_device_ref = models.ExportDevice()
     for (key, value) in values.iteritems():
         export_device_ref[key] = value
-    export_device_ref.save()
-    return export_device_ref
+    try:
+        export_device_ref.save()
+        return export_device_ref
+    except IntegrityError:
+        return None
 
 
 ###################
@@ -1631,7 +1633,7 @@ def user_remove_project_role(context, user_id, project_id, role):
     with session.begin():
         session.execute('delete from user_project_role_association where ' + \
                         'user_id=:user_id and project_id=:project_id and ' + \
-                        'role=:role', { 'user_id'    : user_id, 
+                        'role=:role', { 'user_id'    : user_id,
                                         'project_id' : project_id,
                                         'role'       : role })
 
diff --git a/nova/db/sqlalchemy/models.py b/nova/db/sqlalchemy/models.py
index eed8f0578..a63bca2b0 100644
--- a/nova/db/sqlalchemy/models.py
+++ b/nova/db/sqlalchemy/models.py
@@ -277,6 +277,7 @@ class Quota(BASE, NovaBase):
 class ExportDevice(BASE, NovaBase):
     """Represates a shelf and blade that a volume can be exported on"""
     __tablename__ = 'export_devices'
+    __table_args__ = (schema.UniqueConstraint("shelf_id", "blade_id"), {'mysql_engine': 'InnoDB'})
     id = Column(Integer, primary_key=True)
     shelf_id = Column(Integer)
     blade_id = Column(Integer)
diff --git a/nova/volume/driver.py b/nova/volume/driver.py
index 4604b85d5..cca619550 100644
--- a/nova/volume/driver.py
+++ b/nova/volume/driver.py
@@ -24,6 +24,7 @@ import logging
 
 from twisted.internet import defer
 
+from nova import exception
 from nova import flags
 from nova import process
 
@@ -33,6 +34,8 @@ flags.DEFINE_string('volume_group', 'nova-volumes',
                     'Name for the VG that will contain exported volumes')
 flags.DEFINE_string('aoe_eth_dev', 'eth0',
                     'Which device to export the volumes on')
+flags.DEFINE_string('num_shell_tries', 3,
+                    'number of times to attempt to run flakey shell commands')
 
 
 class AOEDriver(object):
@@ -41,6 +44,25 @@ class AOEDriver(object):
         self._execute = execute
 
     @defer.inlineCallbacks
+    def _try_execute(self, command):
+        # NOTE(vish): Volume commands can partially fail due to timing, but
+        #             running them a second time on failure will usually
+        #             recover nicely.
+        tries = 0
+        while True:
+            try:
+                yield self._execute(command)
+                defer.returnValue(True)
+            except exception.ProcessExecutionError:
+                tries = tries + 1
+                if tries >= FLAGS.num_shell_tries:
+                    raise
+                logging.exception("Recovering from a failed execute."
+                                  "Try number %s", tries)
+                yield self._execute("sleep %s" % tries ** 2)
+
+
+    @defer.inlineCallbacks
     def create_volume(self, volume_name, size):
         """Creates a logical volume"""
         # NOTE(vish): makes sure that the volume group exists
@@ -49,22 +71,22 @@ class AOEDriver(object):
             sizestr = '100M'
         else:
             sizestr = '%sG' % size
-        yield self._execute(
-                "sudo lvcreate -L %s -n %s %s" % (sizestr,
-                                                  volume_name,
-                                                  FLAGS.volume_group))
+        yield self._try_execute("sudo lvcreate -L %s -n %s %s" %
+                            (sizestr,
+                             volume_name,
+                             FLAGS.volume_group))
 
     @defer.inlineCallbacks
     def delete_volume(self, volume_name):
         """Deletes a logical volume"""
-        yield self._execute(
-                "sudo lvremove -f %s/%s" % (FLAGS.volume_group,
-                                            volume_name))
+        yield self._try_execute("sudo lvremove -f %s/%s" %
+                                (FLAGS.volume_group,
+                                 volume_name))
 
     @defer.inlineCallbacks
     def create_export(self, volume_name, shelf_id, blade_id):
         """Creates an export for a logical volume"""
-        yield self._execute(
+        yield self._try_execute(
                 "sudo vblade-persist setup %s %s %s /dev/%s/%s" %
                 (shelf_id,
                  blade_id,
@@ -81,16 +103,22 @@ class AOEDriver(object):
     @defer.inlineCallbacks
     def remove_export(self, _volume_name, shelf_id, blade_id):
         """Removes an export for a logical volume"""
-        yield self._execute(
-                "sudo vblade-persist stop %s %s" % (shelf_id, blade_id))
-        yield self._execute(
-                "sudo vblade-persist destroy %s %s" % (shelf_id, blade_id))
+        yield self._try_execute("sudo vblade-persist stop %s %s" %
+                                (shelf_id, blade_id))
+        yield self._try_execute("sudo vblade-persist destroy %s %s" %
+                                (shelf_id, blade_id))
 
     @defer.inlineCallbacks
     def ensure_exports(self):
         """Runs all existing exports"""
-        # NOTE(ja): wait for blades to appear
-        yield self._execute("sleep 5")
+        # NOTE(vish): The standard _try_execute does not work here
+        #             because these methods throw errors if other
+        #             volumes on this host are in the process of
+        #             being created.  The good news is the command
+        #             still works for the other volumes, so we
+        #             just wait a bit for the current volume to
+        #             be ready and ignore any errors.
+        yield self._execute("sleep 2")
         yield self._execute("sudo vblade-persist auto all",
                             check_exit_code=False)
         yield self._execute("sudo vblade-persist start all",
diff --git a/nova/volume/manager.py b/nova/volume/manager.py
index 8508f27b2..081a2d695 100644
--- a/nova/volume/manager.py
+++ b/nova/volume/manager.py
@@ -62,7 +62,7 @@ class AOEManager(manager.Manager):
         for shelf_id in xrange(FLAGS.num_shelves):
             for blade_id in xrange(FLAGS.blades_per_shelf):
                 dev = {'shelf_id': shelf_id, 'blade_id': blade_id}
-                self.db.export_device_create(context, dev)
+                self.db.export_device_create_safe(context, dev)
 
     @defer.inlineCallbacks
     def create_volume(self, context, volume_id):
@@ -95,20 +95,21 @@ class AOEManager(manager.Manager):
         yield self.driver.ensure_exports()
 
         now = datetime.datetime.utcnow()
-        self.db.volume_update(context, volume_id, {'status': 'available',
-                                                   'launched_at': now})
+        self.db.volume_update(context,
+                              volume_ref['id'], {'status': 'available',
+                                                 'launched_at': now})
         logging.debug("volume %s: created successfully", volume_id)
         defer.returnValue(volume_id)
 
     @defer.inlineCallbacks
     def delete_volume(self, context, volume_id):
         """Deletes and unexports volume"""
-        logging.debug("Deleting volume with id of: %s", volume_id)
         volume_ref = self.db.volume_get(context, volume_id)
         if volume_ref['attach_status'] == "attached":
             raise exception.Error("Volume is still attached")
         if volume_ref['host'] != self.host:
             raise exception.Error("Volume is not local to this node")
+        logging.debug("Deleting volume with id of: %s", volume_id)
         shelf_id, blade_id = self.db.volume_get_shelf_and_blade(context,
                                                                 volume_id)
         yield self.driver.remove_export(volume_ref['ec2_id'],
author	Vishvananda Ishaya <vishvananda@yahoo.com>	2010-10-14 08:38:37 +0000
committer	Tarmac <>	2010-10-14 08:38:37 +0000
commit	7120249b21ae56c070cd2e3d32a9502c320ebb07 (patch)
tree	7372b04b62216a2d73e4c10dbc8eec8799ea4925
parent	134b846d23be923f7453e945e92f32dffbc54f50 (diff)
parent	7403ece82902e633fbd3f2e6f0303ad08c269541 (diff)