ctdb-recoverd: Don't release and re-take the recovery lock

Just continue to hold it, otherwise a broken node might win an election and grab the lock. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
author: Martin Schwenke <martin@meltin.net> 2014-12-09 13:51:27 +1100
committer: Amitay Isaacs <amitay@samba.org> 2015-02-13 07:19:07 +0100
commit: 48c91407abd5e34463d3a10cb6fce47ec4a0d5f6 (patch)
tree: cd80f4aa58bb2af4046358ae0a51b54284dcca9e /ctdb
parent: 1d6ed91f5518d462ba368bca03be923428710157 (diff)
download: samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.tar.gz
samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.tar.xz
samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.zip
1 files changed, 26 insertions, 20 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 2045413ca0..99018be8d3 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1809,29 +1809,35 @@ static int do_recovery(struct ctdb_recoverd *rec,
 	}
 
         if (ctdb->recovery_lock_file != NULL) {
-		DEBUG(DEBUG_ERR, ("Taking out recovery lock from recovery daemon (%s)\n", ctdb->recovery_lock_file));
-		start_time = timeval_current();
-		ctdb_recovery_unlock(ctdb);
-		DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock\n"));
-		if (!ctdb_recovery_lock(ctdb)) {
-			if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
-				/* If ctdb is trying first recovery, it's
-				 * possible that current node does not know yet
-				 * who the recmaster is.
-				 */
-				DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
-						" - retrying recovery\n"));
+		if (ctdb_recovery_have_lock(ctdb)) {
+			DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
+		} else {
+			start_time = timeval_current();
+			DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
+					     ctdb->recovery_lock_file));
+			if (!ctdb_recovery_lock(ctdb)) {
+				if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
+					/* If ctdb is trying first recovery, it's
+					 * possible that current node does not know
+					 * yet who the recmaster is.
+					 */
+					DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
+							  " - retrying recovery\n"));
+					return -1;
+				}
+
+				DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
+						 "and ban ourself for %u seconds\n",
+						 ctdb->tunable.recovery_ban_period));
+				ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
 				return -1;
 			}
-
-			DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
-					 "and ban ourself for %u seconds\n",
-					 ctdb->tunable.recovery_ban_period));
-			ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
-			return -1;
+			ctdb_ctrl_report_recd_lock_latency(ctdb,
+							   CONTROL_TIMEOUT(),
+							   timeval_elapsed(&start_time));
+			DEBUG(DEBUG_NOTICE,
+			      ("Recovery lock taken successfully by recovery daemon\n"));
 		}
-		ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
-		DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
 	}
 
 	DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
author	Martin Schwenke <martin@meltin.net>	2014-12-09 13:51:27 +1100
committer	Amitay Isaacs <amitay@samba.org>	2015-02-13 07:19:07 +0100
commit	48c91407abd5e34463d3a10cb6fce47ec4a0d5f6 (patch)
tree	cd80f4aa58bb2af4046358ae0a51b54284dcca9e /ctdb
parent	1d6ed91f5518d462ba368bca03be923428710157 (diff)
download	samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.tar.gz samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.tar.xz samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.zip