From 57310f80c9b8146a0978d912f73b0a64fde7697e Mon Sep 17 00:00:00 2001
From: Amitay Isaacs <amitay@gmail.com>
Date: Thu, 25 Sep 2014 17:17:04 +1000
Subject: ctdb-recoverd: If obtaining recovery lock fails, try again

When ctdb daemon starts up, it considers itself the recovery master
and tries to do first recovery.  However, it's possible that there is
already a recovery master and the current node has not yet heard from it.
So do not ban ourselves immediately if ctdb_recovery_lock() fails when
doing first recovery.

Signed-off-by: Amitay Isaacs <amitay@gmail.com>
Reviewed-by: Martin Schwenke <martin@meltin.net>
---
 ctdb/server/ctdb_recoverd.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 14e6ea85ad..945b01c4e9 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1815,6 +1815,16 @@ static int do_recovery(struct ctdb_recoverd *rec,
 		DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
 		start_time = timeval_current();
 		if (!ctdb_recovery_lock(ctdb, true)) {
+			if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
+				/* If ctdb is trying first recovery, it's
+				 * possible that current node does not know yet
+				 * who the recmaster is.
+				 */
+				DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
+						" - retrying recovery\n"));
+				return -1;
+			}
+
 			DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
 					 "and ban ourself for %u seconds\n",
 					 ctdb->tunable.recovery_ban_period));
@@ -3593,6 +3603,14 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 		return;
 	}
 
+	/* get runstate */
+	ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
+				     CTDB_CURRENT_NODE, &ctdb->runstate);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
+		return;
+	}
+
 	/* get the current recovery lock file from the server */
 	if (update_recovery_lock_file(ctdb) != 0) {
 		DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
-- 
cgit