recoverd: Track failure of "recovered" event, banning culprits

Pair-programmed-with: Amitay Isaacs <amitay@gmail.com> Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9550c497e6d6ef5ee44826c4bd9ed5ad65174263)
author: Martin Schwenke <martin@meltin.net> 2012-09-24 14:32:04 +1000
committer: Martin Schwenke <martin@meltin.net> 2012-10-11 12:10:45 +1100
commit: 4719df62d6b11a84f3ba7a72748d70ac925f49bd (patch)
tree: 90462e9c1939c9f4555e34516f5f1468f250e46f
parent: 62046a8a4cac85d1679b5e573c8b9d1d2b731ded (diff)
download: samba-4719df62d6b11a84f3ba7a72748d70ac925f49bd.tar.gz
samba-4719df62d6b11a84f3ba7a72748d70ac925f49bd.tar.xz
samba-4719df62d6b11a84f3ba7a72748d70ac925f49bd.zip
1 files changed, 42 insertions, 29 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 1153a40c70..55d878bdd1 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -108,33 +108,6 @@ enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEED
 
 
 /*
-  run the "recovered" eventscript on all nodes
- */
-static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
-{
-	TALLOC_CTX *tmp_ctx;
-	uint32_t *nodes;
-
-	tmp_ctx = talloc_new(ctdb);
-	CTDB_NO_MEMORY(ctdb, tmp_ctx);
-
-	nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
-	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
-					nodes, 0,
-					CONTROL_TIMEOUT(), false, tdb_null,
-					NULL, NULL,
-					NULL) != 0) {
-		DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
-
-		talloc_free(tmp_ctx);
-		return -1;
-	}
-
-	talloc_free(tmp_ctx);
-	return 0;
-}
-
-/*
   remember the trouble maker
  */
 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
@@ -176,6 +149,46 @@ static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 
 
 /* this callback is called for every node that failed to execute the
+   recovered event
+*/
+static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+	DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
+
+	ctdb_set_culprit(rec, node_pnn);
+}
+
+/*
+  run the "recovered" eventscript on all nodes
+ */
+static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
+{
+	TALLOC_CTX *tmp_ctx;
+	uint32_t *nodes;
+	struct ctdb_context *ctdb = rec->ctdb;
+
+	tmp_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+	nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
+					nodes, 0,
+					CONTROL_TIMEOUT(), false, tdb_null,
+					NULL, recovered_fail_callback,
+					rec) != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
+
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	talloc_free(tmp_ctx);
+	return 0;
+}
+
+/* this callback is called for every node that failed to execute the
    start recovery event
 */
 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
@@ -1775,7 +1788,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
 	}
 
 	/* execute the "recovered" event script on all nodes */
-	ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
+	ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
 	if (ret!=0) {
 		DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
 		return -1;
@@ -3726,7 +3739,7 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 		}
 
 		/* execute the "recovered" event script on all nodes */
-		ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
+		ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
 #if 0
 // we cant check whether the event completed successfully
 // since this script WILL fail if the node is in recovery mode
author	Martin Schwenke <martin@meltin.net>	2012-09-24 14:32:04 +1000
committer	Martin Schwenke <martin@meltin.net>	2012-10-11 12:10:45 +1100
commit	4719df62d6b11a84f3ba7a72748d70ac925f49bd (patch)
tree	90462e9c1939c9f4555e34516f5f1468f250e46f
parent	62046a8a4cac85d1679b5e573c8b9d1d2b731ded (diff)
download	samba-4719df62d6b11a84f3ba7a72748d70ac925f49bd.tar.gz samba-4719df62d6b11a84f3ba7a72748d70ac925f49bd.tar.xz samba-4719df62d6b11a84f3ba7a72748d70ac925f49bd.zip