10 files changed, 519 insertions, 262 deletions
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 677e02da19..fdd2b99f80 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -2486,3 +2486,194 @@ int ctdb_ctrl_uptime(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct time
 	return ctdb_ctrl_uptime_recv(ctdb, mem_ctx, state, uptime);
 }
 
+/*
+  send a control to execute the "recovered" event script on a node
+ */
+int ctdb_ctrl_end_recovery(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+	int ret;
+	int32_t status;
+
+	ret = ctdb_control(ctdb, destnode, 0, 
+			   CTDB_CONTROL_END_RECOVERY, 0, tdb_null, 
+			   NULL, NULL, &status, &timeout, NULL);
+	if (ret != 0 || status != 0) {
+		DEBUG(0,(__location__ " ctdb_control for end_recovery failed\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+/* 
+  callback for the async helpers used when sending the same control
+  to multiple nodes in parallell.
+*/
+static void async_callback(struct ctdb_client_control_state *state)
+{
+	struct client_async_data *data = talloc_get_type(state->async.private_data, struct client_async_data);
+	int ret;
+	int32_t res;
+
+	/* one more node has responded with recmode data */
+	data->count--;
+
+	/* if we failed to push the db, then return an error and let
+	   the main loop try again.
+	*/
+	if (state->state != CTDB_CONTROL_DONE) {
+		if ( !data->dont_log_errors) {
+			DEBUG(0,("Async operation failed with state %d\n", state->state));
+		}
+		data->fail_count++;
+		return;
+	}
+	
+	state->async.fn = NULL;
+
+	ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
+	if ((ret != 0) || (res != 0)) {
+		if ( !data->dont_log_errors) {
+			DEBUG(0,("Async operation failed with ret=%d res=%d\n", ret, (int)res));
+		}
+		data->fail_count++;
+	}
+}
+
+
+void ctdb_client_async_add(struct client_async_data *data, struct ctdb_client_control_state *state)
+{
+	/* set up the callback functions */
+	state->async.fn = async_callback;
+	state->async.private_data = data;
+	
+	/* one more control to wait for to complete */
+	data->count++;
+}
+
+
+/* wait for up to the maximum number of seconds allowed
+   or until all nodes we expect a response from has replied
+*/
+int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *data)
+{
+	while (data->count > 0) {
+		event_loop_once(ctdb->ev);
+	}
+	if (data->fail_count != 0) {
+		if (!data->dont_log_errors) {
+			DEBUG(0,("Async wait failed - fail_count=%u\n", 
+				 data->fail_count));
+		}
+		return -1;
+	}
+	return 0;
+}
+
+
+/* 
+   perform a simple control on the listed nodes
+   The control cannot return data
+ */
+int ctdb_client_async_control(struct ctdb_context *ctdb,
+				enum ctdb_controls opcode,
+				uint32_t *nodes,
+				struct timeval timeout,
+				bool dont_log_errors,
+				TDB_DATA data)
+{
+	struct client_async_data *async_data;
+	struct ctdb_client_control_state *state;
+	int j, num_nodes;
+	
+	async_data = talloc_zero(ctdb, struct client_async_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+	async_data->dont_log_errors = dont_log_errors;
+
+	num_nodes = talloc_get_size(nodes) / sizeof(uint32_t);
+
+	/* loop over all nodes and send an async control to each of them */
+	for (j=0; j<num_nodes; j++) {
+		uint32_t pnn = nodes[j];
+
+		state = ctdb_control_send(ctdb, pnn, 0, opcode, 
+					  0, data, async_data, &timeout, NULL);
+		if (state == NULL) {
+			DEBUG(0,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
+			talloc_free(async_data);
+			return -1;
+		}
+		
+		ctdb_client_async_add(async_data, state);
+	}
+
+	if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+		talloc_free(async_data);
+		return -1;
+	}
+
+	talloc_free(async_data);
+	return 0;
+}
+
+uint32_t *list_of_vnnmap_nodes(struct ctdb_context *ctdb,
+				struct ctdb_vnn_map *vnn_map,
+				TALLOC_CTX *mem_ctx,
+				bool include_self)
+{
+	int i, j, num_nodes;
+	uint32_t *nodes;
+
+	for (i=num_nodes=0;i<vnn_map->size;i++) {
+		if (vnn_map->map[i] == ctdb->pnn && !include_self) {
+			continue;
+		}
+		num_nodes++;
+	} 
+
+	nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+	CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+	for (i=j=0;i<vnn_map->size;i++) {
+		if (vnn_map->map[i] == ctdb->pnn && !include_self) {
+			continue;
+		}
+		nodes[j++] = vnn_map->map[i];
+	} 
+
+	return nodes;
+}
+
+uint32_t *list_of_active_nodes(struct ctdb_context *ctdb,
+				struct ctdb_node_map *node_map,
+				TALLOC_CTX *mem_ctx,
+				bool include_self)
+{
+	int i, j, num_nodes;
+	uint32_t *nodes;
+
+	for (i=num_nodes=0;i<node_map->num;i++) {
+		if (node_map->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+		if (node_map->nodes[i].pnn == ctdb->pnn && !include_self) {
+			continue;
+		}
+		num_nodes++;
+	} 
+
+	nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+	CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+	for (i=j=0;i<node_map->num;i++) {
+		if (node_map->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+		if (node_map->nodes[i].pnn == ctdb->pnn && !include_self) {
+			continue;
+		}
+		nodes[j++] = node_map->nodes[i].pnn;
+	} 
+
+	return nodes;
+}
diff --git a/ctdb/config/ctdb.sysconfig b/ctdb/config/ctdb.sysconfig
index f236cda6e7..9306884b64 100644
--- a/ctdb/config/ctdb.sysconfig
+++ b/ctdb/config/ctdb.sysconfig
@@ -42,10 +42,6 @@
 # default is to not manage Samba
 # CTDB_MANAGES_SAMBA=yes
 
-# should ctdb manage starting/stopping the http service for you?
-# default is to not manage http 
-# CTDB_MANAGES_HTTPD=yes
-
 # should ctdb manage starting/stopping Winbind service?
 # if left comented out then it will be autodetected based on smb.conf
 # CTDB_MANAGES_WINBIND=yes
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index eee6983417..b779b94dcd 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -499,4 +499,15 @@ struct ctdb_client_control_state *ctdb_ctrl_uptime_send(struct ctdb_context *ctd
 
 int ctdb_ctrl_uptime_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, struct ctdb_uptime **uptime);
 
+int ctdb_ctrl_end_recovery(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+
+uint32_t *list_of_active_nodes(struct ctdb_context *ctdb,
+				struct ctdb_node_map *node_map,
+				TALLOC_CTX *mem_ctx,
+				bool include_self);
+uint32_t *list_of_vnnmap_nodes(struct ctdb_context *ctdb,
+				struct ctdb_vnn_map *vnn_map,
+				TALLOC_CTX *mem_ctx,
+				bool include_self);
+
 #endif
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index 57501fc68a..ab875924fa 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -488,6 +488,8 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS          = 0,
 		    CTDB_CONTROL_WIPE_DATABASE           = 67,
 		    CTDB_CONTROL_DELETE_RECORD           = 68,
 		    CTDB_CONTROL_UPTIME                  = 69,
+		    CTDB_CONTROL_START_RECOVERY          = 70,
+		    CTDB_CONTROL_END_RECOVERY            = 71,
 };	
 
 /*
@@ -1082,6 +1084,12 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
 				 struct ctdb_req_control *c,
 				 TDB_DATA indata, 
 				 bool *async_reply);
+int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb, 
+				 struct ctdb_req_control *c,
+				 bool *async_reply);
+int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb, 
+				 struct ctdb_req_control *c,
+				 bool *async_reply);
 
 struct ctdb_public_ip {
 	uint32_t pnn;
@@ -1221,4 +1229,20 @@ void ctdb_unblock_signal(int signum);
 int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);
 int ctdb_set_child_logging(struct ctdb_context *ctdb);
 
+
+struct client_async_data {
+	bool dont_log_errors;
+	uint32_t count;
+	uint32_t fail_count;
+};
+void ctdb_client_async_add(struct client_async_data *data, struct ctdb_client_control_state *state);
+int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *data);
+int ctdb_client_async_control(struct ctdb_context *ctdb,
+				enum ctdb_controls opcode,
+				uint32_t *nodes,
+				struct timeval timeout,
+				bool dont_log_errors,
+				TDB_DATA data);
+
+
 #endif
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
index 01a77fe887..884ed69177 100644
--- a/ctdb/server/ctdb_control.c
+++ b/ctdb/server/ctdb_control.c
@@ -355,6 +355,11 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
 	case CTDB_CONTROL_UPTIME:
 		return ctdb_control_uptime(ctdb, outdata);
 
+	case CTDB_CONTROL_START_RECOVERY:
+		return ctdb_control_start_recovery(ctdb, c, async_reply);
+
+	case CTDB_CONTROL_END_RECOVERY:
+		return ctdb_control_end_recovery(ctdb, c, async_reply);
 	default:
 		DEBUG(0,(__location__ " Unknown CTDB control opcode %u\n", opcode));
 		return -1;
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index 65ad471564..b239554a02 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -397,27 +397,6 @@ struct ctdb_set_recmode_state {
 };
 
 /*
-  called when the 'recovered' event script has finished
- */
-static void ctdb_recovered_callback(struct ctdb_context *ctdb, int status, void *p)
-{
-	struct ctdb_set_recmode_state *state = talloc_get_type(p, struct ctdb_set_recmode_state);
-
-	ctdb_enable_monitoring(state->ctdb);
-
-	if (status == 0) {
-		ctdb->recovery_mode = state->recmode;
-	} else {
-		DEBUG(0,(__location__ " recovered event script failed (status %d)\n", status));
-	}
-
-	ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
-	talloc_free(state);
-
-	gettimeofday(&ctdb->last_recovery_time, NULL);
-}
-
-/*
   called if our set_recmode child times out. this would happen if
   ctdb_recovery_lock() would block.
  */
@@ -473,23 +452,11 @@ static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
 		return;
 	}
 
+	state->ctdb->recovery_mode = state->recmode;
 
-	ctdb_disable_monitoring(state->ctdb);
-
-	/* call the events script to tell all subsystems that we have recovered */
-	ret = ctdb_event_script_callback(state->ctdb, 
-					 timeval_current_ofs(state->ctdb->tunable.script_timeout, 0),
-					 state, 
-					 ctdb_recovered_callback, 
-					 state, "recovered");
-
-	if (ret != 0) {
-		ctdb_enable_monitoring(state->ctdb);
-
-		ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "failed to run eventscript from set_recmode");
-		talloc_free(state);
-		return;
-	}
+	ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
+	talloc_free(state);
+	return;
 }
 
 /*
@@ -742,3 +709,122 @@ int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata)
 	free(data.dptr);
 	return 0;	
 }
+
+
+struct recovery_callback_state {
+	struct ctdb_req_control *c;
+};
+
+
+/*
+  called when the 'recovered' event script has finished
+ */
+static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+	struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
+
+	ctdb_enable_monitoring(ctdb);
+
+	if (status != 0) {
+		DEBUG(0,(__location__ " recovered event script failed (status %d)\n", status));
+	}
+
+	ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+	talloc_free(state);
+
+	gettimeofday(&ctdb->last_recovery_time, NULL);
+}
+
+/*
+  recovery has finished
+ */
+int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb, 
+				struct ctdb_req_control *c,
+				bool *async_reply)
+{
+	int ret;
+	struct recovery_callback_state *state;
+
+	DEBUG(0,("Recovery has finished\n"));
+
+	state = talloc(ctdb, struct recovery_callback_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	state->c    = talloc_steal(state, c);
+
+	ctdb_disable_monitoring(ctdb);
+
+	ret = ctdb_event_script_callback(ctdb, 
+					 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
+					 state, 
+					 ctdb_end_recovery_callback, 
+					 state, "recovered");
+
+	if (ret != 0) {
+		ctdb_enable_monitoring(ctdb);
+
+		DEBUG(0,(__location__ " Failed to end recovery\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	/* tell the control that we will be reply asynchronously */
+	*async_reply = true;
+	return 0;
+}
+
+/*
+  called when the 'startrecovery' event script has finished
+ */
+static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+	struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
+
+	ctdb_enable_monitoring(ctdb);
+
+	if (status != 0) {
+		DEBUG(0,(__location__ " startrecovery event script failed (status %d)\n", status));
+	}
+
+	ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+	talloc_free(state);
+}
+
+/*
+  start a recuvery
+ */
+int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb, 
+				struct ctdb_req_control *c,
+				bool *async_reply)
+{
+	int ret;
+	struct recovery_callback_state *state;
+
+	DEBUG(0,("Recovery has started\n"));
+
+	state = talloc(ctdb, struct recovery_callback_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	state->c    = talloc_steal(state, c);
+
+	ctdb_disable_monitoring(ctdb);
+
+	ret = ctdb_event_script_callback(ctdb, 
+					 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
+					 state, 
+					 ctdb_start_recovery_callback, 
+					 state, "startrecovery");
+
+	if (ret != 0) {
+		ctdb_enable_monitoring(ctdb);
+
+		DEBUG(0,(__location__ " Failed to start recovery\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	/* tell the control that we will be reply asynchronously */
+	*async_reply = true;
+	return 0;
+}
+
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index c13136e848..8595706cc0 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -58,66 +58,6 @@ struct ctdb_recoverd {
 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 
 
-struct async_data {
-	uint32_t count;
-	uint32_t fail_count;
-};
-
-static void async_callback(struct ctdb_client_control_state *state)
-{
-	struct async_data *data = talloc_get_type(state->async.private_data, struct async_data);
-	int ret;
-	int32_t res;
-
-	/* one more node has responded with recmode data */
-	data->count--;
-
-	/* if we failed to push the db, then return an error and let
-	   the main loop try again.
-	*/
-	if (state->state != CTDB_CONTROL_DONE) {
-		DEBUG(0,("Async operation failed with state %d\n", state->state));
-		data->fail_count++;
-		return;
-	}
-	
-	state->async.fn = NULL;
-
-	ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
-	if ((ret != 0) || (res != 0)) {
-		DEBUG(0,("Async operation failed with ret=%d res=%d\n", ret, (int)res));
-		data->fail_count++;
-	}
-}
-
-
-static void async_add(struct async_data *data, struct ctdb_client_control_state *state)
-{
-	/* set up the callback functions */
-	state->async.fn = async_callback;
-	state->async.private_data = data;
-	
-	/* one more control to wait for to complete */
-	data->count++;
-}
-
-
-/* wait for up to the maximum number of seconds allowed
-   or until all nodes we expect a response from has replied
-*/
-static int async_wait(struct ctdb_context *ctdb, struct async_data *data)
-{
-	while (data->count > 0) {
-		event_loop_once(ctdb->ev);
-	}
-	if (data->fail_count != 0) {
-		DEBUG(0,("Async wait failed - fail_count=%u\n", data->fail_count));
-		return -1;
-	}
-	return 0;
-}
-
-
 /*
   unban a node
  */
@@ -255,50 +195,49 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_
 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 
 
-/* 
-   perform a simple control on all active nodes. The control cannot return data
+/*
+  run the "recovered" eventscript on all nodes
  */
-static int async_control_on_active_nodes(struct ctdb_context *ctdb, enum ctdb_controls opcode,
-					 struct ctdb_node_map *nodemap, TDB_DATA data, bool include_self)
+static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 {
-	struct async_data *async_data;
-	struct ctdb_client_control_state *state;
-	int j;
-	struct timeval timeout = CONTROL_TIMEOUT();
-	
-	async_data = talloc_zero(ctdb, struct async_data);
-	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+	TALLOC_CTX *tmp_ctx;
 
-	/* loop over all active nodes and send an async control to each of them */
-	for (j=0; j<nodemap->num; j++) {
-		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
-			continue;
-		}
-		if (nodemap->nodes[j].pnn == ctdb->pnn && !include_self) {
-			continue;
-		}
-		state = ctdb_control_send(ctdb, nodemap->nodes[j].pnn, 0, opcode, 
-					  0, data, async_data, &timeout, NULL);
-		if (state == NULL) {
-			DEBUG(0,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
-			talloc_free(async_data);
-			return -1;
-		}
-		
-		async_add(async_data, state);
-	}
+	tmp_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY(ctdb, tmp_ctx);
 
-	if (async_wait(ctdb, async_data) != 0) {
-		DEBUG(0,(__location__ " Failed async control %u\n", (unsigned)opcode));
-		talloc_free(async_data);
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
+			list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
+			CONTROL_TIMEOUT(), false, tdb_null) != 0) {
+		DEBUG(0, (__location__ " Unable to run the 'recovered' event. Recovery failed.\n"));
+		talloc_free(tmp_ctx);
 		return -1;
 	}
 
-	talloc_free(async_data);
+	talloc_free(tmp_ctx);
 	return 0;
 }
 
+/*
+  run the "startrecovery" eventscript on all nodes
+ */
+static int run_startrecovery_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+{
+	TALLOC_CTX *tmp_ctx;
 
+	tmp_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
+			list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
+			CONTROL_TIMEOUT(), false, tdb_null) != 0) {
+		DEBUG(0, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	talloc_free(tmp_ctx);
+	return 0;
+}
 
 /*
   change recovery mode on all nodes
@@ -306,12 +245,21 @@ static int async_control_on_active_nodes(struct ctdb_context *ctdb, enum ctdb_co
 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
 {
 	TDB_DATA data;
+	uint32_t *nodes;
+	TALLOC_CTX *tmp_ctx;
+
+	tmp_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+	nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 
 	/* freeze all nodes */
 	if (rec_mode == CTDB_RECOVERY_ACTIVE) {
-		if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_FREEZE, 
-						  nodemap, tdb_null, true) != 0) {
+		if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
+						nodes, CONTROL_TIMEOUT(),
+						false, tdb_null) != 0) {
 			DEBUG(0, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
+			talloc_free(tmp_ctx);
 			return -1;
 		}
 	}
@@ -320,20 +268,25 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
 	data.dsize = sizeof(uint32_t);
 	data.dptr = (unsigned char *)&rec_mode;
 
-	if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_SET_RECMODE, 
-					  nodemap, data, true) != 0) {
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
+					nodes, CONTROL_TIMEOUT(),
+					false, data) != 0) {
 		DEBUG(0, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
+		talloc_free(tmp_ctx);
 		return -1;
 	}
 
 	if (rec_mode == CTDB_RECOVERY_NORMAL) {
-		if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_THAW, 
-						  nodemap, tdb_null, true) != 0) {
+		if (ctdb_client_async_control(ctdb, CTDB_CONTROL_THAW,
+						nodes, CONTROL_TIMEOUT(),
+						false, tdb_null) != 0) {
 			DEBUG(0, (__location__ " Unable to thaw nodes. Recovery failed.\n"));
+			talloc_free(tmp_ctx);
 			return -1;
 		}
 	}
 
+	talloc_free(tmp_ctx);
 	return 0;
 }
 
@@ -343,16 +296,23 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
 {
 	TDB_DATA data;
+	TALLOC_CTX *tmp_ctx;
+
+	tmp_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY(ctdb, tmp_ctx);
 
 	data.dsize = sizeof(uint32_t);
 	data.dptr = (unsigned char *)&pnn;
 
-	if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_SET_RECMASTER, 
-					  nodemap, data, true) != 0) {
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
+			list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
+			CONTROL_TIMEOUT(), false, data) != 0) {
 		DEBUG(0, (__location__ " Unable to set recmaster. Recovery failed.\n"));
+		talloc_free(tmp_ctx);
 		return -1;
 	}
 
+	talloc_free(tmp_ctx);
 	return 0;
 }
 
@@ -1141,6 +1101,10 @@ static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
 	struct recdb_data params;
 	struct ctdb_control_pulldb_reply *recdata;
 	TDB_DATA outdata;
+	TALLOC_CTX *tmp_ctx;
+
+	tmp_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY(ctdb, tmp_ctx);
 
 	recdata = talloc_zero(recdb, struct ctdb_control_pulldb_reply);
 	CTDB_NO_MEMORY(ctdb, recdata);
@@ -1155,12 +1119,14 @@ static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
 	if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
 		DEBUG(0,(__location__ " Failed to traverse recdb database\n"));
 		talloc_free(params.recdata);
+		talloc_free(tmp_ctx);
 		return -1;
 	}
 
 	if (params.failed) {
 		DEBUG(0,(__location__ " Failed to traverse recdb database\n"));
 		talloc_free(params.recdata);
+		talloc_free(tmp_ctx);
 		return -1;		
 	}
 
@@ -1169,9 +1135,12 @@ static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
 	outdata.dptr = (void *)recdata;
 	outdata.dsize = params.len;
 
-	if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_PUSH_DB, nodemap, outdata, true) != 0) {
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
+			list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
+			CONTROL_TIMEOUT(), false, outdata) != 0) {
 		DEBUG(0,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
 		talloc_free(recdata);
+		talloc_free(tmp_ctx);
 		return -1;
 	}
 
@@ -1179,6 +1148,7 @@ static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
 		  dbid, recdata->count));
 
 	talloc_free(recdata);
+	talloc_free(tmp_ctx);
 
 	return 0;
 }
@@ -1221,9 +1191,11 @@ static int recover_database(struct ctdb_recoverd *rec,
 	data.dptr = (void *)&w;
 	data.dsize = sizeof(w);
 
-	if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_WIPE_DATABASE, 
-					  nodemap, data, true) != 0) {
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
+			list_of_active_nodes(ctdb, nodemap, recdb, true),
+			CONTROL_TIMEOUT(), false, data) != 0) {
 		DEBUG(0, (__location__ " Unable to wipe database. Recovery failed.\n"));
+		talloc_free(recdb);
 		return -1;
 	}
 	
@@ -1304,6 +1276,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
 
 	DEBUG(0, (__location__ " Recovery - created remote databases\n"));
 
+
 	/* set recovery mode to active on all nodes */
 	ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
 	if (ret!=0) {
@@ -1311,6 +1284,13 @@ static int do_recovery(struct ctdb_recoverd *rec,
 		return -1;
 	}
 
+	/* execute the "startrecovery" event script on all nodes */
+	ret = run_startrecovery_eventscript(ctdb, nodemap);
+	if (ret!=0) {
+		DEBUG(0, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
+		return -1;
+	}
+
 	/* pick a new generation number */
 	generation = new_generation();
 
@@ -1334,8 +1314,9 @@ static int do_recovery(struct ctdb_recoverd *rec,
 	data.dptr = (void *)&generation;
 	data.dsize = sizeof(uint32_t);
 
-	if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_TRANSACTION_START, 
-					  nodemap, data, true) != 0) {
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
+			list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
+			CONTROL_TIMEOUT(), false, data) != 0) {
 		DEBUG(0, (__location__ " Unable to start transactions. Recovery failed.\n"));
 		return -1;
 	}
@@ -1352,8 +1333,9 @@ static int do_recovery(struct ctdb_recoverd *rec,
 	DEBUG(0, (__location__ " Recovery - starting database commits\n"));
 
 	/* commit all the changes */
-	if (async_control_on_active_nodes(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT, 
-					  nodemap, data, true) != 0) {
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
+			list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
+			CONTROL_TIMEOUT(), false, data) != 0) {
 		DEBUG(0, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
 		return -1;
 	}
@@ -1417,6 +1399,13 @@ static int do_recovery(struct ctdb_recoverd *rec,
 		DEBUG(1, (__location__ " Recovery - done takeover\n"));
 	}
 
+	/* execute the "recovered" event script on all nodes */
+	ret = run_recovered_eventscript(ctdb, nodemap);
+	if (ret!=0) {
+		DEBUG(0, (__location__ " Unable to run the 'recovered' event on cluster\n"));
+		return -1;
+	}
+
 	/* disable recovery mode */
 	ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
 	if (ret!=0) {
@@ -2445,12 +2434,29 @@ again:
 	/* we might need to change who has what IP assigned */
 	if (rec->need_takeover_run) {
 		rec->need_takeover_run = false;
+
+		/* execute the "startrecovery" event script on all nodes */
+		ret = run_startrecovery_eventscript(ctdb, nodemap);
+		if (ret!=0) {
+			DEBUG(0, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
+			do_recovery(rec, mem_ctx, pnn, num_active, nodemap, 
+				    vnnmap, ctdb->pnn);
+		}
+
 		ret = ctdb_takeover_run(ctdb, nodemap);
 		if (ret != 0) {
 			DEBUG(0, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
 			do_recovery(rec, mem_ctx, pnn, num_active, nodemap, 
 				    vnnmap, ctdb->pnn);
 		}
+
+		/* execute the "recovered" event script on all nodes */
+		ret = run_recovered_eventscript(ctdb, nodemap);
+		if (ret!=0) {
+			DEBUG(0, (__location__ " Unable to run the 'recovered' event on cluster\n"));
+			do_recovery(rec, mem_ctx, pnn, num_active, nodemap, 
+				    vnnmap, ctdb->pnn);
+		}
 	}
 
 	goto again;
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c
index b63b88f4c2..cea3f95e34 100644
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@@ -641,11 +641,14 @@ create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 {
 	int i, num_healthy, retries;
-	int ret;
 	struct ctdb_public_ip ip;
 	uint32_t mask;
 	struct ctdb_public_ip_list *all_ips, *tmp_ip;
 	int maxnode, maxnum=0, minnode, minnum=0, num;
+	TDB_DATA data;
+	struct timeval timeout;
+	struct client_async_data *async_data;
+	struct ctdb_client_control_state *state;
 	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
 
 
@@ -813,6 +816,9 @@ try_again:
 	/* now tell all nodes to delete any alias that they should not
 	   have.  This will be a NOOP on nodes that don't currently
 	   hold the given alias */
+	async_data = talloc_zero(tmp_ctx, struct client_async_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+
 	for (i=0;i<nodemap->num;i++) {
 		/* don't talk to unconnected nodes, but do talk to banned nodes */
 		if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
@@ -830,21 +836,33 @@ try_again:
 			ip.sin.sin_family = AF_INET;
 			ip.sin.sin_addr   = tmp_ip->sin.sin_addr;
 
-			ret = ctdb_ctrl_release_ip(ctdb, TAKEOVER_TIMEOUT(),
-						   nodemap->nodes[i].pnn, 
-						   &ip);
-			if (ret != 0) {
-				DEBUG(0,("Failed to tell vnn %u to release IP %s\n",
-					 nodemap->nodes[i].pnn,
-					 inet_ntoa(tmp_ip->sin.sin_addr)));
+			timeout = TAKEOVER_TIMEOUT();
+			data.dsize = sizeof(ip);
+			data.dptr  = (uint8_t *)&ip;
+			state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
+					0, CTDB_CONTROL_RELEASE_IP, 0,
+					data, async_data,
+					&timeout, NULL);
+			if (state == NULL) {
+				DEBUG(0,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
 				talloc_free(tmp_ctx);
 				return -1;
 			}
+		
+			ctdb_client_async_add(async_data, state);
 		}
 	}
+	if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+		DEBUG(0,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+	talloc_free(async_data);
 
 
 	/* tell all nodes to get their own IPs */
+	async_data = talloc_zero(tmp_ctx, struct client_async_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
 	for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
 		if (tmp_ip->pnn == -1) {
 			/* this IP won't be taken over */
@@ -854,16 +872,25 @@ try_again:
 		ip.sin.sin_family = AF_INET;
 		ip.sin.sin_addr = tmp_ip->sin.sin_addr;
 
-		ret = ctdb_ctrl_takeover_ip(ctdb, TAKEOVER_TIMEOUT(), 
-					    tmp_ip->pnn, 
-					    &ip);
-		if (ret != 0) {
-			DEBUG(0,("Failed asking vnn %u to take over IP %s\n",
-				 tmp_ip->pnn, 
-				 inet_ntoa(tmp_ip->sin.sin_addr)));
+		timeout = TAKEOVER_TIMEOUT();
+		data.dsize = sizeof(ip);
+		data.dptr  = (uint8_t *)&ip;
+		state = ctdb_control_send(ctdb, tmp_ip->pnn,
+				0, CTDB_CONTROL_TAKEOVER_IP, 0,
+				data, async_data,
+				&timeout, NULL);
+		if (state == NULL) {
+			DEBUG(0,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
 			talloc_free(tmp_ctx);
 			return -1;
 		}
+		
+		ctdb_client_async_add(async_data, state);
+	}
+	if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+		DEBUG(0,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
+		talloc_free(tmp_ctx);
+		return -1;
 	}
 
 	talloc_free(tmp_ctx);
diff --git a/ctdb/tests/events.d/00.test b/ctdb/tests/events.d/00.test
index 026cf6cba7..c5933be36a 100755
--- a/ctdb/tests/events.d/00.test
+++ b/ctdb/tests/events.d/00.test
@@ -10,6 +10,12 @@ case $cmd in
 	echo "monitor event stderr" >&2
 	exit 0
 	;;
+
+     startrecovery)
+	echo "ctdb startrecovery event"
+	exit 0;	
+	;;
+
      startup)
 	echo "ctdb startup event"
 	exit 0;	
diff --git a/ctdb/tools/ctdb_vacuum.c b/ctdb/tools/ctdb_vacuum.c
index 2bc9908c7e..f412c04303 100644
--- a/ctdb/tools/ctdb_vacuum.c
+++ b/ctdb/tools/ctdb_vacuum.c
@@ -28,103 +28,6 @@
 /* should be tunable */
 #define TIMELIMIT() timeval_current_ofs(10, 0)
 
-struct async_data {
-	uint32_t count;
-	uint32_t fail_count;
-};
-
-static void async_callback(struct ctdb_client_control_state *state)
-{
-	struct async_data *data = talloc_get_type(state->async.private_data, struct async_data);
-	int ret;
-	int32_t res;
-
-	/* one more node has responded with recmode data */
-	data->count--;
-
-	/* if we failed to push the db, then return an error and let
-	   the main loop try again.
-	*/
-	if (state->state != CTDB_CONTROL_DONE) {
-		data->fail_count++;
-		return;
-	}
-	
-	state->async.fn = NULL;
-
-	ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
-	if ((ret != 0) || (res != 0)) {
-		data->fail_count++;
-	}
-}
-
-static void async_add(struct async_data *data, struct ctdb_client_control_state *state)
-{
-	/* set up the callback functions */
-	state->async.fn = async_callback;
-	state->async.private_data = data;
-	
-	/* one more control to wait for to complete */
-	data->count++;
-}
-
-
-/* wait for up to the maximum number of seconds allowed
-   or until all nodes we expect a response from has replied
-*/
-static int async_wait(struct ctdb_context *ctdb, struct async_data *data)
-{
-	while (data->count > 0) {
-		event_loop_once(ctdb->ev);
-	}
-	if (data->fail_count != 0) {
-		return -1;
-	}
-	return 0;
-}
-
-/* 
-   perform a simple control on nodes in the vnn map except ourselves.
-   The control cannot return data
- */
-static int async_control_on_vnnmap(struct ctdb_context *ctdb, enum ctdb_controls opcode,
-				   TDB_DATA data)
-{
-	struct async_data *async_data;
-	struct ctdb_client_control_state *state;
-	int j;
-	struct timeval timeout = TIMELIMIT();
-	
-	async_data = talloc_zero(ctdb, struct async_data);
-	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
-
-	/* loop over all active nodes and send an async control to each of them */
-	for (j=0; j<ctdb->vnn_map->size; j++) {
-		uint32_t pnn = ctdb->vnn_map->map[j];
-		if (pnn == ctdb->pnn) {
-			continue;
-		}
-		state = ctdb_control_send(ctdb, pnn, 0, opcode, 
-					  0, data, async_data, &timeout, NULL);
-		if (state == NULL) {
-			DEBUG(0,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
-			talloc_free(async_data);
-			return -1;
-		}
-		
-		async_add(async_data, state);
-	}
-
-	if (async_wait(ctdb, async_data) != 0) {
-		talloc_free(async_data);
-		return -1;
-	}
-
-	talloc_free(async_data);
-	return 0;
-}
-
-
 /*
   vacuum one record
  */
@@ -172,7 +75,9 @@ static int ctdb_vacuum_one(struct ctdb_context *ctdb, TDB_DATA key,
 	data.dptr = (void *)rec;
 	data.dsize = rec->length;
 
-	if (async_control_on_vnnmap(ctdb, CTDB_CONTROL_DELETE_RECORD, data) != 0) {
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_DELETE_RECORD,
+			list_of_vnnmap_nodes(ctdb, ctdb->vnn_map, rec, false),
+			TIMELIMIT(), true, data) != 0) {
 		/* one or more nodes failed to delete a record - no problem! */
 		talloc_free(rec);
 		return 0;