10 files changed, 532 insertions, 6 deletions
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index 5f63c8f97b..f29e5dec41 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -129,6 +129,7 @@ struct ctdb_tunable {
 	uint32_t vacuum_max_interval;
 	uint32_t max_queue_depth_drop_msg;
 	uint32_t use_status_events_for_monitoring;
+	uint32_t allow_unhealthy_db_read;
 };
 
 /*
@@ -407,6 +408,9 @@ struct ctdb_context {
 	const char *db_directory_persistent;
 	const char *db_directory_state;
 	struct tdb_wrap *db_persistent_health;
+	uint32_t db_persistent_startup_generation;
+	uint64_t db_persistent_check_errors;
+	uint64_t max_persistent_check_errors;
 	const char *transport;
 	char *recovery_lock_file;
 	int recovery_lock_fd;
@@ -479,6 +483,7 @@ struct ctdb_db_context {
 	struct ctdb_traverse_local_handle *traverse;
 	bool transaction_active;
 	struct ctdb_vacuum_handle *vacuum_handle;
+	char *unhealthy_reason;
 };
 
 
@@ -1543,4 +1548,12 @@ int32_t ctdb_control_get_db_seqnum(struct ctdb_context *ctdb,
 				   TDB_DATA indata,
 				   TDB_DATA *outdata);
 
+int ctdb_load_persistent_health(struct ctdb_context *ctdb,
+				struct ctdb_db_context *ctdb_db);
+int ctdb_update_persistent_health(struct ctdb_context *ctdb,
+				  struct ctdb_db_context *ctdb_db,
+				  const char *reason,/* NULL means healthy */
+				  int num_healthy_nodes);
+int ctdb_recheck_persistent_health(struct ctdb_context *ctdb);
+
 #endif
diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c
index 42b2c124b1..7f3128b469 100644
--- a/ctdb/server/ctdb_daemon.c
+++ b/ctdb/server/ctdb_daemon.c
@@ -354,6 +354,16 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 		return;
 	}
 
+	if (ctdb_db->unhealthy_reason) {
+		/*
+		 * this is just a warning, as the tdb should be empty anyway,
+		 * and only persistent databases can be unhealthy, which doesn't
+		 * use this code patch
+		 */
+		DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
+				     ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
 	key.dptr = c->data;
 	key.dsize = c->keylen;
 
diff --git a/ctdb/server/ctdb_freeze.c b/ctdb/server/ctdb_freeze.c
index 37c90a275b..38520087ce 100644
--- a/ctdb/server/ctdb_freeze.c
+++ b/ctdb/server/ctdb_freeze.c
@@ -489,7 +489,8 @@ int32_t ctdb_control_transaction_commit(struct ctdb_context *ctdb, uint32_t id)
 {
 	struct ctdb_db_context *ctdb_db;
 	int i;
-	
+	int healthy_nodes = 0;
+
 	for (i=1;i<=NUM_DB_PRIORITIES; i++) {
 		if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
 			DEBUG(DEBUG_ERR,(__location__ " Failed transaction_start while not frozen\n"));
@@ -507,6 +508,16 @@ int32_t ctdb_control_transaction_commit(struct ctdb_context *ctdb, uint32_t id)
 		return -1;
 	}
 
+	DEBUG(DEBUG_DEBUG,(__location__ " num_nodes[%d]\n", ctdb->num_nodes));
+	for (i=0; i < ctdb->num_nodes; i++) {
+		DEBUG(DEBUG_DEBUG,(__location__ " node[%d].flags[0x%X]\n",
+				   i, ctdb->nodes[i]->flags));
+		if (ctdb->nodes[i]->flags == 0) {
+			healthy_nodes++;
+		}
+	}
+	DEBUG(DEBUG_INFO,(__location__ " healthy_nodes[%d]\n", healthy_nodes));
+
 	for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
 		int ret;
 
@@ -518,6 +529,14 @@ int32_t ctdb_control_transaction_commit(struct ctdb_context *ctdb, uint32_t id)
 			goto fail;
 		}
 		tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+
+		ret = ctdb_update_persistent_health(ctdb, ctdb_db, NULL, healthy_nodes);
+		if (ret != 0) {
+			DEBUG(DEBUG_CRIT,(__location__ " Failed to update persistent health for db '%s'. "
+					 "Cancel all remaining transactions and resetting transaction_started to false.\n",
+					 ctdb_db->db_name));
+			goto fail;
+		}
 	}
 
 	ctdb->freeze_transaction_started = false;
diff --git a/ctdb/server/ctdb_ltdb_server.c b/ctdb/server/ctdb_ltdb_server.c
index 9a4044e0ca..b966386b13 100644
--- a/ctdb/server/ctdb_ltdb_server.c
+++ b/ctdb/server/ctdb_ltdb_server.c
@@ -23,6 +23,7 @@
 #include "system/network.h"
 #include "system/filesys.h"
 #include "system/dir.h"
+#include "system/time.h"
 #include "../include/ctdb_private.h"
 #include "db_wrap.h"
 #include "lib/util/dlinklist.h"
@@ -190,6 +191,250 @@ static void ctdb_check_db_empty(struct ctdb_db_context *ctdb_db)
 	}
 }
 
+int ctdb_load_persistent_health(struct ctdb_context *ctdb,
+				struct ctdb_db_context *ctdb_db)
+{
+	struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+	char *old;
+	char *reason = NULL;
+	TDB_DATA key;
+	TDB_DATA val;
+
+	key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+	key.dsize = strlen(ctdb_db->db_name);
+
+	old = ctdb_db->unhealthy_reason;
+	ctdb_db->unhealthy_reason = NULL;
+
+	val = tdb_fetch(tdb, key);
+	if (val.dsize > 0) {
+		reason = talloc_strndup(ctdb_db,
+					(const char *)val.dptr,
+					val.dsize);
+		if (reason == NULL) {
+			DEBUG(DEBUG_ALERT,(__location__ " talloc_strndup(%d) failed\n",
+					   (int)val.dsize));
+			ctdb_db->unhealthy_reason = old;
+			free(val.dptr);
+			return -1;
+		}
+	}
+
+	if (val.dptr) {
+		free(val.dptr);
+	}
+
+	talloc_free(old);
+	ctdb_db->unhealthy_reason = reason;
+	return 0;
+}
+
+int ctdb_update_persistent_health(struct ctdb_context *ctdb,
+				  struct ctdb_db_context *ctdb_db,
+				  const char *given_reason,/* NULL means healthy */
+				  int num_healthy_nodes)
+{
+	struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+	int ret;
+	TDB_DATA key;
+	TDB_DATA val;
+	char *new_reason = NULL;
+	char *old_reason = NULL;
+
+	ret = tdb_transaction_start(tdb);
+	if (ret != 0) {
+		DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_start('%s') failed: %d - %s\n",
+				   tdb_name(tdb), ret, tdb_errorstr(tdb)));
+		return -1;
+	}
+
+	ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+	if (ret != 0) {
+		DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+				   ctdb_db->db_name, ret));
+		return -1;
+	}
+	old_reason = ctdb_db->unhealthy_reason;
+
+	key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+	key.dsize = strlen(ctdb_db->db_name);
+
+	if (given_reason) {
+		new_reason = talloc_strdup(ctdb_db, given_reason);
+		if (new_reason == NULL) {
+			DEBUG(DEBUG_ALERT,(__location__ " talloc_strdup(%s) failed\n",
+					  given_reason));
+			return -1;
+		}
+	} else if (old_reason && num_healthy_nodes == 0) {
+		/*
+		 * If the reason indicates ok, but there where no healthy nodes
+		 * available, that it means, we have not recovered valid content
+		 * of the db. So if there's an old reason, prefix it with
+		 * "NO-HEALTHY-NODES - "
+		 */
+		const char *prefix;
+
+#define _TMP_PREFIX "NO-HEALTHY-NODES - "
+		ret = strncmp(_TMP_PREFIX, old_reason, strlen(_TMP_PREFIX));
+		if (ret != 0) {
+			prefix = _TMP_PREFIX;
+		} else {
+			prefix = "";
+		}
+		new_reason = talloc_asprintf(ctdb_db, "%s%s",
+					 prefix, old_reason);
+		if (new_reason == NULL) {
+			DEBUG(DEBUG_ALERT,(__location__ " talloc_asprintf(%s%s) failed\n",
+					  prefix, old_reason));
+			return -1;
+		}
+#undef _TMP_PREFIX
+	}
+
+	if (new_reason) {
+		val.dptr = discard_const_p(uint8_t, new_reason);
+		val.dsize = strlen(new_reason);
+
+		ret = tdb_store(tdb, key, val, TDB_REPLACE);
+		if (ret != 0) {
+			tdb_transaction_cancel(tdb);
+			DEBUG(DEBUG_ALERT,(__location__ " tdb_store('%s', %s, %s) failed: %d - %s\n",
+					   tdb_name(tdb), ctdb_db->db_name, new_reason,
+					   ret, tdb_errorstr(tdb)));
+			talloc_free(new_reason);
+			return -1;
+		}
+		DEBUG(DEBUG_ALERT,("Updated db health for db(%s) to: %s\n",
+				   ctdb_db->db_name, new_reason));
+	} else if (old_reason) {
+		ret = tdb_delete(tdb, key);
+		if (ret != 0) {
+			tdb_transaction_cancel(tdb);
+			DEBUG(DEBUG_ALERT,(__location__ " tdb_delete('%s', %s) failed: %d - %s\n",
+					   tdb_name(tdb), ctdb_db->db_name,
+					   ret, tdb_errorstr(tdb)));
+			talloc_free(new_reason);
+			return -1;
+		}
+		DEBUG(DEBUG_NOTICE,("Updated db health for db(%s): OK\n",
+				   ctdb_db->db_name));
+	}
+
+	ret = tdb_transaction_commit(tdb);
+	if (ret != TDB_SUCCESS) {
+		DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_commit('%s') failed: %d - %s\n",
+				   tdb_name(tdb), ret, tdb_errorstr(tdb)));
+		talloc_free(new_reason);
+		return -1;
+	}
+
+	talloc_free(old_reason);
+	ctdb_db->unhealthy_reason = new_reason;
+
+	return 0;
+}
+
+static int ctdb_backup_corrupted_tdb(struct ctdb_context *ctdb,
+				     struct ctdb_db_context *ctdb_db)
+{
+	time_t now = time(NULL);
+	char *new_path;
+	char *new_reason;
+	int ret;
+	struct tm *tm;
+
+	tm = gmtime(&now);
+
+	/* formatted like: foo.tdb.0.corrupted.20091204160825.0Z */
+	new_path = talloc_asprintf(ctdb_db, "%s.corrupted."
+				   "%04u%02u%02u%02u%02u%02u.0Z",
+				   ctdb_db->db_path,
+				   tm->tm_year+1900, tm->tm_mon+1,
+				   tm->tm_mday, tm->tm_hour, tm->tm_min,
+				   tm->tm_sec);
+	if (new_path == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+		return -1;
+	}
+
+	new_reason = talloc_asprintf(ctdb_db,
+				     "ERROR - Backup of corrupted TDB in '%s'",
+				     new_path);
+	if (new_reason == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+		return -1;
+	}
+	ret = ctdb_update_persistent_health(ctdb, ctdb_db, new_reason, 0);
+	talloc_free(new_reason);
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,(__location__
+				 ": ctdb_backup_corrupted_tdb(%s) not implemented yet\n",
+				 ctdb_db->db_path));
+		return -1;
+	}
+
+	ret = rename(ctdb_db->db_path, new_path);
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,(__location__
+				  ": ctdb_backup_corrupted_tdb(%s) rename to %s failed: %d - %s\n",
+				  ctdb_db->db_path, new_path,
+				  errno, strerror(errno)));
+		talloc_free(new_path);
+		return -1;
+	}
+
+	DEBUG(DEBUG_CRIT,(__location__
+			 ": ctdb_backup_corrupted_tdb(%s) renamed to %s\n",
+			 ctdb_db->db_path, new_path));
+	talloc_free(new_path);
+	return 0;
+}
+
+int ctdb_recheck_persistent_health(struct ctdb_context *ctdb)
+{
+	struct ctdb_db_context *ctdb_db;
+	int ret;
+	int ok = 0;
+	int fail = 0;
+
+	for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+		if (!ctdb_db->persistent) {
+			continue;
+		}
+
+		ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+		if (ret != 0) {
+			DEBUG(DEBUG_ALERT,(__location__
+					   " load persistent health for '%s' failed\n",
+					   ctdb_db->db_path));
+			return -1;
+		}
+
+		if (ctdb_db->unhealthy_reason == NULL) {
+			ok++;
+			DEBUG(DEBUG_INFO,(__location__
+				   " persistent db '%s' healthy\n",
+				   ctdb_db->db_path));
+			continue;
+		}
+
+		fail++;
+		DEBUG(DEBUG_ALERT,(__location__
+				   " persistent db '%s' unhealthy: %s\n",
+				   ctdb_db->db_path,
+				   ctdb_db->unhealthy_reason));
+	}
+	DEBUG((fail!=0)?DEBUG_ALERT:DEBUG_NOTICE,
+	      ("ctdb_recheck_presistent_health: OK[%d] FAIL[%d]\n",
+	       ok, fail));
+
+	if (fail != 0) {
+		return -1;
+	}
+
+	return 0;
+}
 
 /*
   attach to a database, handling both persistent and non-persistent databases
@@ -202,6 +447,8 @@ static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name,
 	int ret;
 	struct TDB_DATA key;
 	unsigned tdb_flags;
+	int mode = 0600;
+	int remaining_tries = 0;
 
 	ctdb_db = talloc_zero(ctdb, struct ctdb_db_context);
 	CTDB_NO_MEMORY(ctdb, ctdb_db);
@@ -226,6 +473,47 @@ static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name,
 		}
 	}
 
+	if (persistent) {
+		if (unhealthy_reason) {
+			ret = ctdb_update_persistent_health(ctdb, ctdb_db,
+							    unhealthy_reason, 0);
+			if (ret != 0) {
+				DEBUG(DEBUG_ALERT,(__location__ " ctdb_update_persistent_health('%s','%s') failed: %d\n",
+						   ctdb_db->db_name, unhealthy_reason, ret));
+				talloc_free(ctdb_db);
+				return -1;
+			}
+		}
+
+		if (ctdb->max_persistent_check_errors > 0) {
+			remaining_tries = 1;
+		}
+		if (ctdb->done_startup) {
+			remaining_tries = 0;
+		}
+
+		ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+		if (ret != 0) {
+			DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+				   ctdb_db->db_name, ret));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+	}
+
+	if (ctdb_db->unhealthy_reason && remaining_tries == 0) {
+		DEBUG(DEBUG_ALERT,(__location__ "ERROR: tdb %s is marked as unhealthy: %s\n",
+				   ctdb_db->db_name, ctdb_db->unhealthy_reason));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
+	if (ctdb_db->unhealthy_reason) {
+		/* this is just a warning, but we want that in the log file! */
+		DEBUG(DEBUG_ALERT,(__location__ "Warning: tdb %s is marked as unhealthy: %s\n",
+				   ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
 	/* open the database */
 	ctdb_db->db_path = talloc_asprintf(ctdb_db, "%s/%s.%u", 
 					   persistent?ctdb->db_directory_persistent:ctdb->db_directory, 
@@ -237,18 +525,105 @@ static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name,
 	}
 	tdb_flags |= TDB_DISALLOW_NESTING;
 
+again:
 	ctdb_db->ltdb = tdb_wrap_open(ctdb, ctdb_db->db_path, 
 				      ctdb->tunable.database_hash_size, 
 				      tdb_flags, 
-				      O_CREAT|O_RDWR, 0600);
+				      O_CREAT|O_RDWR, mode);
 	if (ctdb_db->ltdb == NULL) {
-		DEBUG(DEBUG_CRIT,("Failed to open tdb '%s'\n", ctdb_db->db_path));
-		talloc_free(ctdb_db);
-		return -1;
+		struct stat st;
+		int saved_errno = errno;
+
+		if (!persistent) {
+			DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n",
+					  ctdb_db->db_path,
+					  saved_errno,
+					  strerror(saved_errno)));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+
+		if (remaining_tries == 0) {
+			DEBUG(DEBUG_CRIT,(__location__
+					  "Failed to open persistent tdb '%s': %d - %s\n",
+					  ctdb_db->db_path,
+					  saved_errno,
+					  strerror(saved_errno)));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+
+		ret = stat(ctdb_db->db_path, &st);
+		if (ret != 0) {
+			DEBUG(DEBUG_CRIT,(__location__
+					  "Failed to open persistent tdb '%s': %d - %s\n",
+					  ctdb_db->db_path,
+					  saved_errno,
+					  strerror(saved_errno)));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+
+		ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+		if (ret != 0) {
+			DEBUG(DEBUG_CRIT,(__location__
+					  "Failed to open persistent tdb '%s': %d - %s\n",
+					  ctdb_db->db_path,
+					  saved_errno,
+					  strerror(saved_errno)));
+			talloc_free(ctdb_db);
+			return -1;
+		}
+
+		remaining_tries--;
+		mode = st.st_mode;
+		goto again;
 	}
 
 	if (!persistent) {
 		ctdb_check_db_empty(ctdb_db);
+	} else {
+		ret = tdb_check(ctdb_db->ltdb->tdb, NULL, NULL);
+		if (ret != 0) {
+			int fd;
+			struct stat st;
+
+			DEBUG(DEBUG_CRIT,("tdb_check(%s) failed: %d - %s\n",
+					  ctdb_db->db_path, ret,
+					  tdb_errorstr(ctdb_db->ltdb->tdb)));
+			if (remaining_tries == 0) {
+				talloc_free(ctdb_db);
+				return -1;
+			}
+
+			fd = tdb_fd(ctdb_db->ltdb->tdb);
+			ret = fstat(fd, &st);
+			if (ret != 0) {
+				DEBUG(DEBUG_CRIT,(__location__
+						  "Failed to fstat() persistent tdb '%s': %d - %s\n",
+						  ctdb_db->db_path,
+						  errno,
+						  strerror(errno)));
+				talloc_free(ctdb_db);
+				return -1;
+			}
+
+			/* close the TDB */
+			talloc_free(ctdb_db->ltdb);
+			ctdb_db->ltdb = NULL;
+
+			ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+			if (ret != 0) {
+				DEBUG(DEBUG_CRIT,("Failed to backup corrupted tdb '%s'\n",
+						  ctdb_db->db_path));
+				talloc_free(ctdb_db);
+				return -1;
+			}
+
+			remaining_tries--;
+			mode = st.st_mode;
+			goto again;
+		}
 	}
 
 	DLIST_ADD(ctdb->db_list, ctdb_db);
@@ -587,6 +962,12 @@ int32_t ctdb_ltdb_update_seqnum(struct ctdb_context *ctdb, uint32_t db_id, uint3
 		return -1;
 	}
 
+	if (ctdb_db->unhealthy_reason) {
+		DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_ltdb_update_seqnum: %s\n",
+				 ctdb_db->db_name, ctdb_db->unhealthy_reason));
+		return -1;
+	}
+
 	tdb_increment_seqnum_nonblock(ctdb_db->ltdb->tdb);
 	ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
 	return 0;
diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c
index 2bf5dcb99f..729895c68a 100644
--- a/ctdb/server/ctdb_monitor.c
+++ b/ctdb/server/ctdb_monitor.c
@@ -220,10 +220,13 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
 			      struct timeval t, void *private_data)
 {
 	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+	int ret;
 
 	DEBUG(DEBUG_NOTICE,("CTDB_WAIT_UNTIL_RECOVERED\n"));
 
 	if (ctdb->vnn_map->generation == INVALID_GENERATION) {
+		ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
 		DEBUG(DEBUG_NOTICE,(__location__ " generation is INVALID. Wait one more second\n"));
 		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 				     timeval_current_ofs(1, 0), 
@@ -232,6 +235,8 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
 	}
 
 	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+		ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
 		DEBUG(DEBUG_NOTICE,(__location__ " in recovery. Wait one more second\n"));
 		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 				     timeval_current_ofs(1, 0), 
@@ -241,6 +246,8 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
 
 
 	if (timeval_elapsed(&ctdb->last_recovery_finished) < (ctdb->tunable.rerecovery_timeout + 3)) {
+		ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
 		DEBUG(DEBUG_NOTICE,(__location__ " wait for pending recoveries to end. Wait one more second.\n"));
 
 		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
@@ -249,6 +256,48 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
 		return;
 	}
 
+	if (ctdb->vnn_map->generation == ctdb->db_persistent_startup_generation) {
+		DEBUG(DEBUG_INFO,(__location__ " skip ctdb_recheck_persistent_health() "
+				  "until the next recovery\n"));
+		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+				     timeval_current_ofs(1, 0),
+				     ctdb_wait_until_recovered, ctdb);
+		return;
+	}
+
+	ctdb->db_persistent_startup_generation = ctdb->vnn_map->generation;
+	ret = ctdb_recheck_persistent_health(ctdb);
+	if (ret != 0) {
+		ctdb->db_persistent_check_errors++;
+		if (ctdb->db_persistent_check_errors < ctdb->max_persistent_check_errors) {
+			DEBUG(ctdb->db_persistent_check_errors==1?DEBUG_ERR:DEBUG_WARNING,
+			      (__location__ "ctdb_recheck_persistent_health() "
+			      "failed (%llu of %llu times) - retry later\n",
+			      (unsigned long long)ctdb->db_persistent_check_errors,
+			      (unsigned long long)ctdb->max_persistent_check_errors));
+			event_add_timed(ctdb->ev,
+					ctdb->monitor->monitor_context,
+					timeval_current_ofs(1, 0),
+					ctdb_wait_until_recovered, ctdb);
+			return;
+		}
+		DEBUG(DEBUG_ALERT,(__location__
+				  "ctdb_recheck_persistent_health() failed (%llu times) - prepare shutdown\n",
+				  (unsigned long long)ctdb->db_persistent_check_errors));
+		ctdb_stop_recoverd(ctdb);
+		ctdb_stop_keepalive(ctdb);
+		ctdb_stop_monitoring(ctdb);
+		ctdb_release_all_ips(ctdb);
+		if (ctdb->methods != NULL) {
+			ctdb->methods->shutdown(ctdb);
+		}
+		ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
+		DEBUG(DEBUG_ALERT,("ctdb_recheck_persistent_health() failed - Stopping CTDB daemon\n"));
+		exit(11);
+	}
+	ctdb->db_persistent_check_errors = 0;
+	DEBUG(DEBUG_NOTICE,(__location__
+			   "ctdb_start_monitoring: ctdb_recheck_persistent_health() OK\n"));
 
 	DEBUG(DEBUG_NOTICE,(__location__ " Recoveries finished. Running the \"startup\" event.\n"));
 	event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
@@ -421,6 +470,11 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
 
 	DEBUG(DEBUG_INFO, ("Control modflags on node %u - flags now 0x%x\n", c->pnn, node->flags));
 
+	if (node->flags == 0 && !ctdb->done_startup) {
+		DEBUG(DEBUG_ERR, (__location__ " Node %u became healthy - force recovery for startup\n",
+				  c->pnn));
+		ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+	}
 
 	/* tell the recovery daemon something has changed */
 	ctdb_daemon_send_message(ctdb, ctdb->pnn,
diff --git a/ctdb/server/ctdb_persistent.c b/ctdb/server/ctdb_persistent.c
index 59ddadb042..b686cbdee8 100644
--- a/ctdb/server/ctdb_persistent.c
+++ b/ctdb/server/ctdb_persistent.c
@@ -117,6 +117,12 @@ int32_t ctdb_control_trans2_commit(struct ctdb_context *ctdb,
 		return -1;
 	}
 
+	if (ctdb_db->unhealthy_reason) {
+		DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_trans2_commit: %s\n",
+				 ctdb_db->db_name, ctdb_db->unhealthy_reason));
+		return -1;
+	}
+
 	/* handling num_persistent_updates is a bit strange - 
 	   there are 3 cases
 	     1) very old clients, which never called CTDB_CONTROL_START_PERSISTENT_UPDATE
@@ -597,6 +603,12 @@ int32_t ctdb_control_update_record(struct ctdb_context *ctdb,
 		return -1;
 	}
 
+	if (ctdb_db->unhealthy_reason) {
+		DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_update_record: %s\n",
+				 ctdb_db->db_name, ctdb_db->unhealthy_reason));
+		return -1;
+	}
+
 	state = talloc(ctdb, struct ctdb_persistent_write_state);
 	CTDB_NO_MEMORY(ctdb, state);
 
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index 8568e8bbe0..ecc01e6206 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -386,6 +386,12 @@ int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DAT
 	params.len = offsetof(struct ctdb_marshall_buffer, data);
 	params.failed = false;
 
+	if (ctdb_db->unhealthy_reason) {
+		/* this is just a warning, as the tdb should be empty anyway */
+		DEBUG(DEBUG_WARNING,("db(%s) unhealty in ctdb_control_pull_db: %s\n",
+				     ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
 	if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
 		DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
 		return -1;
diff --git a/ctdb/server/ctdb_traverse.c b/ctdb/server/ctdb_traverse.c
index d66036f9a1..26d4328062 100644
--- a/ctdb/server/ctdb_traverse.c
+++ b/ctdb/server/ctdb_traverse.c
@@ -388,6 +388,16 @@ int32_t ctdb_control_traverse_all(struct ctdb_context *ctdb, TDB_DATA data, TDB_
 		return -1;
 	}
 
+	if (ctdb_db->unhealthy_reason) {
+		if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+			DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+					ctdb_db->db_name, ctdb_db->unhealthy_reason));
+			return -1;
+		}
+		DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+				     ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
 	state = talloc(ctdb_db, struct traverse_all_state);
 	if (state == NULL) {
 		return -1;
@@ -561,6 +571,16 @@ int32_t ctdb_control_traverse_start(struct ctdb_context *ctdb, TDB_DATA data,
 		return -1;
 	}
 
+	if (ctdb_db->unhealthy_reason) {
+		if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+			DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_start: %s\n",
+					ctdb_db->db_name, ctdb_db->unhealthy_reason));
+			return -1;
+		}
+		DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_start: %s\n",
+				     ctdb_db->db_name, ctdb_db->unhealthy_reason));
+	}
+
 	state = talloc(client, struct traverse_start_state);
 	if (state == NULL) {
 		return -1;
diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c
index 17949d10ae..e75dcbd74f 100644
--- a/ctdb/server/ctdb_tunables.c
+++ b/ctdb/server/ctdb_tunables.c
@@ -63,7 +63,8 @@ static const struct {
 	{ "VacuumMinInterval",   60,  offsetof(struct ctdb_tunable, vacuum_min_interval) },
 	{ "VacuumMaxInterval",  600,  offsetof(struct ctdb_tunable, vacuum_max_interval) },
 	{ "MaxQueueDropMsg",  1000,  offsetof(struct ctdb_tunable, max_queue_depth_drop_msg) },
-	{ "UseStatusEvents",     0,  offsetof(struct ctdb_tunable, use_status_events_for_monitoring) }
+	{ "UseStatusEvents",     0,  offsetof(struct ctdb_tunable, use_status_events_for_monitoring) },
+	{ "AllowUnhealthyDBRead", 0,  offsetof(struct ctdb_tunable, allow_unhealthy_db_read) }
 };
 
 /*
diff --git a/ctdb/server/ctdbd.c b/ctdb/server/ctdbd.c
index 7cffde09ee..e32aa6580f 100644
--- a/ctdb/server/ctdbd.c
+++ b/ctdb/server/ctdbd.c
@@ -51,6 +51,7 @@ static struct {
 	int         lvs;
 	int	    script_log_level;
 	int         no_publicipcheck;
+	int         max_persistent_check_errors;
 } options = {
 	.nlist = ETCDIR "/ctdb/nodes",
 	.transport = "tcp",
@@ -139,6 +140,9 @@ int main(int argc, const char *argv[])
 		{ "lvs", 0, POPT_ARG_NONE, &options.lvs, 0, "lvs is enabled on this node", NULL },
 		{ "script-log-level", 0, POPT_ARG_INT, &options.script_log_level, DEBUG_ERR, "log level of event script output", NULL },
 		{ "nopublicipcheck", 0, POPT_ARG_NONE, &options.no_publicipcheck, 0, "dont check we have/dont have the correct public ip addresses", NULL },
+		{ "max-persistent-check-errors", 0, POPT_ARG_INT,
+		  &options.max_persistent_check_errors, 0,
+		  "max allowed persistent check errors (default 0)", NULL },
 		POPT_TABLEEND
 	};
 	int opt, ret;
@@ -325,6 +329,12 @@ int main(int argc, const char *argv[])
 
 	ctdb->do_checkpublicip = !options.no_publicipcheck;
 
+	if (options.max_persistent_check_errors < 0) {
+		ctdb->max_persistent_check_errors = 0xFFFFFFFFFFFFFFFFLL;
+	} else {
+		ctdb->max_persistent_check_errors = (uint64_t)options.max_persistent_check_errors;
+	}
+
 	if (getenv("CTDB_BASE") == NULL) {
 		/* setup a environment variable for the event scripts to use
 		   to find the installation directory */