summaryrefslogtreecommitdiffstats
path: root/ctdb/server/ctdb_monitor.c
diff options
context:
space:
mode:
authorStefan Metzmacher <metze@samba.org>2009-12-07 13:28:11 +0100
committerStefan Metzmacher <metze@samba.org>2009-12-16 08:06:10 +0100
commit94bc40307ac62d552af478ef3b05630a4a54d8cc (patch)
treea7ab8f3e727e300ce576d5a3b593d29467d6dcaf /ctdb/server/ctdb_monitor.c
parent9069d3a7fb013a87c1b4e4c0a83651d3921e9955 (diff)
downloadsamba-94bc40307ac62d552af478ef3b05630a4a54d8cc.tar.gz
samba-94bc40307ac62d552af478ef3b05630a4a54d8cc.tar.xz
samba-94bc40307ac62d552af478ef3b05630a4a54d8cc.zip
server: Use tdb_check to verify persistent tdbs on startup
Depending on --max-persistent-check-errors we allow ctdb to start with unhealthy persistent databases. The default is 0 which means to reject a startup with unhealthy dbs. The health of the persistent databases is checked after each recovery. Node monitoring and the "startup" is deferred until all persistent databases are healthy. Databases can become healthy automaticly by a completely HEALTHY node joining the cluster. Or by an administrator with "ctdb backupdb/restoredb" or "ctdb wipedb". metze (This used to be ctdb commit 15f133d5150ed1badb4fef7d644f10cd08a25cb5)
Diffstat (limited to 'ctdb/server/ctdb_monitor.c')
-rw-r--r--ctdb/server/ctdb_monitor.c54
1 files changed, 54 insertions, 0 deletions
diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c
index 2bf5dcb99f5..729895c68a6 100644
--- a/ctdb/server/ctdb_monitor.c
+++ b/ctdb/server/ctdb_monitor.c
@@ -220,10 +220,13 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
struct timeval t, void *private_data)
{
struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+ int ret;
DEBUG(DEBUG_NOTICE,("CTDB_WAIT_UNTIL_RECOVERED\n"));
if (ctdb->vnn_map->generation == INVALID_GENERATION) {
+ ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
DEBUG(DEBUG_NOTICE,(__location__ " generation is INVALID. Wait one more second\n"));
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(1, 0),
@@ -232,6 +235,8 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
}
if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+ ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
DEBUG(DEBUG_NOTICE,(__location__ " in recovery. Wait one more second\n"));
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(1, 0),
@@ -241,6 +246,8 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
if (timeval_elapsed(&ctdb->last_recovery_finished) < (ctdb->tunable.rerecovery_timeout + 3)) {
+ ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
DEBUG(DEBUG_NOTICE,(__location__ " wait for pending recoveries to end. Wait one more second.\n"));
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
@@ -249,6 +256,48 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
return;
}
+ if (ctdb->vnn_map->generation == ctdb->db_persistent_startup_generation) {
+ DEBUG(DEBUG_INFO,(__location__ " skip ctdb_recheck_persistent_health() "
+ "until the next recovery\n"));
+ event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(1, 0),
+ ctdb_wait_until_recovered, ctdb);
+ return;
+ }
+
+ ctdb->db_persistent_startup_generation = ctdb->vnn_map->generation;
+ ret = ctdb_recheck_persistent_health(ctdb);
+ if (ret != 0) {
+ ctdb->db_persistent_check_errors++;
+ if (ctdb->db_persistent_check_errors < ctdb->max_persistent_check_errors) {
+ DEBUG(ctdb->db_persistent_check_errors==1?DEBUG_ERR:DEBUG_WARNING,
+ (__location__ "ctdb_recheck_persistent_health() "
+ "failed (%llu of %llu times) - retry later\n",
+ (unsigned long long)ctdb->db_persistent_check_errors,
+ (unsigned long long)ctdb->max_persistent_check_errors));
+ event_add_timed(ctdb->ev,
+ ctdb->monitor->monitor_context,
+ timeval_current_ofs(1, 0),
+ ctdb_wait_until_recovered, ctdb);
+ return;
+ }
+ DEBUG(DEBUG_ALERT,(__location__
+ "ctdb_recheck_persistent_health() failed (%llu times) - prepare shutdown\n",
+ (unsigned long long)ctdb->db_persistent_check_errors));
+ ctdb_stop_recoverd(ctdb);
+ ctdb_stop_keepalive(ctdb);
+ ctdb_stop_monitoring(ctdb);
+ ctdb_release_all_ips(ctdb);
+ if (ctdb->methods != NULL) {
+ ctdb->methods->shutdown(ctdb);
+ }
+ ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
+ DEBUG(DEBUG_ALERT,("ctdb_recheck_persistent_health() failed - Stopping CTDB daemon\n"));
+ exit(11);
+ }
+ ctdb->db_persistent_check_errors = 0;
+ DEBUG(DEBUG_NOTICE,(__location__
+ "ctdb_start_monitoring: ctdb_recheck_persistent_health() OK\n"));
DEBUG(DEBUG_NOTICE,(__location__ " Recoveries finished. Running the \"startup\" event.\n"));
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
@@ -421,6 +470,11 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
DEBUG(DEBUG_INFO, ("Control modflags on node %u - flags now 0x%x\n", c->pnn, node->flags));
+ if (node->flags == 0 && !ctdb->done_startup) {
+ DEBUG(DEBUG_ERR, (__location__ " Node %u became healthy - force recovery for startup\n",
+ c->pnn));
+ ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+ }
/* tell the recovery daemon something has changed */
ctdb_daemon_send_message(ctdb, ctdb->pnn,