summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRonnie Sahlberg <ronniesahlberg@gmail.com>2011-03-01 12:09:42 +1100
committerRonnie Sahlberg <ronniesahlberg@gmail.com>2011-03-01 12:13:58 +1100
commit49a30783d39ccc4d29b1c4cd8479a9ab0513ba35 (patch)
tree94bd41038708a2c0a59e0b43cb4445e1f8847964
parentb611de93ad1af694b5cc36abee62781f7935c51e (diff)
downloadsamba-49a30783d39ccc4d29b1c4cd8479a9ab0513ba35.tar.gz
samba-49a30783d39ccc4d29b1c4cd8479a9ab0513ba35.tar.xz
samba-49a30783d39ccc4d29b1c4cd8479a9ab0513ba35.zip
If/when the recovery daemon terminates unexpectedly, try to restart it again from the main daemon instead of just shutting down the main deamon too.
While it does not address the reason for recovery daemon shutting down, it reduces the impact of such issues and makes the system more robust. (This used to be ctdb commit 0566ef3d6cef809bda204877c493c80ff9eb2c40)
-rw-r--r--ctdb/server/ctdb_recoverd.c25
1 files changed, 15 insertions, 10 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index d75370dfb4..cc0be36069 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -70,6 +70,7 @@ struct ctdb_recoverd {
#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
#define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
+static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
/*
ban a node for a period of time
@@ -3521,18 +3522,12 @@ static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
if (kill(ctdb->recoverd_pid, 0) != 0) {
- DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
+ DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
- ctdb_stop_recoverd(ctdb);
- ctdb_stop_keepalive(ctdb);
- ctdb_stop_monitoring(ctdb);
- ctdb_release_all_ips(ctdb);
- if (ctdb->methods != NULL) {
- ctdb->methods->shutdown(ctdb);
- }
- ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
+ event_add_timed(ctdb->ev, ctdb, timeval_zero(),
+ ctdb_restart_recd, ctdb);
- exit(10);
+ return;
}
event_add_timed(ctdb->ev, ctdb,
@@ -3634,3 +3629,13 @@ void ctdb_stop_recoverd(struct ctdb_context *ctdb)
DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
kill(ctdb->recoverd_pid, SIGTERM);
}
+
+static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+ DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
+ ctdb_stop_recoverd(ctdb);
+ ctdb_start_recoverd(ctdb);
+}