diff options
author | Ronnie Sahlberg <ronniesahlberg@gmail.com> | 2008-09-09 13:44:46 +1000 |
---|---|---|
committer | Ronnie Sahlberg <ronniesahlberg@gmail.com> | 2008-09-09 13:44:46 +1000 |
commit | 6474f3278d33107bee8fb499d71c2a682c8c3403 (patch) | |
tree | ae73a84ca8a0e75ab54ecf7c79202b6e14f191a6 /ctdb | |
parent | 7a78a78a1c0de5ef3f470326eee2c1ca7e57e607 (diff) | |
download | samba-6474f3278d33107bee8fb499d71c2a682c8c3403.tar.gz samba-6474f3278d33107bee8fb499d71c2a682c8c3403.tar.xz samba-6474f3278d33107bee8fb499d71c2a682c8c3403.zip |
additional monitoring between the two daemons.
we currently only monitor that the dameons are running by kill(0, pid)
and verifying the the domain socket between them is ok.
this is not sufficient since we can have a situation where the recovery
daemon is hung.
this new code monitors that the recovery daemon is operating.
if the recovery hangs, we log this and shut down the main daemon
(This used to be ctdb commit cd69d292292eaab3aac0e9d9fc57cb621597c63c)
Diffstat (limited to 'ctdb')
-rw-r--r-- | ctdb/client/ctdb_client.c | 18 | ||||
-rw-r--r-- | ctdb/include/ctdb.h | 2 | ||||
-rw-r--r-- | ctdb/include/ctdb_private.h | 4 | ||||
-rw-r--r-- | ctdb/server/ctdb_control.c | 4 | ||||
-rw-r--r-- | ctdb/server/ctdb_daemon.c | 3 | ||||
-rw-r--r-- | ctdb/server/ctdb_recover.c | 38 | ||||
-rw-r--r-- | ctdb/server/ctdb_recoverd.c | 3 | ||||
-rw-r--r-- | ctdb/server/ctdb_tunables.c | 1 |
8 files changed, 73 insertions, 0 deletions
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c index dfcd4d90e7..6d80efc205 100644 --- a/ctdb/client/ctdb_client.c +++ b/ctdb/client/ctdb_client.c @@ -3280,3 +3280,21 @@ again: talloc_free(h); return 0; } + +/* + recovery daemon ping to main daemon + */ +int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb) +{ + int ret; + int32_t res; + + ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_PING, 0, tdb_null, + ctdb, NULL, &res, NULL, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,("Failed to send recd ping\n")); + return -1; + } + + return 0; +} diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h index d43ab50707..60fa60be58 100644 --- a/ctdb/include/ctdb.h +++ b/ctdb/include/ctdb.h @@ -566,4 +566,6 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h, TDB_DATA key, TDB_DATA data); int ctdb_transaction_commit(struct ctdb_transaction_handle *h); +int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb); + #endif diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index a25674c9b4..b2ded310b5 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -114,6 +114,7 @@ struct ctdb_tunable { uint32_t reclock_ping_period; uint32_t no_ip_failback; uint32_t verbose_memory_names; + uint32_t recd_ping_timeout; }; /* @@ -417,6 +418,7 @@ struct ctdb_context { int start_as_disabled; uint32_t event_script_timeouts; /* counting how many consecutive times an eventscript has timedout */ TALLOC_CTX *eventscripts_ctx; /* a context to hold data for the RUN_EVENTSCRIPTS control */ + TALLOC_CTX *recd_ping_ctx; }; struct ctdb_db_context { @@ -550,6 +552,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0, CTDB_CONTROL_TRANS2_FINISHED = 84, CTDB_CONTROL_TRANS2_ERROR = 85, CTDB_CONTROL_TRANS2_COMMIT_RETRY = 86, + CTDB_CONTROL_RECD_PING = 87, }; /* @@ -1378,5 +1381,6 @@ int32_t ctdb_control_trans2_error(struct ctdb_context *ctdb, char *ctdb_addr_to_str(ctdb_sock_addr *addr); void ctdb_canonicalize_ip(const ctdb_sock_addr *ip, ctdb_sock_addr *cip); +int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb); #endif diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c index 4128797866..94736fb568 100644 --- a/ctdb/server/ctdb_control.c +++ b/ctdb/server/ctdb_control.c @@ -406,6 +406,10 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, case CTDB_CONTROL_TRANS2_FINISHED: return ctdb_control_trans2_finished(ctdb, c); + case CTDB_CONTROL_RECD_PING: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_recd_ping(ctdb); + default: DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode)); return -1; diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c index efe3d75349..885ce7e6f6 100644 --- a/ctdb/server/ctdb_daemon.c +++ b/ctdb/server/ctdb_daemon.c @@ -103,6 +103,9 @@ static void ctdb_start_transport(struct ctdb_context *ctdb) /* start periodic update of tcp tickle lists */ ctdb_start_tcp_tickle_update(ctdb); + + /* start listening for recovery daemon pings */ + ctdb_control_recd_ping(ctdb); } static void block_signal(int signum) diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index 3243f42faa..6b207d55bc 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -971,3 +971,41 @@ int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outda return 0; } +static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p) +{ + struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context); + + DEBUG(DEBUG_ERR, (__location__ " Recovery daemon ping timeout. Shutting down ctdb daemon\n")); + + ctdb_stop_recoverd(ctdb); + ctdb_stop_keepalive(ctdb); + ctdb_stop_monitoring(ctdb); + ctdb_release_all_ips(ctdb); + if (ctdb->methods != NULL) { + ctdb->methods->shutdown(ctdb); + } + ctdb_event_script(ctdb, "shutdown"); + DEBUG(DEBUG_ERR, (__location__ " Recovery daemon ping timeout. Daemon has been shut down.\n")); + exit(0); +} + +/* The recovery daemon will ping us at regular intervals. + If we havent been pinged for a while we assume the recovery + daemon is inoperable and we shut down. +*/ +int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb) +{ + talloc_free(ctdb->recd_ping_ctx); + + ctdb->recd_ping_ctx = talloc_new(ctdb); + CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_ctx); + + if (ctdb->tunable.recd_ping_timeout != 0) { + event_add_timed(ctdb->ev, ctdb->recd_ping_ctx, + timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0), + ctdb_recd_ping_timeout, ctdb); + } + + return 0; +} + diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index c6a4ab322a..a8c004ae0c 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2317,6 +2317,9 @@ again: exit(-1); } + /* ping the local daemon to tell it we are alive */ + ctdb_ctrl_recd_ping(ctdb); + if (rec->election_timeout) { /* an election is in progress */ goto again; diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c index d138137afd..de3e46667c 100644 --- a/ctdb/server/ctdb_tunables.c +++ b/ctdb/server/ctdb_tunables.c @@ -50,6 +50,7 @@ static const struct { { "ReclockPingPeriod", 60, offsetof(struct ctdb_tunable, reclock_ping_period) }, { "NoIPFailback", 0, offsetof(struct ctdb_tunable, no_ip_failback) }, { "VerboseMemoryNames", 0, offsetof(struct ctdb_tunable, verbose_memory_names) }, + { "RecdPingTimeout", 60, offsetof(struct ctdb_tunable, recd_ping_timeout) }, }; /* |