From e95a4b5cdb2758022b1b65a6f403a79b461a96b8 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 23 Nov 2007 09:54:38 +1100 Subject: when we print "Remote node had flags xx local had flags xx we swapped the flags when printing them to the log (This used to be ctdb commit 9fc8831a7fcd34763567227d61cd525ec441ebf2) --- ctdb/server/ctdb_recoverd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index e54c53d935..b4b90b2661 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -772,8 +772,8 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n } if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) { DEBUG(0,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n", - nodemap->nodes[j].pnn, nodemap->nodes[j].flags, - remote_nodemap->nodes[j].flags)); + nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, + nodemap->nodes[j].flags)); nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags; } talloc_free(remote_nodemap); -- cgit From c36ce05d08ebff723dac22347d1faa3d78ff6891 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 23 Nov 2007 10:31:51 +1100 Subject: if we get a modflag control but the flags remain unchanged, log this (This used to be ctdb commit 5a0cd9b37b21665054bd35facd87f0a6ff4dcd55) --- ctdb/server/ctdb_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 0e2dc29c6a..774d6accb2 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -243,7 +243,7 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) node->flags &= ~m->clear; if (node->flags == old_flags) { - /* no change */ + DEBUG(0, ("Control modflags on node %u - Unchanged - flags 0x%x\n", ctdb->pnn, node->flags)); return 0; } -- cgit From af5bc9b9156b0fe3a07d74f43bb5d03fb01d60fa Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 23 Nov 2007 10:52:29 +1100 Subject: add an extra log if we get a modflags control but it doesnt change any flags in update_local_flags() (this is only called if we are or we belive we are the recmaster) when we detect that the flags of a remote node is different from what our local node thinks the flags should be for that remote node we should send a node-flag-changed message to the local daemon so that it updates the flags for that node. (This used to be ctdb commit 36225e4e271f7a4065398253747fb20054f99a53) --- ctdb/server/ctdb_recoverd.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index b4b90b2661..4601ae69eb 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -67,6 +67,7 @@ static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn) } if (rec->banned_nodes[pnn] == NULL) { + DEBUG(0,("No ban recorded for this node. ctdb_unban_node() request ignored\n")); return; } @@ -753,6 +754,8 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n */ for (j=0; jnum; j++) { struct ctdb_node_map *remote_nodemap=NULL; + struct ctdb_node_flag_change c; + TDB_DATA data; int ret; if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) { @@ -775,6 +778,22 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, nodemap->nodes[j].flags)); nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags; + + /* We also should tell our daemon about this so it + updates its flags or else we will log the same + message again in the next iteration of recovery. + */ + c.pnn = nodemap->nodes[j].pnn; + c.old_flags = nodemap->nodes[j].flags; + c.new_flags = remote_nodemap->nodes[j].flags; + + data.dptr = (uint8_t *)&c; + data.dsize = sizeof(c); + + ctdb_send_message(ctdb, CTDB_CURRENT_NODE, + CTDB_SRVID_NODE_FLAGS_CHANGED, + data); + } talloc_free(remote_nodemap); } -- cgit From b2a81fb6b11984a9857ad3a617dd74dbe96f817f Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 23 Nov 2007 11:31:42 +1100 Subject: when we as the recovery daemon on the recovery master detects that the flags differ between the local ctdb daemon and the remote node we can force a flags update on all nodes and not just the local daemon (This used to be ctdb commit a924eb89c966ecbae029ca137e06cffd40cc70fd) --- ctdb/server/ctdb_recoverd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 4601ae69eb..0eefbfd4fc 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -742,7 +742,8 @@ static void ctdb_wait_election(struct ctdb_recoverd *rec) /* - update our local flags from all remote connected nodes. + Update our local flags from all remote connected nodes. + This is only run when we are or we belive we are the recovery master */ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap) { @@ -782,6 +783,8 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n /* We also should tell our daemon about this so it updates its flags or else we will log the same message again in the next iteration of recovery. + Since we are the recovery master we can just as + well update the flags on all nodes. */ c.pnn = nodemap->nodes[j].pnn; c.old_flags = nodemap->nodes[j].flags; @@ -790,7 +793,7 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n data.dptr = (uint8_t *)&c; data.dsize = sizeof(c); - ctdb_send_message(ctdb, CTDB_CURRENT_NODE, + ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_NODE_FLAGS_CHANGED, data); -- cgit From b5e79fb06f235786069954331e79c2616d27859d Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 23 Nov 2007 11:53:06 +1100 Subject: If update_local_flags() finds that a node has changed its BANNED status so it differs from what the local ctdb daemon on the recovery master thinks it should be we should call for a re-election (This used to be ctdb commit 21ad6039c31ef5cc0e40a35a41220f91943947cb) --- ctdb/server/ctdb_recoverd.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 0eefbfd4fc..b5ed27b39f 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -755,8 +755,6 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n */ for (j=0; jnum; j++) { struct ctdb_node_map *remote_nodemap=NULL; - struct ctdb_node_flag_change c; - TDB_DATA data; int ret; if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) { @@ -772,15 +770,13 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n", nodemap->nodes[j].pnn)); talloc_free(mem_ctx); - return -1; + return MONITOR_FAILED; } if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) { - DEBUG(0,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n", - nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, - nodemap->nodes[j].flags)); - nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags; + struct ctdb_node_flag_change c; + TDB_DATA data; - /* We also should tell our daemon about this so it + /* We should tell our daemon about this so it updates its flags or else we will log the same message again in the next iteration of recovery. Since we are the recovery master we can just as @@ -797,11 +793,30 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n CTDB_SRVID_NODE_FLAGS_CHANGED, data); + /* Update our local copy of the flags in the recovery + daemon. + */ + DEBUG(0,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n", + nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, + nodemap->nodes[j].flags)); + nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags; + + /* If the BANNED flag has changed for the node + this is a good reason to do a new election. + */ + if ((c.old_flags ^ c.new_flags) & NODE_FLAGS_BANNED) { + DEBUG(0,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n", + nodemap->nodes[j].pnn, c.new_flags, + c.old_flags)); + talloc_free(mem_ctx); + return MONITOR_ELECTION_NEEDED; + } + } talloc_free(remote_nodemap); } talloc_free(mem_ctx); - return 0; + return MONITOR_OK; } @@ -1787,7 +1802,12 @@ again: /* ensure our local copies of flags are right */ ret = update_local_flags(ctdb, nodemap); - if (ret != 0) { + if (ret == MONITOR_ELECTION_NEEDED) { + DEBUG(0,("update_local_flags() called for a re-election.\n")); + force_election(rec, mem_ctx, pnn, nodemap); + goto again; + } + if (ret != MONITOR_OK) { DEBUG(0,("Unable to update local flags\n")); goto again; } -- cgit From 6b284e5905841f0518cc0fb69a6ce5138abc2507 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 23 Nov 2007 12:36:14 +1100 Subject: add log output for when ctdb_ban_node() and ctdb_unban_node() are called when these functions are called to ban or unban a node make sure we update the CTDB_NODE_BANNED flag in rec->node_flags since this field and flag are checked during the election process (This used to be ctdb commit 740c632ae96a2d34327d1b575780aaf079d93f4f) --- ctdb/server/ctdb_recoverd.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index b5ed27b39f..8b1f747cb6 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -61,11 +61,19 @@ static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn) { struct ctdb_context *ctdb = rec->ctdb; + DEBUG(0,("Unbanning node %u\n", pnn)); + if (!ctdb_validate_pnn(ctdb, pnn)) { DEBUG(0,("Bad pnn %u in ctdb_unban_node\n", pnn)); return; } + if (pnn == ctdb->pnn) { + /* make sure we remember we are no longer banned in case + there is an election */ + rec->node_flags &= ~NODE_FLAGS_BANNED; + } + if (rec->banned_nodes[pnn] == NULL) { DEBUG(0,("No ban recorded for this node. ctdb_unban_node() request ignored\n")); return; @@ -98,6 +106,8 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_ { struct ctdb_context *ctdb = rec->ctdb; + DEBUG(0,("Banning node %u for %u seconds\n", pnn, ban_time)); + if (!ctdb_validate_pnn(ctdb, pnn)) { DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn)); return; @@ -112,6 +122,10 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_ DEBUG(0,("self ban - lowering our election priority\n")); /* banning ourselves - lower our election priority */ rec->priority_time = timeval_current(); + + /* make sure we remember we are banned in case there is an + election */ + rec->node_flags |= NODE_FLAGS_BANNED; } ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0); -- cgit From a260145f9f60fa8e33564abd2d9b7d10191c2dc0 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 23 Nov 2007 12:38:37 +1100 Subject: check for recursive bans in ctdb_ban_node() and remove the previous ban if this is an attempt to ban an already banned node (This used to be ctdb commit 214f2d7b04d0a491d466fc85c8d016efde416f9e) --- ctdb/server/ctdb_recoverd.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 8b1f747cb6..5c72e90e93 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -130,6 +130,12 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_ ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0); + if (rec->banned_nodes[pnn] != NULL) { + DEBUG(0,("Re-banning an already banned node. Remove previous ban and set a new ban.\n")); + talloc_free(rec->banned_nodes[pnn]); + rec->banned_nodes[pnn] = NULL; + } + rec->banned_nodes[pnn] = talloc(rec, struct ban_state); CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]); -- cgit From 0597be338660ce9d80e9011a7a7158dcb66be272 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 23 Nov 2007 12:41:29 +1100 Subject: when monitoring the node from the recovery daemon, check that the recovery daemon and the ctdb daemon both agree on whether the node is banned or not and if they disagree then reban the node again after logging an error to the debug log (This used to be ctdb commit 6cd6e534493066edd4bb2c6ae5be0e9a9d495aa0) --- ctdb/server/ctdb_recoverd.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 5c72e90e93..18b7bfe9d0 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -1700,6 +1700,29 @@ again: goto again; } + /* check that we (recovery daemon) and the local ctdb daemon + agrees on whether we are banned or not + */ + if (nodemap->nodes[pnn].flags & NODE_FLAGS_BANNED) { + if (rec->banned_nodes[pnn] == NULL) { + DEBUG(0,("Local ctdb daemon thinks this node is BANNED but the recovery master disagrees. Re-banning the node\n")); + + ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period); + ctdb_set_culprit(rec, pnn); + + goto again; + } + } else { + if (rec->banned_nodes[pnn] != NULL) { + DEBUG(0,("Local ctdb daemon does not think this node is BANNED but the recovery master disagrees. Re-banning the node\n")); + + ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period); + ctdb_set_culprit(rec, pnn); + + goto again; + } + } + /* remember our own node flags */ rec->node_flags = nodemap->nodes[pnn].flags; -- cgit From 9e73dc87ccfbb148af4cd83ae1415fcb7b6e2f23 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 26 Nov 2007 10:52:55 +1100 Subject: Add a --node-ip argument so that one can specify which ip address a specific instance of ctdbd should bind to. This helps when running a "virtual" cluster on a single machine where all instcances bind to different alias interfaces. If --node-ip is specified, then we will only try to bind to this ip address only. Othervise we fall back to the original method trying the ip addresses in /etc/ctdb/nodes one by one until we find one we can bind to. No variable in /etc/sysconfig/ctdb added since this parameter only makes sense in a virtual test/debug cluster. (This used to be ctdb commit d96cb02c2c24f9eabbc53d3d38e90dea49cff3e0) --- ctdb/include/ctdb.h | 6 ++++++ ctdb/include/ctdb_private.h | 1 + ctdb/server/ctdb_server.c | 17 +++++++++++++++++ ctdb/server/ctdbd.c | 16 ++++++++++++++++ ctdb/tcp/tcp_connect.c | 26 ++++++++++++++++---------- 5 files changed, 56 insertions(+), 10 deletions(-) (limited to 'ctdb') diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h index b3a65a36ab..a2245d220b 100644 --- a/ctdb/include/ctdb.h +++ b/ctdb/include/ctdb.h @@ -159,6 +159,12 @@ int ctdb_set_socketname(struct ctdb_context *ctdb, const char *socketname); */ int ctdb_set_nlist(struct ctdb_context *ctdb, const char *nlist); +/* + Check that a specific ip address exists in the node list and returns + the id for the node or -1 +*/ +int ctdb_ip_to_nodeid(struct ctdb_context *ctdb, const char *nodeip); + /* start the ctdb protocol */ diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index cb76bb0074..10abafa1d9 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -367,6 +367,7 @@ struct ctdb_context { const char *default_public_interface; pid_t recoverd_pid; bool done_startup; + const char *node_ip; }; struct ctdb_db_context { diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index dddf90753b..2a80798dd9 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -34,6 +34,23 @@ int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport) return 0; } +/* + Check whether an ip is a valid node ip + Returns the node id for this ip address or -1 +*/ +int ctdb_ip_to_nodeid(struct ctdb_context *ctdb, const char *nodeip) +{ + int nodeid; + + for (nodeid=0;nodeidnum_nodes;nodeid++) { + if (!strcmp(ctdb->nodes[nodeid]->address.address, nodeip)) { + return nodeid; + } + } + + return -1; +} + /* choose the recovery lock file */ diff --git a/ctdb/server/ctdbd.c b/ctdb/server/ctdbd.c index eaf79a0ed7..ba0eac42fb 100644 --- a/ctdb/server/ctdbd.c +++ b/ctdb/server/ctdbd.c @@ -50,6 +50,7 @@ static struct { const char *db_dir_persistent; const char *public_interface; const char *single_public_ip; + const char *node_ip; int no_setsched; } options = { .nlist = ETCDIR "/ctdb/nodes", @@ -110,6 +111,7 @@ int main(int argc, const char *argv[]) { "event-script-dir", 0, POPT_ARG_STRING, &options.event_script_dir, 0, "event script directory", "dirname" }, { "logfile", 0, POPT_ARG_STRING, &options.logfile, 0, "log file location", "filename" }, { "nlist", 0, POPT_ARG_STRING, &options.nlist, 0, "node list file", "filename" }, + { "node-ip", 0, POPT_ARG_STRING, &options.node_ip, 0, "node ip", "ip-address"}, { "listen", 0, POPT_ARG_STRING, &options.myaddress, 0, "address to listen on", "address" }, { "transport", 0, POPT_ARG_STRING, &options.transport, 0, "protocol transport", NULL }, { "dbdir", 0, POPT_ARG_STRING, &options.db_dir, 0, "directory for the tdb files", NULL }, @@ -198,6 +200,20 @@ int main(int argc, const char *argv[]) exit(1); } + /* if a node-ip was specified, verify that it exists in the + nodes file + */ + if (options.node_ip != NULL) { + DEBUG(0,("IP for this node is %s\n", options.node_ip)); + ret = ctdb_ip_to_nodeid(ctdb, options.node_ip); + if (ret == -1) { + DEBUG(0,("The specified node-ip:%s is not a valid node address. Exiting.\n", options.node_ip)); + exit(1); + } + ctdb->node_ip = options.node_ip; + DEBUG(0,("This is node %d\n", ret)); + } + if (options.db_dir) { ret = ctdb_set_tdb_dir(ctdb, options.db_dir); if (ret == -1) { diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 3548f82ed7..3c4e7bfb10 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -214,13 +214,9 @@ static void ctdb_listen_event(struct event_context *ev, struct fd_event *fde, if (fd == -1) return; incoming_node = inet_ntoa(addr.sin_addr); - for (nodeid=0;nodeidnum_nodes;nodeid++) { - if (!strcmp(incoming_node, ctdb->nodes[nodeid]->address.address)) { - DEBUG(0, ("Incoming connection from node:%d %s\n",nodeid,incoming_node)); - break; - } - } - if (nodeid>=ctdb->num_nodes) { + nodeid = ctdb_ip_to_nodeid(ctdb, incoming_node); + + if (nodeid == -1) { DEBUG(0, ("Refused connection from unknown node %s\n", incoming_node)); close(fd); return; @@ -275,17 +271,27 @@ static int ctdb_tcp_listen_automatic(struct ctdb_context *ctdb) } for (i=0;inum_nodes;i++) { + /* if node_ip is specified we will only try to bind to that + ip. + */ + if (ctdb->node_ip != NULL) { + if (strcmp(ctdb->node_ip, ctdb->nodes[i]->address.address)) { + continue; + } + } + ZERO_STRUCT(sock); #ifdef HAVE_SOCK_SIN_LEN sock.sin_len = sizeof(sock); #endif sock.sin_port = htons(ctdb->nodes[i]->address.port); sock.sin_family = PF_INET; - if (ctdb_tcp_get_address(ctdb, ctdb->nodes[i]->address.address, - &sock.sin_addr) != 0) { + if (ctdb_tcp_get_address(ctdb, + ctdb->nodes[i]->address.address, + &sock.sin_addr) != 0) { continue; } - + if (bind(ctcp->listen_fd, (struct sockaddr * )&sock, sizeof(sock)) == 0) { break; -- cgit From 5c3a2709911d4a75d39d156570be952ee0b050e3 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 28 Nov 2007 15:04:20 +1100 Subject: move ctdb_set_culprit higher up in the file when we are the recmaster and we update the local flags for all the nodes, if one of the nodes fail to respond and give us his flags, set that node as a "culprit" as one of the first things to do in the monitor_cluster loop, check if the current culprit has caused too many (20) failures and if so ban that node. this is for the situation where a remote node may still be CONNECTED but it fails to respond to the getnodemap control causing the recovery master to loop in monitor_cluster aborting the monitoring when the node fails to respond but before anything will trigger a call to do_recovery(). If one or more of the databases or nodes are frozen at this stage, this would lead to smbd being blocked for potentially a longish time. (This used to be ctdb commit 83b0261f2cb453195b86f547d360400103a8b795) --- ctdb/server/ctdb_recoverd.c | 47 +++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 18b7bfe9d0..163793dc8e 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -760,6 +760,23 @@ static void ctdb_wait_election(struct ctdb_recoverd *rec) } } +/* + remember the trouble maker + */ +static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit) +{ + struct ctdb_context *ctdb = rec->ctdb; + + if (rec->last_culprit != culprit || + timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) { + DEBUG(0,("New recovery culprit %u\n", culprit)); + /* either a new node is the culprit, or we've decide to forgive them */ + rec->last_culprit = culprit; + rec->first_recover_time = timeval_current(); + rec->culprit_counter = 0; + } + rec->culprit_counter++; +} /* Update our local flags from all remote connected nodes. @@ -789,6 +806,7 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n if (ret != 0) { DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n", nodemap->nodes[j].pnn)); + ctdb_set_culprit(rec, nodemap->nodes[j].pnn); talloc_free(mem_ctx); return MONITOR_FAILED; } @@ -858,23 +876,6 @@ static uint32_t new_generation(void) return generation; } -/* - remember the trouble maker - */ -static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit) -{ - struct ctdb_context *ctdb = rec->ctdb; - - if (rec->last_culprit != culprit || - timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) { - DEBUG(0,("New recovery culprit %u\n", culprit)); - /* either a new node is the culprit, or we've decide to forgive them */ - rec->last_culprit = culprit; - rec->first_recover_time = timeval_current(); - rec->culprit_counter = 0; - } - rec->culprit_counter++; -} /* we are the recmaster, and recovery is needed - start a recovery run @@ -1672,6 +1673,18 @@ again: goto again; } + + /* We must check if we need to ban a node here but we want to do this + as early as possible so we dont wait until we have pulled the node + map from the local node. thats why we have the hardcoded value 20 + */ + if (rec->culprit_counter > 20) { + DEBUG(0,("Node %u has caused %u failures in %.0f seconds - banning it for %u seconds\n", + rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time), + ctdb->tunable.recovery_ban_period)); + ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period); + } + /* get relevant tunables */ ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable); if (ret != 0) { -- cgit From 8ac8cce487acadb485396ba6dad431ee66dae75d Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 30 Nov 2007 08:44:34 +1100 Subject: dont manipulate ctdb->monitoring_mode directly from the SET_MON_MODE control, instead call ctdb_start/stop_monitoring() ctdb_stop_monitoring() dont allocate a new monitoring context, leave it NULL. Also set the monitoring_mode in this function so that ctdb_stop/start_monitoring() and ->monitoring_mode are kept in sync. Add a debug message to log that we have stopped monitoring. ctdb_start_monitoring() check whether monitoring is already active and make the function idempotent. Create the monitoring context when monitoring is started. Update ->monitoring_mode once the monitoring has been started. Add a debug message to log that we have started monitoring. When we temporarily stop monitoring while running an event script, restart monitoring after the event script wrapper returns instead of in the event script callback. Let monitoring_mode start out as DISABLED and let it be enabled once we call ctdb_start_monitoring. dont check for MONITORING_DISABLED in check_fore_dead_nodes(). If monitoring is disabled, this event handler will not be called. (This used to be ctdb commit 3a93ae8bdcffb1adbd6243844f3058fc742f76aa) --- ctdb/server/ctdb_control.c | 6 +++++- ctdb/server/ctdb_monitor.c | 23 ++++++++++++++--------- ctdb/server/ctdb_recover.c | 5 +++-- ctdb/server/ctdb_recoverd.c | 5 +++-- ctdb/server/ctdb_takeover.c | 8 ++++---- ctdb/server/ctdbd.c | 2 +- 6 files changed, 30 insertions(+), 19 deletions(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c index f13f39dbcc..1a79dc4c11 100644 --- a/ctdb/server/ctdb_control.c +++ b/ctdb/server/ctdb_control.c @@ -225,7 +225,11 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, case CTDB_CONTROL_SET_MONMODE: CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); - ctdb->monitoring_mode = *(uint32_t *)indata.dptr; + if (*(uint32_t *)indata.dptr == CTDB_MONITORING_ACTIVE) { + ctdb_start_monitoring(ctdb); + } else { + ctdb_stop_monitoring(ctdb); + } return 0; case CTDB_CONTROL_GET_MONMODE: diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 774d6accb2..9a3986711a 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -33,13 +33,6 @@ static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_eve struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); int i; - if (ctdb->monitoring_mode == CTDB_MONITORING_DISABLED) { - event_add_timed(ctdb->ev, ctdb->monitor_context, - timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), - ctdb_check_for_dead_nodes, ctdb); - return; - } - /* send a keepalive to all other nodes, unless */ for (i=0;inum_nodes;i++) { struct ctdb_node *node = ctdb->nodes[i]; @@ -203,8 +196,10 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te, void ctdb_stop_monitoring(struct ctdb_context *ctdb) { talloc_free(ctdb->monitor_context); - ctdb->monitor_context = talloc_new(ctdb); - CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor_context); + ctdb->monitor_context = NULL; + + ctdb->monitoring_mode = CTDB_MONITORING_DISABLED; + DEBUG(0,("Monitoring has been stopped\n")); } /* @@ -214,8 +209,15 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb) { struct timed_event *te; + if (ctdb->monitoring_mode == CTDB_MONITORING_ACTIVE) { + return; + } + ctdb_stop_monitoring(ctdb); + ctdb->monitor_context = talloc_new(ctdb); + CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor_context); + te = event_add_timed(ctdb->ev, ctdb->monitor_context, timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), ctdb_check_for_dead_nodes, ctdb); @@ -225,6 +227,9 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb) timeval_current_ofs(ctdb->tunable.monitor_retry, 0), ctdb_check_health, ctdb); CTDB_NO_MEMORY_FATAL(ctdb, te); + + ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE; + DEBUG(0,("Monitoring has been started\n")); } diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index 3721facdba..89e6d3893a 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -415,8 +415,6 @@ static void ctdb_recovered_callback(struct ctdb_context *ctdb, int status, void { struct ctdb_set_recmode_state *state = talloc_get_type(p, struct ctdb_set_recmode_state); - ctdb_start_monitoring(ctdb); - if (status == 0) { ctdb->recovery_mode = state->recmode; } else { @@ -492,6 +490,9 @@ static void set_recmode_handler(struct event_context *ev, struct fd_event *fde, state, ctdb_recovered_callback, state, "recovered"); + + ctdb_start_monitoring(state->ctdb); + if (ret != 0) { ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "failed to run eventscript from set_recmode"); talloc_free(state); diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 163793dc8e..88f2254366 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -782,9 +782,10 @@ static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit) Update our local flags from all remote connected nodes. This is only run when we are or we belive we are the recovery master */ -static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap) +static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap) { int j; + struct ctdb_context *ctdb = rec->ctdb; TALLOC_CTX *mem_ctx = talloc_new(ctdb); /* get the nodemap for all active remote nodes and verify @@ -1857,7 +1858,7 @@ again: /* ensure our local copies of flags are right */ - ret = update_local_flags(ctdb, nodemap); + ret = update_local_flags(rec, nodemap); if (ret == MONITOR_ELECTION_NEEDED) { DEBUG(0,("update_local_flags() called for a re-election.\n")); force_election(rec, mem_ctx, pnn, nodemap); diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c index ec3455e4c0..cd9c385dfb 100644 --- a/ctdb/server/ctdb_takeover.c +++ b/ctdb/server/ctdb_takeover.c @@ -131,8 +131,6 @@ static void takeover_ip_callback(struct ctdb_context *ctdb, int status, char *ip = inet_ntoa(state->sin->sin_addr); struct ctdb_tcp_array *tcparray; - ctdb_start_monitoring(ctdb); - if (status != 0) { DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n", ip, state->vnn->iface)); @@ -247,6 +245,8 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, vnn->iface, inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits); + ctdb_start_monitoring(ctdb); + if (ret != 0) { DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n", inet_ntoa(pip->sin.sin_addr), vnn->iface)); @@ -299,8 +299,6 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status, char *ip = inet_ntoa(state->sin->sin_addr); TDB_DATA data; - ctdb_start_monitoring(ctdb); - /* send a message to all clients of this node telling them that the cluster has been reconfigured and they should release any sockets on this IP */ @@ -373,6 +371,8 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, vnn->iface, inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits); + ctdb_start_monitoring(ctdb); + if (ret != 0) { DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n", inet_ntoa(pip->sin.sin_addr), vnn->iface)); diff --git a/ctdb/server/ctdbd.c b/ctdb/server/ctdbd.c index ba0eac42fb..5613ea91b7 100644 --- a/ctdb/server/ctdbd.c +++ b/ctdb/server/ctdbd.c @@ -168,7 +168,7 @@ int main(int argc, const char *argv[]) ctdb->upcalls = &ctdb_upcalls; ctdb->idr = idr_init(ctdb); ctdb->recovery_lock_fd = -1; - ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE; + ctdb->monitoring_mode = CTDB_MONITORING_DISABLED; ctdb_tunables_set_defaults(ctdb); -- cgit From 192ba82b73fa791538a11303734b0546534a1026 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 30 Nov 2007 09:02:37 +1100 Subject: ->monitor_context is NULL when monitoring is disabled. Check whether monitoring is enabled or not before creating new events and log why the event is not set up othervise (This used to be ctdb commit 2f352b2606c04a65ce461fc2e99e6d6251ac4f20) --- ctdb/server/ctdb_monitor.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 9a3986711a..ce172fa345 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -110,9 +110,15 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) next_interval = ctdb->tunable.monitor_interval; } - event_add_timed(ctdb->ev, ctdb->monitor_context, - timeval_current_ofs(next_interval, 0), - ctdb_check_health, ctdb); + if (ctdb->monitor_context == NULL) { + DEBUG(0,(__location__ " monitoring was disabled while running" + " healthcheck. Health checks postphoned until" + " monitoring is re-enabled.\n")); + } else { + event_add_timed(ctdb->ev, ctdb->monitor_context, + timeval_current_ofs(next_interval, 0), + ctdb_check_health, ctdb); + } if (c.old_flags == node->flags) { return; @@ -143,13 +149,27 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p } if (ctdb->done_startup) { - event_add_timed(ctdb->ev, ctdb->monitor_context, - timeval_zero(), - ctdb_check_health, ctdb); + if (ctdb->monitor_context == NULL) { + DEBUG(0,(__location__ " monitoring was disabled while " + "running startup event. " + "startup event postphoned until " + "monitoring is re-enabled.\n")); + } else { + event_add_timed(ctdb->ev, ctdb->monitor_context, + timeval_zero(), + ctdb_check_health, ctdb); + } } else { - event_add_timed(ctdb->ev, ctdb->monitor_context, - timeval_current_ofs(ctdb->tunable.monitor_interval, 0), - ctdb_check_health, ctdb); + if (ctdb->monitor_context == NULL) { + DEBUG(0,(__location__ " monitoring was disabled while " + "running startup event. " + "Health cheack postphoned until " + "monitoring is re-enabled.\n")); + } else { + event_add_timed(ctdb->ev, ctdb->monitor_context, + timeval_current_ofs(ctdb->tunable.monitor_interval, 0), + ctdb_check_health, ctdb); + } } } -- cgit From 0eb6c04dc1b0f01906a61c16b2447cdbd2724c74 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 30 Nov 2007 10:00:04 +1100 Subject: get rid of the control to set the monitoring mode. monitoring should always be enabled (though a node may want to temporarily disable running the "monitor" event scripts but can do so internally without the need for this control) (This used to be ctdb commit e3a33618026823e6af845fd8513cddb08e6b5584) --- ctdb/client/ctdb_client.c | 23 ----------------------- ctdb/include/ctdb.h | 4 ---- ctdb/include/ctdb_private.h | 2 +- ctdb/server/ctdb_control.c | 9 --------- ctdb/tools/ctdb.c | 24 ------------------------ 5 files changed, 1 insertion(+), 61 deletions(-) (limited to 'ctdb') diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c index 4d91e6d70c..1935f5b7b1 100644 --- a/ctdb/client/ctdb_client.c +++ b/ctdb/client/ctdb_client.c @@ -1931,29 +1931,6 @@ int ctdb_ctrl_getpnn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t return res; } -/* - set the monitoring mode of a remote node - */ -int ctdb_ctrl_setmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t monmode) -{ - int ret; - TDB_DATA data; - int32_t res; - - data.dsize = sizeof(uint32_t); - data.dptr = (uint8_t *)&monmode; - - ret = ctdb_control(ctdb, destnode, 0, - CTDB_CONTROL_SET_MONMODE, 0, data, - NULL, NULL, &res, &timeout, NULL); - if (ret != 0 || res != 0) { - DEBUG(0,(__location__ " ctdb_control for setmonmode failed\n")); - return -1; - } - - return 0; -} - /* get the monitoring mode of a remote node */ diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h index a2245d220b..e44706d848 100644 --- a/ctdb/include/ctdb.h +++ b/ctdb/include/ctdb.h @@ -349,10 +349,6 @@ int ctdb_ctrl_setrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint get the monitoring mode of a remote node */ int ctdb_ctrl_getmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *monmode); -/* - set the monitoringmode of a remote node - */ -int ctdb_ctrl_setmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t monmode); /* get the recovery master of a remote node diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index 10abafa1d9..dbe4f74689 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -452,7 +452,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0, CTDB_CONTROL_GET_PNN = 35, CTDB_CONTROL_SHUTDOWN = 36, CTDB_CONTROL_GET_MONMODE = 37, - CTDB_CONTROL_SET_MONMODE = 38, + /* #38 removed */ CTDB_CONTROL_MAX_RSN = 39, CTDB_CONTROL_SET_RSN_NONEMPTY = 40, CTDB_CONTROL_DELETE_LOW_RSN = 41, diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c index 1a79dc4c11..35d2e155db 100644 --- a/ctdb/server/ctdb_control.c +++ b/ctdb/server/ctdb_control.c @@ -223,15 +223,6 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); return ctdb_control_set_recmode(ctdb, c, indata, async_reply, errormsg); - case CTDB_CONTROL_SET_MONMODE: - CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); - if (*(uint32_t *)indata.dptr == CTDB_MONITORING_ACTIVE) { - ctdb_start_monitoring(ctdb); - } else { - ctdb_stop_monitoring(ctdb); - } - return 0; - case CTDB_CONTROL_GET_MONMODE: return ctdb->monitoring_mode; diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c index e0c6621429..0a4f370697 100644 --- a/ctdb/tools/ctdb.c +++ b/ctdb/tools/ctdb.c @@ -753,29 +753,6 @@ static int control_getmonmode(struct ctdb_context *ctdb, int argc, const char ** return 0; } -/* - set the monitoring mode of a remote node - */ -static int control_setmonmode(struct ctdb_context *ctdb, int argc, const char **argv) -{ - uint32_t monmode; - int ret; - - if (argc < 1) { - usage(); - } - - monmode = strtoul(argv[0], NULL, 0); - - ret = ctdb_ctrl_setmonmode(ctdb, TIMELIMIT(), options.pnn, monmode); - if (ret != 0) { - DEBUG(0, ("Unable to set monmode on node %u\n", options.pnn)); - return ret; - } - - return 0; -} - /* display remote list of keys/data for a db */ @@ -1082,7 +1059,6 @@ static const struct { { "getdbmap", control_getdbmap, true, "show the database map" }, { "catdb", control_catdb, true, "dump a database" , ""}, { "getmonmode", control_getmonmode, true, "show monitoring mode" }, - { "setmonmode", control_setmonmode, true, "set monitoring mode", "<0|1>" }, { "setdebug", control_setdebug, true, "set debug level", "" }, { "getdebug", control_getdebug, true, "get debug level" }, { "attach", control_attach, true, "attach to a database", "" }, -- cgit From 50573c539195f441ccfd3b9192c4f281a4dc4d69 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 30 Nov 2007 10:09:54 +1100 Subject: add ctdb_disable/enable_monitoring() that only modifies the monitoring flag. change calling of the recovered/takeip/releaseip event scripts to use these enable/disable functions instead of stopping/starting monitoring. when we disable monitoring we want all events to still be running in particular the events to monitor for dead nodes and we only want to supress running the monitor event scripts (This used to be ctdb commit a006dcc4f75aba950dd701ad7d1a84e89df285e8) --- ctdb/include/ctdb_private.h | 2 ++ ctdb/server/ctdb_monitor.c | 23 ++++++++++++++++++++++- ctdb/server/ctdb_recover.c | 4 ++-- ctdb/server/ctdb_takeover.c | 8 ++++---- 4 files changed, 30 insertions(+), 7 deletions(-) (limited to 'ctdb') diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index dbe4f74689..7f5ff2d1b2 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -1045,6 +1045,8 @@ void ctdb_stop_recoverd(struct ctdb_context *ctdb); uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb); +void ctdb_disable_monitoring(struct ctdb_context *ctdb); +void ctdb_enable_monitoring(struct ctdb_context *ctdb); void ctdb_stop_monitoring(struct ctdb_context *ctdb); void ctdb_start_monitoring(struct ctdb_context *ctdb); void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb); diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index ce172fa345..7053451ea4 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -212,7 +212,28 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te, } } -/* stop any monitoring */ +/* + (Temporaily) Disabling monitoring will stop the monitor event scripts + from running but node health checks will still occur +*/ +void ctdb_disable_monitoring(struct ctdb_context *ctdb) +{ + ctdb->monitoring_mode = CTDB_MONITORING_DISABLED; + DEBUG(0,("Monitoring has been stopped\n")); +} + +/* + Re-enable running monitor events after they have been disabled + */ +void ctdb_enable_monitoring(struct ctdb_context *ctdb) +{ + ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE; + DEBUG(0,("Monitoring has been enabled\n")); +} + +/* stop any monitoring + this should only be done when shutting down the daemon +*/ void ctdb_stop_monitoring(struct ctdb_context *ctdb) { talloc_free(ctdb->monitor_context); diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index 89e6d3893a..1c9f6a91bd 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -482,7 +482,7 @@ static void set_recmode_handler(struct event_context *ev, struct fd_event *fde, } - ctdb_stop_monitoring(state->ctdb); + ctdb_disable_monitoring(state->ctdb); /* call the events script to tell all subsystems that we have recovered */ ret = ctdb_event_script_callback(state->ctdb, @@ -491,7 +491,7 @@ static void set_recmode_handler(struct event_context *ev, struct fd_event *fde, ctdb_recovered_callback, state, "recovered"); - ctdb_start_monitoring(state->ctdb); + ctdb_enable_monitoring(state->ctdb); if (ret != 0) { ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "failed to run eventscript from set_recmode"); diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c index cd9c385dfb..90d692e355 100644 --- a/ctdb/server/ctdb_takeover.c +++ b/ctdb/server/ctdb_takeover.c @@ -236,7 +236,7 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits, vnn->iface)); - ctdb_stop_monitoring(ctdb); + ctdb_disable_monitoring(ctdb); ret = ctdb_event_script_callback(ctdb, timeval_current_ofs(ctdb->tunable.script_timeout, 0), @@ -245,7 +245,7 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, vnn->iface, inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits); - ctdb_start_monitoring(ctdb); + ctdb_enable_monitoring(ctdb); if (ret != 0) { DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n", @@ -362,7 +362,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, state->vnn = vnn; - ctdb_stop_monitoring(ctdb); + ctdb_disable_monitoring(ctdb); ret = ctdb_event_script_callback(ctdb, timeval_current_ofs(ctdb->tunable.script_timeout, 0), @@ -371,7 +371,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, vnn->iface, inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits); - ctdb_start_monitoring(ctdb); + ctdb_enable_monitoring(ctdb); if (ret != 0) { DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n", -- cgit From 975fbc8e225187475e653d548a614626a6fd8b9e Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 30 Nov 2007 10:14:43 +1100 Subject: always set up a new monitoring event regardless of whether monitoring is enabled or not (This used to be ctdb commit c3035f46d1a65d2d97c8be7e679d59e471c092c2) --- ctdb/server/ctdb_monitor.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 7053451ea4..3fadf407ab 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -110,15 +110,9 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) next_interval = ctdb->tunable.monitor_interval; } - if (ctdb->monitor_context == NULL) { - DEBUG(0,(__location__ " monitoring was disabled while running" - " healthcheck. Health checks postphoned until" - " monitoring is re-enabled.\n")); - } else { - event_add_timed(ctdb->ev, ctdb->monitor_context, + event_add_timed(ctdb->ev, ctdb->monitor_context, timeval_current_ofs(next_interval, 0), ctdb_check_health, ctdb); - } if (c.old_flags == node->flags) { return; @@ -149,27 +143,13 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p } if (ctdb->done_startup) { - if (ctdb->monitor_context == NULL) { - DEBUG(0,(__location__ " monitoring was disabled while " - "running startup event. " - "startup event postphoned until " - "monitoring is re-enabled.\n")); - } else { - event_add_timed(ctdb->ev, ctdb->monitor_context, - timeval_zero(), - ctdb_check_health, ctdb); - } + event_add_timed(ctdb->ev, ctdb->monitor_context, + timeval_zero(), + ctdb_check_health, ctdb); } else { - if (ctdb->monitor_context == NULL) { - DEBUG(0,(__location__ " monitoring was disabled while " - "running startup event. " - "Health cheack postphoned until " - "monitoring is re-enabled.\n")); - } else { - event_add_timed(ctdb->ev, ctdb->monitor_context, - timeval_current_ofs(ctdb->tunable.monitor_interval, 0), - ctdb_check_health, ctdb); - } + event_add_timed(ctdb->ev, ctdb->monitor_context, + timeval_current_ofs(ctdb->tunable.monitor_interval, 0), + ctdb_check_health, ctdb); } } -- cgit From 07dd0f6ff033ffe5b97c3fef14d766bd075c8f93 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 30 Nov 2007 10:53:35 +1100 Subject: log that monitoring has been "disabled" not that it has been "stopped" when monitoring is disabled (This used to be ctdb commit e7c92f661a523deae9544b679d412ae79cc0ede7) --- ctdb/server/ctdb_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 3fadf407ab..bccee57f1e 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -199,7 +199,7 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te, void ctdb_disable_monitoring(struct ctdb_context *ctdb) { ctdb->monitoring_mode = CTDB_MONITORING_DISABLED; - DEBUG(0,("Monitoring has been stopped\n")); + DEBUG(0,("Monitoring has been disabled\n")); } /* -- cgit From 2f1baf34d39de59656bac37f3445bb6fc2b4b3d1 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Sat, 1 Dec 2007 10:06:42 +1100 Subject: up the loglevel for the enable/disable monitoring to level 1 (This used to be ctdb commit 5043a0afeedbd30c7f64c2733c8ae5bf75479a98) --- ctdb/server/ctdb_monitor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'ctdb') diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index bccee57f1e..dfa91abe71 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -199,7 +199,7 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te, void ctdb_disable_monitoring(struct ctdb_context *ctdb) { ctdb->monitoring_mode = CTDB_MONITORING_DISABLED; - DEBUG(0,("Monitoring has been disabled\n")); + DEBUG(1,("Monitoring has been disabled\n")); } /* @@ -208,7 +208,7 @@ void ctdb_disable_monitoring(struct ctdb_context *ctdb) void ctdb_enable_monitoring(struct ctdb_context *ctdb) { ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE; - DEBUG(0,("Monitoring has been enabled\n")); + DEBUG(1,("Monitoring has been enabled\n")); } /* stop any monitoring -- cgit