summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Tridgell <tridge@samba.org>2007-12-03 10:19:24 +1100
committerAndrew Tridgell <tridge@samba.org>2007-12-03 10:19:24 +1100
commit7edb41692eb8d32dfeadb08af61db744f39900c2 (patch)
treed8baa16c422f364bff2aa87f0c2032fbfe20b17f
parent330bf59ab1c14ca09bb4d60b9dbd02793debff1c (diff)
parent2f1baf34d39de59656bac37f3445bb6fc2b4b3d1 (diff)
downloadsamba-7edb41692eb8d32dfeadb08af61db744f39900c2.tar.gz
samba-7edb41692eb8d32dfeadb08af61db744f39900c2.tar.xz
samba-7edb41692eb8d32dfeadb08af61db744f39900c2.zip
merge from ronnie
(This used to be ctdb commit 6653a0b67381310236e548e5fc0a9e27209b44e0)
-rw-r--r--ctdb/client/ctdb_client.c23
-rw-r--r--ctdb/include/ctdb.h10
-rw-r--r--ctdb/include/ctdb_private.h5
-rw-r--r--ctdb/server/ctdb_control.c5
-rw-r--r--ctdb/server/ctdb_monitor.c54
-rw-r--r--ctdb/server/ctdb_recover.c7
-rw-r--r--ctdb/server/ctdb_recoverd.c149
-rw-r--r--ctdb/server/ctdb_server.c17
-rw-r--r--ctdb/server/ctdb_takeover.c12
-rw-r--r--ctdb/server/ctdbd.c18
-rw-r--r--ctdb/tcp/tcp_connect.c26
-rw-r--r--ctdb/tools/ctdb.c24
12 files changed, 237 insertions, 113 deletions
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 4d91e6d70c..1935f5b7b1 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -1932,29 +1932,6 @@ int ctdb_ctrl_getpnn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t
}
/*
- set the monitoring mode of a remote node
- */
-int ctdb_ctrl_setmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t monmode)
-{
- int ret;
- TDB_DATA data;
- int32_t res;
-
- data.dsize = sizeof(uint32_t);
- data.dptr = (uint8_t *)&monmode;
-
- ret = ctdb_control(ctdb, destnode, 0,
- CTDB_CONTROL_SET_MONMODE, 0, data,
- NULL, NULL, &res, &timeout, NULL);
- if (ret != 0 || res != 0) {
- DEBUG(0,(__location__ " ctdb_control for setmonmode failed\n"));
- return -1;
- }
-
- return 0;
-}
-
-/*
get the monitoring mode of a remote node
*/
int ctdb_ctrl_getmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *monmode)
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index b3a65a36ab..e44706d848 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -160,6 +160,12 @@ int ctdb_set_socketname(struct ctdb_context *ctdb, const char *socketname);
int ctdb_set_nlist(struct ctdb_context *ctdb, const char *nlist);
/*
+ Check that a specific ip address exists in the node list and returns
+ the id for the node or -1
+*/
+int ctdb_ip_to_nodeid(struct ctdb_context *ctdb, const char *nodeip);
+
+/*
start the ctdb protocol
*/
int ctdb_start(struct ctdb_context *ctdb);
@@ -343,10 +349,6 @@ int ctdb_ctrl_setrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint
get the monitoring mode of a remote node
*/
int ctdb_ctrl_getmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *monmode);
-/*
- set the monitoringmode of a remote node
- */
-int ctdb_ctrl_setmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t monmode);
/*
get the recovery master of a remote node
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index cb76bb0074..7f5ff2d1b2 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -367,6 +367,7 @@ struct ctdb_context {
const char *default_public_interface;
pid_t recoverd_pid;
bool done_startup;
+ const char *node_ip;
};
struct ctdb_db_context {
@@ -451,7 +452,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0,
CTDB_CONTROL_GET_PNN = 35,
CTDB_CONTROL_SHUTDOWN = 36,
CTDB_CONTROL_GET_MONMODE = 37,
- CTDB_CONTROL_SET_MONMODE = 38,
+ /* #38 removed */
CTDB_CONTROL_MAX_RSN = 39,
CTDB_CONTROL_SET_RSN_NONEMPTY = 40,
CTDB_CONTROL_DELETE_LOW_RSN = 41,
@@ -1044,6 +1045,8 @@ void ctdb_stop_recoverd(struct ctdb_context *ctdb);
uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb);
+void ctdb_disable_monitoring(struct ctdb_context *ctdb);
+void ctdb_enable_monitoring(struct ctdb_context *ctdb);
void ctdb_stop_monitoring(struct ctdb_context *ctdb);
void ctdb_start_monitoring(struct ctdb_context *ctdb);
void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb);
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
index f13f39dbcc..35d2e155db 100644
--- a/ctdb/server/ctdb_control.c
+++ b/ctdb/server/ctdb_control.c
@@ -223,11 +223,6 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
return ctdb_control_set_recmode(ctdb, c, indata, async_reply, errormsg);
- case CTDB_CONTROL_SET_MONMODE:
- CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
- ctdb->monitoring_mode = *(uint32_t *)indata.dptr;
- return 0;
-
case CTDB_CONTROL_GET_MONMODE:
return ctdb->monitoring_mode;
diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c
index 0e2dc29c6a..bdb3d45eda 100644
--- a/ctdb/server/ctdb_monitor.c
+++ b/ctdb/server/ctdb_monitor.c
@@ -33,13 +33,6 @@ static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_eve
struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
int i;
- if (ctdb->monitoring_mode == CTDB_MONITORING_DISABLED) {
- event_add_timed(ctdb->ev, ctdb->monitor_context,
- timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
- ctdb_check_for_dead_nodes, ctdb);
- return;
- }
-
/* send a keepalive to all other nodes, unless */
for (i=0;i<ctdb->num_nodes;i++) {
struct ctdb_node *node = ctdb->nodes[i];
@@ -118,8 +111,8 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
}
event_add_timed(ctdb->ev, ctdb->monitor_context,
- timeval_current_ofs(next_interval, 0),
- ctdb_check_health, ctdb);
+ timeval_current_ofs(next_interval, 0),
+ ctdb_check_health, ctdb);
if (c.old_flags == node->flags) {
return;
@@ -155,7 +148,7 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p
ctdb_check_health, ctdb);
} else {
event_add_timed(ctdb->ev, ctdb->monitor_context,
- timeval_current_ofs(ctdb->tunable.monitor_interval, 0),
+ timeval_current_ofs(ctdb->tunable.monitor_interval, 0),
ctdb_check_health, ctdb);
}
@@ -199,12 +192,35 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
}
}
-/* stop any monitoring */
+/*
+ (Temporaily) Disabling monitoring will stop the monitor event scripts
+ from running but node health checks will still occur
+*/
+void ctdb_disable_monitoring(struct ctdb_context *ctdb)
+{
+ ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
+ DEBUG(2,("Monitoring has been disabled\n"));
+}
+
+/*
+ Re-enable running monitor events after they have been disabled
+ */
+void ctdb_enable_monitoring(struct ctdb_context *ctdb)
+{
+ ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE;
+ DEBUG(2,("Monitoring has been enabled\n"));
+}
+
+/* stop any monitoring
+ this should only be done when shutting down the daemon
+*/
void ctdb_stop_monitoring(struct ctdb_context *ctdb)
{
talloc_free(ctdb->monitor_context);
- ctdb->monitor_context = talloc_new(ctdb);
- CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor_context);
+ ctdb->monitor_context = NULL;
+
+ ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
+ DEBUG(0,("Monitoring has been stopped\n"));
}
/*
@@ -214,8 +230,15 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb)
{
struct timed_event *te;
+ if (ctdb->monitoring_mode == CTDB_MONITORING_ACTIVE) {
+ return;
+ }
+
ctdb_stop_monitoring(ctdb);
+ ctdb->monitor_context = talloc_new(ctdb);
+ CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor_context);
+
te = event_add_timed(ctdb->ev, ctdb->monitor_context,
timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
ctdb_check_for_dead_nodes, ctdb);
@@ -225,6 +248,9 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb)
timeval_current_ofs(ctdb->tunable.monitor_retry, 0),
ctdb_check_health, ctdb);
CTDB_NO_MEMORY_FATAL(ctdb, te);
+
+ ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE;
+ DEBUG(0,("Monitoring has been started\n"));
}
@@ -243,7 +269,7 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
node->flags &= ~m->clear;
if (node->flags == old_flags) {
- /* no change */
+ DEBUG(2, ("Control modflags on node %u - Unchanged - flags 0x%x\n", ctdb->pnn, node->flags));
return 0;
}
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index 3721facdba..8b2dfb7583 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -415,7 +415,7 @@ static void ctdb_recovered_callback(struct ctdb_context *ctdb, int status, void
{
struct ctdb_set_recmode_state *state = talloc_get_type(p, struct ctdb_set_recmode_state);
- ctdb_start_monitoring(ctdb);
+ ctdb_enable_monitoring(state->ctdb);
if (status == 0) {
ctdb->recovery_mode = state->recmode;
@@ -484,7 +484,7 @@ static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
}
- ctdb_stop_monitoring(state->ctdb);
+ ctdb_disable_monitoring(state->ctdb);
/* call the events script to tell all subsystems that we have recovered */
ret = ctdb_event_script_callback(state->ctdb,
@@ -492,7 +492,10 @@ static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
state,
ctdb_recovered_callback,
state, "recovered");
+
if (ret != 0) {
+ ctdb_enable_monitoring(state->ctdb);
+
ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "failed to run eventscript from set_recmode");
talloc_free(state);
return;
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index e54c53d935..8e297e9f52 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -61,12 +61,21 @@ static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
{
struct ctdb_context *ctdb = rec->ctdb;
+ DEBUG(0,("Unbanning node %u\n", pnn));
+
if (!ctdb_validate_pnn(ctdb, pnn)) {
DEBUG(0,("Bad pnn %u in ctdb_unban_node\n", pnn));
return;
}
+ if (pnn == ctdb->pnn) {
+ /* make sure we remember we are no longer banned in case
+ there is an election */
+ rec->node_flags &= ~NODE_FLAGS_BANNED;
+ }
+
if (rec->banned_nodes[pnn] == NULL) {
+ DEBUG(0,("No ban recorded for this node. ctdb_unban_node() request ignored\n"));
return;
}
@@ -97,6 +106,8 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_
{
struct ctdb_context *ctdb = rec->ctdb;
+ DEBUG(0,("Banning node %u for %u seconds\n", pnn, ban_time));
+
if (!ctdb_validate_pnn(ctdb, pnn)) {
DEBUG(0,("Bad pnn %u in ctdb_ban_node\n", pnn));
return;
@@ -111,10 +122,20 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_
DEBUG(0,("self ban - lowering our election priority\n"));
/* banning ourselves - lower our election priority */
rec->priority_time = timeval_current();
+
+ /* make sure we remember we are banned in case there is an
+ election */
+ rec->node_flags |= NODE_FLAGS_BANNED;
}
ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
+ if (rec->banned_nodes[pnn] != NULL) {
+ DEBUG(0,("Re-banning an already banned node. Remove previous ban and set a new ban.\n"));
+ talloc_free(rec->banned_nodes[pnn]);
+ rec->banned_nodes[pnn] = NULL;
+ }
+
rec->banned_nodes[pnn] = talloc(rec, struct ban_state);
CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
@@ -739,13 +760,32 @@ static void ctdb_wait_election(struct ctdb_recoverd *rec)
}
}
+/*
+ remember the trouble maker
+ */
+static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ if (rec->last_culprit != culprit ||
+ timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
+ DEBUG(0,("New recovery culprit %u\n", culprit));
+ /* either a new node is the culprit, or we've decided to forgive them */
+ rec->last_culprit = culprit;
+ rec->first_recover_time = timeval_current();
+ rec->culprit_counter = 0;
+ }
+ rec->culprit_counter++;
+}
/*
- update our local flags from all remote connected nodes.
+ Update our local flags from all remote connected nodes.
+ This is only run when we are or we belive we are the recovery master
*/
-static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
{
int j;
+ struct ctdb_context *ctdb = rec->ctdb;
TALLOC_CTX *mem_ctx = talloc_new(ctdb);
/* get the nodemap for all active remote nodes and verify
@@ -767,19 +807,55 @@ static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *n
if (ret != 0) {
DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n",
nodemap->nodes[j].pnn));
+ ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
talloc_free(mem_ctx);
- return -1;
+ return MONITOR_FAILED;
}
if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
+ struct ctdb_node_flag_change c;
+ TDB_DATA data;
+
+ /* We should tell our daemon about this so it
+ updates its flags or else we will log the same
+ message again in the next iteration of recovery.
+ Since we are the recovery master we can just as
+ well update the flags on all nodes.
+ */
+ c.pnn = nodemap->nodes[j].pnn;
+ c.old_flags = nodemap->nodes[j].flags;
+ c.new_flags = remote_nodemap->nodes[j].flags;
+
+ data.dptr = (uint8_t *)&c;
+ data.dsize = sizeof(c);
+
+ ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
+ CTDB_SRVID_NODE_FLAGS_CHANGED,
+ data);
+
+ /* Update our local copy of the flags in the recovery
+ daemon.
+ */
DEBUG(0,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
- nodemap->nodes[j].pnn, nodemap->nodes[j].flags,
- remote_nodemap->nodes[j].flags));
+ nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
+ nodemap->nodes[j].flags));
nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
+
+ /* If the BANNED flag has changed for the node
+ this is a good reason to do a new election.
+ */
+ if ((c.old_flags ^ c.new_flags) & NODE_FLAGS_BANNED) {
+ DEBUG(0,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
+ nodemap->nodes[j].pnn, c.new_flags,
+ c.old_flags));
+ talloc_free(mem_ctx);
+ return MONITOR_ELECTION_NEEDED;
+ }
+
}
talloc_free(remote_nodemap);
}
talloc_free(mem_ctx);
- return 0;
+ return MONITOR_OK;
}
@@ -801,23 +877,6 @@ static uint32_t new_generation(void)
return generation;
}
-/*
- remember the trouble maker
- */
-static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
-{
- struct ctdb_context *ctdb = rec->ctdb;
-
- if (rec->last_culprit != culprit ||
- timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
- DEBUG(0,("New recovery culprit %u\n", culprit));
- /* either a new node is the culprit, or we've decide to forgive them */
- rec->last_culprit = culprit;
- rec->first_recover_time = timeval_current();
- rec->culprit_counter = 0;
- }
- rec->culprit_counter++;
-}
/*
we are the recmaster, and recovery is needed - start a recovery run
@@ -1615,6 +1674,18 @@ again:
goto again;
}
+
+ /* We must check if we need to ban a node here but we want to do this
+ as early as possible so we dont wait until we have pulled the node
+ map from the local node. thats why we have the hardcoded value 20
+ */
+ if (rec->culprit_counter > 20) {
+ DEBUG(0,("Node %u has caused %u failures in %.0f seconds - banning it for %u seconds\n",
+ rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
+ ctdb->tunable.recovery_ban_period));
+ ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
+ }
+
/* get relevant tunables */
ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
if (ret != 0) {
@@ -1643,6 +1714,29 @@ again:
goto again;
}
+ /* check that we (recovery daemon) and the local ctdb daemon
+ agrees on whether we are banned or not
+ */
+ if (nodemap->nodes[pnn].flags & NODE_FLAGS_BANNED) {
+ if (rec->banned_nodes[pnn] == NULL) {
+ DEBUG(0,("Local ctdb daemon thinks this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
+
+ ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
+ ctdb_set_culprit(rec, pnn);
+
+ goto again;
+ }
+ } else {
+ if (rec->banned_nodes[pnn] != NULL) {
+ DEBUG(0,("Local ctdb daemon does not think this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
+
+ ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
+ ctdb_set_culprit(rec, pnn);
+
+ goto again;
+ }
+ }
+
/* remember our own node flags */
rec->node_flags = nodemap->nodes[pnn].flags;
@@ -1764,8 +1858,13 @@ again:
/* ensure our local copies of flags are right */
- ret = update_local_flags(ctdb, nodemap);
- if (ret != 0) {
+ ret = update_local_flags(rec, nodemap);
+ if (ret == MONITOR_ELECTION_NEEDED) {
+ DEBUG(0,("update_local_flags() called for a re-election.\n"));
+ force_election(rec, mem_ctx, pnn, nodemap);
+ goto again;
+ }
+ if (ret != MONITOR_OK) {
DEBUG(0,("Unable to update local flags\n"));
goto again;
}
diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c
index dddf90753b..2a80798dd9 100644
--- a/ctdb/server/ctdb_server.c
+++ b/ctdb/server/ctdb_server.c
@@ -35,6 +35,23 @@ int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport)
}
/*
+ Check whether an ip is a valid node ip
+ Returns the node id for this ip address or -1
+*/
+int ctdb_ip_to_nodeid(struct ctdb_context *ctdb, const char *nodeip)
+{
+ int nodeid;
+
+ for (nodeid=0;nodeid<ctdb->num_nodes;nodeid++) {
+ if (!strcmp(ctdb->nodes[nodeid]->address.address, nodeip)) {
+ return nodeid;
+ }
+ }
+
+ return -1;
+}
+
+/*
choose the recovery lock file
*/
int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file)
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c
index ec3455e4c0..a452da6424 100644
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@@ -131,7 +131,7 @@ static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
char *ip = inet_ntoa(state->sin->sin_addr);
struct ctdb_tcp_array *tcparray;
- ctdb_start_monitoring(ctdb);
+ ctdb_enable_monitoring(ctdb);
if (status != 0) {
DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
@@ -238,7 +238,7 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits,
vnn->iface));
- ctdb_stop_monitoring(ctdb);
+ ctdb_disable_monitoring(ctdb);
ret = ctdb_event_script_callback(ctdb,
timeval_current_ofs(ctdb->tunable.script_timeout, 0),
@@ -247,7 +247,9 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
vnn->iface,
inet_ntoa(pip->sin.sin_addr),
vnn->public_netmask_bits);
+
if (ret != 0) {
+ ctdb_enable_monitoring(ctdb);
DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
inet_ntoa(pip->sin.sin_addr), vnn->iface));
talloc_free(state);
@@ -299,7 +301,7 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status,
char *ip = inet_ntoa(state->sin->sin_addr);
TDB_DATA data;
- ctdb_start_monitoring(ctdb);
+ ctdb_enable_monitoring(ctdb);
/* send a message to all clients of this node telling them
that the cluster has been reconfigured and they should
@@ -364,7 +366,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
state->vnn = vnn;
- ctdb_stop_monitoring(ctdb);
+ ctdb_disable_monitoring(ctdb);
ret = ctdb_event_script_callback(ctdb,
timeval_current_ofs(ctdb->tunable.script_timeout, 0),
@@ -374,6 +376,8 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
inet_ntoa(pip->sin.sin_addr),
vnn->public_netmask_bits);
if (ret != 0) {
+ ctdb_enable_monitoring(ctdb);
+
DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n",
inet_ntoa(pip->sin.sin_addr), vnn->iface));
talloc_free(state);
diff --git a/ctdb/server/ctdbd.c b/ctdb/server/ctdbd.c
index eaf79a0ed7..5613ea91b7 100644
--- a/ctdb/server/ctdbd.c
+++ b/ctdb/server/ctdbd.c
@@ -50,6 +50,7 @@ static struct {
const char *db_dir_persistent;
const char *public_interface;
const char *single_public_ip;
+ const char *node_ip;
int no_setsched;
} options = {
.nlist = ETCDIR "/ctdb/nodes",
@@ -110,6 +111,7 @@ int main(int argc, const char *argv[])
{ "event-script-dir", 0, POPT_ARG_STRING, &options.event_script_dir, 0, "event script directory", "dirname" },
{ "logfile", 0, POPT_ARG_STRING, &options.logfile, 0, "log file location", "filename" },
{ "nlist", 0, POPT_ARG_STRING, &options.nlist, 0, "node list file", "filename" },
+ { "node-ip", 0, POPT_ARG_STRING, &options.node_ip, 0, "node ip", "ip-address"},
{ "listen", 0, POPT_ARG_STRING, &options.myaddress, 0, "address to listen on", "address" },
{ "transport", 0, POPT_ARG_STRING, &options.transport, 0, "protocol transport", NULL },
{ "dbdir", 0, POPT_ARG_STRING, &options.db_dir, 0, "directory for the tdb files", NULL },
@@ -166,7 +168,7 @@ int main(int argc, const char *argv[])
ctdb->upcalls = &ctdb_upcalls;
ctdb->idr = idr_init(ctdb);
ctdb->recovery_lock_fd = -1;
- ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE;
+ ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
ctdb_tunables_set_defaults(ctdb);
@@ -198,6 +200,20 @@ int main(int argc, const char *argv[])
exit(1);
}
+ /* if a node-ip was specified, verify that it exists in the
+ nodes file
+ */
+ if (options.node_ip != NULL) {
+ DEBUG(0,("IP for this node is %s\n", options.node_ip));
+ ret = ctdb_ip_to_nodeid(ctdb, options.node_ip);
+ if (ret == -1) {
+ DEBUG(0,("The specified node-ip:%s is not a valid node address. Exiting.\n", options.node_ip));
+ exit(1);
+ }
+ ctdb->node_ip = options.node_ip;
+ DEBUG(0,("This is node %d\n", ret));
+ }
+
if (options.db_dir) {
ret = ctdb_set_tdb_dir(ctdb, options.db_dir);
if (ret == -1) {
diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c
index 3548f82ed7..3c4e7bfb10 100644
--- a/ctdb/tcp/tcp_connect.c
+++ b/ctdb/tcp/tcp_connect.c
@@ -214,13 +214,9 @@ static void ctdb_listen_event(struct event_context *ev, struct fd_event *fde,
if (fd == -1) return;
incoming_node = inet_ntoa(addr.sin_addr);
- for (nodeid=0;nodeid<ctdb->num_nodes;nodeid++) {
- if (!strcmp(incoming_node, ctdb->nodes[nodeid]->address.address)) {
- DEBUG(0, ("Incoming connection from node:%d %s\n",nodeid,incoming_node));
- break;
- }
- }
- if (nodeid>=ctdb->num_nodes) {
+ nodeid = ctdb_ip_to_nodeid(ctdb, incoming_node);
+
+ if (nodeid == -1) {
DEBUG(0, ("Refused connection from unknown node %s\n", incoming_node));
close(fd);
return;
@@ -275,17 +271,27 @@ static int ctdb_tcp_listen_automatic(struct ctdb_context *ctdb)
}
for (i=0;i<ctdb->num_nodes;i++) {
+ /* if node_ip is specified we will only try to bind to that
+ ip.
+ */
+ if (ctdb->node_ip != NULL) {
+ if (strcmp(ctdb->node_ip, ctdb->nodes[i]->address.address)) {
+ continue;
+ }
+ }
+
ZERO_STRUCT(sock);
#ifdef HAVE_SOCK_SIN_LEN
sock.sin_len = sizeof(sock);
#endif
sock.sin_port = htons(ctdb->nodes[i]->address.port);
sock.sin_family = PF_INET;
- if (ctdb_tcp_get_address(ctdb, ctdb->nodes[i]->address.address,
- &sock.sin_addr) != 0) {
+ if (ctdb_tcp_get_address(ctdb,
+ ctdb->nodes[i]->address.address,
+ &sock.sin_addr) != 0) {
continue;
}
-
+
if (bind(ctcp->listen_fd, (struct sockaddr * )&sock,
sizeof(sock)) == 0) {
break;
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index e0c6621429..0a4f370697 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -754,29 +754,6 @@ static int control_getmonmode(struct ctdb_context *ctdb, int argc, const char **
}
/*
- set the monitoring mode of a remote node
- */
-static int control_setmonmode(struct ctdb_context *ctdb, int argc, const char **argv)
-{
- uint32_t monmode;
- int ret;
-
- if (argc < 1) {
- usage();
- }
-
- monmode = strtoul(argv[0], NULL, 0);
-
- ret = ctdb_ctrl_setmonmode(ctdb, TIMELIMIT(), options.pnn, monmode);
- if (ret != 0) {
- DEBUG(0, ("Unable to set monmode on node %u\n", options.pnn));
- return ret;
- }
-
- return 0;
-}
-
-/*
display remote list of keys/data for a db
*/
static int control_catdb(struct ctdb_context *ctdb, int argc, const char **argv)
@@ -1082,7 +1059,6 @@ static const struct {
{ "getdbmap", control_getdbmap, true, "show the database map" },
{ "catdb", control_catdb, true, "dump a database" , "<dbname>"},
{ "getmonmode", control_getmonmode, true, "show monitoring mode" },
- { "setmonmode", control_setmonmode, true, "set monitoring mode", "<0|1>" },
{ "setdebug", control_setdebug, true, "set debug level", "<debuglevel>" },
{ "getdebug", control_getdebug, true, "get debug level" },
{ "attach", control_attach, true, "attach to a database", "<dbname>" },