summaryrefslogtreecommitdiffstats
path: root/ctdb/tools
diff options
context:
space:
mode:
authorMartin Schwenke <martin@meltin.net>2014-05-23 21:58:55 +1000
committerAmitay Isaacs <amitay@samba.org>2014-05-29 05:59:37 +0200
commit4dd382296d3e78000713ab0ac1f8e531e25857cc (patch)
tree30f8b27375004c4c28bfbecdb341c4d713ecd3eb /ctdb/tools
parentf2ef23cd5f227d2da2f032a2a56fbdd4d105b137 (diff)
downloadsamba-4dd382296d3e78000713ab0ac1f8e531e25857cc.tar.gz
samba-4dd382296d3e78000713ab0ac1f8e531e25857cc.tar.xz
samba-4dd382296d3e78000713ab0ac1f8e531e25857cc.zip
ctdb-tools-ctdb: Make natgwlist and lvsmaster more resilient
Recent changes have caused these commands to attempt to get capabilities from all nodes before doing further filtering. This means that capabilities are unnecessarily fetched from nodes that are unlikely to be the master. If such a node does not answer the control then many nodes can fail to calculate the master node. In the case of natgwlist this will cause "monitor" events to fail resulting in unhealthy nodes. Restore the behaviour where capabilities are only fetched for a node that will be the master if it has the desired flags. Although this masks a problem where a connected node is not replying, it can help to avoid an outage in some cases. Add supporting tests and infrastructure. Infrastructure just lets a timeout be faked - just for ctdb_ctrl_getcapabilities_stub() so far. First test checks that this infrastructure works if the first node times out in natgwlist. Second test checks the case worked around by the above fix - that is, no failure when a node with PNN beyond the NATGW master can time out. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com> Autobuild-User(master): Amitay Isaacs <amitay@samba.org> Autobuild-Date(master): Thu May 29 05:59:37 CEST 2014 on sn-devel-104
Diffstat (limited to 'ctdb/tools')
-rw-r--r--ctdb/tools/ctdb.c80
1 files changed, 47 insertions, 33 deletions
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index 8033fcb7ae2..4704726de7d 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -1179,7 +1179,8 @@ filter_nodemap_by_addrs(struct ctdb_context *ctdb,
static struct ctdb_node_map *
filter_nodemap_by_capabilities(struct ctdb_context *ctdb,
struct ctdb_node_map *nodemap,
- uint32_t required_capabilities)
+ uint32_t required_capabilities,
+ bool first_only)
{
int i;
uint32_t capabilities;
@@ -1213,6 +1214,9 @@ filter_nodemap_by_capabilities(struct ctdb_context *ctdb,
ret->nodes[ret->num] = nodemap->nodes[i];
ret->num++;
+ if (first_only) {
+ break;
+ }
}
return ret;
@@ -1252,7 +1256,7 @@ static int control_natgwlist(struct ctdb_context *ctdb, int argc, const char **a
int i, ret;
struct pnn_node *natgw_nodes = NULL;
struct ctdb_node_map *orig_nodemap=NULL;
- struct ctdb_node_map *cnodemap, *nodemap;
+ struct ctdb_node_map *nodemap;
uint32_t mypnn, pnn;
const char *ip;
@@ -1293,21 +1297,15 @@ static int control_natgwlist(struct ctdb_context *ctdb, int argc, const char **a
goto done;
}
- /* Get a nodemap that includes only the nodes with the NATGW
- * capability */
- cnodemap = filter_nodemap_by_capabilities(ctdb, nodemap,
- CTDB_CAP_NATGW);
- if (cnodemap == NULL) {
- ret = -1;
- goto done;
- }
-
ret = 2; /* matches ENOENT */
pnn = -1;
ip = "0.0.0.0";
+ /* For each flag mask... */
for (i = 0; exclude_flags[i] != 0; i++) {
+ /* ... get a nodemap that excludes nodes with with
+ * masked flags... */
struct ctdb_node_map *t =
- filter_nodemap_by_flags(ctdb, cnodemap,
+ filter_nodemap_by_flags(ctdb, nodemap,
exclude_flags[i]);
if (t == NULL) {
/* No memory */
@@ -1315,10 +1313,23 @@ static int control_natgwlist(struct ctdb_context *ctdb, int argc, const char **a
goto done;
}
if (t->num > 0) {
- ret = 0;
- pnn = t->nodes[0].pnn;
- ip = ctdb_addr_to_str(&t->nodes[0].addr);
- break;
+ /* ... and find the first node with the NATGW
+ * capability */
+ struct ctdb_node_map *n;
+ n = filter_nodemap_by_capabilities(ctdb, t,
+ CTDB_CAP_NATGW,
+ true);
+ if (n == NULL) {
+ /* No memory */
+ ret = -1;
+ goto done;
+ }
+ if (n->num > 0) {
+ ret = 0;
+ pnn = n->nodes[0].pnn;
+ ip = ctdb_addr_to_str(&n->nodes[0].addr);
+ break;
+ }
}
talloc_free(t);
}
@@ -3569,7 +3580,7 @@ static int control_lvs(struct ctdb_context *ctdb, int argc, const char **argv)
}
nodemap = filter_nodemap_by_capabilities(ctdb, orig_nodemap,
- CTDB_CAP_LVS);
+ CTDB_CAP_LVS, false);
if (nodemap == NULL) {
/* No memory */
ret = -1;
@@ -3609,26 +3620,17 @@ done:
static int control_lvsmaster(struct ctdb_context *ctdb, int argc, const char **argv)
{
TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
- struct ctdb_node_map *orig_nodemap=NULL;
- struct ctdb_node_map *nodemap;
+ struct ctdb_node_map *nodemap=NULL;
int i, ret;
ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn,
- tmp_ctx, &orig_nodemap);
+ tmp_ctx, &nodemap);
if (ret != 0) {
DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
talloc_free(tmp_ctx);
return -1;
}
- nodemap = filter_nodemap_by_capabilities(ctdb, orig_nodemap,
- CTDB_CAP_LVS);
- if (nodemap == NULL) {
- /* No memory */
- ret = -1;
- goto done;
- }
-
for (i = 0; lvs_exclude_flags[i] != 0; i++) {
struct ctdb_node_map *t =
filter_nodemap_by_flags(ctdb, nodemap,
@@ -3639,11 +3641,23 @@ static int control_lvsmaster(struct ctdb_context *ctdb, int argc, const char **a
goto done;
}
if (t->num > 0) {
- ret = 0;
- printf(options.machinereadable ?
- "%d\n" : "Node %d is LVS master\n",
- t->nodes[0].pnn);
- goto done;
+ struct ctdb_node_map *n;
+ n = filter_nodemap_by_capabilities(ctdb,
+ t,
+ CTDB_CAP_LVS,
+ true);
+ if (n == NULL) {
+ /* No memory */
+ ret = -1;
+ goto done;
+ }
+ if (n->num > 0) {
+ ret = 0;
+ printf(options.machinereadable ?
+ "%d\n" : "Node %d is LVS master\n",
+ n->nodes[0].pnn);
+ goto done;
+ }
}
talloc_free(t);
}