diff options
author | Stefan Metzmacher <metze@samba.org> | 2009-10-09 15:47:49 +0200 |
---|---|---|
committer | Ronnie Sahlberg <ronniesahlberg@gmail.com> | 2009-10-26 14:21:45 +1100 |
commit | 198866d82d4487f6d45104e051ca94b8072a21f2 (patch) | |
tree | 0dcc052af97fedb4f8ceeef9a62ae5ab0977e4d2 /ctdb/server/ctdb_recoverd.c | |
parent | 7a616a0d7b5c0b83822a3c0084c0dc82060b546a (diff) | |
download | samba-198866d82d4487f6d45104e051ca94b8072a21f2.tar.gz samba-198866d82d4487f6d45104e051ca94b8072a21f2.tar.xz samba-198866d82d4487f6d45104e051ca94b8072a21f2.zip |
server: if takeover runs when the recovery master becomes unhealthy
The problem was this:
When the monitor event fails, the node->flags get updated,
and an update (containing the old and new flags) is sent to
the recovery master.
If the recovery master sends the update to itself (the same process),
it was compairing the node->flags variable with the received new flags.
This check always found both flag values to be equal
and never sets the rec->need_takeover_run variable to true.
There were two problem, first the push_flags_handler() function
didn't pass the received old flags.
And the ctdb_control_modflags() function ignored the received old flags.
metze
(This used to be ctdb commit 8ec633b64a05a2d903c2b9639909f15f6375548f)
Diffstat (limited to 'ctdb/server/ctdb_recoverd.c')
-rw-r--r-- | ctdb/server/ctdb_recoverd.c | 40 |
1 files changed, 38 insertions, 2 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 6a453f98d8b..ecdcd99dd18 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2056,11 +2056,47 @@ static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid, { int ret; struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr; + struct ctdb_node_map *nodemap=NULL; + TALLOC_CTX *tmp_ctx = talloc_new(ctdb); + uint32_t recmaster; + uint32_t *nodes; - ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags); + /* find the recovery master */ + ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster); if (ret != 0) { - DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n")); + DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n")); + talloc_free(tmp_ctx); + return; + } + + /* read the node flags from the recmaster */ + ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn)); + talloc_free(tmp_ctx); + return; } + if (c->pnn >= nodemap->num) { + DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn)); + talloc_free(tmp_ctx); + return; + } + + /* send the flags update to all connected nodes */ + nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true); + + if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS, + nodes, 0, CONTROL_TIMEOUT(), + false, data, + NULL, NULL, + NULL) != 0) { + DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n")); + + talloc_free(tmp_ctx); + return; + } + + talloc_free(tmp_ctx); } |