diff options
author | Martin Schwenke <martin@meltin.net> | 2014-12-09 13:51:27 +1100 |
---|---|---|
committer | Amitay Isaacs <amitay@samba.org> | 2015-02-13 07:19:07 +0100 |
commit | 48c91407abd5e34463d3a10cb6fce47ec4a0d5f6 (patch) | |
tree | cd80f4aa58bb2af4046358ae0a51b54284dcca9e /ctdb | |
parent | 1d6ed91f5518d462ba368bca03be923428710157 (diff) | |
download | samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.tar.gz samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.tar.xz samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.zip |
ctdb-recoverd: Don't release and re-take the recovery lock
Just continue to hold it, otherwise a broken node might win an
election and grab the lock.
Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
Diffstat (limited to 'ctdb')
-rw-r--r-- | ctdb/server/ctdb_recoverd.c | 46 |
1 files changed, 26 insertions, 20 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 2045413ca0..99018be8d3 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -1809,29 +1809,35 @@ static int do_recovery(struct ctdb_recoverd *rec, } if (ctdb->recovery_lock_file != NULL) { - DEBUG(DEBUG_ERR, ("Taking out recovery lock from recovery daemon (%s)\n", ctdb->recovery_lock_file)); - start_time = timeval_current(); - ctdb_recovery_unlock(ctdb); - DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock\n")); - if (!ctdb_recovery_lock(ctdb)) { - if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) { - /* If ctdb is trying first recovery, it's - * possible that current node does not know yet - * who the recmaster is. - */ - DEBUG(DEBUG_ERR, ("Unable to get recovery lock" - " - retrying recovery\n")); + if (ctdb_recovery_have_lock(ctdb)) { + DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n")); + } else { + start_time = timeval_current(); + DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n", + ctdb->recovery_lock_file)); + if (!ctdb_recovery_lock(ctdb)) { + if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) { + /* If ctdb is trying first recovery, it's + * possible that current node does not know + * yet who the recmaster is. + */ + DEBUG(DEBUG_ERR, ("Unable to get recovery lock" + " - retrying recovery\n")); + return -1; + } + + DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery " + "and ban ourself for %u seconds\n", + ctdb->tunable.recovery_ban_period)); + ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period); return -1; } - - DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery " - "and ban ourself for %u seconds\n", - ctdb->tunable.recovery_ban_period)); - ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period); - return -1; + ctdb_ctrl_report_recd_lock_latency(ctdb, + CONTROL_TIMEOUT(), + timeval_elapsed(&start_time)); + DEBUG(DEBUG_NOTICE, + ("Recovery lock taken successfully by recovery daemon\n")); } - ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time)); - DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n")); } DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node)); |