summaryrefslogtreecommitdiffstats
path: root/ctdb
diff options
context:
space:
mode:
authorMartin Schwenke <martin@meltin.net>2014-12-09 13:51:27 +1100
committerAmitay Isaacs <amitay@samba.org>2015-02-13 07:19:07 +0100
commit48c91407abd5e34463d3a10cb6fce47ec4a0d5f6 (patch)
treecd80f4aa58bb2af4046358ae0a51b54284dcca9e /ctdb
parent1d6ed91f5518d462ba368bca03be923428710157 (diff)
downloadsamba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.tar.gz
samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.tar.xz
samba-48c91407abd5e34463d3a10cb6fce47ec4a0d5f6.zip
ctdb-recoverd: Don't release and re-take the recovery lock
Just continue to hold it, otherwise a broken node might win an election and grab the lock. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
Diffstat (limited to 'ctdb')
-rw-r--r--ctdb/server/ctdb_recoverd.c46
1 files changed, 26 insertions, 20 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 2045413ca0..99018be8d3 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1809,29 +1809,35 @@ static int do_recovery(struct ctdb_recoverd *rec,
}
if (ctdb->recovery_lock_file != NULL) {
- DEBUG(DEBUG_ERR, ("Taking out recovery lock from recovery daemon (%s)\n", ctdb->recovery_lock_file));
- start_time = timeval_current();
- ctdb_recovery_unlock(ctdb);
- DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock\n"));
- if (!ctdb_recovery_lock(ctdb)) {
- if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
- /* If ctdb is trying first recovery, it's
- * possible that current node does not know yet
- * who the recmaster is.
- */
- DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
- " - retrying recovery\n"));
+ if (ctdb_recovery_have_lock(ctdb)) {
+ DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
+ } else {
+ start_time = timeval_current();
+ DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
+ ctdb->recovery_lock_file));
+ if (!ctdb_recovery_lock(ctdb)) {
+ if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
+ /* If ctdb is trying first recovery, it's
+ * possible that current node does not know
+ * yet who the recmaster is.
+ */
+ DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
+ " - retrying recovery\n"));
+ return -1;
+ }
+
+ DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
+ "and ban ourself for %u seconds\n",
+ ctdb->tunable.recovery_ban_period));
+ ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
return -1;
}
-
- DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
- "and ban ourself for %u seconds\n",
- ctdb->tunable.recovery_ban_period));
- ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
- return -1;
+ ctdb_ctrl_report_recd_lock_latency(ctdb,
+ CONTROL_TIMEOUT(),
+ timeval_elapsed(&start_time));
+ DEBUG(DEBUG_NOTICE,
+ ("Recovery lock taken successfully by recovery daemon\n"));
}
- ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
- DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
}
DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));