summaryrefslogtreecommitdiffstats
path: root/ctdb
diff options
context:
space:
mode:
authorMartin Schwenke <martin@meltin.net>2014-12-17 20:33:19 +1100
committerAmitay Isaacs <amitay@samba.org>2015-02-13 07:19:07 +0100
commit432d6774891eba30a959cd2d8ee8469d189c7872 (patch)
tree97fc2622a4550d014a8286500557ed7a1f9bd0bf /ctdb
parent48c91407abd5e34463d3a10cb6fce47ec4a0d5f6 (diff)
downloadsamba-432d6774891eba30a959cd2d8ee8469d189c7872.tar.gz
samba-432d6774891eba30a959cd2d8ee8469d189c7872.tar.xz
samba-432d6774891eba30a959cd2d8ee8469d189c7872.zip
ctdb-recoverd: Improve error messages on recovery lock coherence fail
When the daemon is able to take the recovery lock during recovery we might as well guess that the cluster filesystem has a lock coherence problem and print a more useful message. This will be more helpful to those trying out cluster filesystems that don't have lock coherence or that are difficult to setup. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
Diffstat (limited to 'ctdb')
-rw-r--r--ctdb/server/ctdb_recover.c26
1 files changed, 14 insertions, 12 deletions
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index 2c300ee090..4b9407f6c8 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -528,16 +528,17 @@ static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
state->te = NULL;
- /* read the childs status when trying to lock the reclock file.
- child wrote 0 if everything is fine and 1 if it did manage
- to lock the file, which would be a problem since that means
- we got a request to exit from recovery but we could still lock
- the file which at this time SHOULD be locked by the recovery
- daemon on the recmaster
- */
+ /* If, as expected, the child was unable to take the recovery
+ * lock then it will have written 0 into the pipe, so
+ * continue. However, any other value (e.g. 1) indicates that
+ * it was able to take the recovery lock when it should have
+ * been held by the recovery daemon on the recovery master.
+ */
ret = sys_read(state->fd[0], &c, 1);
if (ret != 1 || c != 0) {
- ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
+ ctdb_request_control_reply(
+ state->ctdb, state->c, NULL, -1,
+ "Took recovery lock from daemon during recovery - probably a cluster filesystem lock coherence problem");
talloc_free(state);
return;
}
@@ -672,11 +673,12 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
ctdb_set_process_name("ctdb_recmode");
debug_extra = talloc_asprintf(NULL, "set_recmode:");
- /* we should not be able to get the lock on the reclock file,
- as it should be held by the recovery master
- */
+ /* Daemon should not be able to get the recover lock,
+ * as it should be held by the recovery master */
if (ctdb_recovery_lock(ctdb)) {
- DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
+ DEBUG(DEBUG_ERR,
+ ("ERROR: Daemon able to take recovery lock on \"%s\" during recovery\n",
+ ctdb->recovery_lock_file));
ctdb_recovery_unlock(ctdb);
cc = 1;
}