diff options
author | Martin Schwenke <martin@meltin.net> | 2014-06-10 15:16:44 +1000 |
---|---|---|
committer | Amitay Isaacs <amitay@samba.org> | 2014-06-19 23:41:13 +0200 |
commit | 6a552f1a12ebe43f946bbbee2a3846b5a640ae4f (patch) | |
tree | 48a7da00070e52f9516dc2b756652f3d8af85d09 | |
parent | 364bdadde3159dde1ddcc8c5fa4be981448f6833 (diff) | |
download | samba-6a552f1a12ebe43f946bbbee2a3846b5a640ae4f.tar.gz samba-6a552f1a12ebe43f946bbbee2a3846b5a640ae4f.tar.xz samba-6a552f1a12ebe43f946bbbee2a3846b5a640ae4f.zip |
ctdb-tests: Try harder to avoid failures due to repeated recoveries
About a year ago a check was added to _cluster_is_healthy() to make
sure that node 0 isn't in recovery. This was to avoid unexpected
recoveries causing tests to fail. However, it was misguided because
each test initially calls cluster_is_healthy() and will now fail if an
unexpected recovery occurs.
Instead, have cluster_is_healthy() warn if the cluster is in recovery.
Also:
* Rename wait_until_healthy() to wait_until_ready() because it waits
until both healthy and out of recovery.
* Change the post-recovery sleep in restart_ctdb() to 2 seconds and
add a loop to wait (for 2 seconds at a time) if the cluster is back
in recovery. The logic here is that the re-recovery timeout has
been set to 1 second, so sleeping for just 1 second might race
against the next recovery.
* Use reverse logic in node_has_status() so that it works for "all".
* Tweak wait_until() so that it can handle timeouts with a
recheck-interval specified.
Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
-rwxr-xr-x | ctdb/tests/complex/34_nfs_tickle_restart.sh | 2 | ||||
-rw-r--r-- | ctdb/tests/scripts/integration.bash | 57 |
2 files changed, 44 insertions, 15 deletions
diff --git a/ctdb/tests/complex/34_nfs_tickle_restart.sh b/ctdb/tests/complex/34_nfs_tickle_restart.sh index 93587e2f31..b7eea4ca21 100755 --- a/ctdb/tests/complex/34_nfs_tickle_restart.sh +++ b/ctdb/tests/complex/34_nfs_tickle_restart.sh @@ -79,7 +79,7 @@ try_command_on_node $rn $CTDB_TEST_WRAPPER restart_ctdb_1 echo "Setting NoIPTakeover on node ${rn}" try_command_on_node $rn $CTDB setvar NoIPTakeover 1 -wait_until_healthy +wait_until_ready echo "Getting TickleUpdateInterval..." try_command_on_node $test_node $CTDB getvar TickleUpdateInterval diff --git a/ctdb/tests/scripts/integration.bash b/ctdb/tests/scripts/integration.bash index 1ff02d5568..dec60a2990 100644 --- a/ctdb/tests/scripts/integration.bash +++ b/ctdb/tests/scripts/integration.bash @@ -205,11 +205,19 @@ select_test_node_and_ips () ####################################### # Wait until either timeout expires or command succeeds. The command -# will be tried once per second. +# will be tried once per second, unless timeout has format T/I, where +# I is the recheck interval. wait_until () { local timeout="$1" ; shift # "$@" is the command... + local interval=1 + case "$timeout" in + */*) + interval="${timeout#*/}" + timeout="${timeout%/*}" + esac + local negate=false if [ "$1" = "!" ] ; then negate=true @@ -227,9 +235,12 @@ wait_until () echo "OK" return 0 fi - echo -n . - t=$(($t - 1)) - sleep 1 + local i + for i in $(seq 1 $interval) ; do + echo -n . + done + t=$(($t - $interval)) + sleep $interval done echo "*TIMEOUT*" @@ -249,14 +260,26 @@ sleep_for () _cluster_is_healthy () { - $CTDB nodestatus all >/dev/null && \ - node_has_status 0 recovered + $CTDB nodestatus all >/dev/null +} + +_cluster_is_recovered () +{ + node_has_status all recovered +} + +_cluster_is_ready () +{ + _cluster_is_healthy && _cluster_is_recovered } cluster_is_healthy () { if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then echo "Cluster is HEALTHY" + if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_recovered ; then + echo "WARNING: cluster in recovery mode!" + fi return 0 else echo "Cluster is UNHEALTHY" @@ -272,13 +295,13 @@ cluster_is_healthy () fi } -wait_until_healthy () +wait_until_ready () { local timeout="${1:-120}" - echo "Waiting for cluster to become healthy..." + echo "Waiting for cluster to become ready..." - wait_until $timeout onnode -q any $CTDB_TEST_WRAPPER _cluster_is_healthy + wait_until $timeout onnode -q any $CTDB_TEST_WRAPPER _cluster_is_ready } # This function is becoming nicely overloaded. Soon it will collapse! :-) @@ -303,7 +326,7 @@ node_has_status () (unfrozen) fpat='^[[:space:]]+frozen[[:space:]]+0$' ;; (monon) mpat='^Monitoring mode:ACTIVE \(0\)$' ;; (monoff) mpat='^Monitoring mode:DISABLED \(1\)$' ;; - (recovered) rpat='^Recovery mode:NORMAL \(0\)$' ;; + (recovered) rpat='^Recovery mode:RECOVERY \(1\)$' ;; *) echo "node_has_status: unknown status \"$status\"" return 1 @@ -329,7 +352,7 @@ node_has_status () elif [ -n "$mpat" ] ; then $CTDB getmonmode -n "$pnn" | egrep -q "$mpat" elif [ -n "$rpat" ] ; then - $CTDB status -n "$pnn" | egrep -q "$rpat" + ! $CTDB status -n "$pnn" | egrep -q "$rpat" else echo 'node_has_status: unknown mode, neither $bits nor $fpat is set' return 1 @@ -479,8 +502,8 @@ restart_ctdb () continue } - wait_until_healthy || { - echo "Cluster didn't become healthy. Restarting..." + wait_until_ready || { + echo "Cluster didn't become ready. Restarting..." continue } @@ -492,7 +515,13 @@ restart_ctdb () # help the cluster to stabilise before a subsequent test. echo "Forcing a recovery..." onnode -q 0 $CTDB recover - sleep_for 1 + sleep_for 2 + + if ! onnode -q any $CTDB_TEST_WRAPPER _cluster_is_recovered ; then + echo "Cluster has gone into recovery again, waiting..." + wait_until 30/2 onnode -q any $CTDB_TEST_WRAPPER _cluster_is_recovered + fi + # Cluster is still healthy. Good, we're done! if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then |