ctdb-tests: Try harder to avoid failures due to repeated recoveries

About a year ago a check was added to _cluster_is_healthy() to make sure that node 0 isn't in recovery. This was to avoid unexpected recoveries causing tests to fail. However, it was misguided because each test initially calls cluster_is_healthy() and will now fail if an unexpected recovery occurs. Instead, have cluster_is_healthy() warn if the cluster is in recovery. Also: * Rename wait_until_healthy() to wait_until_ready() because it waits until both healthy and out of recovery. * Change the post-recovery sleep in restart_ctdb() to 2 seconds and add a loop to wait (for 2 seconds at a time) if the cluster is back in recovery. The logic here is that the re-recovery timeout has been set to 1 second, so sleeping for just 1 second might race against the next recovery. * Use reverse logic in node_has_status() so that it works for "all". * Tweak wait_until() so that it can handle timeouts with a recheck-interval specified. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
author: Martin Schwenke <martin@meltin.net> 2014-06-10 15:16:44 +1000
committer: Amitay Isaacs <amitay@samba.org> 2014-06-19 23:41:13 +0200
commit: 6a552f1a12ebe43f946bbbee2a3846b5a640ae4f (patch)
tree: 48a7da00070e52f9516dc2b756652f3d8af85d09
parent: 364bdadde3159dde1ddcc8c5fa4be981448f6833 (diff)
download: samba-6a552f1a12ebe43f946bbbee2a3846b5a640ae4f.tar.gz
samba-6a552f1a12ebe43f946bbbee2a3846b5a640ae4f.tar.xz
samba-6a552f1a12ebe43f946bbbee2a3846b5a640ae4f.zip
2 files changed, 44 insertions, 15 deletions
diff --git a/ctdb/tests/complex/34_nfs_tickle_restart.sh b/ctdb/tests/complex/34_nfs_tickle_restart.sh
index 93587e2f31..b7eea4ca21 100755
--- a/ctdb/tests/complex/34_nfs_tickle_restart.sh
+++ b/ctdb/tests/complex/34_nfs_tickle_restart.sh
@@ -79,7 +79,7 @@ try_command_on_node $rn $CTDB_TEST_WRAPPER restart_ctdb_1
 echo "Setting NoIPTakeover on node ${rn}"
 try_command_on_node $rn $CTDB setvar NoIPTakeover 1
 
-wait_until_healthy
+wait_until_ready
 
 echo "Getting TickleUpdateInterval..."
 try_command_on_node $test_node $CTDB getvar TickleUpdateInterval
diff --git a/ctdb/tests/scripts/integration.bash b/ctdb/tests/scripts/integration.bash
index 1ff02d5568..dec60a2990 100644
--- a/ctdb/tests/scripts/integration.bash
+++ b/ctdb/tests/scripts/integration.bash
@@ -205,11 +205,19 @@ select_test_node_and_ips ()
 #######################################
 
 # Wait until either timeout expires or command succeeds.  The command
-# will be tried once per second.
+# will be tried once per second, unless timeout has format T/I, where
+# I is the recheck interval.
 wait_until ()
 {
     local timeout="$1" ; shift # "$@" is the command...
 
+    local interval=1
+    case "$timeout" in
+	*/*)
+	    interval="${timeout#*/}"
+	    timeout="${timeout%/*}"
+    esac
+
     local negate=false
     if [ "$1" = "!" ] ; then
 	negate=true
@@ -227,9 +235,12 @@ wait_until ()
 	    echo "OK"
 	    return 0
 	fi
-	echo -n .
-	t=$(($t - 1))
-	sleep 1
+	local i
+	for i in $(seq 1 $interval) ; do
+	    echo -n .
+	done
+	t=$(($t - $interval))
+	sleep $interval
     done
 
     echo "*TIMEOUT*"
@@ -249,14 +260,26 @@ sleep_for ()
 
 _cluster_is_healthy ()
 {
-    $CTDB nodestatus all >/dev/null && \
-	node_has_status 0 recovered
+    $CTDB nodestatus all >/dev/null
+}
+
+_cluster_is_recovered ()
+{
+    node_has_status all recovered
+}
+
+_cluster_is_ready ()
+{
+    _cluster_is_healthy && _cluster_is_recovered
 }
 
 cluster_is_healthy ()
 {
     if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
 	echo "Cluster is HEALTHY"
+	if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_recovered ; then
+	  echo "WARNING: cluster in recovery mode!"
+	fi
 	return 0
     else
 	echo "Cluster is UNHEALTHY"
@@ -272,13 +295,13 @@ cluster_is_healthy ()
     fi
 }
 
-wait_until_healthy ()
+wait_until_ready ()
 {
     local timeout="${1:-120}"
 
-    echo "Waiting for cluster to become healthy..."
+    echo "Waiting for cluster to become ready..."
 
-    wait_until $timeout onnode -q any $CTDB_TEST_WRAPPER _cluster_is_healthy
+    wait_until $timeout onnode -q any $CTDB_TEST_WRAPPER _cluster_is_ready
 }
 
 # This function is becoming nicely overloaded.  Soon it will collapse!  :-)
@@ -303,7 +326,7 @@ node_has_status ()
 	(unfrozen)     fpat='^[[:space:]]+frozen[[:space:]]+0$' ;;
 	(monon)        mpat='^Monitoring mode:ACTIVE \(0\)$' ;;
 	(monoff)       mpat='^Monitoring mode:DISABLED \(1\)$' ;;
-	(recovered)    rpat='^Recovery mode:NORMAL \(0\)$' ;;
+	(recovered)    rpat='^Recovery mode:RECOVERY \(1\)$' ;;
 	*)
 	    echo "node_has_status: unknown status \"$status\""
 	    return 1
@@ -329,7 +352,7 @@ node_has_status ()
     elif [ -n "$mpat" ] ; then
 	$CTDB getmonmode -n "$pnn" | egrep -q "$mpat"
     elif [ -n "$rpat" ] ; then
-        $CTDB status -n "$pnn" | egrep -q "$rpat"
+        ! $CTDB status -n "$pnn" | egrep -q "$rpat"
     else
 	echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
 	return 1
@@ -479,8 +502,8 @@ restart_ctdb ()
 	    continue
 	}
 
-	wait_until_healthy || {
-	    echo "Cluster didn't become healthy.  Restarting..."
+	wait_until_ready || {
+	    echo "Cluster didn't become ready.  Restarting..."
 	    continue
 	}
 
@@ -492,7 +515,13 @@ restart_ctdb ()
 	# help the cluster to stabilise before a subsequent test.
 	echo "Forcing a recovery..."
 	onnode -q 0 $CTDB recover
-	sleep_for 1
+	sleep_for 2
+
+	if ! onnode -q any $CTDB_TEST_WRAPPER _cluster_is_recovered ; then
+	    echo "Cluster has gone into recovery again, waiting..."
+	    wait_until 30/2 onnode -q any $CTDB_TEST_WRAPPER _cluster_is_recovered
+	fi
+
 
 	# Cluster is still healthy.  Good, we're done!
 	if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
author	Martin Schwenke <martin@meltin.net>	2014-06-10 15:16:44 +1000
committer	Amitay Isaacs <amitay@samba.org>	2014-06-19 23:41:13 +0200
commit	6a552f1a12ebe43f946bbbee2a3846b5a640ae4f (patch)
tree	48a7da00070e52f9516dc2b756652f3d8af85d09
parent	364bdadde3159dde1ddcc8c5fa4be981448f6833 (diff)
download	samba-6a552f1a12ebe43f946bbbee2a3846b5a640ae4f.tar.gz samba-6a552f1a12ebe43f946bbbee2a3846b5a640ae4f.tar.xz samba-6a552f1a12ebe43f946bbbee2a3846b5a640ae4f.zip