From 2532149f8f9bbe6d3c8f5ac6e5e4bc2ad1681e27 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 7 Feb 2014 17:37:00 +1100 Subject: ctdb-scripts: Enhancements to hung script debugging * Add stack dumps for "interesting" processes that sometimes get stuck, so try to print stack traces for them if they appear in the pstree output. * Add new configuration variables CTDB_DEBUG_HUNG_SCRIPT_LOGFILE and CTDB_DEBUG_HUNG_SCRIPT_STACKPAT. These are primarily for testing but the latter may be useful for live debugging. * Load CTDB configuration so that above configuration variables can be set/changed without restarting ctdbd. Add a test that tries to ensure that all of this is working. Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- ctdb/config/debug-hung-script.sh | 34 ++++++++++- ctdb/doc/ctdbd.conf.5.xml | 30 ++++++++++ ctdb/tests/complex/90_debug_hung_script.sh | 91 ++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 2 deletions(-) create mode 100755 ctdb/tests/complex/90_debug_hung_script.sh diff --git a/ctdb/config/debug-hung-script.sh b/ctdb/config/debug-hung-script.sh index 1984242418..63d695f01b 100755 --- a/ctdb/config/debug-hung-script.sh +++ b/ctdb/config/debug-hung-script.sh @@ -1,18 +1,48 @@ #!/bin/sh +[ -n "$CTDB_BASE" ] || \ + export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD") + +. "$CTDB_BASE/functions" + +loadconfig ctdb + +# Testing hook +if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then + exec >>"$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" 2>&1 +fi + ( flock --wait 2 9 || exit 1 echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" =====" echo "pstree -p -a ${1}:" - pstree -p -a $1 + out=$(pstree -p -a $1) + echo "$out" + + # Check for processes matching a regular expression and print + # stack staces. This could help confirm that certain processes + # are stuck in certain places such as the cluster filesystem. The + # regexp should separate items with "\|" and should not contain + # parentheses. The default pattern can be replaced for testing. + default_pat='exportfs\|rpcinfo' + pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}" + echo "$out" | + sed -n "s@.*-\(.*${pat}.*\),\([0-9]*\).*@\2 \1@p" | + while read pid name ; do + trace=$(cat "/proc/${pid}/stack" 2>/dev/null) + if [ $? -eq 0 ] ; then + echo "---- Stack trace of interesting process ${pid}[${name}] ----" + echo "$trace" + fi + done if [ "$2" = "init" ] ; then exit 0 fi - echo "ctdb scriptstatus ${2}:" + echo "---- ctdb scriptstatus ${2}: ----" # No use running several of these in parallel if, say, "releaseip" # event hangs for multiple IPs. In that case the output would be # interleaved in the log and would just be confusing. diff --git a/ctdb/doc/ctdbd.conf.5.xml b/ctdb/doc/ctdbd.conf.5.xml index a1f6db5ef6..37b1cf94cb 100644 --- a/ctdb/doc/ctdbd.conf.5.xml +++ b/ctdb/doc/ctdbd.conf.5.xml @@ -1374,6 +1374,36 @@ CTDB_SET_MonitorInterval=20 + + CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=FILENAME + + + FILENAME specifies where log messages should go when + debugging hung eventscripts. This is a testing option. + See also CTDB_DEBUG_HUNG_SCRIPT. + + + No default. Messages go to stdout/stderr and are logged + to the same place as other CTDB log messages. + + + + + + CTDB_DEBUG_HUNG_SCRIPT_STACKPAT=REGEXP + + + REGEXP specifies interesting processes for which stack + traces should be logged when debugging hung eventscripts + and those processes are matched in pstree output. See + also CTDB_DEBUG_HUNG_SCRIPT. + + + Default is "exportfs\|rpcinfo". + + + + CTDB_DEBUG_LOCKS=FILENAME diff --git a/ctdb/tests/complex/90_debug_hung_script.sh b/ctdb/tests/complex/90_debug_hung_script.sh new file mode 100755 index 0000000000..ef6216cf94 --- /dev/null +++ b/ctdb/tests/complex/90_debug_hung_script.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +test_info() +{ + cat </dev/null ; then + echo "GOOD: output contains \"$pattern\"" + else + echo "BAD: output does not contain \"$pattern\"" + exit 1 + fi +done <<'EOF' +===== Start of hung script debug for PID=".*", event="monitor" ===== +===== End of hung script debug for PID=".*", event="monitor" ===== +pstree -p -a .*: + *\`-99\\.timeout,.* /etc/ctdb/events.d/99.timeout monitor + *\`-sleep,.* +---- Stack trace of interesting process [0-9]*\\[sleep\\] ---- +[<[0-9a-f]*>] .*sleep+.* +---- ctdb scriptstatus monitor: ---- +[0-9]* scripts were executed last monitor cycle +99\\.timeout *Status:TIMEDOUT.* + *OUTPUT:sleeping for [0-9]* seconds\\.\\.\\. +EOF -- cgit