diff options
author | Martin Schwenke <martin@meltin.net> | 2014-02-07 17:37:00 +1100 |
---|---|---|
committer | Amitay Isaacs <amitay@gmail.com> | 2014-02-19 12:04:47 +1100 |
commit | 2532149f8f9bbe6d3c8f5ac6e5e4bc2ad1681e27 (patch) | |
tree | 9d23e4a4a0a3621cd511f5c77b45ed3247a0af86 | |
parent | 79e2725f339e7c5336b4053348c4266268de6ca3 (diff) | |
download | samba-2532149f8f9bbe6d3c8f5ac6e5e4bc2ad1681e27.tar.gz samba-2532149f8f9bbe6d3c8f5ac6e5e4bc2ad1681e27.tar.xz samba-2532149f8f9bbe6d3c8f5ac6e5e4bc2ad1681e27.zip |
ctdb-scripts: Enhancements to hung script debugging
* Add stack dumps for "interesting" processes that sometimes get
stuck, so try to print stack traces for them if they appear in the
pstree output.
* Add new configuration variables CTDB_DEBUG_HUNG_SCRIPT_LOGFILE and
CTDB_DEBUG_HUNG_SCRIPT_STACKPAT. These are primarily for testing
but the latter may be useful for live debugging.
* Load CTDB configuration so that above configuration variables can be
set/changed without restarting ctdbd.
Add a test that tries to ensure that all of this is working.
Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
-rwxr-xr-x | ctdb/config/debug-hung-script.sh | 34 | ||||
-rw-r--r-- | ctdb/doc/ctdbd.conf.5.xml | 30 | ||||
-rwxr-xr-x | ctdb/tests/complex/90_debug_hung_script.sh | 91 |
3 files changed, 153 insertions, 2 deletions
diff --git a/ctdb/config/debug-hung-script.sh b/ctdb/config/debug-hung-script.sh index 1984242418..63d695f01b 100755 --- a/ctdb/config/debug-hung-script.sh +++ b/ctdb/config/debug-hung-script.sh @@ -1,18 +1,48 @@ #!/bin/sh +[ -n "$CTDB_BASE" ] || \ + export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD") + +. "$CTDB_BASE/functions" + +loadconfig ctdb + +# Testing hook +if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then + exec >>"$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" 2>&1 +fi + ( flock --wait 2 9 || exit 1 echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" =====" echo "pstree -p -a ${1}:" - pstree -p -a $1 + out=$(pstree -p -a $1) + echo "$out" + + # Check for processes matching a regular expression and print + # stack staces. This could help confirm that certain processes + # are stuck in certain places such as the cluster filesystem. The + # regexp should separate items with "\|" and should not contain + # parentheses. The default pattern can be replaced for testing. + default_pat='exportfs\|rpcinfo' + pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}" + echo "$out" | + sed -n "s@.*-\(.*${pat}.*\),\([0-9]*\).*@\2 \1@p" | + while read pid name ; do + trace=$(cat "/proc/${pid}/stack" 2>/dev/null) + if [ $? -eq 0 ] ; then + echo "---- Stack trace of interesting process ${pid}[${name}] ----" + echo "$trace" + fi + done if [ "$2" = "init" ] ; then exit 0 fi - echo "ctdb scriptstatus ${2}:" + echo "---- ctdb scriptstatus ${2}: ----" # No use running several of these in parallel if, say, "releaseip" # event hangs for multiple IPs. In that case the output would be # interleaved in the log and would just be confusing. diff --git a/ctdb/doc/ctdbd.conf.5.xml b/ctdb/doc/ctdbd.conf.5.xml index a1f6db5ef6..37b1cf94cb 100644 --- a/ctdb/doc/ctdbd.conf.5.xml +++ b/ctdb/doc/ctdbd.conf.5.xml @@ -1375,6 +1375,36 @@ CTDB_SET_MonitorInterval=20 </varlistentry> <varlistentry> + <term>CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=<parameter>FILENAME</parameter></term> + <listitem> + <para> + FILENAME specifies where log messages should go when + debugging hung eventscripts. This is a testing option. + See also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>. + </para> + <para> + No default. Messages go to stdout/stderr and are logged + to the same place as other CTDB log messages. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>CTDB_DEBUG_HUNG_SCRIPT_STACKPAT=<parameter>REGEXP</parameter></term> + <listitem> + <para> + REGEXP specifies interesting processes for which stack + traces should be logged when debugging hung eventscripts + and those processes are matched in pstree output. See + also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>. + </para> + <para> + Default is "exportfs\|rpcinfo". + </para> + </listitem> + </varlistentry> + + <varlistentry> <term>CTDB_DEBUG_LOCKS=<parameter>FILENAME</parameter></term> <listitem> <para> diff --git a/ctdb/tests/complex/90_debug_hung_script.sh b/ctdb/tests/complex/90_debug_hung_script.sh new file mode 100755 index 0000000000..ef6216cf94 --- /dev/null +++ b/ctdb/tests/complex/90_debug_hung_script.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +test_info() +{ + cat <<EOF +Verify CTDB's debugging of timed out eventscripts + +Prerequisites: + +* An active CTDB cluster with monitoring enabled + +Expected results: + +* When an eventscript times out the correct debugging is executed. +EOF +} + +. "${TEST_SCRIPTS_DIR}/integration.bash" + +set -e + +ctdb_test_init "$@" + +ctdb_test_check_real_cluster + +cluster_is_healthy + +# No need for restart when done + +# This is overkill but it at least provides a valid test node +select_test_node_and_ips + +#################### + +# Set this if CTDB is installed in a non-standard location on cluster +# nodes +[ -n "$CTDB_BASE" ] || CTDB_BASE="/etc/ctdb" + +#################### + +echo "Enable eventscript for testing timeouts..." +ctdb_test_exit_hook_add "onnode -q $test_node $CTDB disablescript 99.timeout" +try_command_on_node $test_node $CTDB enablescript "99.timeout" + +#################### + +echo "Setting monitor events to time out..." +rc_local_d="${CTDB_BASE}/rc.local.d" +try_command_on_node $test_node mkdir -p "$rc_local_d" + +rc_local_f="${rc_local_d}/timeout_config.$$" +ctdb_test_exit_hook_add "onnode $test_node rm -f $rc_local_f" + +try_command_on_node $test_node mktemp +debug_output="$out" +ctdb_test_exit_hook_add "onnode $test_node rm -f $debug_output" + +try_command_on_node -i $test_node tee "$rc_local_f" <<<"\ +CTDB_RUN_TIMEOUT_MONITOR=yes +CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=\"$debug_output\" +CTDB_DEBUG_HUNG_SCRIPT_STACKPAT='exportfs\|rpcinfo\|sleep'" + +try_command_on_node $test_node chmod +x "$rc_local_f" + +#################### + +wait_for_monitor_event $test_node + +echo "Checking output of hung script debugging..." +try_command_on_node -v $test_node cat "$debug_output" + +while IFS="" read pattern ; do + if grep -- "^${pattern}\$" <<<"$out" >/dev/null ; then + echo "GOOD: output contains \"$pattern\"" + else + echo "BAD: output does not contain \"$pattern\"" + exit 1 + fi +done <<'EOF' +===== Start of hung script debug for PID=".*", event="monitor" ===== +===== End of hung script debug for PID=".*", event="monitor" ===== +pstree -p -a .*: + *\`-99\\.timeout,.* /etc/ctdb/events.d/99.timeout monitor + *\`-sleep,.* +---- Stack trace of interesting process [0-9]*\\[sleep\\] ---- +[<[0-9a-f]*>] .*sleep+.* +---- ctdb scriptstatus monitor: ---- +[0-9]* scripts were executed last monitor cycle +99\\.timeout *Status:TIMEDOUT.* + *OUTPUT:sleeping for [0-9]* seconds\\.\\.\\. +EOF |