diff options
| author | Ronnie Sahlberg <ronniesahlberg@gmail.com> | 2011-08-15 15:20:18 +1000 |
|---|---|---|
| committer | Ronnie Sahlberg <ronniesahlberg@gmail.com> | 2011-08-15 15:20:18 +1000 |
| commit | 2c5f1d7cccdd45f80b6fd110673d41e1408b2f27 (patch) | |
| tree | ed076a754b0440871fc185f1c4a0b73e6ccc6683 | |
| parent | 775e188cb7fe89ed5ca8f46f33f2eeb7165f6311 (diff) | |
| parent | 3b43805a318c2c51f783db83e963e69f6751ad3d (diff) | |
| download | samba-2c5f1d7cccdd45f80b6fd110673d41e1408b2f27.tar.gz samba-2c5f1d7cccdd45f80b6fd110673d41e1408b2f27.tar.xz samba-2c5f1d7cccdd45f80b6fd110673d41e1408b2f27.zip | |
Merge remote branch 'martins/eventscript.60.nfs.rpc'
(This used to be ctdb commit 2e30a2bb4371a846c7a768affa15883211642d5c)
16 files changed, 154 insertions, 115 deletions
diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs index e77804836b..2b4c158d68 100755 --- a/ctdb/config/events.d/60.nfs +++ b/ctdb/config/events.d/60.nfs @@ -59,7 +59,7 @@ case "$1" in ;; monitor) - # and that its directories are available + # Check that directories for shares actually exist. [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || { exportfs | grep -v '^#' | grep '^/' | sed -e 's/[[:space:]]\+[^[:space:]]*$//' | @@ -73,118 +73,35 @@ case "$1" in # we only do this IF we have a rpc.statd command. # For platforms where rpc.statd does not exist, we skip # the check completely - p="rpc.statd" - which $p >/dev/null 2>/dev/null && { - if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then - (service_name="nfs_statd"; ctdb_counter_init) - else - cmd="$p" - cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}" - cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}" - cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}" - ( - service_name="nfs_statd" - ctdb_counter_incr - ctdb_check_counter_limit 10 quiet >/dev/null - ) || { - echo "$ctdb_check_rpc_out" - echo "Trying to restart STATD [$cmd]" - $cmd - } - fi - } + p="rpc.statd" + which $p >/dev/null 2>/dev/null && \ + nfs_check_rpc_service "statd" \ + -ge 6 "verbose unhealthy" \ + -eq 4 "verbose restart" \ + -eq 2 "restart:bs" # check that NFS responds to rpc requests - [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || { - if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then - (service_name="nfs_knfsd"; ctdb_counter_init) - else - ( - service_name="nfs_knfsd" - ctdb_counter_incr - - ctdb_check_counter_equal 2 || { - echo "Trying to restart NFS service" - startstop_nfs restart >/dev/null 2>&1 & - exit 0 - } - - ctdb_check_counter_limit 5 quiet >/dev/null - ) || { - echo "$ctdb_check_rpc_out" - echo "Trying to restart NFS service" - startstop_nfs restart - exit 1 - } - fi - } - - # check that lockd responds to rpc requests - if ctdb_check_rpc "LOCKD" 100021 1 >/dev/null ; then - (service_name="lockd"; ctdb_counter_init) - else - ( - service_name="lockd" - ctdb_counter_incr - - ctdb_check_counter_equal 10 || { - echo "Trying to restart NFS lock service" - startstop_nfs restart >/dev/null 2>&1 & - startstop_nfslock restart >/dev/null 2>&1 & - exit 0 - } - - ctdb_check_counter_limit 15 quiet >/dev/null - ) || { - echo "$ctdb_check_rpc_out" - echo "Trying to restart NFS lock service" - startstop_nfs restart - startstop_nfslock restart - exit 1 - } - fi - - # mount needs special handling since it is sometimes not started - # correctly on RHEL5 - if ctdb_check_rpc "MOUNTD" 100005 1 >/dev/null ; then - (service_name="nfs_mountd"; ctdb_counter_init) - else - ( - service_name="nfs_mountd" - ctdb_counter_incr - - ctdb_check_counter_equal 5 || { - p="rpc.mountd" - cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}" - echo "Trying to restart MOUNTD [${cmd}]" - killall -q -9 $p - $cmd & - exit 0 - } - - ctdb_check_counter_limit 10 quiet >/dev/null - ) || { - echo "$ctdb_check_rpc_out" - p="rpc.mountd" - cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}" - echo "Trying to restart MOUNTD [${cmd}]" - killall -q -9 $p - $cmd & - exit 1 - } + if [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then + nfs_check_rpc_service "knfsd" \ + -ge 6 "verbose unhealthy" \ + -eq 4 "verbose restart" \ + -eq 2 "restart:bs" fi - - # rquotad needs special handling since it is sometimes not started - # correctly on RHEL5 - # this is not a critical service so we dont flag the node as unhealthy - ctdb_check_rpc "RQUOTAD" 100011 1 || { - p="rpc.rquotad" - cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}" - echo "Trying to restart RQUOTAD [${cmd}]" - killall -q -9 $p - $cmd & - } + # check that lockd responds to rpc requests + nfs_check_rpc_service "lockd" \ + -ge 15 "verbose restart unhealthy" \ + -eq 10 "restart:bs" + + # mountd is sometimes not started correctly on RHEL5 + nfs_check_rpc_service "mountd" \ + -ge 10 "verbose restart:b unhealthy" \ + -eq 5 "restart:b" + + # rquotad is sometimes not started correctly on RHEL5 + # not a critical service so we dont flag the node as unhealthy + nfs_check_rpc_service "rquotad" \ + -gt 0 "verbose restart:b" # once every 600 seconds, update the statd state database for which # clients need notifications diff --git a/ctdb/config/functions b/ctdb/config/functions index 2668531ca8..b04965281d 100755 --- a/ctdb/config/functions +++ b/ctdb/config/functions @@ -106,17 +106,139 @@ get_proc () } ###################################################### +# Check that an RPC service is healthy - +# this includes allowing a certain number of failures +# before marking the NFS service unhealthy. +# +# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...] +# +# each triple is a set of 3 arguments: an operator, a +# fail count limit and an action string. +# +# For example: +# +# nfs_check_rpc_service "lockd" \ +# -ge 15 "verbose restart unhealthy" \ +# -eq 10 "restart:bs" +# +# says that if lockd is down for 15 iterations then do +# a verbose restart of lockd and mark the node unhealthy. +# Before this, after 10 iterations of failure, the +# service is restarted silently in the background. +# Order is important: the number of failures need to be +# specified in reverse order because processing stops +# after the first condition that is true. +###################################################### +nfs_check_rpc_service () +{ + _prog_name="$1" ; shift + + _version=1 + _rpc_prog="$_prog_name" + _restart="" + _opts="" + case "$_prog_name" in + knfsd) + _rpc_prog=nfs + _version=3 + _restart="echo 'Trying to restart NFS service'" + _restart="${_restart}; startstop_nfs restart" + ;; + mountd) + _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}" + ;; + rquotad) + _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}" + ;; + lockd) + _rpc_prog=nlockmgr + _version=4 + _restart="echo 'Trying to restart lock manager service'" + _restart="${_restart}; startstop_nfslock restart" + ;; + statd) + _rpc_prog=status + _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}" + _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}" + _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}" + ;; + *) + echo "Internal error: unknown RPC program \"$_prog_name\"." + exit 1 + esac + + _service_name="nfs_${_prog_name}" + + if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then + ctdb_counter_init "$_service_name" + return 0 + fi + + ctdb_counter_incr "$_service_name" + + while [ -n "$3" ] ; do + ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || { + for _action in $3 ; do + case "$_action" in + verbose) + echo "$ctdb_check_rpc_out" + ;; + restart|restart:*) + # No explicit command specified, construct rpc command. + if [ -z "$_restart" ] ; then + _p="rpc.${_prog_name}" + _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'" + _restart="${_restart}; killall -q -9 $_p" + _restart="${_restart}; $_p $_opts" + fi + + # Process restart flags... + _flags="${_action#restart:}" + # There may not have been a colon... + [ "$_flags" != "$_action" ] || _flags="" + # q=quiet - everything to /dev/null + if [ "${_flags#*q}" != "$_flags" ] ; then + _restart="{ ${_restart} ; } >/dev/null 2>&1" + fi + # s=stealthy - last command to /dev/null + if [ "${_flags#*s}" != "$_flags" ] ; then + _restart="${_restart} >/dev/null 2>&1" + fi + # b=background - the whole thing, easy and reliable + if [ "${_flags#*b}" != "$_flags" ] ; then + _restart="{ ${_restart} ; } &" + fi + + # Do it! + eval "${_restart}" + ;; + unhealthy) + exit 1 + ;; + *) + echo "Internal error: unknown action \"$_action\"." + exit 1 + esac + done + + # Only process the first action group. + break + } + shift 3 + done +} + +###################################################### # check that a rpc server is registered with portmap # and responding to requests -# usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION +# usage: ctdb_check_rpc SERVICE_NAME VERSION ###################################################### -ctdb_check_rpc() { +ctdb_check_rpc () +{ progname="$1" - prognum="$2" - version="$3" + version="$2" - ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1) - if [ $? -ne 0 ] ; then + if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then ctdb_check_rpc_out="ERROR: $progname failed RPC check: $ctdb_check_rpc_out" echo "$ctdb_check_rpc_out" diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh index 5b260ac6fb..5b260ac6fb 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh index 95f4dc6fb6..95f4dc6fb6 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh index 657db97a07..657db97a07 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh index 8dcde029dd..8dcde029dd 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh index de796ebd07..de796ebd07 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh index 062c3f6f5c..062c3f6f5c 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh index b93dba7b97..b93dba7b97 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh index 84d20b761c..84d20b761c 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh index fad9a4c590..fad9a4c590 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh index 377de6e229..377de6e229 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh index 3ca3cf64a7..3ca3cf64a7 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh index af2dd26f5d..af2dd26f5d 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh index 9aad819b74..9aad819b74 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh index 6b2750c82c..6b2750c82c 100644..100755 --- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh +++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh |
