summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRonnie Sahlberg <ronniesahlberg@gmail.com>2011-08-15 15:20:18 +1000
committerRonnie Sahlberg <ronniesahlberg@gmail.com>2011-08-15 15:20:18 +1000
commit2c5f1d7cccdd45f80b6fd110673d41e1408b2f27 (patch)
treeed076a754b0440871fc185f1c4a0b73e6ccc6683
parent775e188cb7fe89ed5ca8f46f33f2eeb7165f6311 (diff)
parent3b43805a318c2c51f783db83e963e69f6751ad3d (diff)
downloadsamba-2c5f1d7cccdd45f80b6fd110673d41e1408b2f27.tar.gz
samba-2c5f1d7cccdd45f80b6fd110673d41e1408b2f27.tar.xz
samba-2c5f1d7cccdd45f80b6fd110673d41e1408b2f27.zip
Merge remote branch 'martins/eventscript.60.nfs.rpc'
(This used to be ctdb commit 2e30a2bb4371a846c7a768affa15883211642d5c)
-rwxr-xr-xctdb/config/events.d/60.nfs135
-rwxr-xr-xctdb/config/functions134
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh0
-rwxr-xr-x[-rw-r--r--]ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh0
16 files changed, 154 insertions, 115 deletions
diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs
index e77804836b..2b4c158d68 100755
--- a/ctdb/config/events.d/60.nfs
+++ b/ctdb/config/events.d/60.nfs
@@ -59,7 +59,7 @@ case "$1" in
;;
monitor)
- # and that its directories are available
+ # Check that directories for shares actually exist.
[ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
exportfs | grep -v '^#' | grep '^/' |
sed -e 's/[[:space:]]\+[^[:space:]]*$//' |
@@ -73,118 +73,35 @@ case "$1" in
# we only do this IF we have a rpc.statd command.
# For platforms where rpc.statd does not exist, we skip
# the check completely
- p="rpc.statd"
- which $p >/dev/null 2>/dev/null && {
- if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
- (service_name="nfs_statd"; ctdb_counter_init)
- else
- cmd="$p"
- cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
- cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
- cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
- (
- service_name="nfs_statd"
- ctdb_counter_incr
- ctdb_check_counter_limit 10 quiet >/dev/null
- ) || {
- echo "$ctdb_check_rpc_out"
- echo "Trying to restart STATD [$cmd]"
- $cmd
- }
- fi
- }
+ p="rpc.statd"
+ which $p >/dev/null 2>/dev/null && \
+ nfs_check_rpc_service "statd" \
+ -ge 6 "verbose unhealthy" \
+ -eq 4 "verbose restart" \
+ -eq 2 "restart:bs"
# check that NFS responds to rpc requests
- [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
- if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
- (service_name="nfs_knfsd"; ctdb_counter_init)
- else
- (
- service_name="nfs_knfsd"
- ctdb_counter_incr
-
- ctdb_check_counter_equal 2 || {
- echo "Trying to restart NFS service"
- startstop_nfs restart >/dev/null 2>&1 &
- exit 0
- }
-
- ctdb_check_counter_limit 5 quiet >/dev/null
- ) || {
- echo "$ctdb_check_rpc_out"
- echo "Trying to restart NFS service"
- startstop_nfs restart
- exit 1
- }
- fi
- }
-
- # check that lockd responds to rpc requests
- if ctdb_check_rpc "LOCKD" 100021 1 >/dev/null ; then
- (service_name="lockd"; ctdb_counter_init)
- else
- (
- service_name="lockd"
- ctdb_counter_incr
-
- ctdb_check_counter_equal 10 || {
- echo "Trying to restart NFS lock service"
- startstop_nfs restart >/dev/null 2>&1 &
- startstop_nfslock restart >/dev/null 2>&1 &
- exit 0
- }
-
- ctdb_check_counter_limit 15 quiet >/dev/null
- ) || {
- echo "$ctdb_check_rpc_out"
- echo "Trying to restart NFS lock service"
- startstop_nfs restart
- startstop_nfslock restart
- exit 1
- }
- fi
-
- # mount needs special handling since it is sometimes not started
- # correctly on RHEL5
- if ctdb_check_rpc "MOUNTD" 100005 1 >/dev/null ; then
- (service_name="nfs_mountd"; ctdb_counter_init)
- else
- (
- service_name="nfs_mountd"
- ctdb_counter_incr
-
- ctdb_check_counter_equal 5 || {
- p="rpc.mountd"
- cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
- echo "Trying to restart MOUNTD [${cmd}]"
- killall -q -9 $p
- $cmd &
- exit 0
- }
-
- ctdb_check_counter_limit 10 quiet >/dev/null
- ) || {
- echo "$ctdb_check_rpc_out"
- p="rpc.mountd"
- cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
- echo "Trying to restart MOUNTD [${cmd}]"
- killall -q -9 $p
- $cmd &
- exit 1
- }
+ if [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then
+ nfs_check_rpc_service "knfsd" \
+ -ge 6 "verbose unhealthy" \
+ -eq 4 "verbose restart" \
+ -eq 2 "restart:bs"
fi
-
- # rquotad needs special handling since it is sometimes not started
- # correctly on RHEL5
- # this is not a critical service so we dont flag the node as unhealthy
- ctdb_check_rpc "RQUOTAD" 100011 1 || {
- p="rpc.rquotad"
- cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
- echo "Trying to restart RQUOTAD [${cmd}]"
- killall -q -9 $p
- $cmd &
- }
+ # check that lockd responds to rpc requests
+ nfs_check_rpc_service "lockd" \
+ -ge 15 "verbose restart unhealthy" \
+ -eq 10 "restart:bs"
+
+ # mountd is sometimes not started correctly on RHEL5
+ nfs_check_rpc_service "mountd" \
+ -ge 10 "verbose restart:b unhealthy" \
+ -eq 5 "restart:b"
+
+ # rquotad is sometimes not started correctly on RHEL5
+ # not a critical service so we dont flag the node as unhealthy
+ nfs_check_rpc_service "rquotad" \
+ -gt 0 "verbose restart:b"
# once every 600 seconds, update the statd state database for which
# clients need notifications
diff --git a/ctdb/config/functions b/ctdb/config/functions
index 2668531ca8..b04965281d 100755
--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@@ -106,17 +106,139 @@ get_proc ()
}
######################################################
+# Check that an RPC service is healthy -
+# this includes allowing a certain number of failures
+# before marking the NFS service unhealthy.
+#
+# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
+#
+# each triple is a set of 3 arguments: an operator, a
+# fail count limit and an action string.
+#
+# For example:
+#
+# nfs_check_rpc_service "lockd" \
+# -ge 15 "verbose restart unhealthy" \
+# -eq 10 "restart:bs"
+#
+# says that if lockd is down for 15 iterations then do
+# a verbose restart of lockd and mark the node unhealthy.
+# Before this, after 10 iterations of failure, the
+# service is restarted silently in the background.
+# Order is important: the number of failures need to be
+# specified in reverse order because processing stops
+# after the first condition that is true.
+######################################################
+nfs_check_rpc_service ()
+{
+ _prog_name="$1" ; shift
+
+ _version=1
+ _rpc_prog="$_prog_name"
+ _restart=""
+ _opts=""
+ case "$_prog_name" in
+ knfsd)
+ _rpc_prog=nfs
+ _version=3
+ _restart="echo 'Trying to restart NFS service'"
+ _restart="${_restart}; startstop_nfs restart"
+ ;;
+ mountd)
+ _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+ ;;
+ rquotad)
+ _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+ ;;
+ lockd)
+ _rpc_prog=nlockmgr
+ _version=4
+ _restart="echo 'Trying to restart lock manager service'"
+ _restart="${_restart}; startstop_nfslock restart"
+ ;;
+ statd)
+ _rpc_prog=status
+ _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+ _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
+ _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+ ;;
+ *)
+ echo "Internal error: unknown RPC program \"$_prog_name\"."
+ exit 1
+ esac
+
+ _service_name="nfs_${_prog_name}"
+
+ if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
+ ctdb_counter_init "$_service_name"
+ return 0
+ fi
+
+ ctdb_counter_incr "$_service_name"
+
+ while [ -n "$3" ] ; do
+ ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
+ for _action in $3 ; do
+ case "$_action" in
+ verbose)
+ echo "$ctdb_check_rpc_out"
+ ;;
+ restart|restart:*)
+ # No explicit command specified, construct rpc command.
+ if [ -z "$_restart" ] ; then
+ _p="rpc.${_prog_name}"
+ _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
+ _restart="${_restart}; killall -q -9 $_p"
+ _restart="${_restart}; $_p $_opts"
+ fi
+
+ # Process restart flags...
+ _flags="${_action#restart:}"
+ # There may not have been a colon...
+ [ "$_flags" != "$_action" ] || _flags=""
+ # q=quiet - everything to /dev/null
+ if [ "${_flags#*q}" != "$_flags" ] ; then
+ _restart="{ ${_restart} ; } >/dev/null 2>&1"
+ fi
+ # s=stealthy - last command to /dev/null
+ if [ "${_flags#*s}" != "$_flags" ] ; then
+ _restart="${_restart} >/dev/null 2>&1"
+ fi
+ # b=background - the whole thing, easy and reliable
+ if [ "${_flags#*b}" != "$_flags" ] ; then
+ _restart="{ ${_restart} ; } &"
+ fi
+
+ # Do it!
+ eval "${_restart}"
+ ;;
+ unhealthy)
+ exit 1
+ ;;
+ *)
+ echo "Internal error: unknown action \"$_action\"."
+ exit 1
+ esac
+ done
+
+ # Only process the first action group.
+ break
+ }
+ shift 3
+ done
+}
+
+######################################################
# check that a rpc server is registered with portmap
# and responding to requests
-# usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION
+# usage: ctdb_check_rpc SERVICE_NAME VERSION
######################################################
-ctdb_check_rpc() {
+ctdb_check_rpc ()
+{
progname="$1"
- prognum="$2"
- version="$3"
+ version="$2"
- ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
- if [ $? -ne 0 ] ; then
+ if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then
ctdb_check_rpc_out="ERROR: $progname failed RPC check:
$ctdb_check_rpc_out"
echo "$ctdb_check_rpc_out"
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh
index 5b260ac6fb..5b260ac6fb 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh
index 95f4dc6fb6..95f4dc6fb6 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh
index 657db97a07..657db97a07 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh
index 8dcde029dd..8dcde029dd 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh
index de796ebd07..de796ebd07 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh
index 062c3f6f5c..062c3f6f5c 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh
index b93dba7b97..b93dba7b97 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh
index 84d20b761c..84d20b761c 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh
index fad9a4c590..fad9a4c590 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh
index 377de6e229..377de6e229 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh
index 3ca3cf64a7..3ca3cf64a7 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh
index af2dd26f5d..af2dd26f5d 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh
index 9aad819b74..9aad819b74 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh
index 6b2750c82c..6b2750c82c 100644..100755
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh