eventscripts: Add new option $CTDB_MONITOR_NFS_THREAD_COUNT

Consider the following example: 1. There are 256 nfsd threads configured. 2. 200 threads are "stuck" in system calls, perhaps waiting for the underlying filesystem when an attempt is made to restart NFS. 3. 56 threads exit when NFS is stopped. 4. 56 new threads are started when NFS is started. 5. 200 "stuck" threads exit leaving only 56 threads running. Setting this option to "yes" makes the 60.nfs monitor event look for this situation and try to correct it. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 99b0d8b8ecc36dfc493775b9ebced54539c182d2)
author: Martin Schwenke <martin@meltin.net> 2013-06-13 10:17:20 +1000
committer: Martin Schwenke <martin@meltin.net> 2013-06-13 20:01:22 +1000
commit: f408caea2ad46dab3f8f19b97eda8c8652bd5ff4 (patch)
tree: a847cc6fe3577543023bc9982328828bee6bd77a
parent: 7513f0ba6110c160e3f207db0d233c94609ee45f (diff)
2 files changed, 38 insertions, 0 deletions
diff --git a/ctdb/config/ctdb.sysconfig b/ctdb/config/ctdb.sysconfig
index 6f58e8fc8c..25ad2aef95 100644
--- a/ctdb/config/ctdb.sysconfig
+++ b/ctdb/config/ctdb.sysconfig
@@ -129,6 +129,19 @@ CTDB_RECOVERY_LOCK="/some/place/on/shared/storage"
 # CTDB_MONITOR_FREE_MEMORY_WARN=100
 # CTDB_MONITOR_FREE_MEMORY=10
 
+# Should the 60.nfs monitor event try to correct the number of nfsd
+# threads?  This works around a limitation in some NFS initscripts
+# where some threads can be stuck in host filesystem calls (perhaps
+# due to slow storage), a restart occurs, some threads don't exit, the
+# start only adds the missing number of threads, the stuck threads
+# exit, and the result is a lower than expected thread count.  Note
+# that if you must also set $RPCNFSDCOUNT (RedHat/Debian) or
+# $USE_KERNEL_NFSD_NUMBER (SUSE) in your NFS configuration so the
+# monitoring code knows how many threads there should be - if neither
+# of these are set then this option will be ignored.  The default is
+# to not do this check.
+# CTDB_MONITOR_NFS_THREAD_COUNT="yes"
+
 # When set to yes, the CTDB node will start in DISABLED mode and not host
 # any public ip addresses. The administrator needs to explicitely enable
 # the node with "ctdb enable"
diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs
index eb98ee1241..53f78dfe3d 100755
--- a/ctdb/config/events.d/60.nfs
+++ b/ctdb/config/events.d/60.nfs
@@ -26,6 +26,29 @@ service_reconfigure ()
     } >/dev/null 2>&1
 }
 
+nfs_check_thread_count ()
+{
+    [ "$CTDB_MONITOR_NFS_THREAD_COUNT" = "yes" ] || return 0
+
+    # If $RPCNFSDCOUNT/$USE_KERNEL_NFSD_NUMBER isn't set then we could
+    # guess the default from the initscript.  However, let's just
+    # assume that those using the default don't care about the number
+    # of threads and that they have switched on this feature in error.
+    _configured_threads="${RPCNFSDCOUNT:-${USE_KERNEL_NFSD_NUMBER}}"
+    [ -n "$_configured_threads" ] || return 0
+
+    # nfsd should be running the configured number of threads.  If
+    # there are a different number of threads then tell nfsd the
+    # correct number.  
+    _running_threads=$(get_proc "fs/nfsd/threads")
+    # Intentionally not arithmetic comparison - avoids extra errors
+    # when get_proc() fails...
+    if [ "$_running_threads" != "$_configured_threads" ] ; then
+	echo "Attempting to correct number of nfsd threads from ${_running_threads} to ${_configured_threads}"
+	set_proc "fs/nfsd/threads" "$_configured_threads"
+    fi
+}
+
 loadconfig
 
 [ "$NFS_SERVER_MODE" != "ganesha" ] || exit 0
@@ -71,6 +94,8 @@ case "$1" in
 
 	nfs_check_rpc_services
 
+	nfs_check_thread_count
+
 	# Every 10 minutes, update the statd state database for which
 	# clients need notifications
 	nfs_statd_update 600
author	Martin Schwenke <martin@meltin.net>	2013-06-13 10:17:20 +1000
committer	Martin Schwenke <martin@meltin.net>	2013-06-13 20:01:22 +1000
commit	f408caea2ad46dab3f8f19b97eda8c8652bd5ff4 (patch)
tree	a847cc6fe3577543023bc9982328828bee6bd77a
parent	7513f0ba6110c160e3f207db0d233c94609ee45f (diff)