diff options
author | Srikrishan Malik <srimalik@in.ibm.com> | 2013-01-09 16:11:39 +0530 |
---|---|---|
committer | Amitay Isaacs <amitay@gmail.com> | 2013-01-11 17:16:46 +1100 |
commit | 28cbe527d47822f870e8252495ab2a1c8fddd12f (patch) | |
tree | 7a57d9be9ad620cc5bbeef83ca316f19c4205055 /ctdb | |
parent | 96ba39669776fc0931571cb361d11982d079c016 (diff) | |
download | samba-28cbe527d47822f870e8252495ab2a1c8fddd12f.tar.gz samba-28cbe527d47822f870e8252495ab2a1c8fddd12f.tar.xz samba-28cbe527d47822f870e8252495ab2a1c8fddd12f.zip |
Changes for unobtrusive recovery and new method for health check.
Unobtrusive recovery: Ganesha will not be restarted on failovers.
Ganesha health: Use the counters in /var/lib/nfs/ganesha_local to track progress
instead of the null call which can timeout if the server is too busy.
Signed-off-by: Srikrishan Malik <srimalik@in.ibm.com>
Signed-off-by: Lance Russell <lancerus@us.ibm.com>
(This used to be ctdb commit 0e651e9da0f1f3c836b4474612ab13d0ccd272d9)
Diffstat (limited to 'ctdb')
-rwxr-xr-x | ctdb/config/events.d/60.ganesha | 217 | ||||
-rwxr-xr-x | ctdb/config/functions | 8 | ||||
-rwxr-xr-x | ctdb/config/statd-callout | 6 |
3 files changed, 156 insertions, 75 deletions
diff --git a/ctdb/config/events.d/60.ganesha b/ctdb/config/events.d/60.ganesha index 4d8736e2db9..7e8d48d49d1 100755 --- a/ctdb/config/events.d/60.ganesha +++ b/ctdb/config/events.d/60.ganesha @@ -6,7 +6,10 @@ . $CTDB_BASE/functions -service_name="nfs-ganesha-gpfs" +GANRECDIR="/var/lib/nfs/ganesha" +GANRECDIR2="/var/lib/nfs/ganesha/recevents" +GPFS_STATE="/usr/lpp/mmfs/bin/mmgetstate" +GANRECDIR3="/var/lib/nfs/ganesha_local" service_start () @@ -29,9 +32,15 @@ service_reconfigure () $CTDB_BASE/statd-callout notify & } >/dev/null 2>&1 } + loadconfig "nfs" -[ "$NFS_SERVER_MODE" = "ganesha" ] || exit 0 + +[ -n "$CTDB_CLUSTER_FILESYSTEM_TYPE" ] || CTDB_CLUSTER_FILESYSTEM_TYPE="gpfs" + +service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE" + +[ "$NFS_SERVER_MODE" == "ganesha" ] || exit 0 ctdb_setup_service_state_dir @@ -43,100 +52,182 @@ statd_update_trigger="$service_state_dir/update-trigger" # when we get to a monitor event. touch "$statd_update_trigger" - ctdb_start_stop_service is_ctdb_managed_service || exit 0 ctdb_service_check_reconfigure +get_cluster_fs_state () +{ + case $CTDB_CLUSTER_FILESYSTEM_TYPE in + gpfs) + STATE=`$GPFS_STATE | awk 'NR <= 3 {next} {printf "%-6s", $3}'` + echo $STATE + ;; + *) + die "File system $CTDB_CLUSTER_FILESYSTEM_TYPE not supported" + ;; + esac +} + +create_ganesha_recdirs () +{ + if [ -z "$(mount -t $CTDB_CLUSTER_FILESYSTEM_TYPE)" ]; then + echo "startup $CTDB_CLUSTER_FILESYSTEM_TYPE not ready" + exit 1 + fi + MNTPT=`mount -t $CTDB_CLUSTER_FILESYSTEM_TYPE | sort | awk '{print $3}' | head -n 1` + mkdir -p $MNTPT/.ganesha + if [ -e $GANRECDIR ]; then + if [ ! -L $GANRECDIR ] ; then + rm -rf $GANRECDIR + if ! ln -s $MNTPT/.ganesha $GANRECDIR ; then + echo "ln failed" + fi + fi + else + if ! ln -s $MNTPT/.ganesha $GANRECDIR ; then + echo "ln failed" + fi + fi + + mkdir -p $GANRECDIR2 + mkdir -p $GANRECDIR3 +} + case "$1" in init) - # read statd from persistent database - ;; + # read statd from persistent database + ;; startup) - ctdb_service_start - ;; + create_ganesha_recdirs + ctdb_service_start + ;; shutdown) - ctdb_service_stop - ;; + ctdb_service_stop + ;; takeip) - ctdb_service_set_reconfigure - ;; + if [ -n "$2" ] ; then + case $CTDB_CLUSTER_FILESYSTEM_TYPE in + gpfs) + NNUM=`/usr/lpp/mmfs/bin/mmlsconfig myNodeConfigNumber | awk '{print $2}'` + TDATE=`date +"%s"` + TOUCHTGT=$1"_"$TDATE"_"$NNUM"_"$3"_"$4"_"$2 + touch $GANRECDIR2/$TOUCHTGT + ;; + esac + fi + ctdb_service_set_reconfigure + ;; releaseip) - ctdb_service_set_reconfigure - ;; - - monitor) + if [ -n "$2" ] ; then + case $CTDB_CLUSTER_FILESYSTEM_TYPE in + gpfs) + NNUM=`/usr/lpp/mmfs/bin/mmlsconfig myNodeConfigNumber | awk '{print $2}'` + TDATE=`date +"%s"` + TOUCHTGT=$1"_"$TDATE"_"$NNUM"_"$3"_"$4"_"$2 + touch $GANRECDIR2/$TOUCHTGT + ;; + esac + fi + ctdb_service_set_reconfigure + ;; - update_tickles 2049 + monitor) + update_tickles 2049 + create_ganesha_recdirs + service_name=${service_name}_process # check that statd responds to rpc requests # if statd is not running we try to restart it # we only do this IF we have a rpc.statd command. # For platforms where rpc.statd does not exist, we skip - # the check completely - p="rpc.statd" - which $p >/dev/null 2>/dev/null && \ + # the check completely + p="rpc.statd" + which $p >/dev/null 2>/dev/null && \ nfs_check_rpc_service "statd" 1 \ - -ge 6 "verbose unhealthy" \ - -eq 4 "verbose restart" \ - -eq 2 "restart:bs" - - PIDFILE="/var/run/ganesha.pid" - RUNNING=0 - if [ -e $PIDFILE ] - then - PID=`cat $PIDFILE` - GANESHA="/usr/bin/gpfs.ganesha.nfsd" - RUNNING=`cat /proc/$PID/cmdline | grep $GANESHA | wc -l` - fi - if [ $RUNNING != 1 ] - then - echo "Trying fast restart of NFS service" - startstop_ganesha restart - fi - - # check that NFS responds to rpc requests - if [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then - nfs_check_rpc_service "ganesha" \ -ge 6 "verbose unhealthy" \ -eq 4 "verbose restart" \ -eq 2 "restart:bs" + + PIDFILE="/var/run/ganesha.pid" + CUR_STATE=`get_cluster_fs_state` + GANESHA="/usr/bin/$CTDB_CLUSTER_FILESYSTEM_TYPE.ganesha.nfsd" + if { read PID < $PIDFILE && \ + grep "$GANESHA" "/proc/$PID/cmdline" ; } >/dev/null 2>&1 ; then + ctdb_counter_init "$service_name" + else + if [ $CUR_STATE = "active" ]; then + echo "Trying fast restart of NFS service" + startstop_ganesha restart + ctdb_counter_incr "$service_name" + ctdb_check_counter "error" "-ge" "6" "$service_name" + fi + fi + + service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"_service + # check that NFS is posting forward progress + if [ $CUR_STATE = "active" -a "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then + MAXREDS=2 + MAXSTALL=120 + RESTART=0 + + NUMREDS=`ls $GANRECDIR3 | grep "red" | wc -l` + LASTONE=`ls -t $GANRECDIR3 | sed 's/_/ /' | awk 'NR > 1 {next} {printf $1} '` + # Beware of startup + if [ -z $LASTONE ] ; then + LASTONE=`date +"%s"` + fi + TNOW=$(date +"%s") + TSTALL=$(($TNOW - $LASTONE)) + if [ $NUMREDS -ge $MAXREDS ] ; then + echo restarting because of $NUMREDS red conditions + RESTART=1 + ctdb_counter_incr "$service_name" + ctdb_check_counter "error" "-ge" "6" "$service_name" + fi + if [ $TSTALL -ge $MAXSTALL ] ; then + echo restarting because of $TSTALL second stall + RESTART=1 + ctdb_counter_incr "$service_name" + ctdb_check_counter "error" "-ge" "6" "$service_name" + fi + if [ $RESTART -gt 0 ] ; then + startstop_ganesha restart + else + ctdb_counter_init "$service_name" + fi fi # rquotad is sometimes not started correctly on RHEL5 # not a critical service so we dont flag the node as unhealthy - nfs_check_rpc_service "rquotad" 1\ + nfs_check_rpc_service "rquotad" 1 \ -gt 0 "verbose restart:b" - - # Check that directories for shares actually exist. - [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || { - grep Path /etc/ganesha/gpfs.ganesha.exports.conf | - cut -f2 -d\" | ctdb_check_directories - } || exit $? - - # once every 60 seconds, update the statd state database for which - # clients need notifications - LAST_UPDATE=`stat --printf="%Y" "$statd_update_trigger" 2>/dev/null` - CURRENT_TIME=`date +"%s"` - [ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && { + # Check that directories for shares actually exist. + [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || { + grep Path /etc/ganesha/$CTDB_CLUSTER_FILESYSTEM_TYPE.ganesha.exports.conf | + cut -f2 -d\" | ctdb_check_directories + } || exit $? + + # once every 60 seconds, update the statd state database for which + # clients need notifications + LAST_UPDATE=`stat --printf="%Y" "$statd_update_trigger" 2>/dev/null` + CURRENT_TIME=`date +"%s"` + [ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && { touch "$statd_update_trigger" - $CTDB_BASE/statd-callout updatelocal & - $CTDB_BASE/statd-callout updateremote & - } - ;; + $CTDB_BASE/statd-callout updatelocal & + $CTDB_BASE/statd-callout updateremote & + } + ;; - ipreallocated) - ctdb_service_set_reconfigure - ;; - *) - ctdb_standard_event_handler "$@" - ;; + *) + ctdb_standard_event_handler "$@" + ;; esac exit 0 diff --git a/ctdb/config/functions b/ctdb/config/functions index 8fbc0791197..8d186a0e41d 100755 --- a/ctdb/config/functions +++ b/ctdb/config/functions @@ -727,13 +727,7 @@ tickle_tcp_connections() { ######################################################## startstop_ganesha() { - _ganesha_fsal_list="gpfs" - for _fsal in $_ganesha_fsal_list ; do - _service_name="nfs-ganesha-${_fsal}" - if [ -x /etc/init.d/$_service_name ] ; then - break - fi - done + _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE" case "$1" in start) service "$_service_name" start diff --git a/ctdb/config/statd-callout b/ctdb/config/statd-callout index 72a2ca96401..a9c2e3dfba5 100755 --- a/ctdb/config/statd-callout +++ b/ctdb/config/statd-callout @@ -136,11 +136,7 @@ case "$1" in # will respond "strangely" immediately after restarting it, which # causes clients to fail to reclaim the locks. # - if [ "$NFS_SERVER_MODE" = "ganesha" ] ; then - startstop_ganesha stop >/dev/null 2>&1 - sleep 2 - startstop_ganesha start >/dev/null 2>&1 - else + if [ "$NFS_SERVER_MODE" != "ganesha" ] ; then startstop_nfslock stop >/dev/null 2>&1 sleep 2 startstop_nfslock start >/dev/null 2>&1 |