summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimo Sorce <simo@redhat.com>2016-01-12 20:07:59 -0500
committerSimo Sorce <simo@redhat.com>2016-04-05 13:00:12 -0400
commite215e5534bb56f3887521443ce6c77d13ea3518d (patch)
tree4d1f8ee865c3ff12e84dd58811ca983c194adb5d
parentdad416a9b0095e1c423b7da65db7c636fa69e614 (diff)
downloadsssd-e215e5534bb56f3887521443ce6c77d13ea3518d.tar.gz
sssd-e215e5534bb56f3887521443ce6c77d13ea3518d.tar.xz
sssd-e215e5534bb56f3887521443ce6c77d13ea3518d.zip
Util: Add watchdog helper
The watchdog uses a kernel timer to issue a signal to the process. It checks if the ticker is not being reset by the main event loop, which would indicate that the process got stuck. At the same time it sets a tevent timer to clear the watchdog ticker, so that the watchdog handler is kept happy. If the watchdog detects that the timer event failed to reset the watchdog for three times in a row then the process is killed. Normally the monitor will detect the child terminated and will rescheduled it. Related: https://fedorahosted.org/sssd/ticket/2921
-rw-r--r--Makefile.am1
-rw-r--r--src/util/util.h4
-rw-r--r--src/util/util_watchdog.c142
3 files changed, 147 insertions, 0 deletions
diff --git a/Makefile.am b/Makefile.am
index e2d2c38a..1559548d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -867,6 +867,7 @@ libsss_util_la_SOURCES = \
src/util/well_known_sids.c \
src/util/string_utils.c \
src/util/become_user.c \
+ src/util/util_watchdog.c \
$(NULL)
libsss_util_la_CFLAGS = \
$(AM_CFLAGS) \
diff --git a/src/util/util.h b/src/util/util.h
index 05ee8758..dd394514 100644
--- a/src/util/util.h
+++ b/src/util/util.h
@@ -587,4 +587,8 @@ int sss_unique_file(TALLOC_CTX *owner,
*/
int sss_unique_filename(TALLOC_CTX *owner, char *path_tmpl);
+/* from util_watchdog.c */
+int setup_watchdog(struct tevent_context *ev, int interval);
+void teardown_watchdog(void);
+
#endif /* __SSSD_UTIL_H__ */
diff --git a/src/util/util_watchdog.c b/src/util/util_watchdog.c
new file mode 100644
index 00000000..9fef8427
--- /dev/null
+++ b/src/util/util_watchdog.c
@@ -0,0 +1,142 @@
+/*
+ SSSD
+
+ Timer Watchdog routines
+
+ Copyright (C) Simo Sorce 2016
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "util/util.h"
+
+#define WATCHDOG_DEF_INTERVAL 10
+
+/* this is intentionally a global variable */
+struct watchdog_ctx {
+ timer_t timerid;
+ struct timeval interval;
+ struct tevent_timer *te;
+ volatile int ticks;
+} watchdog_ctx;
+
+/* the watchdog is purposefully *not* handled by the tevent
+ * signal handler as it is meant to check if the daemon is
+ * still processing the event queue itself. A stuck process
+ * may not handle the event queue at all and thus not handle
+ * signals either */
+static void watchdog_handler(int sig)
+{
+ /* if 3 ticks passed by kills itself */
+
+ if (__sync_add_and_fetch(&watchdog_ctx.ticks, 1) > 3) {
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Watchdog timer overflow, killing process!\n");
+ orderly_shutdown(1);
+ }
+}
+
+static void watchdog_reset(void)
+{
+ __sync_and_and_fetch(&watchdog_ctx.ticks, 0);
+}
+
+static void watchdog_event_handler(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ /* first thing reset the watchdog ticks */
+ watchdog_reset();
+
+ /* then set a new watchodg event */
+ watchdog_ctx.te = tevent_add_timer(ev, ev,
+ tevent_timeval_current_ofs(watchdog_ctx.interval.tv_sec, 0),
+ watchdog_event_handler, NULL);
+ /* if the function fails the watchdog will kill the
+ * process soon enough, so we just warn */
+ if (!watchdog_ctx.te) {
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Failed to create a watchdog timer event!\n");
+ }
+}
+
+int setup_watchdog(struct tevent_context *ev, int interval)
+{
+ struct sigevent sev = { 0 };
+ struct itimerspec its;
+ int signum = SIGRTMIN;
+ int ret;
+
+ CatchSignal(signum, watchdog_handler);
+
+ sev.sigev_notify = SIGEV_SIGNAL;
+ sev.sigev_signo = signum;
+ sev.sigev_value.sival_ptr = &watchdog_ctx.timerid;
+ errno = 0;
+ ret = timer_create(CLOCK_MONOTONIC, &sev, &watchdog_ctx.timerid);
+ if (ret == -1) {
+ ret = errno;
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Failed to create watchdog timer (%d) [%s]\n",
+ ret, strerror(ret));
+ return ret;
+ }
+
+ if (interval == 0) {
+ interval = WATCHDOG_DEF_INTERVAL;
+ }
+ watchdog_ctx.interval.tv_sec = interval;
+ watchdog_ctx.interval.tv_usec = 0;
+
+ /* Start the timer */
+ /* we give 1 second head start to the watchdog event */
+ its.it_value.tv_sec = interval + 1;
+ its.it_value.tv_nsec = 0;
+ its.it_interval.tv_sec = interval;
+ its.it_interval.tv_nsec = 0;
+ errno = 0;
+ ret = timer_settime(watchdog_ctx.timerid, 0, &its, NULL);
+ if (ret == -1) {
+ ret = errno;
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Failed to create watchdog timer (%d) [%s]\n",
+ ret, strerror(ret));
+ return ret;
+ }
+
+ /* Add the watchdog event and make it fire as fast as the timer */
+ watchdog_event_handler(ev, NULL, tevent_timeval_zero(), NULL);
+
+ return EOK;
+}
+
+void teardown_watchdog(void)
+{
+ int ret;
+
+ /* Disarm the timer */
+ errno = 0;
+ ret = timer_delete(watchdog_ctx.timerid);
+ if (ret == -1) {
+ ret = errno;
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Failed to destroy watchdog timer (%d) [%s]\n",
+ ret, strerror(ret));
+ }
+
+ /* and kill the watchdog event */
+ talloc_free(watchdog_ctx.te);
+}
+