summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimo Sorce <simo@redhat.com>2016-01-12 20:07:59 -0500
committerJakub Hrozek <jhrozek@redhat.com>2016-06-29 21:45:30 +0200
commit75ba524d356fed615a9c92152f64aebf0bdaf9c2 (patch)
tree9fa8b5f4331ec182921eb43f629c3b6880910dd6
parent96a624877512ac352736047023b65b8688039ae1 (diff)
downloadsssd-75ba524d356fed615a9c92152f64aebf0bdaf9c2.tar.gz
sssd-75ba524d356fed615a9c92152f64aebf0bdaf9c2.tar.xz
sssd-75ba524d356fed615a9c92152f64aebf0bdaf9c2.zip
Util: Add watchdog helper
The watchdog uses a kernel timer to issue a signal to the process. It checks if the ticker is not being reset by the main event loop, which would indicate that the process got stuck. At the same time it sets a tevent timer to clear the watchdog ticker, so that the watchdog handler is kept happy. If the watchdog detects that the timer event failed to reset the watchdog for three times in a row then the process is killed. Normally the monitor will detect the child terminated and will rescheduled it. Related: https://fedorahosted.org/sssd/ticket/2921 Reviewed-by: Jakub Hrozek <jhrozek@redhat.com> Reviewed-by: Pavel Březina <pbrezina@redhat.com>
-rw-r--r--Makefile.am1
-rw-r--r--src/util/util.h4
-rw-r--r--src/util/util_watchdog.c141
3 files changed, 146 insertions, 0 deletions
diff --git a/Makefile.am b/Makefile.am
index 241086355..273842ccb 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -923,6 +923,7 @@ libsss_util_la_SOURCES = \
src/util/well_known_sids.c \
src/util/string_utils.c \
src/util/become_user.c \
+ src/util/util_watchdog.c \
$(NULL)
libsss_util_la_CFLAGS = \
$(AM_CFLAGS) \
diff --git a/src/util/util.h b/src/util/util.h
index a8b4776be..96f154f6a 100644
--- a/src/util/util.h
+++ b/src/util/util.h
@@ -617,4 +617,8 @@ int sss_unique_file(TALLOC_CTX *owner,
*/
int sss_unique_filename(TALLOC_CTX *owner, char *path_tmpl);
+/* from util_watchdog.c */
+int setup_watchdog(struct tevent_context *ev, int interval);
+void teardown_watchdog(void);
+
#endif /* __SSSD_UTIL_H__ */
diff --git a/src/util/util_watchdog.c b/src/util/util_watchdog.c
new file mode 100644
index 000000000..cdb379653
--- /dev/null
+++ b/src/util/util_watchdog.c
@@ -0,0 +1,141 @@
+/*
+ SSSD
+
+ Timer Watchdog routines
+
+ Copyright (C) Simo Sorce 2016
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "util/util.h"
+
+#define WATCHDOG_DEF_INTERVAL 10
+
+/* this is intentionally a global variable */
+struct watchdog_ctx {
+ timer_t timerid;
+ struct timeval interval;
+ struct tevent_timer *te;
+ volatile int ticks;
+} watchdog_ctx;
+
+/* the watchdog is purposefully *not* handled by the tevent
+ * signal handler as it is meant to check if the daemon is
+ * still processing the event queue itself. A stuck process
+ * may not handle the event queue at all and thus not handle
+ * signals either */
+static void watchdog_handler(int sig)
+{
+ /* if 3 ticks passed by kills itself */
+
+ if (__sync_add_and_fetch(&watchdog_ctx.ticks, 1) > 3) {
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Watchdog timer overflow, killing process!\n");
+ orderly_shutdown(1);
+ }
+}
+
+static void watchdog_reset(void)
+{
+ __sync_and_and_fetch(&watchdog_ctx.ticks, 0);
+}
+
+static void watchdog_event_handler(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ /* first thing reset the watchdog ticks */
+ watchdog_reset();
+
+ /* then set a new watchodg event */
+ watchdog_ctx.te = tevent_add_timer(ev, ev,
+ tevent_timeval_current_ofs(watchdog_ctx.interval.tv_sec, 0),
+ watchdog_event_handler, NULL);
+ /* if the function fails the watchdog will kill the
+ * process soon enough, so we just warn */
+ if (!watchdog_ctx.te) {
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Failed to create a watchdog timer event!\n");
+ }
+}
+
+int setup_watchdog(struct tevent_context *ev, int interval)
+{
+ struct sigevent sev = { 0 };
+ struct itimerspec its;
+ int signum = SIGRTMIN;
+ int ret;
+
+ CatchSignal(signum, watchdog_handler);
+
+ sev.sigev_notify = SIGEV_SIGNAL;
+ sev.sigev_signo = signum;
+ sev.sigev_value.sival_ptr = &watchdog_ctx.timerid;
+ errno = 0;
+ ret = timer_create(CLOCK_MONOTONIC, &sev, &watchdog_ctx.timerid);
+ if (ret == -1) {
+ ret = errno;
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Failed to create watchdog timer (%d) [%s]\n",
+ ret, strerror(ret));
+ return ret;
+ }
+
+ if (interval == 0) {
+ interval = WATCHDOG_DEF_INTERVAL;
+ }
+ watchdog_ctx.interval.tv_sec = interval;
+ watchdog_ctx.interval.tv_usec = 0;
+
+ /* Start the timer */
+ /* we give 1 second head start to the watchdog event */
+ its.it_value.tv_sec = interval + 1;
+ its.it_value.tv_nsec = 0;
+ its.it_interval.tv_sec = interval;
+ its.it_interval.tv_nsec = 0;
+ errno = 0;
+ ret = timer_settime(watchdog_ctx.timerid, 0, &its, NULL);
+ if (ret == -1) {
+ ret = errno;
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Failed to create watchdog timer (%d) [%s]\n",
+ ret, strerror(ret));
+ return ret;
+ }
+
+ /* Add the watchdog event and make it fire as fast as the timer */
+ watchdog_event_handler(ev, NULL, tevent_timeval_zero(), NULL);
+
+ return EOK;
+}
+
+void teardown_watchdog(void)
+{
+ int ret;
+
+ /* Disarm the timer */
+ errno = 0;
+ ret = timer_delete(watchdog_ctx.timerid);
+ if (ret == -1) {
+ ret = errno;
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Failed to destroy watchdog timer (%d) [%s]\n",
+ ret, strerror(ret));
+ }
+
+ /* and kill the watchdog event */
+ talloc_free(watchdog_ctx.te);
+}