From 152251b13a99c88054055d46600e0478c4f7bd05 Mon Sep 17 00:00:00 2001 From: Stephen Gallagher Date: Wed, 10 Dec 2014 14:16:49 -0500 Subject: monitor: Service restart fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are actually two bugs here: 1) When either the kill(SIGTERM) or kill(SIGKILL) commands returned failure (for any reason), we would talloc_free(svc) which removed it from being eligible for restart, resulting in the service never starting again without an SSSD service restart. 2) There is a fairly wide race condition where it's possible for a SIGKILL timer to "catch up" to the child exit handler between us noticing the termination and actually restarting it. The race happens because we re-enter the mainloop and add a restart timeout to avoid a quick failure if we keep restarting due to a transitory issue (the mt_svc object, and therefore the SIGKILL timer, were never freed until we got to the actual service restart). We can minimize this race by recording the timer_event for the SIGKILL timeout in the mt_svc object. This way, if the process exits via SIGTERM, we will immediately remove the timer for the SIGKILL. Additionally, we'll catch the special-case of an ESRCH response from the kill(SIGKILL) and assume that it means that the process has exited. The only other two possible errors are * EINVAL: (an invalid signal was specified) - This should be impossible, obviously. * EPERM: This process doesn't have permission to send signals to this PID. If this happens, it's either an SELinux bug or else the process has terminated and a new process that SSSD doesn't control has taken the ID over. So in the incredibly unlikely case that one of those occurs, we'll just go ahead and try to start a new process. This patch also removes the incorrect talloc_free(svc) calls on the kill() failures and replaces them with an attempt to just start up the service again and hope for the best. Resolves: https://fedorahosted.org/sssd/ticket/2525 Reviewed-by: Pavel Březina --- src/monitor/monitor.c | 94 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 20 deletions(-) (limited to 'src/monitor') diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c index c63206b78..afefe7f11 100644 --- a/src/monitor/monitor.c +++ b/src/monitor/monitor.c @@ -119,6 +119,8 @@ struct mt_svc { int ping_time; int kill_time; + struct tevent_timer *kill_timer; + bool svc_started; int restarts; @@ -588,6 +590,7 @@ static void set_tasks_checker(struct mt_svc *svc) svc->ping_ev = te; } +static void monitor_restart_service(struct mt_svc *svc); static void mt_svc_sigkill(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *ptr); @@ -595,7 +598,6 @@ static int monitor_kill_service (struct mt_svc *svc) { int ret; struct timeval tv; - struct tevent_timer *te; ret = kill(svc->pid, SIGTERM); if (ret == -1) { @@ -604,29 +606,30 @@ static int monitor_kill_service (struct mt_svc *svc) "Sending signal to child (%s:%d) failed: [%d]: %s! " "Ignore and pretend child is dead.\n", svc->name, svc->pid, ret, strerror(ret)); - goto done; + /* The only thing we can try here is to launch a new process + * and hope that it works. + */ + monitor_restart_service(svc); + return EOK; } /* Set up a timer to send SIGKILL if this process * doesn't exit within sixty seconds */ tv = tevent_timeval_current_ofs(svc->kill_time, 0); - te = tevent_add_timer(svc->mt_ctx->ev, svc, tv, mt_svc_sigkill, svc); - if (te == NULL) { + svc->kill_timer = tevent_add_timer(svc->mt_ctx->ev, + svc, + tv, + mt_svc_sigkill, + svc); + if (svc->kill_timer == NULL) { /* Nothing much we can do */ - ret = ENOMEM; DEBUG(SSSDBG_CRIT_FAILURE, "Failed to allocate timed event: mt_svc_sigkill.\n"); - goto done; + /* We'll just have to hope that the SIGTERM succeeds */ } - ret = EOK; - -done: - if (ret != EOK) { - talloc_free(svc); - } - return ret; + return EOK; } static void mt_svc_sigkill(struct tevent_context *ev, @@ -645,12 +648,35 @@ static void mt_svc_sigkill(struct tevent_context *ev, ret = kill(svc->pid, SIGKILL); if (ret != EOK) { + ret = errno; DEBUG(SSSDBG_FATAL_FAILURE, "Sending signal to child (%s:%d) failed! " "Ignore and pretend child is dead.\n", svc->name, svc->pid); - talloc_free(svc); + + if (ret == ESRCH) { + /* The process doesn't exist + * This most likely means we hit a race where + * the SIGTERM concluded just after the timer + * fired but before we called kill() here. + * We'll just do nothing, since the + * mt_svc_exit_handler() should be doing the + * necessary work. + */ + return; + } + + /* Something went really wrong. + * The only thing we can try here is to launch a new process + * and hope that it works. + */ + monitor_restart_service(svc); } + + /* The process should terminate immediately and then be + * restarted by the mt_svc_exit_handler() + */ + return; } static void reload_reply(DBusPendingCall *pending, void *data) @@ -2705,11 +2731,7 @@ static void mt_svc_restart(struct tevent_context *ev, static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) { struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc); - struct mt_ctx *mt_ctx = svc->mt_ctx; - time_t now = time(NULL); - struct tevent_timer *te; - struct timeval tv; - int restart_delay; + if (WIFEXITED(wait_status)) { DEBUG(SSSDBG_OP_FAILURE, @@ -2731,6 +2753,28 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) return; } + /* Clear the kill_timer so we don't try to SIGKILL it after it's + * already gone. + */ + talloc_zfree(svc->kill_timer); + + /* Check the number of restart tries and relaunch the service */ + monitor_restart_service(svc); + + return; +} + +static void monitor_restart_service(struct mt_svc *svc) +{ + struct mt_ctx *mt_ctx = svc->mt_ctx; + int restart_delay; + time_t now = time(NULL); + struct tevent_timer *te; + struct timeval tv; + + /* Handle the actual checks for how many times to restart this + * service before giving up. + */ if ((now - svc->last_restart) > MONITOR_RESTART_CNT_INTERVAL_RESET) { svc->restarts = 0; } @@ -2739,9 +2783,19 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) if (svc->restarts > MONITOR_MAX_SVC_RESTARTS) { DEBUG(SSSDBG_FATAL_FAILURE, "Process [%s], definitely stopped!\n", svc->name); + + sss_log(SSS_LOG_ERR, + "Exiting the SSSD. Could not restart critical service [%s].", + svc->name); + talloc_free(svc); - /* exit with error */ + /* exit the SSSD with an error, shutting down all + * services and domains. + * We do this because if one of the responders is down + * and can't come back up, this is the only way to + * guarantee admin intervention. + */ monitor_quit(mt_ctx, 1); return; } -- cgit