diff options
author | Stephen Gallagher <sgallagh@redhat.com> | 2014-12-10 14:16:49 -0500 |
---|---|---|
committer | Jakub Hrozek <jhrozek@redhat.com> | 2015-01-07 12:09:32 +0100 |
commit | 152251b13a99c88054055d46600e0478c4f7bd05 (patch) | |
tree | a1f841a86c1d991cf2fa5782b579248a291fa19a /src | |
parent | ad1bc5e129a9a2128851aa028247f8e5fab54cc8 (diff) | |
download | sssd-152251b13a99c88054055d46600e0478c4f7bd05.tar.gz sssd-152251b13a99c88054055d46600e0478c4f7bd05.tar.xz sssd-152251b13a99c88054055d46600e0478c4f7bd05.zip |
monitor: Service restart fixes
There are actually two bugs here:
1) When either the kill(SIGTERM) or kill(SIGKILL) commands returned
failure (for any reason), we would talloc_free(svc) which removed it
from being eligible for restart, resulting in the service never
starting again without an SSSD service restart.
2) There is a fairly wide race condition where it's possible for a
SIGKILL timer to "catch up" to the child exit handler between us
noticing the termination and actually restarting it. The race
happens because we re-enter the mainloop and add a restart
timeout to avoid a quick failure if we keep restarting due to a
transitory issue (the mt_svc object, and therefore the SIGKILL
timer, were never freed until we got to the actual service
restart).
We can minimize this race by recording the timer_event for the
SIGKILL timeout in the mt_svc object. This way, if the process
exits via SIGTERM, we will immediately remove the timer for the
SIGKILL. Additionally, we'll catch the special-case of an ESRCH
response from the kill(SIGKILL) and assume that it means that the
process has exited. The only other two possible errors are
* EINVAL: (an invalid signal was specified) - This should be
impossible, obviously.
* EPERM: This process doesn't have permission to send signals to
this PID. If this happens, it's either an SELinux bug or
else the process has terminated and a new process that
SSSD doesn't control has taken the ID over.
So in the incredibly unlikely case that one of those occurs, we'll
just go ahead and try to start a new process.
This patch also removes the incorrect talloc_free(svc) calls on the
kill() failures and replaces them with an attempt to just start up
the service again and hope for the best.
Resolves:
https://fedorahosted.org/sssd/ticket/2525
Reviewed-by: Pavel Březina <pbrezina@redhat.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/monitor/monitor.c | 94 |
1 files changed, 74 insertions, 20 deletions
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c index c63206b78..afefe7f11 100644 --- a/src/monitor/monitor.c +++ b/src/monitor/monitor.c @@ -119,6 +119,8 @@ struct mt_svc { int ping_time; int kill_time; + struct tevent_timer *kill_timer; + bool svc_started; int restarts; @@ -588,6 +590,7 @@ static void set_tasks_checker(struct mt_svc *svc) svc->ping_ev = te; } +static void monitor_restart_service(struct mt_svc *svc); static void mt_svc_sigkill(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *ptr); @@ -595,7 +598,6 @@ static int monitor_kill_service (struct mt_svc *svc) { int ret; struct timeval tv; - struct tevent_timer *te; ret = kill(svc->pid, SIGTERM); if (ret == -1) { @@ -604,29 +606,30 @@ static int monitor_kill_service (struct mt_svc *svc) "Sending signal to child (%s:%d) failed: [%d]: %s! " "Ignore and pretend child is dead.\n", svc->name, svc->pid, ret, strerror(ret)); - goto done; + /* The only thing we can try here is to launch a new process + * and hope that it works. + */ + monitor_restart_service(svc); + return EOK; } /* Set up a timer to send SIGKILL if this process * doesn't exit within sixty seconds */ tv = tevent_timeval_current_ofs(svc->kill_time, 0); - te = tevent_add_timer(svc->mt_ctx->ev, svc, tv, mt_svc_sigkill, svc); - if (te == NULL) { + svc->kill_timer = tevent_add_timer(svc->mt_ctx->ev, + svc, + tv, + mt_svc_sigkill, + svc); + if (svc->kill_timer == NULL) { /* Nothing much we can do */ - ret = ENOMEM; DEBUG(SSSDBG_CRIT_FAILURE, "Failed to allocate timed event: mt_svc_sigkill.\n"); - goto done; + /* We'll just have to hope that the SIGTERM succeeds */ } - ret = EOK; - -done: - if (ret != EOK) { - talloc_free(svc); - } - return ret; + return EOK; } static void mt_svc_sigkill(struct tevent_context *ev, @@ -645,12 +648,35 @@ static void mt_svc_sigkill(struct tevent_context *ev, ret = kill(svc->pid, SIGKILL); if (ret != EOK) { + ret = errno; DEBUG(SSSDBG_FATAL_FAILURE, "Sending signal to child (%s:%d) failed! " "Ignore and pretend child is dead.\n", svc->name, svc->pid); - talloc_free(svc); + + if (ret == ESRCH) { + /* The process doesn't exist + * This most likely means we hit a race where + * the SIGTERM concluded just after the timer + * fired but before we called kill() here. + * We'll just do nothing, since the + * mt_svc_exit_handler() should be doing the + * necessary work. + */ + return; + } + + /* Something went really wrong. + * The only thing we can try here is to launch a new process + * and hope that it works. + */ + monitor_restart_service(svc); } + + /* The process should terminate immediately and then be + * restarted by the mt_svc_exit_handler() + */ + return; } static void reload_reply(DBusPendingCall *pending, void *data) @@ -2705,11 +2731,7 @@ static void mt_svc_restart(struct tevent_context *ev, static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) { struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc); - struct mt_ctx *mt_ctx = svc->mt_ctx; - time_t now = time(NULL); - struct tevent_timer *te; - struct timeval tv; - int restart_delay; + if (WIFEXITED(wait_status)) { DEBUG(SSSDBG_OP_FAILURE, @@ -2731,6 +2753,28 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) return; } + /* Clear the kill_timer so we don't try to SIGKILL it after it's + * already gone. + */ + talloc_zfree(svc->kill_timer); + + /* Check the number of restart tries and relaunch the service */ + monitor_restart_service(svc); + + return; +} + +static void monitor_restart_service(struct mt_svc *svc) +{ + struct mt_ctx *mt_ctx = svc->mt_ctx; + int restart_delay; + time_t now = time(NULL); + struct tevent_timer *te; + struct timeval tv; + + /* Handle the actual checks for how many times to restart this + * service before giving up. + */ if ((now - svc->last_restart) > MONITOR_RESTART_CNT_INTERVAL_RESET) { svc->restarts = 0; } @@ -2739,9 +2783,19 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) if (svc->restarts > MONITOR_MAX_SVC_RESTARTS) { DEBUG(SSSDBG_FATAL_FAILURE, "Process [%s], definitely stopped!\n", svc->name); + + sss_log(SSS_LOG_ERR, + "Exiting the SSSD. Could not restart critical service [%s].", + svc->name); + talloc_free(svc); - /* exit with error */ + /* exit the SSSD with an error, shutting down all + * services and domains. + * We do this because if one of the responders is down + * and can't come back up, this is the only way to + * guarantee admin intervention. + */ monitor_quit(mt_ctx, 1); return; } |