summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Gallagher <sgallagh@redhat.com>2014-12-10 14:16:49 -0500
committerJakub Hrozek <jhrozek@redhat.com>2015-01-07 12:09:32 +0100
commit152251b13a99c88054055d46600e0478c4f7bd05 (patch)
treea1f841a86c1d991cf2fa5782b579248a291fa19a
parentad1bc5e129a9a2128851aa028247f8e5fab54cc8 (diff)
downloadsssd-152251b13a99c88054055d46600e0478c4f7bd05.tar.gz
sssd-152251b13a99c88054055d46600e0478c4f7bd05.tar.xz
sssd-152251b13a99c88054055d46600e0478c4f7bd05.zip
monitor: Service restart fixes
There are actually two bugs here: 1) When either the kill(SIGTERM) or kill(SIGKILL) commands returned failure (for any reason), we would talloc_free(svc) which removed it from being eligible for restart, resulting in the service never starting again without an SSSD service restart. 2) There is a fairly wide race condition where it's possible for a SIGKILL timer to "catch up" to the child exit handler between us noticing the termination and actually restarting it. The race happens because we re-enter the mainloop and add a restart timeout to avoid a quick failure if we keep restarting due to a transitory issue (the mt_svc object, and therefore the SIGKILL timer, were never freed until we got to the actual service restart). We can minimize this race by recording the timer_event for the SIGKILL timeout in the mt_svc object. This way, if the process exits via SIGTERM, we will immediately remove the timer for the SIGKILL. Additionally, we'll catch the special-case of an ESRCH response from the kill(SIGKILL) and assume that it means that the process has exited. The only other two possible errors are * EINVAL: (an invalid signal was specified) - This should be impossible, obviously. * EPERM: This process doesn't have permission to send signals to this PID. If this happens, it's either an SELinux bug or else the process has terminated and a new process that SSSD doesn't control has taken the ID over. So in the incredibly unlikely case that one of those occurs, we'll just go ahead and try to start a new process. This patch also removes the incorrect talloc_free(svc) calls on the kill() failures and replaces them with an attempt to just start up the service again and hope for the best. Resolves: https://fedorahosted.org/sssd/ticket/2525 Reviewed-by: Pavel Březina <pbrezina@redhat.com>
-rw-r--r--src/monitor/monitor.c94
1 files changed, 74 insertions, 20 deletions
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c
index c63206b78..afefe7f11 100644
--- a/src/monitor/monitor.c
+++ b/src/monitor/monitor.c
@@ -119,6 +119,8 @@ struct mt_svc {
int ping_time;
int kill_time;
+ struct tevent_timer *kill_timer;
+
bool svc_started;
int restarts;
@@ -588,6 +590,7 @@ static void set_tasks_checker(struct mt_svc *svc)
svc->ping_ev = te;
}
+static void monitor_restart_service(struct mt_svc *svc);
static void mt_svc_sigkill(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval t, void *ptr);
@@ -595,7 +598,6 @@ static int monitor_kill_service (struct mt_svc *svc)
{
int ret;
struct timeval tv;
- struct tevent_timer *te;
ret = kill(svc->pid, SIGTERM);
if (ret == -1) {
@@ -604,29 +606,30 @@ static int monitor_kill_service (struct mt_svc *svc)
"Sending signal to child (%s:%d) failed: [%d]: %s! "
"Ignore and pretend child is dead.\n",
svc->name, svc->pid, ret, strerror(ret));
- goto done;
+ /* The only thing we can try here is to launch a new process
+ * and hope that it works.
+ */
+ monitor_restart_service(svc);
+ return EOK;
}
/* Set up a timer to send SIGKILL if this process
* doesn't exit within sixty seconds
*/
tv = tevent_timeval_current_ofs(svc->kill_time, 0);
- te = tevent_add_timer(svc->mt_ctx->ev, svc, tv, mt_svc_sigkill, svc);
- if (te == NULL) {
+ svc->kill_timer = tevent_add_timer(svc->mt_ctx->ev,
+ svc,
+ tv,
+ mt_svc_sigkill,
+ svc);
+ if (svc->kill_timer == NULL) {
/* Nothing much we can do */
- ret = ENOMEM;
DEBUG(SSSDBG_CRIT_FAILURE,
"Failed to allocate timed event: mt_svc_sigkill.\n");
- goto done;
+ /* We'll just have to hope that the SIGTERM succeeds */
}
- ret = EOK;
-
-done:
- if (ret != EOK) {
- talloc_free(svc);
- }
- return ret;
+ return EOK;
}
static void mt_svc_sigkill(struct tevent_context *ev,
@@ -645,12 +648,35 @@ static void mt_svc_sigkill(struct tevent_context *ev,
ret = kill(svc->pid, SIGKILL);
if (ret != EOK) {
+ ret = errno;
DEBUG(SSSDBG_FATAL_FAILURE,
"Sending signal to child (%s:%d) failed! "
"Ignore and pretend child is dead.\n",
svc->name, svc->pid);
- talloc_free(svc);
+
+ if (ret == ESRCH) {
+ /* The process doesn't exist
+ * This most likely means we hit a race where
+ * the SIGTERM concluded just after the timer
+ * fired but before we called kill() here.
+ * We'll just do nothing, since the
+ * mt_svc_exit_handler() should be doing the
+ * necessary work.
+ */
+ return;
+ }
+
+ /* Something went really wrong.
+ * The only thing we can try here is to launch a new process
+ * and hope that it works.
+ */
+ monitor_restart_service(svc);
}
+
+ /* The process should terminate immediately and then be
+ * restarted by the mt_svc_exit_handler()
+ */
+ return;
}
static void reload_reply(DBusPendingCall *pending, void *data)
@@ -2705,11 +2731,7 @@ static void mt_svc_restart(struct tevent_context *ev,
static void mt_svc_exit_handler(int pid, int wait_status, void *pvt)
{
struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc);
- struct mt_ctx *mt_ctx = svc->mt_ctx;
- time_t now = time(NULL);
- struct tevent_timer *te;
- struct timeval tv;
- int restart_delay;
+
if (WIFEXITED(wait_status)) {
DEBUG(SSSDBG_OP_FAILURE,
@@ -2731,6 +2753,28 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt)
return;
}
+ /* Clear the kill_timer so we don't try to SIGKILL it after it's
+ * already gone.
+ */
+ talloc_zfree(svc->kill_timer);
+
+ /* Check the number of restart tries and relaunch the service */
+ monitor_restart_service(svc);
+
+ return;
+}
+
+static void monitor_restart_service(struct mt_svc *svc)
+{
+ struct mt_ctx *mt_ctx = svc->mt_ctx;
+ int restart_delay;
+ time_t now = time(NULL);
+ struct tevent_timer *te;
+ struct timeval tv;
+
+ /* Handle the actual checks for how many times to restart this
+ * service before giving up.
+ */
if ((now - svc->last_restart) > MONITOR_RESTART_CNT_INTERVAL_RESET) {
svc->restarts = 0;
}
@@ -2739,9 +2783,19 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt)
if (svc->restarts > MONITOR_MAX_SVC_RESTARTS) {
DEBUG(SSSDBG_FATAL_FAILURE,
"Process [%s], definitely stopped!\n", svc->name);
+
+ sss_log(SSS_LOG_ERR,
+ "Exiting the SSSD. Could not restart critical service [%s].",
+ svc->name);
+
talloc_free(svc);
- /* exit with error */
+ /* exit the SSSD with an error, shutting down all
+ * services and domains.
+ * We do this because if one of the responders is down
+ * and can't come back up, this is the only way to
+ * guarantee admin intervention.
+ */
monitor_quit(mt_ctx, 1);
return;
}