From 89530c830ded58c6140cdb34c9de07bf77bb5bc0 Mon Sep 17 00:00:00 2001 From: Jakub Hrozek Date: Mon, 2 Nov 2015 11:41:31 +0100 Subject: SSSD: Add a new option diag_cmd This option is an optional one that is run when a sbus ping times out and before a SIGKILL signal is sent. It is undocumented by default. diag_cmd (string): A command that should be run for diagnostic purpose when an sbus timeout fails. The option value may contain %p which would be expanded for the process ID of the process that timed out Example: pstack %p This setting would print the stackstrace of the service whose ping timed out. Default: not set. Reviewed-by: Petr Cech --- src/monitor/monitor.c | 216 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 194 insertions(+), 22 deletions(-) (limited to 'src/monitor') diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c index 89ac882d3..ac3af282d 100644 --- a/src/monitor/monitor.c +++ b/src/monitor/monitor.c @@ -116,6 +116,7 @@ struct mt_svc { char *identity; pid_t pid; + char *diag_cmd; int ping_time; int kill_time; @@ -383,6 +384,176 @@ static int add_svc_conn_spy(struct mt_svc *svc) return EOK; } +static char *expand_diag_cmd(struct mt_svc *svc, + const char *template) +{ + TALLOC_CTX *tmp_ctx = NULL; + char *copy; + char *p_copy; + char *n; + char *result = NULL; + char action; + char *res = NULL; + + if (template == NULL) { + DEBUG(SSSDBG_CRIT_FAILURE, "Missing template.\n"); + return NULL; + } + + tmp_ctx = talloc_new(NULL); + if (!tmp_ctx) return NULL; + + copy = talloc_strdup(tmp_ctx, template); + if (copy == NULL) { + DEBUG(SSSDBG_CRIT_FAILURE, "talloc_strdup failed.\n"); + goto done; + } + + result = talloc_strdup(tmp_ctx, ""); + if (result == NULL) { + DEBUG(SSSDBG_CRIT_FAILURE, "talloc_strdup failed.\n"); + goto done; + } + + p_copy = copy; + while ((n = strchr(p_copy, '%')) != NULL) { + *n = '\0'; + n++; + if ( *n == '\0' ) { + DEBUG(SSSDBG_CRIT_FAILURE, + "format error, single %% at the end of the template.\n"); + goto done; + } + + action = *n; + switch (action) { + case 'p': + result = talloc_asprintf_append(result, "%s%d", p_copy, svc->pid); + break; + default: + DEBUG(SSSDBG_CRIT_FAILURE, + "format error, unknown template [%%%c].\n", *n); + goto done; + } + + if (result == NULL) { + DEBUG(SSSDBG_CRIT_FAILURE, "talloc_asprintf_append failed.\n"); + goto done; + } + + p_copy = n + 1; + } + + result = talloc_asprintf_append(result, "%s", p_copy); + if (result == NULL) { + DEBUG(SSSDBG_CRIT_FAILURE, "talloc_asprintf_append failed.\n"); + goto done; + } + + res = talloc_move(svc, &result); +done: + talloc_zfree(tmp_ctx); + return res; +} + +static void svc_child_info(struct mt_svc *svc, int wait_status) +{ + if (WIFEXITED(wait_status)) { + DEBUG(SSSDBG_OP_FAILURE, + "Child [%d] exited with code [%d]\n", + svc->pid, WEXITSTATUS(wait_status)); + } else if (WIFSIGNALED(wait_status)) { + DEBUG(SSSDBG_OP_FAILURE, + "Child [%d] terminated with signal [%d]\n", + svc->pid, WTERMSIG(wait_status)); + } else { + DEBUG(SSSDBG_FATAL_FAILURE, + "Child [%d] did not exit cleanly\n", svc->pid); + /* Forcibly kill this child, just in case */ + kill(svc->pid, SIGKILL); + + /* Let us get caught by another + * call to the SIGCHLD handler + */ + } +} + +static void svc_diag_cmd_exit_handler(int pid, int wait_status, void *pvt) +{ + struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc); + + svc_child_info(svc, wait_status); +} + +static void svc_run_diag_cmd(struct mt_svc *svc) +{ + pid_t pkc_pid; + char **args; + int ret; + int debug_fd; + char *diag_cmd; + struct sss_child_ctx *diag_child_ctx; + + if (svc->diag_cmd == NULL) { + return; + } + + pkc_pid = fork(); + if (pkc_pid != 0) { + /* parent, schedule SIGKILL */ + + ret = sss_child_register(svc, + svc->mt_ctx->sigchld_ctx, + pkc_pid, + svc_diag_cmd_exit_handler, + svc, + &diag_child_ctx); + if (ret != EOK) { + DEBUG(SSSDBG_CRIT_FAILURE, "Cannot register child %d\n", pkc_pid); + /* Try to go on ... */ + } + + return; + } + + /* child, execute diagnostics */ + diag_cmd = expand_diag_cmd(svc, svc->diag_cmd); + if (diag_cmd == NULL) { + DEBUG(SSSDBG_CRIT_FAILURE, + "Failed to expand [%s]\n", svc->diag_cmd); + _exit(1); + } + + if (debug_level >= SSSDBG_TRACE_LIBS) { + debug_fd = get_fd_from_debug_file(); + ret = dup2(debug_fd, STDERR_FILENO); + if (ret == -1) { + ret = errno; + DEBUG(SSSDBG_MINOR_FAILURE, + "dup2 failed for stderr [%d][%s].\n", ret, sss_strerror(ret)); + /* failure to redirect stderr is not fatal */ + } + + ret = dup2(debug_fd, STDOUT_FILENO); + if (ret == -1) { + ret = errno; + DEBUG(SSSDBG_MINOR_FAILURE, + "dup2 failed for stdout [%d][%s].\n", ret, sss_strerror(ret)); + /* failure to redirect stdout is not fatal */ + } + } + + args = parse_args(diag_cmd); + execvp(args[0], args); + + /* If we are here, exec() has failed + * Print errno and abort quickly */ + ret = errno; + DEBUG(SSSDBG_FATAL_FAILURE, + "Could not exec %s, reason: %s\n", svc->diag_cmd, strerror(ret)); + _exit(1); +} + static int mark_service_as_started(struct mt_svc *svc) { struct mt_ctx *ctx = svc->mt_ctx; @@ -613,8 +784,10 @@ static int monitor_kill_service (struct mt_svc *svc) return EOK; } + svc_run_diag_cmd(svc); + /* Set up a timer to send SIGKILL if this process - * doesn't exit within sixty seconds + * doesn't exit within the configured interval */ tv = tevent_timeval_current_ofs(svc->kill_time, 0); svc->kill_timer = tevent_add_timer(svc->mt_ctx->ev, @@ -628,7 +801,6 @@ static int monitor_kill_service (struct mt_svc *svc) "Failed to allocate timed event: mt_svc_sigkill.\n"); /* We'll just have to hope that the SIGTERM succeeds */ } - return EOK; } @@ -1065,6 +1237,19 @@ static errno_t get_ping_config(struct mt_ctx *ctx, const char *path, "Time between service pings for [%s]: [%d]\n", svc->name, svc->ping_time); + ret = confdb_get_string(ctx->cdb, svc, path, + CONFDB_MONITOR_PRE_KILL_CMD, + NULL, &svc->diag_cmd); + if (ret != EOK) { + DEBUG(SSSDBG_CRIT_FAILURE, + "Failed to get diagnostics command for %s\n", svc->name); + return ret; + } + if (svc->diag_cmd) { + DEBUG(SSSDBG_CONF_SETTINGS, + "Diagnostics command: [%s]\n", svc->diag_cmd); + } + ret = confdb_get_int(ctx->cdb, path, CONFDB_SERVICE_FORCE_TIMEOUT, MONITOR_DEF_FORCE_TIME, &svc->kill_time); @@ -2615,6 +2800,10 @@ static void ping_check(DBusPendingCall *pending, void *data) "Attempt [%d]\n", svc->name, svc->failed_pongs); svc->failed_pongs++; + + if (debug_level & SSSDBG_TRACE_LIBS) { + svc_run_diag_cmd(svc); + } break; } @@ -2765,26 +2954,9 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) { struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc); - - if (WIFEXITED(wait_status)) { - DEBUG(SSSDBG_OP_FAILURE, - "Child [%s] exited with code [%d]\n", - svc->name, WEXITSTATUS(wait_status)); - } else if (WIFSIGNALED(wait_status)) { - DEBUG(SSSDBG_OP_FAILURE, - "Child [%s] terminated with signal [%d]\n", - svc->name, WTERMSIG(wait_status)); - } else { - DEBUG(SSSDBG_FATAL_FAILURE, - "Child [%s] did not exit cleanly\n", svc->name); - /* Forcibly kill this child, just in case */ - kill(svc->pid, SIGKILL); - - /* Return and let us get caught by another - * call to the SIGCHLD handler - */ - return; - } + DEBUG(SSSDBG_TRACE_LIBS, + "SIGCHLD handler of service %s called\n", svc->name); + svc_child_info(svc, wait_status); /* Clear the kill_timer so we don't try to SIGKILL it after it's * already gone. -- cgit