summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJakub Hrozek <jhrozek@redhat.com>2015-11-02 11:41:31 +0100
committerJakub Hrozek <jhrozek@redhat.com>2015-11-13 10:55:50 +0100
commit89530c830ded58c6140cdb34c9de07bf77bb5bc0 (patch)
tree3377c5ab067444cd127ea145b4753612ff6cae12
parent499b60f44ecf7124e1906157bd4fca141f48e8d9 (diff)
downloadsssd-89530c830ded58c6140cdb34c9de07bf77bb5bc0.tar.gz
sssd-89530c830ded58c6140cdb34c9de07bf77bb5bc0.tar.xz
sssd-89530c830ded58c6140cdb34c9de07bf77bb5bc0.zip
SSSD: Add a new option diag_cmd
This option is an optional one that is run when a sbus ping times out and before a SIGKILL signal is sent. It is undocumented by default. diag_cmd (string): A command that should be run for diagnostic purpose when an sbus timeout fails. The option value may contain %p which would be expanded for the process ID of the process that timed out Example: pstack %p This setting would print the stackstrace of the service whose ping timed out. Default: not set. Reviewed-by: Petr Cech <pcech@redhat.com>
-rw-r--r--src/confdb/confdb.h1
-rw-r--r--src/config/SSSDConfig/__init__.py.in1
-rwxr-xr-xsrc/config/SSSDConfigTest.py1
-rw-r--r--src/config/etc/sssd.api.conf1
-rw-r--r--src/monitor/monitor.c216
5 files changed, 198 insertions, 22 deletions
diff --git a/src/confdb/confdb.h b/src/confdb/confdb.h
index 37b5fd7c7..0ef7268f9 100644
--- a/src/confdb/confdb.h
+++ b/src/confdb/confdb.h
@@ -71,6 +71,7 @@
#define CONFDB_MONITOR_DEFAULT_DOMAIN "default_domain_suffix"
#define CONFDB_MONITOR_OVERRIDE_SPACE "override_space"
#define CONFDB_MONITOR_USER_RUNAS "user"
+#define CONFDB_MONITOR_PRE_KILL_CMD "diag_cmd"
/* Both monitor and domains */
#define CONFDB_NAME_REGEX "re_expression"
diff --git a/src/config/SSSDConfig/__init__.py.in b/src/config/SSSDConfig/__init__.py.in
index bf61c4027..60129e6e7 100644
--- a/src/config/SSSDConfig/__init__.py.in
+++ b/src/config/SSSDConfig/__init__.py.in
@@ -50,6 +50,7 @@ option_strings = {
'reconnection_retries' : _('Number of times to attempt connection to Data Providers'),
'fd_limit' : _('The number of file descriptors that may be opened by this responder'),
'client_idle_timeout' : _('Idle time before automatic disconnection of a client'),
+ 'diag_cmd' : _('The command to run when a service ping times out'),
# [sssd]
'services' : _('SSSD Services to start'),
diff --git a/src/config/SSSDConfigTest.py b/src/config/SSSDConfigTest.py
index 45562214d..abd4a3925 100755
--- a/src/config/SSSDConfigTest.py
+++ b/src/config/SSSDConfigTest.py
@@ -307,6 +307,7 @@ class SSSDConfigTestSSSDService(unittest.TestCase):
'reconnection_retries',
'fd_limit',
'client_idle_timeout',
+ 'diag_cmd',
'description']
self.assertTrue(type(options) == dict,
diff --git a/src/config/etc/sssd.api.conf b/src/config/etc/sssd.api.conf
index 72abb8b3f..0c03625bd 100644
--- a/src/config/etc/sssd.api.conf
+++ b/src/config/etc/sssd.api.conf
@@ -13,6 +13,7 @@ fd_limit = int, None, false
client_idle_timeout = int, None, false
force_timeout = int, None, false
description = str, None, false
+diag_cmd = str, None, false
[sssd]
# Monitor service
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c
index 89ac882d3..ac3af282d 100644
--- a/src/monitor/monitor.c
+++ b/src/monitor/monitor.c
@@ -116,6 +116,7 @@ struct mt_svc {
char *identity;
pid_t pid;
+ char *diag_cmd;
int ping_time;
int kill_time;
@@ -383,6 +384,176 @@ static int add_svc_conn_spy(struct mt_svc *svc)
return EOK;
}
+static char *expand_diag_cmd(struct mt_svc *svc,
+ const char *template)
+{
+ TALLOC_CTX *tmp_ctx = NULL;
+ char *copy;
+ char *p_copy;
+ char *n;
+ char *result = NULL;
+ char action;
+ char *res = NULL;
+
+ if (template == NULL) {
+ DEBUG(SSSDBG_CRIT_FAILURE, "Missing template.\n");
+ return NULL;
+ }
+
+ tmp_ctx = talloc_new(NULL);
+ if (!tmp_ctx) return NULL;
+
+ copy = talloc_strdup(tmp_ctx, template);
+ if (copy == NULL) {
+ DEBUG(SSSDBG_CRIT_FAILURE, "talloc_strdup failed.\n");
+ goto done;
+ }
+
+ result = talloc_strdup(tmp_ctx, "");
+ if (result == NULL) {
+ DEBUG(SSSDBG_CRIT_FAILURE, "talloc_strdup failed.\n");
+ goto done;
+ }
+
+ p_copy = copy;
+ while ((n = strchr(p_copy, '%')) != NULL) {
+ *n = '\0';
+ n++;
+ if ( *n == '\0' ) {
+ DEBUG(SSSDBG_CRIT_FAILURE,
+ "format error, single %% at the end of the template.\n");
+ goto done;
+ }
+
+ action = *n;
+ switch (action) {
+ case 'p':
+ result = talloc_asprintf_append(result, "%s%d", p_copy, svc->pid);
+ break;
+ default:
+ DEBUG(SSSDBG_CRIT_FAILURE,
+ "format error, unknown template [%%%c].\n", *n);
+ goto done;
+ }
+
+ if (result == NULL) {
+ DEBUG(SSSDBG_CRIT_FAILURE, "talloc_asprintf_append failed.\n");
+ goto done;
+ }
+
+ p_copy = n + 1;
+ }
+
+ result = talloc_asprintf_append(result, "%s", p_copy);
+ if (result == NULL) {
+ DEBUG(SSSDBG_CRIT_FAILURE, "talloc_asprintf_append failed.\n");
+ goto done;
+ }
+
+ res = talloc_move(svc, &result);
+done:
+ talloc_zfree(tmp_ctx);
+ return res;
+}
+
+static void svc_child_info(struct mt_svc *svc, int wait_status)
+{
+ if (WIFEXITED(wait_status)) {
+ DEBUG(SSSDBG_OP_FAILURE,
+ "Child [%d] exited with code [%d]\n",
+ svc->pid, WEXITSTATUS(wait_status));
+ } else if (WIFSIGNALED(wait_status)) {
+ DEBUG(SSSDBG_OP_FAILURE,
+ "Child [%d] terminated with signal [%d]\n",
+ svc->pid, WTERMSIG(wait_status));
+ } else {
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Child [%d] did not exit cleanly\n", svc->pid);
+ /* Forcibly kill this child, just in case */
+ kill(svc->pid, SIGKILL);
+
+ /* Let us get caught by another
+ * call to the SIGCHLD handler
+ */
+ }
+}
+
+static void svc_diag_cmd_exit_handler(int pid, int wait_status, void *pvt)
+{
+ struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc);
+
+ svc_child_info(svc, wait_status);
+}
+
+static void svc_run_diag_cmd(struct mt_svc *svc)
+{
+ pid_t pkc_pid;
+ char **args;
+ int ret;
+ int debug_fd;
+ char *diag_cmd;
+ struct sss_child_ctx *diag_child_ctx;
+
+ if (svc->diag_cmd == NULL) {
+ return;
+ }
+
+ pkc_pid = fork();
+ if (pkc_pid != 0) {
+ /* parent, schedule SIGKILL */
+
+ ret = sss_child_register(svc,
+ svc->mt_ctx->sigchld_ctx,
+ pkc_pid,
+ svc_diag_cmd_exit_handler,
+ svc,
+ &diag_child_ctx);
+ if (ret != EOK) {
+ DEBUG(SSSDBG_CRIT_FAILURE, "Cannot register child %d\n", pkc_pid);
+ /* Try to go on ... */
+ }
+
+ return;
+ }
+
+ /* child, execute diagnostics */
+ diag_cmd = expand_diag_cmd(svc, svc->diag_cmd);
+ if (diag_cmd == NULL) {
+ DEBUG(SSSDBG_CRIT_FAILURE,
+ "Failed to expand [%s]\n", svc->diag_cmd);
+ _exit(1);
+ }
+
+ if (debug_level >= SSSDBG_TRACE_LIBS) {
+ debug_fd = get_fd_from_debug_file();
+ ret = dup2(debug_fd, STDERR_FILENO);
+ if (ret == -1) {
+ ret = errno;
+ DEBUG(SSSDBG_MINOR_FAILURE,
+ "dup2 failed for stderr [%d][%s].\n", ret, sss_strerror(ret));
+ /* failure to redirect stderr is not fatal */
+ }
+
+ ret = dup2(debug_fd, STDOUT_FILENO);
+ if (ret == -1) {
+ ret = errno;
+ DEBUG(SSSDBG_MINOR_FAILURE,
+ "dup2 failed for stdout [%d][%s].\n", ret, sss_strerror(ret));
+ /* failure to redirect stdout is not fatal */
+ }
+ }
+
+ args = parse_args(diag_cmd);
+ execvp(args[0], args);
+
+ /* If we are here, exec() has failed
+ * Print errno and abort quickly */
+ ret = errno;
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ "Could not exec %s, reason: %s\n", svc->diag_cmd, strerror(ret));
+ _exit(1);
+}
+
static int mark_service_as_started(struct mt_svc *svc)
{
struct mt_ctx *ctx = svc->mt_ctx;
@@ -613,8 +784,10 @@ static int monitor_kill_service (struct mt_svc *svc)
return EOK;
}
+ svc_run_diag_cmd(svc);
+
/* Set up a timer to send SIGKILL if this process
- * doesn't exit within sixty seconds
+ * doesn't exit within the configured interval
*/
tv = tevent_timeval_current_ofs(svc->kill_time, 0);
svc->kill_timer = tevent_add_timer(svc->mt_ctx->ev,
@@ -628,7 +801,6 @@ static int monitor_kill_service (struct mt_svc *svc)
"Failed to allocate timed event: mt_svc_sigkill.\n");
/* We'll just have to hope that the SIGTERM succeeds */
}
-
return EOK;
}
@@ -1065,6 +1237,19 @@ static errno_t get_ping_config(struct mt_ctx *ctx, const char *path,
"Time between service pings for [%s]: [%d]\n",
svc->name, svc->ping_time);
+ ret = confdb_get_string(ctx->cdb, svc, path,
+ CONFDB_MONITOR_PRE_KILL_CMD,
+ NULL, &svc->diag_cmd);
+ if (ret != EOK) {
+ DEBUG(SSSDBG_CRIT_FAILURE,
+ "Failed to get diagnostics command for %s\n", svc->name);
+ return ret;
+ }
+ if (svc->diag_cmd) {
+ DEBUG(SSSDBG_CONF_SETTINGS,
+ "Diagnostics command: [%s]\n", svc->diag_cmd);
+ }
+
ret = confdb_get_int(ctx->cdb, path,
CONFDB_SERVICE_FORCE_TIMEOUT,
MONITOR_DEF_FORCE_TIME, &svc->kill_time);
@@ -2615,6 +2800,10 @@ static void ping_check(DBusPendingCall *pending, void *data)
"Attempt [%d]\n",
svc->name, svc->failed_pongs);
svc->failed_pongs++;
+
+ if (debug_level & SSSDBG_TRACE_LIBS) {
+ svc_run_diag_cmd(svc);
+ }
break;
}
@@ -2765,26 +2954,9 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt)
{
struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc);
-
- if (WIFEXITED(wait_status)) {
- DEBUG(SSSDBG_OP_FAILURE,
- "Child [%s] exited with code [%d]\n",
- svc->name, WEXITSTATUS(wait_status));
- } else if (WIFSIGNALED(wait_status)) {
- DEBUG(SSSDBG_OP_FAILURE,
- "Child [%s] terminated with signal [%d]\n",
- svc->name, WTERMSIG(wait_status));
- } else {
- DEBUG(SSSDBG_FATAL_FAILURE,
- "Child [%s] did not exit cleanly\n", svc->name);
- /* Forcibly kill this child, just in case */
- kill(svc->pid, SIGKILL);
-
- /* Return and let us get caught by another
- * call to the SIGCHLD handler
- */
- return;
- }
+ DEBUG(SSSDBG_TRACE_LIBS,
+ "SIGCHLD handler of service %s called\n", svc->name);
+ svc_child_info(svc, wait_status);
/* Clear the kill_timer so we don't try to SIGKILL it after it's
* already gone.