summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOndrej Kos <okos@redhat.com>2013-01-14 13:37:12 +0100
committerJakub Hrozek <jhrozek@redhat.com>2013-02-26 14:14:52 +0100
commit3a3d0b722ea1401497bf834382150077375ff52a (patch)
tree7a0a08cc2a6b2feeb36119eb1be4054a90954547
parentdb9cff72c35757ce1fbe3b528c4fa40325225a60 (diff)
downloadsssd-3a3d0b722ea1401497bf834382150077375ff52a.tar.gz
sssd-3a3d0b722ea1401497bf834382150077375ff52a.tar.xz
sssd-3a3d0b722ea1401497bf834382150077375ff52a.zip
MONITOR: use sigchld handler for monitoring SSSD services
backport of https://fedorahosted.org/sssd/changeset/20e53344fbdfa215ff7633630feb10458a0274b9
-rw-r--r--Makefile.am1
-rw-r--r--src/monitor/monitor.c329
2 files changed, 152 insertions, 178 deletions
diff --git a/Makefile.am b/Makefile.am
index 7623ed166..9bb3f86c6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -395,6 +395,7 @@ sssd_SOURCES = \
src/monitor/monitor.c \
src/monitor/monitor_netlink.c \
src/confdb/confdb_setup.c \
+ src/providers/child_common.c \
$(SSSD_UTIL_OBJ)
sssd_LDADD = \
$(SSSD_LIBS) \
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c
index cfe571281..0db3cea8d 100644
--- a/src/monitor/monitor.c
+++ b/src/monitor/monitor.c
@@ -52,6 +52,7 @@
#include "dbus/dbus.h"
#include "sbus/sssd_dbus.h"
#include "monitor/monitor_interfaces.h"
+#include "providers/child_common.h"
/* ping time cannot be less then once every few seconds or the
* monitor will get crazy hammering children with messages */
@@ -64,9 +65,16 @@
struct svc_spy;
+enum mt_svc_type {
+ MT_SVC_SERVICE,
+ MT_SVC_PROVIDER
+};
+
struct mt_svc {
struct mt_svc *prev;
struct mt_svc *next;
+ enum mt_svc_type type;
+
struct sbus_connection *conn;
struct svc_spy *conn_spy;
@@ -90,6 +98,10 @@ struct mt_svc {
int debug_level;
struct tevent_timer *ping_ev;
+
+ struct sss_child_ctx *child_ctx;
+
+ struct tevent_timer *sigkill_ev;
};
struct config_file_callback {
@@ -125,6 +137,7 @@ struct mt_ctx {
bool check_children;
bool services_started;
struct netlink_ctx *nlctx;
+ struct sss_sigchild_ctx *sigchld_ctx;
};
static int start_service(struct mt_svc *mt_svc);
@@ -135,18 +148,19 @@ static int service_send_ping(struct mt_svc *svc);
static int service_signal_reset_offline(struct mt_svc *svc);
static void ping_check(DBusPendingCall *pending, void *data);
-static int service_check_alive(struct mt_svc *svc);
-
static void set_tasks_checker(struct mt_svc *srv);
-static void set_global_checker(struct mt_ctx *ctx);
static int monitor_kill_service (struct mt_svc *svc);
static int get_service_config(struct mt_ctx *ctx, const char *name,
struct mt_svc **svc_cfg);
static int get_provider_config(struct mt_ctx *ctx, const char *name,
struct mt_svc **svc_cfg);
-static int add_new_service(struct mt_ctx *ctx, const char *name);
-static int add_new_provider(struct mt_ctx *ctx, const char *name);
+static int add_new_service(struct mt_ctx *ctx,
+ const char *name,
+ int restarts);
+static int add_new_provider(struct mt_ctx *ctx,
+ const char *name,
+ int restarts);
static int mark_service_as_started(struct mt_svc *svc);
@@ -391,7 +405,7 @@ static int mark_service_as_started(struct mt_svc *svc)
DEBUG(4, ("Now starting services!\n"));
/* then start all services */
for (i = 0; ctx->services[i]; i++) {
- add_new_service(ctx, ctx->services[i]);
+ add_new_service(ctx, ctx->services[i], 0);
}
}
@@ -418,7 +432,7 @@ static void services_startup_timeout(struct tevent_context *ev,
DEBUG(4, ("Now starting services!\n"));
/* then start all services */
for (i = 0; ctx->services[i]; i++) {
- add_new_service(ctx, ctx->services[i]);
+ add_new_service(ctx, ctx->services[i], 0);
}
}
}
@@ -474,98 +488,35 @@ static int monitor_dbus_init(struct mt_ctx *ctx)
return ret;
}
-static void svc_try_restart(struct mt_svc *svc, time_t now)
-{
- int ret;
-
- DLIST_REMOVE(svc->mt_ctx->svc_list, svc);
- if (svc->last_restart != 0) {
- if ((now - svc->last_restart) > 30) { /* TODO: get val from config */
- /* it was long ago reset restart threshold */
- svc->restarts = 0;
- }
- }
-
- /* restart the process */
- if (svc->restarts > 3) { /* TODO: get val from config */
- DEBUG(0, ("Process [%s], definitely stopped!\n", svc->name));
- talloc_free(svc);
- return;
- }
-
- /* Shut down the current ping timer so it will restart
- * cleanly in start_service()
- */
- talloc_free(svc->ping_ev);
-
- ret = start_service(svc);
- if (ret != EOK) {
- DEBUG(0,("Failed to restart service '%s'\n", svc->name));
- talloc_free(svc);
- return;
- }
-
- svc->restarts++;
- svc->last_restart = now;
- return;
-}
-
static void tasks_check_handler(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval t, void *ptr)
{
struct mt_svc *svc = talloc_get_type(ptr, struct mt_svc);
- time_t now = time(NULL);
- bool process_alive = true;
int ret;
- ret = service_check_alive(svc);
+ ret = service_send_ping(svc);
switch (ret) {
case EOK:
/* all fine */
break;
- case ECHILD:
- DEBUG(1,("Process (%s) is stopped!\n", svc->name));
- process_alive = false;
- break;
+ case ENXIO:
+ DEBUG(1,("Child (%s) not responding! (yet)\n", svc->name));
default:
- /* TODO: should we tear down it ? */
- DEBUG(1,("Checking for service %s(%d) failed!!\n",
- svc->name, svc->pid));
+ /* TODO: should we tear it down ? */
+ DEBUG(1,("Sending a message to service (%s) failed!!\n", svc->name));
break;
}
- if (process_alive) {
- ret = service_send_ping(svc);
- switch (ret) {
- case EOK:
- /* all fine */
- break;
-
- case ENXIO:
- DEBUG(1,("Child (%s) not responding! (yet)\n", svc->name));
- break;
+ if (svc->failed_pongs >= 3) {
+ /* too long since we last heard of this process */
+ DEBUG(1, ("Killing service [%s], not responding to pings!\n",
+ svc->name));
- default:
- /* TODO: should we tear it down ? */
- DEBUG(1,("Sending a message to service (%s) failed!!\n", svc->name));
- break;
- }
-
- if (svc->failed_pongs >= 3) {
- /* too long since we last heard of this process */
- DEBUG(1,
- ("Killing service [%s], not responding to pings!\n",
- svc->name));
- monitor_kill_service(svc);
- process_alive = false;
- }
- }
-
- if (!process_alive) {
- svc_try_restart(svc, now);
+ /* Kill the service. The SIGCHLD handler will restart it */
+ monitor_kill_service(svc);
return;
}
@@ -590,75 +541,49 @@ static void set_tasks_checker(struct mt_svc *svc)
svc->ping_ev = te;
}
-static void global_checks_handler(struct tevent_context *ev,
- struct tevent_timer *te,
- struct timeval t, void *ptr)
+static void mt_svc_sigkill(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *ptr);
+static int monitor_kill_service (struct mt_svc *svc)
{
- struct mt_ctx *ctx = talloc_get_type(ptr, struct mt_ctx);
- struct mt_svc *svc;
- int status;
- pid_t pid;
-
- if (!ctx->check_children) {
- goto done;
- }
-
- errno = 0;
- pid = waitpid(0, &status, WNOHANG);
- if (pid == 0) {
- goto done;
- }
+ int ret;
+ struct timeval tv;
- if (pid == -1) {
- DEBUG(0, ("waitpid returned -1 (errno:%d[%s])\n",
- errno, strerror(errno)));
- goto done;
+ ret = kill(svc->pid, SIGTERM);
+ if (ret != EOK) {
+ DEBUG(0, ("Sending signal to child (%s:%d) failed! "
+ "Ignore and pretend child is dead.\n",
+ svc->name, svc->pid));
+ talloc_free(svc);
}
- /* let's see if it is a known service, and try to restart it */
- for (svc = ctx->svc_list; svc; svc = svc->next) {
- if (svc->pid == pid) {
- time_t now = time(NULL);
- DEBUG(1, ("Service [%s] did exit\n", svc->name));
- svc_try_restart(svc, now);
- goto done;
- }
- }
- if (svc == NULL) {
- DEBUG(0, ("Unknown child (%d) did exit\n", pid));
- }
+ /* Set up a timer to send SIGKILL if this process
+ * doesn't exit within sixty seconds
+ */
+ tv = tevent_timeval_current_ofs(60, 0);
+ svc->sigkill_ev = tevent_add_timer(svc->mt_ctx->ev, svc, tv,
+ mt_svc_sigkill, svc);
-done:
- set_global_checker(ctx);
+ return ret;
}
-static void set_global_checker(struct mt_ctx *ctx)
+static void mt_svc_sigkill(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *ptr)
{
- struct tevent_timer *te = NULL;
- struct timeval tv;
+ int ret;
+ struct mt_svc *svc = talloc_get_type(ptr, struct mt_svc);
- gettimeofday(&tv, NULL);
- tv.tv_sec += 1; /* once a second */
- tv.tv_usec = 0;
- te = tevent_add_timer(ctx->ev, ctx, tv, global_checks_handler, ctx);
- if (te == NULL) {
- DEBUG(0, ("failed to add global checker event! PANIC TIME!\n"));
- /* FIXME: is this right ? shoulkd we try to clean up first ?*/
- exit(-1);
- }
-}
+ DEBUG(0, ("[%s][%d] is not responding to SIGTERM. Sending SIGKILL.\n",
+ svc->name, svc->pid));
-static int monitor_kill_service (struct mt_svc *svc)
-{
- int ret;
- ret = kill(svc->pid, SIGTERM);
+ ret = kill(svc->pid, SIGKILL);
if (ret != EOK) {
- DEBUG(0,("Sending signal to child (%s:%d) failed! "
- "Ignore and pretend child is dead.\n",
- svc->name, svc->pid));
+ DEBUG(0, ("Sending signal to child (%s:%d) failed! "
+ "Ignore and pretend child is dead.\n",
+ svc->name, svc->pid));
+ talloc_free(svc);
}
-
- return ret;
}
static void reload_reply(DBusPendingCall *pending, void *data)
@@ -905,6 +830,7 @@ static int get_service_config(struct mt_ctx *ctx, const char *name,
int ret;
char *path;
struct mt_svc *svc;
+ time_t now = time(NULL);
*svc_cfg = NULL;
@@ -913,6 +839,7 @@ static int get_service_config(struct mt_ctx *ctx, const char *name,
return ENOMEM;
}
svc->mt_ctx = ctx;
+ svc->type = MT_SVC_SERVICE;
talloc_set_destructor((TALLOC_CTX *)svc, svc_destructor);
@@ -971,13 +898,17 @@ static int get_service_config(struct mt_ctx *ctx, const char *name,
svc->ping_time = MONITOR_DEF_PING_TIME;
}
+ svc->last_restart = now;
+
*svc_cfg = svc;
talloc_free(path);
return EOK;
}
-static int add_new_service(struct mt_ctx *ctx, const char *name)
+static int add_new_service(struct mt_ctx *ctx,
+ const char *name,
+ int restarts)
{
int ret;
struct mt_svc *svc;
@@ -987,6 +918,8 @@ static int add_new_service(struct mt_ctx *ctx, const char *name)
return ret;
}
+ svc->restarts = restarts;
+
ret = start_service(svc);
if (ret != EOK) {
DEBUG(0,("Failed to start service '%s'\n", svc->name));
@@ -1002,6 +935,7 @@ static int get_provider_config(struct mt_ctx *ctx, const char *name,
int ret;
char *path;
struct mt_svc *svc;
+ time_t now = time(NULL);
*svc_cfg = NULL;
@@ -1010,6 +944,7 @@ static int get_provider_config(struct mt_ctx *ctx, const char *name,
return ENOMEM;
}
svc->mt_ctx = ctx;
+ svc->type = MT_SVC_PROVIDER;
talloc_set_destructor((TALLOC_CTX *)svc, svc_destructor);
@@ -1084,12 +1019,16 @@ static int get_provider_config(struct mt_ctx *ctx, const char *name,
return ENOMEM;
}
}
+ svc->last_restart = now;
+
*svc_cfg = svc;
return EOK;
}
-static int add_new_provider(struct mt_ctx *ctx, const char *name)
+static int add_new_provider(struct mt_ctx *ctx,
+ const char *name,
+ int restarts)
{
int ret;
struct mt_svc *svc;
@@ -1100,6 +1039,7 @@ static int add_new_provider(struct mt_ctx *ctx, const char *name)
name));
return ret;
}
+ svc->restarts = restarts;
if (strcasecmp(svc->provider, "local") == 0) {
/* The LOCAL provider requires no back-end currently
@@ -1940,6 +1880,10 @@ int monitor_process_init(struct mt_ctx *ctx,
return EIO;
}
+ /* Set up the SIGCHLD handler */
+ ret = sss_sigchld_init(ctx, ctx->ev, &ctx->sigchld_ctx);
+ if (ret != EOK) return ret;
+
#if 0
This feature is incomplete and can leave the SSSD in a bad state if the
config file is changed while the SSSD is running.
@@ -1991,7 +1935,7 @@ int monitor_process_init(struct mt_ctx *ctx,
/* start providers */
num_providers = 0;
for (dom = ctx->domains; dom; dom = dom->next) {
- ret = add_new_provider(ctx, dom->name);
+ ret = add_new_provider(ctx, dom->name, 0);
if (ret != EOK && ret != ENOENT) {
return ret;
}
@@ -2017,13 +1961,10 @@ int monitor_process_init(struct mt_ctx *ctx,
/* No providers start services immediately
* Normally this means only LOCAL is configured */
for (i = 0; ctx->services[i]; i++) {
- add_new_service(ctx, ctx->services[i]);
+ add_new_service(ctx, ctx->services[i], 0);
}
}
- /* now start checking for global events */
- set_global_checker(ctx);
-
return EOK;
}
@@ -2204,39 +2145,6 @@ done:
dbus_message_unref(reply);
}
-
-
-/* service_check_alive
- * This function checks if the service child is still alive
- */
-static int service_check_alive(struct mt_svc *svc)
-{
- int status;
- pid_t pid;
-
- DEBUG(4,("Checking service %s(%d) is still alive\n", svc->name, svc->pid));
-
- pid = waitpid(svc->pid, &status, WNOHANG);
- if (pid == 0) {
- return EOK;
- }
-
- if (pid != svc->pid) {
- DEBUG(1, ("bad return (%d) from waitpid() waiting for %d\n",
- pid, svc->pid));
- /* TODO: what do we do now ? */
- return EINVAL;
- }
-
- if (WIFEXITED(status)) { /* children exited on it's own */
- /* TODO: check configuration to see if it was removed
- * from the list of process to run */
- DEBUG(0,("Process [%s] exited\n", svc->name));
- }
-
- return ECHILD;
-}
-
static void service_startup_handler(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval t, void *ptr);
@@ -2270,10 +2178,12 @@ static int start_service(struct mt_svc *svc)
return EOK;
}
+static void mt_svc_exit_handler(int pid, int wait_status, void *pvt);
static void service_startup_handler(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval t, void *ptr)
{
+ errno_t ret;
struct mt_svc *mt_svc;
char **args;
@@ -2292,6 +2202,21 @@ static void service_startup_handler(struct tevent_context *ev,
/* Parent */
mt_svc->mt_ctx->check_children = true;
mt_svc->failed_pongs = 0;
+
+ /* Handle process exit */
+ ret = sss_child_register(mt_svc,
+ mt_svc->mt_ctx->sigchld_ctx,
+ mt_svc->pid,
+ mt_svc_exit_handler,
+ mt_svc,
+ &mt_svc->child_ctx);
+ if (ret != EOK) {
+ DEBUG(0, ("Could not register sigchld handler.\n"));
+ /* Should we exit here? For now, we'll hope this
+ * child never dies, because we can't restart it.
+ */
+ }
+
DLIST_ADD(mt_svc->mt_ctx->svc_list, mt_svc);
set_tasks_checker(mt_svc);
@@ -2313,6 +2238,54 @@ static void service_startup_handler(struct tevent_context *ev,
_exit(1);
}
+static void mt_svc_exit_handler(int pid, int wait_status, void *pvt)
+{
+ struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc);
+ time_t now = time(NULL);
+
+ if WIFEXITED(wait_status) {
+ DEBUG(2, ("Child [%s] exited with code [%d]\n",
+ svc->name, WEXITSTATUS(wait_status)));
+ } else if WIFSIGNALED(wait_status) {
+ DEBUG(2, ("Child [%s] terminated with signal [%d]\n",
+ svc->name, WTERMSIG(wait_status)));
+ } else {
+ DEBUG(0, ("Child [%s] did not exit cleanly\n", svc->name));
+ /* Forcibly kill this child, just in case */
+ kill(svc->pid, SIGKILL);
+
+ /* Return and let us get caught by another
+ * call to the SIGCHLD handler
+ */
+ return;
+ }
+
+ if ((now - svc->last_restart) > 30) { /* TODO: get val from config */
+ svc->restarts = 0;
+ }
+
+ /* Restart the service */
+ if (svc->restarts > 2) { /* TODO: get val from config */
+ DEBUG(0, ("Process [%s], definitely stopped!\n", svc->name));
+ talloc_free(svc);
+ return;
+ }
+
+ if (svc->type == MT_SVC_SERVICE) {
+ add_new_service(svc->mt_ctx, svc->name, svc->restarts + 1);
+ } else if (svc->type == MT_SVC_PROVIDER) {
+ add_new_provider(svc->mt_ctx, svc->name, svc->restarts + 1);
+ } else {
+ /* Invalid type? */
+ DEBUG(1, ("BUG: Invalid child process type [%d]\n", svc->type));
+ }
+
+ /* Free the old service (which will also remove it
+ * from the child list)
+ */
+ talloc_free(svc);
+}
+
int main(int argc, const char *argv[])
{
int opt;