From 6806d6836053c6117a30ab8ccd171f879e73efd0 Mon Sep 17 00:00:00 2001 From: Simo Sorce Date: Thu, 12 Feb 2009 15:54:47 -0500 Subject: Add a separte global checker that does not depend on individual services ping time. --- server/monitor/monitor.c | 119 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 91 insertions(+), 28 deletions(-) (limited to 'server/monitor') diff --git a/server/monitor/monitor.c b/server/monitor/monitor.c index f441403d1..87f5ca730 100644 --- a/server/monitor/monitor.c +++ b/server/monitor/monitor.c @@ -83,6 +83,7 @@ static void ping_check(DBusPendingCall *pending, void *data); static int service_check_alive(struct mt_svc *svc); static void set_tasks_checker(struct mt_svc *srv); +static void set_global_checker(struct mt_ctx *ctx); /* dbus_get_monitor_version * Return the monitor version over D-BUS */ @@ -164,6 +165,37 @@ static int monitor_dbus_init(struct mt_ctx *ctx) return ret; } +static void svc_try_restart(struct mt_svc *svc, time_t now) +{ + int ret; + + DLIST_REMOVE(svc->mt_ctx->svc_list, svc); + if (svc->last_restart != 0) { + if ((now - svc->last_restart) > 30) { /* TODO: get val from config */ + /* it was long ago reset restart threshold */ + svc->restarts = 0; + } + } + + /* restart the process */ + if (svc->restarts > 3) { /* TODO: get val from config */ + DEBUG(0, ("Process [%s], definitely stopped!\n", svc->name)); + talloc_free(svc); + return; + } + + ret = start_service(svc); + if (ret != EOK) { + DEBUG(0,("Failed to restart service '%s'\n", svc->name)); + talloc_free(svc); + return; + } + + svc->restarts++; + svc->last_restart = now; + return; +} + static void tasks_check_handler(struct event_context *ev, struct timed_event *te, struct timeval t, void *ptr) @@ -224,30 +256,7 @@ static void tasks_check_handler(struct event_context *ev, } if (!process_alive) { - DLIST_REMOVE(svc->mt_ctx->svc_list, svc); - if (svc->last_restart != 0) { - if ((now - svc->last_restart) > 30) { /* TODO: get val from config */ - /* it was long ago reset restart threshold */ - svc->restarts = 0; - } - } - - /* restart the process */ - if (svc->restarts > 3) { /* TODO: get val from config */ - DEBUG(0, ("Process [%s], definitely stopped!\n", svc->name)); - talloc_free(svc); - return; - } - - ret = start_service(svc); - if (ret != EOK) { - DEBUG(0,("Failed to restart service '%s'\n", svc->name)); - talloc_free(svc); - return; - } - - svc->restarts++; - svc->last_restart = now; + svc_try_restart(svc, now); return; } @@ -271,6 +280,59 @@ static void set_tasks_checker(struct mt_svc *svc) } } +static void global_checks_handler(struct event_context *ev, + struct timed_event *te, + struct timeval t, void *ptr) +{ + struct mt_ctx *ctx = talloc_get_type(ptr, struct mt_ctx); + struct mt_svc *svc; + int status; + pid_t pid; + + errno = 0; + pid = waitpid(0, &status, WNOHANG); + if (pid == 0) { + goto done; + } + + if (pid == -1) { + DEBUG(0, ("waitpid returned -1 (errno:%d[%s])\n", + errno, strerror(errno))); + goto done; + } + + /* let's see if it is a known servicei, and try to restart it */ + for (svc = ctx->svc_list; svc; svc = svc->next) { + if (svc->pid == pid) { + time_t now = time(NULL); + DEBUG(1, ("Service [%s] did exit\n", svc->name)); + svc_try_restart(svc, now); + } + } + if (svc == NULL) { + DEBUG(0, ("Unknown child (%d) did exit\n", pid)); + } + +done: + set_global_checker(ctx); +} + +static void set_global_checker(struct mt_ctx *ctx) +{ + struct timed_event *te = NULL; + struct timeval tv; + + gettimeofday(&tv, NULL); + tv.tv_sec += 1; /* once a second */ + tv.tv_usec = 0; + te = event_add_timed(ctx->ev, ctx, tv, global_checks_handler, ctx); + if (te == NULL) { + DEBUG(0, ("failed to add global checker event! PANIC TIME!\n")); + /* FIXME: is this right ? shoulkd we try to clean up first ?*/ + exit(-1); + } +} + int get_monitor_config(struct mt_ctx *ctx) { int ret; @@ -424,12 +486,11 @@ int monitor_process_init(TALLOC_CTX *mem_ctx, talloc_free(svc); continue; } - - DLIST_ADD(ctx->svc_list, svc); - - set_tasks_checker(svc); } + /* now start checking for global events */ + set_global_checker(ctx); + return EOK; } @@ -744,6 +805,8 @@ done: dbus_message_unref(reply); } + + /* service_check_alive * This function checks if the service child is still alive */ -- cgit