/*
SSSD
Service monitor
Copyright (C) Simo Sorce 2008
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/param.h>
#include <time.h>
#include <string.h>
#include "config.h"
#ifdef HAVE_SYS_INOTIFY_H
#include <sys/inotify.h>
#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
/* Needed for res_init() */
#include <netinet/in.h>
#include <arpa/nameser.h>
#include <resolv.h>
#include "util/util.h"
#include "popt.h"
#include "tevent.h"
#include "confdb/confdb.h"
#include "confdb/confdb_setup.h"
#include "collection.h"
#include "ini_config.h"
#include "db/sysdb.h"
#include "monitor/monitor.h"
#include "dbus/dbus.h"
#include "sbus/sssd_dbus.h"
#include "monitor/monitor_interfaces.h"
#include "util/sssd-i18n.h"
/* ping time cannot be less then once every few seconds or the
* monitor will get crazy hammering children with messages */
#define MONITOR_DEF_PING_TIME 10
struct svc_spy;
struct mt_svc {
struct mt_svc *prev;
struct mt_svc *next;
struct sbus_connection *conn;
struct svc_spy *conn_spy;
struct mt_ctx *mt_ctx;
char *provider;
char *command;
char *name;
char *identity;
pid_t pid;
int ping_time;
bool svc_started;
int restarts;
time_t last_restart;
time_t last_ping;
int failed_pongs;
int debug_level;
struct tevent_timer *ping_ev;
};
struct config_file_callback {
int wd;
int retries;
monitor_reconf_fn fn;
char *filename;
time_t modified;
struct config_file_callback *next;
struct config_file_callback *prev;
};
struct config_file_ctx {
TALLOC_CTX *parent_ctx;
struct tevent_timer *timer;
bool needs_update;
struct mt_ctx *mt_ctx;
struct config_file_callback *callbacks;
};
struct mt_ctx {
struct tevent_context *ev;
struct confdb_ctx *cdb;
TALLOC_CTX *domain_ctx; /* Memory context for domain list */
struct sss_domain_info *domains;
TALLOC_CTX *service_ctx; /* Memory context for services */
char **services;
struct mt_svc *svc_list;
struct sbus_connection *sbus_srv;
struct config_file_ctx *file_ctx;
int inotify_fd;
int service_id_timeout;
bool check_children;
bool services_started;
};
static int start_service(struct mt_svc *mt_svc);
static int monitor_service_init(struct sbus_connection *conn, void *data);
static int service_send_ping(struct mt_svc *svc);
static void ping_check(DBusPendingCall *pending, void *data);
static int service_check_alive(struct mt_svc *svc);
static void set_tasks_checker(struct mt_svc *srv);
static void set_global_checker(struct mt_ctx *ctx);
static int monitor_kill_service (struct mt_svc *svc);
static int get_service_config(struct mt_ctx *ctx, const char *name,
struct mt_svc **svc_cfg);
static int get_provider_config(struct mt_ctx *ctx, const char *name,
struct mt_svc **svc_cfg);
static int add_new_service(struct mt_ctx *ctx, const char *name);
static int add_new_provider(struct mt_ctx *ctx, const char *name);
static int mark_service_as_started(struct mt_svc *svc);
static int monitor_signal_reconf(struct config_file_ctx *file_ctx,
const char *filename);
static int update_monitor_config(struct mt_ctx *ctx);
static int monitor_cleanup(void);
/* dbus_get_monitor_version
* Return the monitor version over D-BUS */
static int get_monitor_version(DBusMessage *message,
struct sbus_connection *conn)
{
dbus_uint16_t version = MONITOR_VERSION;
DBusMessage *reply;
dbus_bool_t ret;
reply = dbus_message_new_method_return(message);
if (!reply) return ENOMEM;
ret = dbus_message_append_args(reply,
DBUS_TYPE_UINT16, &version,
DBUS_TYPE_INVALID);
if (!ret) {
dbus_message_unref(reply);
return EIO;
}
/* send reply back */
sbus_conn_send_reply(conn, reply);
dbus_message_unref(reply);
return EOK;
}
struct mon_init_conn {
struct mt_ctx *ctx;
struct sbus_connection *conn;
struct tevent_timer *timeout;
};
static int add_svc_conn_spy(struct mt_svc *svc);
/* registers a new client.
* if operation is successful also sends back the Monitor version */
static int client_registration(DBusMessage *message,
struct sbus_connection *conn)
{
dbus_uint16_t version = MONITOR_VERSION;
struct mon_init_conn *mini;
struct mt_svc *svc;
void *data;
DBusMessage *reply;
DBusError dbus_error;
dbus_uint16_t svc_ver;
char *svc_name;
dbus_bool_t dbret;
int ret;
data = sbus_conn_get_private_data(conn);
mini = talloc_get_type(data, struct mon_init_conn);
if (!mini) {
DEBUG(0, ("Connection holds no valid init data\n"));
return EINVAL;
}
/* First thing, cancel the timeout */
talloc_zfree(mini->timeout);
dbus_error_init(&dbus_error);
dbret = dbus_message_get_args(message, &dbus_error,
DBUS_TYPE_STRING, &svc_name,
DBUS_TYPE_UINT16, &svc_ver,
DBUS_TYPE_INVALID);
if (!dbret) {
DEBUG(1, ("Failed to parse message, killing connection\n"));
if (dbus_error_is_set(&dbus_error)) dbus_error_free(&dbus_error);
sbus_disconnect(conn);
/* FIXME: should we just talloc_zfree(conn) ? */
goto done;
}
DEBUG(4, ("Received ID registration: (%s,%d)\n", svc_name, svc_ver));
/* search this service in the list */
svc = mini->ctx->svc_list;
while (svc) {
ret = strcasecmp(svc->identity, svc_name);
if (ret == 0) {
break;
}
svc = svc->next;
}
if (!svc) {
DEBUG(0, ("Unable to find peer [%s] in list of services,"
" killing connection!\n", svc_name));
sbus_disconnect(conn);
/* FIXME: should we just talloc_zfree(conn) ? */
goto done;
}
/* Fill in svc structure with connection data */
svc->conn = mini->conn;
ret = mark_service_as_started(svc);
if (ret) {
DEBUG(1, ("Failed to mark service [%s]!\n", svc_name));
goto done;
}
/* reply that all is ok */
reply = dbus_message_new_method_return(message);
if (!reply) return ENOMEM;
dbret = dbus_message_append_args(reply,
DBUS_TYPE_UINT16, &version,
DBUS_TYPE_INVALID);
if (!dbret) {
dbus_message_unref(reply);
return EIO;
}
/* send reply back */
sbus_conn_send_reply(conn, reply);
dbus_message_unref(reply);
done:
/* init complete, get rid of temp init context */
talloc_zfree(mini);
return EOK;
}
struct svc_spy {
struct mt_svc *svc;
};
static int svc_destructor(void *mem)
{
struct mt_svc *svc = talloc_get_type(mem, struct mt_svc);
if (!svc) {
/* ?!?!? */
return 0;
}
/* always try to delist service */
DLIST_REMOVE(svc->mt_ctx->svc_list, svc);
/* svc is beeing freed, neutralize the spy */
if (svc->conn_spy) {
talloc_set_destructor((TALLOC_CTX *)svc->conn_spy, NULL);
talloc_zfree(svc->conn_spy);
}
return 0;
}
static int svc_spy_destructor(void *mem)
{
struct svc_spy *spy = talloc_get_type(mem, struct svc_spy);
if (!spy) {
/* ?!?!? */
return 0;
}
/* svc->conn has been freed, NULL the pointer in svc */
spy->svc->conn = NULL;
return 0;
}
static int add_svc_conn_spy(struct mt_svc *svc)
{
struct svc_spy *spy;
spy = talloc(svc->conn, struct svc_spy);
if (!spy) return ENOMEM;
spy->svc = svc;
talloc_set_destructor((TALLOC_CTX *)spy, svc_spy_destructor);
svc->conn_spy = spy;
return EOK;
}
static int mark_service_as_started(struct mt_svc *svc)
{
struct mt_ctx *ctx = svc->mt_ctx;
struct mt_svc *iter;
int ret;
int i;
DEBUG(5, ("Marking %s as started.\n", svc->name));
svc->svc_started = true;
/* we need to attach a spy to the connection structure so that if some code
* frees it we can zero it out in the service structure. Otherwise we may
* try to access or even free, freed memory. */
ret = add_svc_conn_spy(svc);
if (ret) {
DEBUG(0, ("Failed to attch spy\n"));
goto done;
}
if (!ctx->services_started) {
/* check if all providers are up */
for (iter = ctx->svc_list; iter; iter = iter->next) {
if (iter->provider && !iter->svc_started) {
DEBUG(5, ("Still waiting on %s provider.", iter->name));
break;
}
}
if (iter) {
/* there are still unstarted providers */
goto done;
}
ctx->services_started = true;
DEBUG(4, ("Now starting services!\n"));
/* then start all services */
for (i = 0; ctx->services[i]; i++) {
add_new_service(ctx, ctx->services[i]);
}
}
done:
return ret;
}
static void services_startup_timeout(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval t, void *ptr)
{
struct mt_ctx *ctx = talloc_get_type(ptr, struct mt_ctx);
int i;
DEBUG(6, ("Handling timeout\n"));
if (!ctx->services_started) {
DEBUG(1, ("Providers did not start in time, "
"forcing services startup!\n"));
ctx->services_started = true;
DEBUG(4, ("Now starting services!\n"));
/* then start all services */
for (i = 0; ctx->services[i]; i++) {
add_new_service(ctx, ctx->services[i]);
}
}
}
static int add_services_startup_timeout(struct mt_ctx *ctx)
{
struct tevent_timer *to;
struct timeval tv;
/* 5 seconds should be plenty */
tv = tevent_timeval_current_ofs(5, 0);
to = tevent_add_timer(ctx->ev, ctx, tv, services_startup_timeout, ctx);
if (!to) {
DEBUG(0,("Out of memory?!\n"));
return ENOMEM;
}
return EOK;
}
struct sbus_method monitor_methods[] = {
{ MON_SRV_METHOD_VERSION, get_monitor_version },
{ MON_SRV_METHOD_REGISTER, client_registration },
{ NULL, NULL }
};
struct sbus_interface monitor_server_interface = {
MON_SRV_INTERFACE,
MON_SRV_PATH,
SBUS_DEFAULT_VTABLE,
monitor_methods,
NULL
};
/* monitor_dbus_init
* Set up the monitor service as a D-BUS Server */
static int monitor_dbus_init(struct mt_ctx *ctx)
{
char *monitor_address;
int ret;
ret = monitor_get_sbus_address(ctx, &monitor_address);
if (ret != EOK) {
return ret;
}
ret = sbus_new_server(ctx, ctx->ev,
monitor_address, &monitor_server_interface,
&ctx->sbus_srv, monitor_service_init, ctx);
talloc_free(monitor_address);
return ret;
}
static void svc_try_restart(struct mt_svc *svc, time_t now)
{
int ret;
DLIST_REMOVE(svc->mt_ctx->svc_list, svc);
if (svc->last_restart != 0) {
if ((now - svc->last_restart) > 30) { /* TODO: get val from config */
/* it was long ago reset restart threshold */
svc->restarts = 0;
}
}
/* restart the process */
if (svc->restarts > 3) { /* TODO: get val from config */
DEBUG(0, ("Process [%s], definitely stopped!\n", svc->name));
talloc_free(svc);
return;
}
/* Shut down the current ping timer so it will restart
* cleanly in start_service()
*/
talloc_free(svc->ping_ev);
ret = start_service(svc);
if (ret != EOK) {
DEBUG(0,("Failed to restart service '%s'\n", svc->name));
talloc_free(svc);
return;
}
svc->restarts++;
svc->last_restart = now;
return;
}
static void tasks_check_handler(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval t, void *ptr)
{
struct mt_svc *svc = talloc_get_type(ptr, struct mt_svc);
time_t now = time(NULL);
bool process_alive = true;
int ret;
ret = service_check_alive(svc);
switch (ret) {
case EOK:
/* all fine */
break;
case ECHILD:
DEBUG(1,("Process (%s) is stopped!\n", svc->name));
process_alive = false;
break;
default:
/* TODO: should we tear down it ? */
DEBUG(1,("Checking for service %s(%d) failed!!\n",
svc->name, svc->pid));
break;
}
if (process_alive) {
ret = service_send_ping(svc);
switch (ret) {
case EOK:
/* all fine */
break;
case ENXIO:
DEBUG(1,("Child (%s) not responding! (yet)\n", svc->name));
break;
default:
/* TODO: should we tear it down ? */
DEBUG(1,("Sending a message to service (%s) failed!!\n", svc->name));
break;
}
if (svc->last_ping != 0) {
if ((now - svc->last_ping) > (svc->ping_time)) {
svc->failed_pongs++;
} else {
svc->failed_pongs = 0;
}
if (svc->failed_pongs > 3) {
/* too long since we last heard of this process */
DEBUG(1, ("Killing service [%s], not responding to pings!\n",
svc->name));
monitor_kill_service(svc);
process_alive = false;
}
}
svc->last_ping = now;
}
if (!process_alive) {
svc_try_restart(svc, now);
return;
}
/* all fine, set up the task checker again */
set_tasks_checker(svc);
}
static void set_tasks_checker(struct mt_svc *svc)
{
struct tevent_timer *te = NULL;
struct timeval tv;
gettimeofday(&tv, NULL);
tv.tv_sec += svc->ping_time;
tv.tv_usec = 0;
te = tevent_add_timer(svc->mt_ctx->ev, svc, tv, tasks_check_handler, svc);
if (te == NULL) {
DEBUG(0, ("failed to add event, monitor offline for [%s]!\n",
svc->name));
/* FIXME: shutdown ? */
}
svc->ping_ev = te;
}
static void global_checks_handler(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval t, void *ptr)
{
struct mt_ctx *ctx = talloc_get_type(ptr, struct mt_ctx);
struct mt_svc *svc;
int status;
pid_t pid;
if (!ctx->check_children) {
goto done;
}
errno = 0;
pid = waitpid(0, &status, WNOHANG);
if (pid == 0) {
goto done;
}
if (pid == -1) {
DEBUG(0, ("waitpid returned -1 (errno:%d[%s])\n",
errno, strerror(errno)));
goto done;
}
/* let's see if it is a known service, and try to restart it */
for (svc = ctx->svc_list; svc; svc = svc->next) {
if (svc->pid == pid) {
time_t now = time(NULL);
DEBUG(1, ("Service [%s] did exit\n", svc->name));
svc_try_restart(svc, now);
goto done;
}
}
if (svc == NULL) {
DEBUG(0, ("Unknown child (%d) did exit\n", pid));
}
done:
set_global_checker(ctx);
}
static void set_global_checker(struct mt_ctx *ctx)
{
struct tevent_timer *te = NULL;
struct timeval tv;
gettimeofday(&tv, NULL);
tv.tv_sec += 1; /* once a second */
tv.tv_usec = 0;
te = tevent_add_timer(ctx->ev, ctx, tv, global_checks_handler, ctx);
if (te == NULL) {
DEBUG(0, ("failed to add global checker event! PANIC TIME!\n"));
/* FIXME: is this right ? shoulkd we try to clean up first ?*/
exit(-1);
}
}
static int monitor_kill_service (struct mt_svc *svc)
{
int ret;
ret = kill(svc->pid, SIGTERM);
if (ret != EOK) {
DEBUG(0,("Sending signal to child (%s:%d) failed! "
"Ignore and pretend child is dead.\n",
svc->name, svc->pid));
}
return ret;
}
static void shutdown_reply(DBusPendingCall *pending, void *data)
{
DBusMessage *reply;
int type;
|