diff options
author | Stephen Gallagher <sgallagh@redhat.com> | 2011-09-14 13:12:41 -0400 |
---|---|---|
committer | Stephen Gallagher <sgallagh@redhat.com> | 2011-10-26 10:29:39 -0400 |
commit | 2bfdeeb723d990404f89b6d7e9cb23db4a967158 (patch) | |
tree | 018a0a874a938afdb18c7deea602b76c20150a6a /src/monitor/monitor.c | |
parent | fb54ed5823a4481318d680fcc0da57fb854c241f (diff) | |
download | sssd-2bfdeeb723d990404f89b6d7e9cb23db4a967158.tar.gz sssd-2bfdeeb723d990404f89b6d7e9cb23db4a967158.tar.xz sssd-2bfdeeb723d990404f89b6d7e9cb23db4a967158.zip |
MONITOR: Correctly detect lack of response from services
We were incorrectly using DBUS_ERROR_TIMEOUT here. The correct
behaviour is to check for DBUS_ERROR_NO_REPLY. This way we will
properly handle the three-tries in the tasks_check_handler().
Additionally, we weren't properly handling failure counts
correctly, meaning we weren't restarting stuck services in a
timely manner.
Diffstat (limited to 'src/monitor/monitor.c')
-rw-r--r-- | src/monitor/monitor.c | 47 |
1 files changed, 26 insertions, 21 deletions
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c index 431b875cc..26e0799f7 100644 --- a/src/monitor/monitor.c +++ b/src/monitor/monitor.c @@ -84,7 +84,6 @@ struct mt_svc { int restarts; time_t last_restart; - time_t last_ping; int failed_pongs; int debug_level; @@ -549,22 +548,14 @@ static void tasks_check_handler(struct tevent_context *ev, break; } - if (svc->last_ping != 0) { - if ((now - svc->last_ping) > (svc->ping_time)) { - svc->failed_pongs++; - } else { - svc->failed_pongs = 0; - } - if (svc->failed_pongs > 3) { - /* too long since we last heard of this process */ - DEBUG(1, ("Killing service [%s], not responding to pings!\n", - svc->name)); - monitor_kill_service(svc); - process_alive = false; - } + if (svc->failed_pongs >= 3) { + /* too long since we last heard of this process */ + DEBUG(1, + ("Killing service [%s], not responding to pings!\n", + svc->name)); + monitor_kill_service(svc); + process_alive = false; } - - svc->last_ping = now; } if (!process_alive) { @@ -2118,7 +2109,7 @@ static int service_send_ping(struct mt_svc *svc) } ret = sbus_conn_send(svc->conn, msg, - svc->mt_ctx->service_id_timeout, + svc->ping_time, ping_check, svc, NULL); dbus_message_unref(msg); return ret; @@ -2129,6 +2120,7 @@ static void ping_check(DBusPendingCall *pending, void *data) struct mt_svc *svc; DBusMessage *reply; const char *dbus_error_name; + size_t len; int type; svc = talloc_get_type(data, struct mt_svc); @@ -2161,13 +2153,26 @@ static void ping_check(DBusPendingCall *pending, void *data) case DBUS_MESSAGE_TYPE_ERROR: dbus_error_name = dbus_message_get_error_name(reply); + if (!dbus_error_name) { + dbus_error_name = "<UNKNOWN>"; + } + + len = strlen(DBUS_ERROR_NO_REPLY); - /* timeouts are handled in the main service check function */ - if (strcmp(dbus_error_name, DBUS_ERROR_TIMEOUT) == 0) + /* Increase failed pong count */ + if (strnlen(dbus_error_name, len + 1) == len + && strncmp(dbus_error_name, DBUS_ERROR_NO_REPLY, len) == 0) { + DEBUG(1, + ("A service PING timed out on [%s]. " + "Attempt [%d]\n", + svc->name, svc->failed_pongs)); + svc->failed_pongs++; break; + } - DEBUG(0,("A service PING returned an error [%s], closing connection.\n", - dbus_error_name)); + DEBUG(0, + ("A service PING returned an error [%s], closing connection.\n", + dbus_error_name)); /* Falling through to default intentionally*/ default: /* |