From 0195667b8d2a39040b69d1ca627439ea9317f66a Mon Sep 17 00:00:00 2001 From: Stephen Gallagher Date: Wed, 14 Sep 2011 13:12:41 -0400 Subject: MONITOR: Correctly detect lack of response from services We were incorrectly using DBUS_ERROR_TIMEOUT here. The correct behaviour is to check for DBUS_ERROR_NO_REPLY. This way we will properly handle the three-tries in the tasks_check_handler(). Additionally, we weren't properly handling failure counts correctly, meaning we weren't restarting stuck services in a timely manner. --- src/monitor/monitor.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c index 431b875cc..26e0799f7 100644 --- a/src/monitor/monitor.c +++ b/src/monitor/monitor.c @@ -84,7 +84,6 @@ struct mt_svc { int restarts; time_t last_restart; - time_t last_ping; int failed_pongs; int debug_level; @@ -549,22 +548,14 @@ static void tasks_check_handler(struct tevent_context *ev, break; } - if (svc->last_ping != 0) { - if ((now - svc->last_ping) > (svc->ping_time)) { - svc->failed_pongs++; - } else { - svc->failed_pongs = 0; - } - if (svc->failed_pongs > 3) { - /* too long since we last heard of this process */ - DEBUG(1, ("Killing service [%s], not responding to pings!\n", - svc->name)); - monitor_kill_service(svc); - process_alive = false; - } + if (svc->failed_pongs >= 3) { + /* too long since we last heard of this process */ + DEBUG(1, + ("Killing service [%s], not responding to pings!\n", + svc->name)); + monitor_kill_service(svc); + process_alive = false; } - - svc->last_ping = now; } if (!process_alive) { @@ -2118,7 +2109,7 @@ static int service_send_ping(struct mt_svc *svc) } ret = sbus_conn_send(svc->conn, msg, - svc->mt_ctx->service_id_timeout, + svc->ping_time, ping_check, svc, NULL); dbus_message_unref(msg); return ret; @@ -2129,6 +2120,7 @@ static void ping_check(DBusPendingCall *pending, void *data) struct mt_svc *svc; DBusMessage *reply; const char *dbus_error_name; + size_t len; int type; svc = talloc_get_type(data, struct mt_svc); @@ -2161,13 +2153,26 @@ static void ping_check(DBusPendingCall *pending, void *data) case DBUS_MESSAGE_TYPE_ERROR: dbus_error_name = dbus_message_get_error_name(reply); + if (!dbus_error_name) { + dbus_error_name = ""; + } + + len = strlen(DBUS_ERROR_NO_REPLY); - /* timeouts are handled in the main service check function */ - if (strcmp(dbus_error_name, DBUS_ERROR_TIMEOUT) == 0) + /* Increase failed pong count */ + if (strnlen(dbus_error_name, len + 1) == len + && strncmp(dbus_error_name, DBUS_ERROR_NO_REPLY, len) == 0) { + DEBUG(1, + ("A service PING timed out on [%s]. " + "Attempt [%d]\n", + svc->name, svc->failed_pongs)); + svc->failed_pongs++; break; + } - DEBUG(0,("A service PING returned an error [%s], closing connection.\n", - dbus_error_name)); + DEBUG(0, + ("A service PING returned an error [%s], closing connection.\n", + dbus_error_name)); /* Falling through to default intentionally*/ default: /* -- cgit