summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Gallagher <sgallagh@redhat.com>2011-09-14 13:12:41 -0400
committerStephen Gallagher <sgallagh@redhat.com>2011-09-15 15:34:11 -0400
commitbbee583b6600d1263a66c395ac8929374900d7cb (patch)
treecba117edbf9664efe44733e1e2e34f38881f0cec
parenta769f9bb2293cbb972da6a4f5604ed1fb7252e7b (diff)
downloadsssd-bbee583b6600d1263a66c395ac8929374900d7cb.tar.gz
sssd-bbee583b6600d1263a66c395ac8929374900d7cb.tar.xz
sssd-bbee583b6600d1263a66c395ac8929374900d7cb.zip
MONITOR: Correctly detect lack of response from services
We were incorrectly using DBUS_ERROR_TIMEOUT here. The correct behaviour is to check for DBUS_ERROR_NO_REPLY. This way we will properly handle the three-tries in the tasks_check_handler(). Additionally, we weren't properly handling failure counts correctly, meaning we weren't restarting stuck services in a timely manner.
-rw-r--r--src/monitor/monitor.c47
1 files changed, 26 insertions, 21 deletions
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c
index 799b7c69a..67dc7751c 100644
--- a/src/monitor/monitor.c
+++ b/src/monitor/monitor.c
@@ -86,7 +86,6 @@ struct mt_svc {
int restarts;
time_t last_restart;
- time_t last_ping;
int failed_pongs;
int debug_level;
@@ -550,22 +549,14 @@ static void tasks_check_handler(struct tevent_context *ev,
break;
}
- if (svc->last_ping != 0) {
- if ((now - svc->last_ping) > (svc->ping_time)) {
- svc->failed_pongs++;
- } else {
- svc->failed_pongs = 0;
- }
- if (svc->failed_pongs > 3) {
- /* too long since we last heard of this process */
- DEBUG(1, ("Killing service [%s], not responding to pings!\n",
- svc->name));
- monitor_kill_service(svc);
- process_alive = false;
- }
+ if (svc->failed_pongs >= 3) {
+ /* too long since we last heard of this process */
+ DEBUG(SSSDBG_CRIT_FAILURE,
+ ("Killing service [%s], not responding to pings!\n",
+ svc->name));
+ monitor_kill_service(svc);
+ process_alive = false;
}
-
- svc->last_ping = now;
}
if (!process_alive) {
@@ -2194,7 +2185,7 @@ static int service_send_ping(struct mt_svc *svc)
}
ret = sbus_conn_send(svc->conn, msg,
- svc->mt_ctx->service_id_timeout,
+ svc->ping_time,
ping_check, svc, NULL);
dbus_message_unref(msg);
return ret;
@@ -2205,6 +2196,7 @@ static void ping_check(DBusPendingCall *pending, void *data)
struct mt_svc *svc;
DBusMessage *reply;
const char *dbus_error_name;
+ size_t len;
int type;
svc = talloc_get_type(data, struct mt_svc);
@@ -2237,13 +2229,26 @@ static void ping_check(DBusPendingCall *pending, void *data)
case DBUS_MESSAGE_TYPE_ERROR:
dbus_error_name = dbus_message_get_error_name(reply);
+ if (!dbus_error_name) {
+ dbus_error_name = "<UNKNOWN>";
+ }
+
+ len = strlen(DBUS_ERROR_NO_REPLY);
- /* timeouts are handled in the main service check function */
- if (strcmp(dbus_error_name, DBUS_ERROR_TIMEOUT) == 0)
+ /* Increase failed pong count */
+ if (strnlen(dbus_error_name, len + 1) == len
+ && strncmp(dbus_error_name, DBUS_ERROR_NO_REPLY, len) == 0) {
+ DEBUG(SSSDBG_CRIT_FAILURE,
+ ("A service PING timed out on [%s]. "
+ "Attempt [%d]\n",
+ svc->name, svc->failed_pongs));
+ svc->failed_pongs++;
break;
+ }
- DEBUG(0,("A service PING returned an error [%s], closing connection.\n",
- dbus_error_name));
+ DEBUG(SSSDBG_FATAL_FAILURE,
+ ("A service PING returned an error [%s], closing connection.\n",
+ dbus_error_name));
/* Falling through to default intentionally*/
default:
/*