summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Gallagher <sgallagh@redhat.com>2011-09-14 13:12:41 -0400
committerStephen Gallagher <sgallagh@redhat.com>2011-09-20 11:25:37 -0400
commit0195667b8d2a39040b69d1ca627439ea9317f66a (patch)
tree699a00cad95b19f5be344fbc7db03ae55946fcc7
parent4f1ce08f55806e51a013e5a9824c23c5a65c5f48 (diff)
downloadsssd-0195667b8d2a39040b69d1ca627439ea9317f66a.tar.gz
sssd-0195667b8d2a39040b69d1ca627439ea9317f66a.tar.xz
sssd-0195667b8d2a39040b69d1ca627439ea9317f66a.zip
MONITOR: Correctly detect lack of response from services
We were incorrectly using DBUS_ERROR_TIMEOUT here. The correct behaviour is to check for DBUS_ERROR_NO_REPLY. This way we will properly handle the three-tries in the tasks_check_handler(). Additionally, we weren't properly handling failure counts correctly, meaning we weren't restarting stuck services in a timely manner.
-rw-r--r--src/monitor/monitor.c47
1 files changed, 26 insertions, 21 deletions
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c
index 431b875cc..26e0799f7 100644
--- a/src/monitor/monitor.c
+++ b/src/monitor/monitor.c
@@ -84,7 +84,6 @@ struct mt_svc {
int restarts;
time_t last_restart;
- time_t last_ping;
int failed_pongs;
int debug_level;
@@ -549,22 +548,14 @@ static void tasks_check_handler(struct tevent_context *ev,
break;
}
- if (svc->last_ping != 0) {
- if ((now - svc->last_ping) > (svc->ping_time)) {
- svc->failed_pongs++;
- } else {
- svc->failed_pongs = 0;
- }
- if (svc->failed_pongs > 3) {
- /* too long since we last heard of this process */
- DEBUG(1, ("Killing service [%s], not responding to pings!\n",
- svc->name));
- monitor_kill_service(svc);
- process_alive = false;
- }
+ if (svc->failed_pongs >= 3) {
+ /* too long since we last heard of this process */
+ DEBUG(1,
+ ("Killing service [%s], not responding to pings!\n",
+ svc->name));
+ monitor_kill_service(svc);
+ process_alive = false;
}
-
- svc->last_ping = now;
}
if (!process_alive) {
@@ -2118,7 +2109,7 @@ static int service_send_ping(struct mt_svc *svc)
}
ret = sbus_conn_send(svc->conn, msg,
- svc->mt_ctx->service_id_timeout,
+ svc->ping_time,
ping_check, svc, NULL);
dbus_message_unref(msg);
return ret;
@@ -2129,6 +2120,7 @@ static void ping_check(DBusPendingCall *pending, void *data)
struct mt_svc *svc;
DBusMessage *reply;
const char *dbus_error_name;
+ size_t len;
int type;
svc = talloc_get_type(data, struct mt_svc);
@@ -2161,13 +2153,26 @@ static void ping_check(DBusPendingCall *pending, void *data)
case DBUS_MESSAGE_TYPE_ERROR:
dbus_error_name = dbus_message_get_error_name(reply);
+ if (!dbus_error_name) {
+ dbus_error_name = "<UNKNOWN>";
+ }
+
+ len = strlen(DBUS_ERROR_NO_REPLY);
- /* timeouts are handled in the main service check function */
- if (strcmp(dbus_error_name, DBUS_ERROR_TIMEOUT) == 0)
+ /* Increase failed pong count */
+ if (strnlen(dbus_error_name, len + 1) == len
+ && strncmp(dbus_error_name, DBUS_ERROR_NO_REPLY, len) == 0) {
+ DEBUG(1,
+ ("A service PING timed out on [%s]. "
+ "Attempt [%d]\n",
+ svc->name, svc->failed_pongs));
+ svc->failed_pongs++;
break;
+ }
- DEBUG(0,("A service PING returned an error [%s], closing connection.\n",
- dbus_error_name));
+ DEBUG(0,
+ ("A service PING returned an error [%s], closing connection.\n",
+ dbus_error_name));
/* Falling through to default intentionally*/
default:
/*