summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJakub Hrozek <jhrozek@redhat.com>2012-11-15 19:26:18 +0100
committerJakub Hrozek <jhrozek@redhat.com>2012-11-19 18:04:40 +0100
commit3c922410f0b92a9b8556e28ff5d46ee7a59709c6 (patch)
treed2b9b2c1bfe31ff544ead436a58ff10ac6f56f2e
parent9a0e490c945db007c71ddded49bfe8a408989eab (diff)
downloadsssd-3c922410f0b92a9b8556e28ff5d46ee7a59709c6.tar.gz
sssd-3c922410f0b92a9b8556e28ff5d46ee7a59709c6.tar.xz
sssd-3c922410f0b92a9b8556e28ff5d46ee7a59709c6.zip
Restart services with a delay in case they are restarted too often
In case a service is restarted while the DP is not ready yet, it gets restarted again immediatelly, which means the DP might still not be ready. The allowed number of restarts is then depleted quickly. This patch changes the restart mechanism such that the first restart happens immediatelly, the second is scheduled after 2 second, then 4 etc.. https://fedorahosted.org/sssd/ticket/1528
-rw-r--r--src/monitor/monitor.c73
1 files changed, 59 insertions, 14 deletions
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c
index 7f7bbd0fa..8e73ff66d 100644
--- a/src/monitor/monitor.c
+++ b/src/monitor/monitor.c
@@ -64,6 +64,19 @@
* doesn't shutdown on receiving SIGTERM */
#define MONITOR_DEF_FORCE_TIME 60
+/* TODO: get the restart related values from config */
+#define MONITOR_RESTART_CNT_INTERVAL_RESET 30
+/* maximum allowed number of service restarts if the restarts
+ * were less than MONITOR_RESTART_CNT_INTERVAL_RESET apart, which would
+ * indicate a crash after startup or after every request */
+#define MONITOR_MAX_SVC_RESTARTS 2
+/* The services are restarted with a delay in case the restart was
+ * hitting a race condition where the DP is not ready yet either.
+ * The MONITOR_MAX_RESTART_DELAY defines the maximum delay between
+ * restarts.
+ */
+#define MONITOR_MAX_RESTART_DELAY 4
+
/* name of the monitor server instance */
#define MONITOR_NAME "sssd"
#define SSSD_PIDFILE_PATH PID_PATH"/"MONITOR_NAME".pid"
@@ -2507,11 +2520,44 @@ static void service_startup_handler(struct tevent_context *ev,
_exit(1);
}
+static void mt_svc_restart(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *ptr)
+{
+ struct mt_svc *svc;
+
+ svc = talloc_get_type(ptr, struct mt_svc);
+ if (svc == NULL) {
+ return;
+ }
+
+ DEBUG(SSSDBG_TRACE_FUNC, ("Scheduling service %s for restart %d\n",
+ svc->name, svc->restarts+1));
+
+ if (svc->type == MT_SVC_SERVICE) {
+ add_new_service(svc->mt_ctx, svc->name, svc->restarts + 1);
+ } else if (svc->type == MT_SVC_PROVIDER) {
+ add_new_provider(svc->mt_ctx, svc->name, svc->restarts + 1);
+ } else {
+ /* Invalid type? */
+ DEBUG(SSSDBG_CRIT_FAILURE,
+ ("BUG: Invalid child process type [%d]\n", svc->type));
+ }
+
+ /* Free the old service (which will also remove it
+ * from the child list)
+ */
+ talloc_free(svc);
+}
+
static void mt_svc_exit_handler(int pid, int wait_status, void *pvt)
{
struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc);
struct mt_ctx *mt_ctx = svc->mt_ctx;
time_t now = time(NULL);
+ struct tevent_timer *te;
+ struct timeval tv;
+ int restart_delay;
if WIFEXITED(wait_status) {
DEBUG(SSSDBG_OP_FAILURE,
@@ -2532,12 +2578,12 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt)
return;
}
- if ((now - svc->last_restart) > 30) { /* TODO: get val from config */
+ if ((now - svc->last_restart) > MONITOR_RESTART_CNT_INTERVAL_RESET) {
svc->restarts = 0;
}
/* Restart the service */
- if (svc->restarts > 2) { /* TODO: get val from config */
+ if (svc->restarts > MONITOR_MAX_SVC_RESTARTS) {
DEBUG(SSSDBG_FATAL_FAILURE,
("Process [%s], definitely stopped!\n", svc->name));
talloc_free(svc);
@@ -2547,20 +2593,19 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt)
return;
}
- if (svc->type == MT_SVC_SERVICE) {
- add_new_service(svc->mt_ctx, svc->name, svc->restarts + 1);
- } else if (svc->type == MT_SVC_PROVIDER) {
- add_new_provider(svc->mt_ctx, svc->name, svc->restarts + 1);
- } else {
- /* Invalid type? */
- DEBUG(SSSDBG_CRIT_FAILURE,
- ("BUG: Invalid child process type [%d]\n", svc->type));
+ /* restarts are schedule after 0, 2, 4 seconds */
+ restart_delay = svc->restarts << 1;
+ if (restart_delay > MONITOR_MAX_RESTART_DELAY) {
+ restart_delay = MONITOR_MAX_RESTART_DELAY;
}
- /* Free the old service (which will also remove it
- * from the child list)
- */
- talloc_free(svc);
+ tv = tevent_timeval_current_ofs(restart_delay, 0);
+ te = tevent_add_timer(svc->mt_ctx->ev, svc, tv, mt_svc_restart, svc);
+ if (!te) {
+ /* Nothing much we can do */
+ DEBUG(SSSDBG_CRIT_FAILURE, ("Out of memory?!\n"));
+ return;
+ }
}
int main(int argc, const char *argv[])