diff options
author | Dean Jansa <djansa@redhat.com> | 2005-09-23 20:55:41 +0000 |
---|---|---|
committer | Nathan Straz <nstraz@redhat.com> | 2008-09-23 09:37:45 -0400 |
commit | d58e13af244d292b4a50cb5176be21c112f8aae0 (patch) | |
tree | cb33f4cde1487c85c6023bc1ae92055ccfd3c571 /qarsh.c | |
parent | 94887f1190ccad5682e7adce7b4e24cd09052736 (diff) | |
download | qarsh-d58e13af244d292b4a50cb5176be21c112f8aae0.tar.gz qarsh-d58e13af244d292b4a50cb5176be21c112f8aae0.tar.xz qarsh-d58e13af244d292b4a50cb5176be21c112f8aae0.zip |
Rework heartbeating to run a small state machine, and change from
-m missed to -t timeout on the command line.
New env var - QARSH_TIMEOUT can be set to change the default timeout.
Command line -t overrides this env var.
Diffstat (limited to 'qarsh.c')
-rw-r--r-- | qarsh.c | 107 |
1 files changed, 77 insertions, 30 deletions
@@ -20,6 +20,7 @@ #include <netdb.h> #include <syslog.h> #include <pwd.h> +#include <time.h> #include "sockutil.h" #include "qarsh_packet.h" @@ -27,10 +28,18 @@ #define QARSH_MINPORT 5010 +struct qarsh_hbeat_s { + int max_timeout; + enum {HOST_ALIVE, HOST_QUIET, HOST_TIMEOUT, HOST_REBOOT} rhost_state; + unsigned int last_rhost_btime; + time_t start_quiet_time; +}; + + /* Globals */ int qarsh_fd = -1; /* The control connection to qarshd */ char *qarshd_host; /* hostname of remote host */ -int qarsh_allowed_hbeatmisses = 100; +struct qarsh_hbeat_s qarsh_hbeat; int signal_to_send = 0; int sigs_to_propogate[] = { SIGINT, SIGTERM, SIGHUP, SIGUSR1, SIGUSR2 }; sigset_t pselect_sigmask; @@ -42,11 +51,12 @@ usage(const char *pname) "-l user Run cmdline using this user name.\n" "-g group Run cmdline using this group name.\n" "-p port Use this port to contact qarshd.\n" - "-m misscount Number of missed heartbeats allowed.\n" - " A value of 0 disables heartbeats.\n" - " Default is value is <%d>.\n" + "-t timeout Number of seconds a remote host can be\n" + " silent before we give up and exit\n" + " A value of 0 disables heartbeating.\n" + " Default is value is 120.\n" - ,pname, qarsh_allowed_hbeatmisses); + ,pname); return; } @@ -81,6 +91,13 @@ sig_handler(int sig) signal_to_send = sig; } +void +sig_alrm_handler(int sig) +{ + fprintf(stderr, "No heartbeat from %s\n", qarshd_host); + exit(1); +} + void setup_signals(void) { @@ -125,24 +142,42 @@ set_remote_user(char *user, char *group) unsigned int heartbeat(const char *host) { - int retry; unsigned int hbeat; + time_t current_time; /* User disabled heart beating */ - if (qarsh_allowed_hbeatmisses == 0) { + if (!qarsh_hbeat.max_timeout) { return 1; } - for (retry = 0; retry < qarsh_allowed_hbeatmisses; retry++) { - if ((hbeat = btime(host)) == 0) { - fprintf(stderr, "qarsh: INFO -- missed heartbeat %d\n", retry); - sleep(retry); + hbeat = btime(host); + current_time = time(NULL); + + if (hbeat == 0) { + qarsh_hbeat.rhost_state = HOST_QUIET; + if (qarsh_hbeat.start_quiet_time == 0) { + qarsh_hbeat.start_quiet_time = time(NULL); } else { - break; + if (current_time - qarsh_hbeat.start_quiet_time + > qarsh_hbeat.max_timeout) { + qarsh_hbeat.rhost_state = HOST_TIMEOUT; + return 0; + } + } + } else { + if (qarsh_hbeat.last_rhost_btime == 0) { + qarsh_hbeat.last_rhost_btime = hbeat; + qarsh_hbeat.rhost_state = HOST_ALIVE; + qarsh_hbeat.start_quiet_time = 0; } + + if (abs(hbeat - qarsh_hbeat.last_rhost_btime) > 5) { + qarsh_hbeat.rhost_state = HOST_REBOOT; + return 0; + } } - - return hbeat; + + return 1; } int @@ -160,15 +195,6 @@ run_remote_cmd(char *cmdline) struct sockaddr_in caddr; socklen_t clen; struct timespec timeout; - unsigned int start_hbeat; - unsigned int hbeat; - - /* Use remote node boot time as hearbeat */ - start_hbeat = heartbeat(qarshd_host); - if (!start_hbeat) { - fprintf(stderr, "Can not initialize heartbeat from %s\n", qarshd_host); - return 1; - } l_in = bind_any(QARSH_MINPORT); p_in = getsockport(l_in); @@ -257,9 +283,7 @@ run_remote_cmd(char *cmdline) &pselect_sigmask); if (nset == 0) { - hbeat = heartbeat(qarshd_host); - if ((!hbeat) || - (abs(hbeat - start_hbeat)) > 5) { + if (!heartbeat(qarshd_host)) { fprintf(stderr, "No heartbeat from %s\n", qarshd_host); /* Set our return packet as NULL so we exit * with unknown error. */ @@ -348,7 +372,6 @@ run_remote_cmd(char *cmdline) int main(int argc, char *argv[]) { - int c; int port = 5008; char *host; @@ -357,10 +380,23 @@ main(int argc, char *argv[]) char *args; struct passwd *pw; int ret; + struct sigaction sa; + sigset_t sigmask; + char *cp; openlog("qarsh", LOG_PID, LOG_DAEMON); - while ((c = getopt(argc, argv, "+p:l:g:m:")) != -1) { + /* Init our heartbeat info */ + qarsh_hbeat.max_timeout = 120; + qarsh_hbeat.rhost_state = HOST_ALIVE; + qarsh_hbeat.last_rhost_btime = 0; + qarsh_hbeat.start_quiet_time = 0; + + if ((cp = getenv("QARSH_TIMEOUT")) != NULL) { + qarsh_hbeat.max_timeout = atoi(cp); + } + + while ((c = getopt(argc, argv, "+p:l:g:t:")) != -1) { switch (c) { case 'l': remuser = strdup(optarg); @@ -371,8 +407,8 @@ main(int argc, char *argv[]) case 'p': port = atoi(optarg); break; - case 'm': - qarsh_allowed_hbeatmisses = atoi(optarg); + case 't': + qarsh_hbeat.max_timeout = atoi(optarg); break; case '?': default: @@ -416,7 +452,18 @@ main(int argc, char *argv[]) } qarshd_host = strdup(host); + + memset(&sa, 0, sizeof sa); + sigemptyset(&sigmask); + sa.sa_mask = sigmask; + sa.sa_flags = SA_ONESHOT; + sa.sa_handler = sig_alrm_handler; + sigaction(SIGALRM, &sa, NULL); + + alarm(qarsh_hbeat.max_timeout); qarsh_fd = connect_to_host(host, port); + alarm(0); + if (qarsh_fd == -1) { if (errno == 0) { fprintf(stderr, "Could not connect to %s:%d, %d: %s\n", |