summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDean Jansa <djansa@redhat.com>2005-09-23 20:55:41 +0000
committerNathan Straz <nstraz@redhat.com>2008-09-23 09:37:45 -0400
commitd58e13af244d292b4a50cb5176be21c112f8aae0 (patch)
treecb33f4cde1487c85c6023bc1ae92055ccfd3c571
parent94887f1190ccad5682e7adce7b4e24cd09052736 (diff)
downloadqarsh-d58e13af244d292b4a50cb5176be21c112f8aae0.zip
qarsh-d58e13af244d292b4a50cb5176be21c112f8aae0.tar.gz
qarsh-d58e13af244d292b4a50cb5176be21c112f8aae0.tar.xz
Rework heartbeating to run a small state machine, and change from
-m missed to -t timeout on the command line. New env var - QARSH_TIMEOUT can be set to change the default timeout. Command line -t overrides this env var.
-rw-r--r--qarsh.c107
1 files changed, 77 insertions, 30 deletions
diff --git a/qarsh.c b/qarsh.c
index 5521311..3f7e5a7 100644
--- a/qarsh.c
+++ b/qarsh.c
@@ -20,6 +20,7 @@
#include <netdb.h>
#include <syslog.h>
#include <pwd.h>
+#include <time.h>
#include "sockutil.h"
#include "qarsh_packet.h"
@@ -27,10 +28,18 @@
#define QARSH_MINPORT 5010
+struct qarsh_hbeat_s {
+ int max_timeout;
+ enum {HOST_ALIVE, HOST_QUIET, HOST_TIMEOUT, HOST_REBOOT} rhost_state;
+ unsigned int last_rhost_btime;
+ time_t start_quiet_time;
+};
+
+
/* Globals */
int qarsh_fd = -1; /* The control connection to qarshd */
char *qarshd_host; /* hostname of remote host */
-int qarsh_allowed_hbeatmisses = 100;
+struct qarsh_hbeat_s qarsh_hbeat;
int signal_to_send = 0;
int sigs_to_propogate[] = { SIGINT, SIGTERM, SIGHUP, SIGUSR1, SIGUSR2 };
sigset_t pselect_sigmask;
@@ -42,11 +51,12 @@ usage(const char *pname)
"-l user Run cmdline using this user name.\n"
"-g group Run cmdline using this group name.\n"
"-p port Use this port to contact qarshd.\n"
- "-m misscount Number of missed heartbeats allowed.\n"
- " A value of 0 disables heartbeats.\n"
- " Default is value is <%d>.\n"
+ "-t timeout Number of seconds a remote host can be\n"
+ " silent before we give up and exit\n"
+ " A value of 0 disables heartbeating.\n"
+ " Default is value is 120.\n"
- ,pname, qarsh_allowed_hbeatmisses);
+ ,pname);
return;
}
@@ -81,6 +91,13 @@ sig_handler(int sig)
signal_to_send = sig;
}
+void
+sig_alrm_handler(int sig)
+{
+ fprintf(stderr, "No heartbeat from %s\n", qarshd_host);
+ exit(1);
+}
+
void
setup_signals(void)
{
@@ -125,24 +142,42 @@ set_remote_user(char *user, char *group)
unsigned int
heartbeat(const char *host)
{
- int retry;
unsigned int hbeat;
+ time_t current_time;
/* User disabled heart beating */
- if (qarsh_allowed_hbeatmisses == 0) {
+ if (!qarsh_hbeat.max_timeout) {
return 1;
}
- for (retry = 0; retry < qarsh_allowed_hbeatmisses; retry++) {
- if ((hbeat = btime(host)) == 0) {
- fprintf(stderr, "qarsh: INFO -- missed heartbeat %d\n", retry);
- sleep(retry);
+ hbeat = btime(host);
+ current_time = time(NULL);
+
+ if (hbeat == 0) {
+ qarsh_hbeat.rhost_state = HOST_QUIET;
+ if (qarsh_hbeat.start_quiet_time == 0) {
+ qarsh_hbeat.start_quiet_time = time(NULL);
} else {
- break;
+ if (current_time - qarsh_hbeat.start_quiet_time
+ > qarsh_hbeat.max_timeout) {
+ qarsh_hbeat.rhost_state = HOST_TIMEOUT;
+ return 0;
+ }
+ }
+ } else {
+ if (qarsh_hbeat.last_rhost_btime == 0) {
+ qarsh_hbeat.last_rhost_btime = hbeat;
+ qarsh_hbeat.rhost_state = HOST_ALIVE;
+ qarsh_hbeat.start_quiet_time = 0;
}
+
+ if (abs(hbeat - qarsh_hbeat.last_rhost_btime) > 5) {
+ qarsh_hbeat.rhost_state = HOST_REBOOT;
+ return 0;
+ }
}
-
- return hbeat;
+
+ return 1;
}
int
@@ -160,15 +195,6 @@ run_remote_cmd(char *cmdline)
struct sockaddr_in caddr;
socklen_t clen;
struct timespec timeout;
- unsigned int start_hbeat;
- unsigned int hbeat;
-
- /* Use remote node boot time as hearbeat */
- start_hbeat = heartbeat(qarshd_host);
- if (!start_hbeat) {
- fprintf(stderr, "Can not initialize heartbeat from %s\n", qarshd_host);
- return 1;
- }
l_in = bind_any(QARSH_MINPORT);
p_in = getsockport(l_in);
@@ -257,9 +283,7 @@ run_remote_cmd(char *cmdline)
&pselect_sigmask);
if (nset == 0) {
- hbeat = heartbeat(qarshd_host);
- if ((!hbeat) ||
- (abs(hbeat - start_hbeat)) > 5) {
+ if (!heartbeat(qarshd_host)) {
fprintf(stderr, "No heartbeat from %s\n", qarshd_host);
/* Set our return packet as NULL so we exit
* with unknown error. */
@@ -348,7 +372,6 @@ run_remote_cmd(char *cmdline)
int
main(int argc, char *argv[])
{
-
int c;
int port = 5008;
char *host;
@@ -357,10 +380,23 @@ main(int argc, char *argv[])
char *args;
struct passwd *pw;
int ret;
+ struct sigaction sa;
+ sigset_t sigmask;
+ char *cp;
openlog("qarsh", LOG_PID, LOG_DAEMON);
- while ((c = getopt(argc, argv, "+p:l:g:m:")) != -1) {
+ /* Init our heartbeat info */
+ qarsh_hbeat.max_timeout = 120;
+ qarsh_hbeat.rhost_state = HOST_ALIVE;
+ qarsh_hbeat.last_rhost_btime = 0;
+ qarsh_hbeat.start_quiet_time = 0;
+
+ if ((cp = getenv("QARSH_TIMEOUT")) != NULL) {
+ qarsh_hbeat.max_timeout = atoi(cp);
+ }
+
+ while ((c = getopt(argc, argv, "+p:l:g:t:")) != -1) {
switch (c) {
case 'l':
remuser = strdup(optarg);
@@ -371,8 +407,8 @@ main(int argc, char *argv[])
case 'p':
port = atoi(optarg);
break;
- case 'm':
- qarsh_allowed_hbeatmisses = atoi(optarg);
+ case 't':
+ qarsh_hbeat.max_timeout = atoi(optarg);
break;
case '?':
default:
@@ -416,7 +452,18 @@ main(int argc, char *argv[])
}
qarshd_host = strdup(host);
+
+ memset(&sa, 0, sizeof sa);
+ sigemptyset(&sigmask);
+ sa.sa_mask = sigmask;
+ sa.sa_flags = SA_ONESHOT;
+ sa.sa_handler = sig_alrm_handler;
+ sigaction(SIGALRM, &sa, NULL);
+
+ alarm(qarsh_hbeat.max_timeout);
qarsh_fd = connect_to_host(host, port);
+ alarm(0);
+
if (qarsh_fd == -1) {
if (errno == 0) {
fprintf(stderr, "Could not connect to %s:%d, %d: %s\n",