summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Teigland <teigland@redhat.com>2009-05-28 16:19:34 -0500
committerDavid Teigland <teigland@redhat.com>2009-05-28 16:19:34 -0500
commitaef54488553beced8e497a39012646ed278aeb37 (patch)
treece75cb1cda08de9a6d21ed6998fd66c15da0184e
parent07755d9dc0c4d4e0ee3d5319cdbf33820709113b (diff)
downloaddct-stuff-aef54488553beced8e497a39012646ed278aeb37.tar.gz
dct-stuff-aef54488553beced8e497a39012646ed278aeb37.tar.xz
dct-stuff-aef54488553beced8e497a39012646ed278aeb37.zip
cpgx: new die option to kill corosync
test something similar to a node failure, as opposed to just a process failure, by kill -9 corosync, then cman_tool join May want to use iptables instead of killing corosync to better approximate a realistic node failure. Signed-off-by: David Teigland <teigland@redhat.com>
-rw-r--r--cpgx/cpgx.c57
1 files changed, 45 insertions, 12 deletions
diff --git a/cpgx/cpgx.c b/cpgx/cpgx.c
index e083b0d..4ae63aa 100644
--- a/cpgx/cpgx.c
+++ b/cpgx/cpgx.c
@@ -129,6 +129,7 @@ static int prog_quit;
static int cluster_down;
static int opt_leave = 1;
static int opt_fail = 1;
+static int opt_die = 0;
static int got_error = 0;
static int continue_after_error = 0;
static int opt_print_event = 1;
@@ -1274,7 +1275,7 @@ int do_join(void)
{
cpg_error_t error;
cpg_handle_t h;
- int i = 0, fd, ci, rv;
+ int i = 0, fd, ci;
int unused;
uint32_t nodeid;
@@ -1284,20 +1285,18 @@ int do_join(void)
error = cpg_initialize(&h, &cpg_callbacks);
if (error != CPG_OK) {
log_error("cpg_initialize error %d", error);
- rv = -1;
+ log_error("is corosync running?");
goto fail_out;
}
error = cpg_local_get(h, &nodeid);
if (error != CPG_OK) {
log_error("cpg_local_get error %d", error);
- rv = -1;
goto fail_fin;
}
if (nodeid < 1 || nodeid > 255) {
log_error("nodeids must be between 1 and 255");
- rv = -1;
goto fail_fin;
}
our_nodeid = (uint8_t)nodeid;
@@ -1322,7 +1321,6 @@ int do_join(void)
if (error != CPG_OK) {
log_error("cpg_join error %d", error);
cpg_finalize(h);
- rv = -1;
goto fail;
}
@@ -1333,7 +1331,7 @@ int do_join(void)
fail_fin:
cpg_finalize(h);
fail_out:
- return rv;
+ exit(1);
}
int do_leave(void)
@@ -1412,6 +1410,24 @@ int we_should_fail(void)
return 0;
}
+int we_should_die(void)
+{
+ static unsigned int tries;
+ int rv;
+
+ if (!opt_die)
+ return 0;
+
+ tries++;
+
+ rv = rand_int(1, 10000);
+ if (rv == 111) {
+ log_debug("do die %u", tries);
+ return 1;
+ }
+ return 0;
+}
+
#if 0
int we_should_leave(void)
{
@@ -1508,7 +1524,7 @@ void loop(void)
gettimeofday(&now, NULL);
ms = time_diff_ms(&last_dispatch, &now);
- if (ms > 10000)
+ if (ms > 20000)
log_error("no cpg dispatch in %ul ms", ms);
if (got_error) {
@@ -1553,6 +1569,17 @@ void loop(void)
fflush(stderr);
exit(2);
}
+
+ if (we_should_die()) {
+ fflush(stdout);
+ fflush(stderr);
+ log_debug("kill corosync");
+ system("kill -9 `pidof corosync`");
+ sleep(1);
+ log_debug("start corosync");
+ system("cman_tool join -w");
+ exit(2);
+ }
}
out:
return;
@@ -1563,8 +1590,10 @@ void print_usage(void)
printf("Options:\n");
printf(" -H [0|1] event history output [off|on] default 1\n");
printf(" -D [0|1] debug output [off|on] default 1\n");
- printf(" -f [0|1] fail included in test [off|on] default 1\n");
printf(" -l [0|1] leave included in test [off|on] default 1\n");
+ printf(" -f [0|1] fail included in test [off|on] default 1\n");
+ printf(" -d [0|1] die included in test [off|on] default 0\n");
+ printf(" (kills corosync, restarts with cman_tool join)\n");
printf(" -s <num> sync up to num events, default %d\n",
DEFAULT_SYNC_MAX);
printf(" -c continue after error\n");
@@ -1576,7 +1605,7 @@ void print_usage(void)
printf("\n");
printf("Notes:\n");
printf(" - to prevent history from periodically restarting from 0,\n"
- " keep one node from leaving or failing with -f0 -l0\n");
+ " keep one node from leaving/failing/dieing with -f0 -l0 -d0\n");
printf(" - 8 nodes max, nodeids beteen 1 and 255\n");
printf(" - debug dump on error: %s\n", DUMP_WRITE_PATH);
}
@@ -1589,7 +1618,7 @@ int main(int argc, char **argv)
int optchar;
while (cont) {
- optchar = getopt(argc, argv, "H:D:f:l:s:ch");
+ optchar = getopt(argc, argv, "H:D:f:l:d:s:ch");
switch (optchar) {
case 'H':
@@ -1600,12 +1629,16 @@ int main(int argc, char **argv)
opt_print_debug = atoi(optarg);
break;
+ case 'l':
+ opt_leave = atoi(optarg);
+ break;
+
case 'f':
opt_fail = atoi(optarg);
break;
- case 'l':
- opt_leave = atoi(optarg);
+ case 'd':
+ opt_die = atoi(optarg);
break;
case 's':