diff options
author | David Teigland <teigland@redhat.com> | 2009-05-28 16:19:34 -0500 |
---|---|---|
committer | David Teigland <teigland@redhat.com> | 2009-05-28 16:19:34 -0500 |
commit | aef54488553beced8e497a39012646ed278aeb37 (patch) | |
tree | ce75cb1cda08de9a6d21ed6998fd66c15da0184e | |
parent | 07755d9dc0c4d4e0ee3d5319cdbf33820709113b (diff) | |
download | dct-stuff-aef54488553beced8e497a39012646ed278aeb37.tar.gz dct-stuff-aef54488553beced8e497a39012646ed278aeb37.tar.xz dct-stuff-aef54488553beced8e497a39012646ed278aeb37.zip |
cpgx: new die option to kill corosync
test something similar to a node failure, as opposed to just a process
failure, by kill -9 corosync, then cman_tool join
May want to use iptables instead of killing corosync to better approximate
a realistic node failure.
Signed-off-by: David Teigland <teigland@redhat.com>
-rw-r--r-- | cpgx/cpgx.c | 57 |
1 files changed, 45 insertions, 12 deletions
diff --git a/cpgx/cpgx.c b/cpgx/cpgx.c index e083b0d..4ae63aa 100644 --- a/cpgx/cpgx.c +++ b/cpgx/cpgx.c @@ -129,6 +129,7 @@ static int prog_quit; static int cluster_down; static int opt_leave = 1; static int opt_fail = 1; +static int opt_die = 0; static int got_error = 0; static int continue_after_error = 0; static int opt_print_event = 1; @@ -1274,7 +1275,7 @@ int do_join(void) { cpg_error_t error; cpg_handle_t h; - int i = 0, fd, ci, rv; + int i = 0, fd, ci; int unused; uint32_t nodeid; @@ -1284,20 +1285,18 @@ int do_join(void) error = cpg_initialize(&h, &cpg_callbacks); if (error != CPG_OK) { log_error("cpg_initialize error %d", error); - rv = -1; + log_error("is corosync running?"); goto fail_out; } error = cpg_local_get(h, &nodeid); if (error != CPG_OK) { log_error("cpg_local_get error %d", error); - rv = -1; goto fail_fin; } if (nodeid < 1 || nodeid > 255) { log_error("nodeids must be between 1 and 255"); - rv = -1; goto fail_fin; } our_nodeid = (uint8_t)nodeid; @@ -1322,7 +1321,6 @@ int do_join(void) if (error != CPG_OK) { log_error("cpg_join error %d", error); cpg_finalize(h); - rv = -1; goto fail; } @@ -1333,7 +1331,7 @@ int do_join(void) fail_fin: cpg_finalize(h); fail_out: - return rv; + exit(1); } int do_leave(void) @@ -1412,6 +1410,24 @@ int we_should_fail(void) return 0; } +int we_should_die(void) +{ + static unsigned int tries; + int rv; + + if (!opt_die) + return 0; + + tries++; + + rv = rand_int(1, 10000); + if (rv == 111) { + log_debug("do die %u", tries); + return 1; + } + return 0; +} + #if 0 int we_should_leave(void) { @@ -1508,7 +1524,7 @@ void loop(void) gettimeofday(&now, NULL); ms = time_diff_ms(&last_dispatch, &now); - if (ms > 10000) + if (ms > 20000) log_error("no cpg dispatch in %ul ms", ms); if (got_error) { @@ -1553,6 +1569,17 @@ void loop(void) fflush(stderr); exit(2); } + + if (we_should_die()) { + fflush(stdout); + fflush(stderr); + log_debug("kill corosync"); + system("kill -9 `pidof corosync`"); + sleep(1); + log_debug("start corosync"); + system("cman_tool join -w"); + exit(2); + } } out: return; @@ -1563,8 +1590,10 @@ void print_usage(void) printf("Options:\n"); printf(" -H [0|1] event history output [off|on] default 1\n"); printf(" -D [0|1] debug output [off|on] default 1\n"); - printf(" -f [0|1] fail included in test [off|on] default 1\n"); printf(" -l [0|1] leave included in test [off|on] default 1\n"); + printf(" -f [0|1] fail included in test [off|on] default 1\n"); + printf(" -d [0|1] die included in test [off|on] default 0\n"); + printf(" (kills corosync, restarts with cman_tool join)\n"); printf(" -s <num> sync up to num events, default %d\n", DEFAULT_SYNC_MAX); printf(" -c continue after error\n"); @@ -1576,7 +1605,7 @@ void print_usage(void) printf("\n"); printf("Notes:\n"); printf(" - to prevent history from periodically restarting from 0,\n" - " keep one node from leaving or failing with -f0 -l0\n"); + " keep one node from leaving/failing/dieing with -f0 -l0 -d0\n"); printf(" - 8 nodes max, nodeids beteen 1 and 255\n"); printf(" - debug dump on error: %s\n", DUMP_WRITE_PATH); } @@ -1589,7 +1618,7 @@ int main(int argc, char **argv) int optchar; while (cont) { - optchar = getopt(argc, argv, "H:D:f:l:s:ch"); + optchar = getopt(argc, argv, "H:D:f:l:d:s:ch"); switch (optchar) { case 'H': @@ -1600,12 +1629,16 @@ int main(int argc, char **argv) opt_print_debug = atoi(optarg); break; + case 'l': + opt_leave = atoi(optarg); + break; + case 'f': opt_fail = atoi(optarg); break; - case 'l': - opt_leave = atoi(optarg); + case 'd': + opt_die = atoi(optarg); break; case 's': |