diff options
author | Ronnie Sahlberg <ronniesahlberg@gmail.com> | 2011-07-29 09:04:01 +1000 |
---|---|---|
committer | Ronnie Sahlberg <ronniesahlberg@gmail.com> | 2011-07-29 09:04:01 +1000 |
commit | a17ae8a8bed982b9ecd064ccca0be88a61d2ec85 (patch) | |
tree | e87663bc138c73e51acf0835bae13c364aae384b /ctdb | |
parent | a5cd8a3270d1b614157115aa71fd224abcab6be7 (diff) | |
parent | 5ac67504caace31d9d529792ace03fbe133a0d59 (diff) | |
download | samba-a17ae8a8bed982b9ecd064ccca0be88a61d2ec85.tar.gz samba-a17ae8a8bed982b9ecd064ccca0be88a61d2ec85.tar.xz samba-a17ae8a8bed982b9ecd064ccca0be88a61d2ec85.zip |
Merge branch 'master' of 10.1.1.27:/shared/ctdb/ctdb-master
(This used to be ctdb commit 518945e59e2e48f07fcc0955f3aa81cd0d946aea)
Diffstat (limited to 'ctdb')
-rwxr-xr-x | ctdb/Makefile.in | 7 | ||||
-rw-r--r-- | ctdb/include/ctdb_private.h | 34 | ||||
-rw-r--r-- | ctdb/server/ctdb_takeover.c | 648 | ||||
-rw-r--r-- | ctdb/server/ctdb_tunables.c | 1 | ||||
-rw-r--r-- | ctdb/tests/src/ctdb_takeover_tests.c | 378 |
5 files changed, 955 insertions, 113 deletions
diff --git a/ctdb/Makefile.in b/ctdb/Makefile.in index 8fb9ea73cb..d53d3db5e0 100755 --- a/ctdb/Makefile.in +++ b/ctdb/Makefile.in @@ -70,6 +70,7 @@ TEST_BINS=tests/bin/ctdb_bench tests/bin/ctdb_fetch tests/bin/ctdb_fetch_one \ tests/bin/ctdb_fetch_lock_once tests/bin/ctdb_store \ tests/bin/ctdb_randrec tests/bin/ctdb_persistent \ tests/bin/ctdb_traverse tests/bin/rb_test tests/bin/ctdb_transaction \ + tests/bin/ctdb_takeover_tests @INFINIBAND_BINS@ BINS = bin/ctdb @CTDB_SCSI_IO@ bin/smnotify bin/ping_pong bin/ltdbtool @@ -190,6 +191,12 @@ tests/bin/ctdb_transaction: $(CTDB_CLIENT_OBJ) tests/src/ctdb_transaction.o @echo Linking $@ @$(CC) $(CFLAGS) -o $@ tests/src/ctdb_transaction.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS) +CTDB_TAKEOVER_OBJ = $(CTDB_SERVER_OBJ:server/ctdbd.o=) + +tests/bin/ctdb_takeover_tests: $(CTDB_TAKEOVER_OBJ) tests/src/ctdb_takeover_tests.o + @echo Linking $@ + @$(CC) $(CFLAGS) -o $@ tests/src/ctdb_takeover_tests.o $(CTDB_TAKEOVER_OBJ) $(LIB_FLAGS) + tests/bin/ibwrapper_test: $(CTDB_CLIENT_OBJ) ib/ibwrapper_test.o @echo Linking $@ @$(CC) $(CFLAGS) -o $@ ib/ibwrapper_test.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS) diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index 396427bfa6..37f8a7344a 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -120,6 +120,7 @@ struct ctdb_tunable { uint32_t stat_history_interval; uint32_t deferred_attach_timeout; uint32_t vacuum_fast_path_count; + uint32_t lcp2_public_ip_assignment; }; /* @@ -1410,4 +1411,37 @@ int32_t ctdb_local_schedule_for_deletion(struct ctdb_db_context *ctdb_db, struct ctdb_ltdb_header *ctdb_header_from_record_handle(struct ctdb_record_handle *h); +/* For unit testing ctdb_transaction.c. */ +struct ctdb_public_ip_list { + struct ctdb_public_ip_list *next; + uint32_t pnn; + ctdb_sock_addr addr; +}; +uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2); +uint32_t ip_distance_2_sum(ctdb_sock_addr *ip, + struct ctdb_public_ip_list *ips, + int pnn); +uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn); +void lcp2_init(struct ctdb_context * tmp_ctx, + struct ctdb_node_map * nodemap, + uint32_t mask, + struct ctdb_public_ip_list *all_ips, + uint32_t **lcp2_imbalances, + bool **newly_healthy); +void lcp2_allocate_unassigned(struct ctdb_context *ctdb, + struct ctdb_node_map *nodemap, + uint32_t mask, + struct ctdb_public_ip_list *all_ips, + uint32_t *lcp2_imbalances); +bool lcp2_failback(struct ctdb_context *ctdb, + struct ctdb_node_map *nodemap, + uint32_t mask, + struct ctdb_public_ip_list *all_ips, + uint32_t *lcp2_imbalances, + bool *newly_healthy); +void ctdb_takeover_run_core(struct ctdb_context *ctdb, + struct ctdb_node_map *nodemap, + struct ctdb_public_ip_list **all_ips_p); + + #endif diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c index a971fdecfb..5512acc379 100644 --- a/ctdb/server/ctdb_takeover.c +++ b/ctdb/server/ctdb_takeover.c @@ -3,6 +3,7 @@ Copyright (C) Ronnie Sahlberg 2007 Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -1058,13 +1059,6 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb, return 0; } -struct ctdb_public_ip_list { - struct ctdb_public_ip_list *next; - uint32_t pnn; - ctdb_sock_addr addr; -}; - - /* Given a physical node, return the number of public addresses that is currently assigned to this node. */ @@ -1255,112 +1249,119 @@ create_merged_ip_list(struct ctdb_context *ctdb) return ip_list; } -/* - make any IP alias changes for public addresses that are necessary +/* + * This is the length of the longtest common prefix between the IPs. + * It is calculated by XOR-ing the 2 IPs together and counting the + * number of leading zeroes. The implementation means that all + * addresses end up being 128 bits long. + * Not static, so we can easily link it into a unit test. + * + * FIXME? Should we consider IPv4 and IPv6 separately given that the + * 12 bytes of 0 prefix padding will hurt the algorithm if there are + * lots of nodes and IP addresses? */ -int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap) +uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2) { - int i, num_healthy, retries, num_ips; - struct ctdb_public_ip ip; - struct ctdb_public_ipv4 ipv4; - uint32_t mask, *nodes; - struct ctdb_public_ip_list *all_ips, *tmp_ip; - int maxnode, maxnum=0, minnode, minnum=0, num; - TDB_DATA data; - struct timeval timeout; - struct client_async_data *async_data; - struct ctdb_client_control_state *state; - TALLOC_CTX *tmp_ctx = talloc_new(ctdb); - - /* - * ip failover is completely disabled, just send out the - * ipreallocated event. - */ - if (ctdb->tunable.disable_ip_failover != 0) { - goto ipreallocated; - } + uint32_t ip1_k[IP_KEYLEN]; + uint32_t *t; + int i; + uint32_t x; - ZERO_STRUCT(ip); + uint32_t distance = 0; - /* Count how many completely healthy nodes we have */ - num_healthy = 0; - for (i=0;i<nodemap->num;i++) { - if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) { - num_healthy++; + memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k)); + t = ip_key(ip2); + for (i=0; i<IP_KEYLEN; i++) { + x = ip1_k[i] ^ t[i]; + if (x == 0) { + distance += 32; + } else { + /* Count number of leading zeroes. + * FIXME? This could be optimised... + */ + while ((x & (1 << 31)) == 0) { + x <<= 1; + distance += 1; + } } } - if (num_healthy > 0) { - /* We have healthy nodes, so only consider them for - serving public addresses - */ - mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED; - } else { - /* We didnt have any completely healthy nodes so - use "disabled" nodes as a fallback - */ - mask = NODE_FLAGS_INACTIVE; - } - - /* since nodes only know about those public addresses that - can be served by that particular node, no single node has - a full list of all public addresses that exist in the cluster. - Walk over all node structures and create a merged list of - all public addresses that exist in the cluster. + return distance; +} - keep the tree of ips around as ctdb->ip_tree - */ - all_ips = create_merged_ip_list(ctdb); +/* Calculate the IP distance for the given IP relative to IPs on the + given node. The ips argument is generally the all_ips variable + used in the main part of the algorithm. + * Not static, so we can easily link it into a unit test. + */ +uint32_t ip_distance_2_sum(ctdb_sock_addr *ip, + struct ctdb_public_ip_list *ips, + int pnn) +{ + struct ctdb_public_ip_list *t; + uint32_t d; - /* Count how many ips we have */ - num_ips = 0; - for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { - num_ips++; - } + uint32_t sum = 0; - /* If we want deterministic ip allocations, i.e. that the ip addresses - will always be allocated the same way for a specific set of - available/unavailable nodes. - */ - if (1 == ctdb->tunable.deterministic_public_ips) { - DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n")); - for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) { - tmp_ip->pnn = i%nodemap->num; + for (t=ips; t != NULL; t=t->next) { + if (t->pnn != pnn) { + continue; } - } - - /* mark all public addresses with a masked node as being served by - node -1 - */ - for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { - if (tmp_ip->pnn == -1) { + /* Optimisation: We never calculate the distance + * between an address and itself. This allows us to + * calculate the effect of removing an address from a + * node by simply calculating the distance between + * that address and all of the exitsing addresses. + * Moreover, we assume that we're only ever dealing + * with addresses from all_ips so we can identify an + * address via a pointer rather than doing a more + * expensive address comparison. */ + if (&(t->addr) == ip) { continue; } - if (nodemap->nodes[tmp_ip->pnn].flags & mask) { - tmp_ip->pnn = -1; - } + + d = ip_distance(ip, &(t->addr)); + sum += d * d; /* Cheaper than pulling in math.h :-) */ } - /* verify that the assigned nodes can serve that public ip - and set it to -1 if not - */ - for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { - if (tmp_ip->pnn == -1) { + return sum; +} + +/* Return the LCP2 imbalance metric for addresses currently assigned + to the given node. + * Not static, so we can easily link it into a unit test. + */ +uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn) +{ + struct ctdb_public_ip_list *t; + + uint32_t imbalance = 0; + + for (t=all_ips; t!=NULL; t=t->next) { + if (t->pnn != pnn) { continue; } - if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) { - /* this node can not serve this ip. */ - tmp_ip->pnn = -1; - } + /* Pass the rest of the IPs rather than the whole + all_ips input list. + */ + imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn); } + return imbalance; +} + +/* Allocate any unassigned IPs just by looping through the IPs and + * finding the best node for each. + * Not static, so we can easily link it into a unit test. + */ +void basic_allocate_unassigned(struct ctdb_context *ctdb, + struct ctdb_node_map *nodemap, + uint32_t mask, + struct ctdb_public_ip_list *all_ips) +{ + struct ctdb_public_ip_list *tmp_ip; - /* now we must redistribute all public addresses with takeover node - -1 among the nodes available - */ - retries = 0; -try_again: /* loop over all ip's and find a physical node to cover for each unassigned ip. */ @@ -1372,26 +1373,26 @@ try_again: } } } +} - /* If we dont want ips to fail back after a node becomes healthy - again, we wont even try to reallocat the ip addresses so that - they are evenly spread out. - This can NOT be used at the same time as DeterministicIPs ! - */ - if (1 == ctdb->tunable.no_ip_failback) { - if (1 == ctdb->tunable.deterministic_public_ips) { - DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n")); - } - goto finished; - } - +/* Basic non-deterministic rebalancing algorithm. + * Not static, so we can easily link it into a unit test. + */ +bool basic_failback(struct ctdb_context *ctdb, + struct ctdb_node_map *nodemap, + uint32_t mask, + struct ctdb_public_ip_list *all_ips, + int num_ips, + int *retries) +{ + int i; + int maxnode, maxnum=0, minnode, minnum=0, num; + struct ctdb_public_ip_list *tmp_ip; - /* now, try to make sure the ip adresses are evenly distributed - across the node. - for each ip address, loop over all nodes that can serve this - ip and make sure that the difference between the node - serving the most and the node serving the least ip's are not greater - than 1. + /* for each ip address, loop over all nodes that can serve + this ip and make sure that the difference between the node + serving the most and the node serving the least ip's are + not greater than 1. */ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { if (tmp_ip->pnn == -1) { @@ -1455,7 +1456,7 @@ try_again: want to spend too much time balancing the ip coverage. */ if ( (maxnum > minnum+1) - && (retries < (num_ips + 5)) ){ + && (*retries < (num_ips + 5)) ){ struct ctdb_public_ip_list *tmp; /* mark one of maxnode's vnn's as unassigned and try @@ -1464,13 +1465,402 @@ try_again: for (tmp=all_ips;tmp;tmp=tmp->next) { if (tmp->pnn == maxnode) { tmp->pnn = -1; - retries++; - goto try_again; + (*retries)++; + return true; } } } } + return false; +} + +/* Do necessary LCP2 initialisation. Bury it in a function here so + * that we can unit test it. + * Not static, so we can easily link it into a unit test. + */ +void lcp2_init(struct ctdb_context * tmp_ctx, + struct ctdb_node_map * nodemap, + uint32_t mask, + struct ctdb_public_ip_list *all_ips, + uint32_t **lcp2_imbalances, + bool **newly_healthy) +{ + int i; + struct ctdb_public_ip_list *tmp_ip; + + *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num); + CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy); + *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num); + CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances); + + for (i=0;i<nodemap->num;i++) { + (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i); + /* First step: is the node "healthy"? */ + (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask); + } + + /* 2nd step: if a ndoe has IPs assigned then it must have been + * healthy before, so we remove it from consideration... */ + for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { + if (tmp_ip->pnn != -1) { + (*newly_healthy)[tmp_ip->pnn] = false; + } + } +} + +/* Allocate any unassigned addresses using the LCP2 algorithm to find + * the IP/node combination that will cost the least. + * Not static, so we can easily link it into a unit test. + */ +void lcp2_allocate_unassigned(struct ctdb_context *ctdb, + struct ctdb_node_map *nodemap, + uint32_t mask, + struct ctdb_public_ip_list *all_ips, + uint32_t *lcp2_imbalances) +{ + struct ctdb_public_ip_list *tmp_ip; + int dstnode; + + int minnode; + uint32_t mindsum, dstdsum, dstimbl, minimbl; + struct ctdb_public_ip_list *minip; + + bool should_loop = true; + bool have_unassigned = true; + + while (have_unassigned && should_loop) { + should_loop = false; + + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n")); + + minnode = -1; + mindsum = 0; + minip = NULL; + + /* loop over each unassigned ip. */ + for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { + if (tmp_ip->pnn != -1) { + continue; + } + + for (dstnode=0; dstnode < nodemap->num; dstnode++) { + /* only check nodes that can actually serve this ip */ + if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) { + /* no it couldnt so skip to the next node */ + continue; + } + if (nodemap->nodes[dstnode].flags & mask) { + continue; + } + + dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode); + dstimbl = lcp2_imbalances[dstnode] + dstdsum; + DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n", + ctdb_addr_to_str(&(tmp_ip->addr)), + dstnode, + dstimbl - lcp2_imbalances[dstnode])); + + + if ((minnode == -1) || (dstdsum < mindsum)) { + minnode = dstnode; + minimbl = dstimbl; + mindsum = dstdsum; + minip = tmp_ip; + should_loop = true; + } + } + } + + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + + /* If we found one then assign it to the given node. */ + if (minnode != -1) { + minip->pnn = minnode; + lcp2_imbalances[minnode] = minimbl; + DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n", + ctdb_addr_to_str(&(minip->addr)), + minnode, + mindsum)); + } + + /* There might be a better way but at least this is clear. */ + have_unassigned = false; + for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { + if (tmp_ip->pnn == -1) { + have_unassigned = true; + } + } + } + + /* We know if we have an unassigned addresses so we might as + * well optimise. + */ + if (have_unassigned) { + for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { + if (tmp_ip->pnn == -1) { + DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n", + ctdb_addr_to_str(&tmp_ip->addr))); + } + } + } +} + +/* LCP2 algorithm for rebalancing the cluster. This finds the source + * node with the highest LCP2 imbalance, and then determines the best + * IP/destination node combination to move from the source node. + * + * Not static, so we can easily link it into a unit test. + */ +bool lcp2_failback(struct ctdb_context *ctdb, + struct ctdb_node_map *nodemap, + uint32_t mask, + struct ctdb_public_ip_list *all_ips, + uint32_t *lcp2_imbalances, + bool *newly_healthy) +{ + int srcnode, dstnode, mindstnode, i, num_newly_healthy; + uint32_t srcimbl, srcdsum, maximbl, dstimbl, dstdsum; + uint32_t minsrcimbl, mindstimbl, b; + struct ctdb_public_ip_list *minip; + struct ctdb_public_ip_list *tmp_ip; + + /* It is only worth continuing if we have suitable target + * nodes to transfer IPs to. This check is much cheaper than + * continuing on... + */ + num_newly_healthy = 0; + for (i = 0; i < nodemap->num; i++) { + if (newly_healthy[i]) { + num_newly_healthy++; + } + } + if (num_newly_healthy == 0) { + return false; + } + + /* Get the node with the highest imbalance metric. */ + srcnode = -1; + maximbl = 0; + for (i=0; i < nodemap->num; i++) { + b = lcp2_imbalances[i]; + if ((srcnode == -1) || (b > maximbl)) { + srcnode = i; + maximbl = b; + } + } + + /* This means that all nodes had 0 or 1 addresses, so can't be + * imbalanced. + */ + if (maximbl == 0) { + return false; + } + + /* Find an IP and destination node that best reduces imbalance. */ + minip = NULL; + minsrcimbl = 0; + mindstnode = -1; + mindstimbl = 0; + + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, maximbl)); + + for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) { + /* Only consider addresses on srcnode. */ + if (tmp_ip->pnn != srcnode) { + continue; + } + + /* What is this IP address costing the source node? */ + srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode); + srcimbl = maximbl - srcdsum; + + /* Consider this IP address would cost each potential + * destination node. Destination nodes are limited to + * those that are newly healthy, since we don't want + * to do gratuitous failover of IPs just to make minor + * balance improvements. + */ + for (dstnode=0; dstnode < nodemap->num; dstnode++) { + if (! newly_healthy[dstnode]) { + continue; + } + /* only check nodes that can actually serve this ip */ + if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) { + /* no it couldnt so skip to the next node */ + continue; + } + + dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode); + dstimbl = lcp2_imbalances[dstnode] + dstdsum; + DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n", + srcnode, srcimbl - lcp2_imbalances[srcnode], + ctdb_addr_to_str(&(tmp_ip->addr)), + dstnode, dstimbl - lcp2_imbalances[dstnode])); + + if ((dstimbl < maximbl) && (dstdsum < srcdsum) && \ + ((mindstnode == -1) || \ + ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) { + + minip = tmp_ip; + minsrcimbl = srcimbl; + mindstnode = dstnode; + mindstimbl = dstimbl; + } + } + } + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + + if (mindstnode != -1) { + /* We found a move that makes things better... */ + DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n", + srcnode, minsrcimbl - lcp2_imbalances[srcnode], + ctdb_addr_to_str(&(minip->addr)), + mindstnode, mindstimbl - lcp2_imbalances[mindstnode])); + + + lcp2_imbalances[srcnode] = srcimbl; + lcp2_imbalances[mindstnode] = mindstimbl; + minip->pnn = mindstnode; + + return true; + } + + return false; + +} + +/* The calculation part of the IP allocation algorithm. + * Not static, so we can easily link it into a unit test. + */ +void ctdb_takeover_run_core(struct ctdb_context *ctdb, + struct ctdb_node_map *nodemap, + struct ctdb_public_ip_list **all_ips_p) +{ + int i, num_healthy, retries, num_ips; + uint32_t mask; + struct ctdb_public_ip_list *all_ips, *tmp_ip; + uint32_t *lcp2_imbalances; + bool *newly_healthy; + + TALLOC_CTX *tmp_ctx = talloc_new(ctdb); + + /* Count how many completely healthy nodes we have */ + num_healthy = 0; + for (i=0;i<nodemap->num;i++) { + if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) { + num_healthy++; + } + } + + if (num_healthy > 0) { + /* We have healthy nodes, so only consider them for + serving public addresses + */ + mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED; + } else { + /* We didnt have any completely healthy nodes so + use "disabled" nodes as a fallback + */ + mask = NODE_FLAGS_INACTIVE; + } + + /* since nodes only know about those public addresses that + can be served by that particular node, no single node has + a full list of all public addresses that exist in the cluster. + Walk over all node structures and create a merged list of + all public addresses that exist in the cluster. + + keep the tree of ips around as ctdb->ip_tree + */ + all_ips = create_merged_ip_list(ctdb); + *all_ips_p = all_ips; /* minimal code changes */ + + /* Count how many ips we have */ + num_ips = 0; + for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { + num_ips++; + } + + /* If we want deterministic ip allocations, i.e. that the ip addresses + will always be allocated the same way for a specific set of + available/unavailable nodes. + */ + if (1 == ctdb->tunable.deterministic_public_ips) { + DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n")); + for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) { + tmp_ip->pnn = i%nodemap->num; + } + } + + + /* mark all public addresses with a masked node as being served by + node -1 + */ + for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { + if (tmp_ip->pnn == -1) { + continue; + } + if (nodemap->nodes[tmp_ip->pnn].flags & mask) { + tmp_ip->pnn = -1; + } + } + + /* verify that the assigned nodes can serve that public ip + and set it to -1 if not + */ + for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) { + if (tmp_ip->pnn == -1) { + continue; + } + if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) { + /* this node can not serve this ip. */ + tmp_ip->pnn = -1; + } + } + + if (1 == ctdb->tunable.lcp2_public_ip_assignment) { + lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy); + } + + /* now we must redistribute all public addresses with takeover node + -1 among the nodes available + */ + retries = 0; +try_again: + if (1 == ctdb->tunable.lcp2_public_ip_assignment) { + lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances); + } else { + basic_allocate_unassigned(ctdb, nodemap, mask, all_ips); + } + + /* If we dont want ips to fail back after a node becomes healthy + again, we wont even try to reallocat the ip addresses so that + they are evenly spread out. + This can NOT be used at the same time as DeterministicIPs ! + */ + if (1 == ctdb->tunable.no_ip_failback) { + if (1 == ctdb->tunable.deterministic_public_ips) { + DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n")); + } + goto finished; + } + + + /* now, try to make sure the ip adresses are evenly distributed + across the node. + */ + if (1 == ctdb->tunable.lcp2_public_ip_assignment) { + if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) { + goto try_again; + } + } else { + if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) { + goto try_again; + } + } /* finished distributing the public addresses, now just send the info out to the nodes @@ -1481,6 +1871,38 @@ finished: or -1 if there is no node that can cover this ip */ + return; +} + +/* + make any IP alias changes for public addresses that are necessary + */ +int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap) +{ + int i; + struct ctdb_public_ip ip; + struct ctdb_public_ipv4 ipv4; + uint32_t *nodes; + struct ctdb_public_ip_list *all_ips, *tmp_ip; + TDB_DATA data; + struct timeval timeout; + struct client_async_data *async_data; + struct ctdb_client_control_state *state; + TALLOC_CTX *tmp_ctx = talloc_new(ctdb); + + /* + * ip failover is completely disabled, just send out the + * ipreallocated event. + */ + if (ctdb->tunable.disable_ip_failover != 0) { + goto ipreallocated; + } + + ZERO_STRUCT(ip); + + /* Do the IP reassignment calculations */ + ctdb_takeover_run_core(ctdb, nodemap, &all_ips); + /* now tell all nodes to delete any alias that they should not have. This will be a NOOP on nodes that don't currently hold the given alias */ diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c index 52fc22a364..bd7834daad 100644 --- a/ctdb/server/ctdb_tunables.c +++ b/ctdb/server/ctdb_tunables.c @@ -46,6 +46,7 @@ static const struct { { "RerecoveryTimeout", 10, offsetof(struct ctdb_tunable, rerecovery_timeout) }, { "EnableBans", 1, offsetof(struct ctdb_tunable, enable_bans) }, { "DeterministicIPs", 1, offsetof(struct ctdb_tunable, deterministic_public_ips) }, + { "LCP2PublicIPs", 0, offsetof(struct ctdb_tunable, lcp2_public_ip_assignment) }, { "ReclockPingPeriod", 60, offsetof(struct ctdb_tunable, reclock_ping_period) }, { "NoIPFailback", 0, offsetof(struct ctdb_tunable, no_ip_failback) }, { "DisableIPFailover", 0, offsetof(struct ctdb_tunable, disable_ip_failover) }, diff --git a/ctdb/tests/src/ctdb_takeover_tests.c b/ctdb/tests/src/ctdb_takeover_tests.c new file mode 100644 index 0000000000..5fd23320a3 --- /dev/null +++ b/ctdb/tests/src/ctdb_takeover_tests.c @@ -0,0 +1,378 @@ +/* + Tests for ctdb_takeover.c + + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "includes.h" +#include "../include/ctdb_private.h" + +/* + * Need these, since they're defined in ctdbd.c but we can't link + * that. + */ +int script_log_level; +bool fast_start; +void ctdb_load_nodes_file(struct ctdb_context *ctdb) {} + +/* Format of each line is "IP pnn" - the separator has to be at least + * 1 space (not a tab or whatever - a space!). + */ +static struct ctdb_public_ip_list * +read_ctdb_public_ip_list(TALLOC_CTX *ctx) +{ + char line[1024]; + ctdb_sock_addr addr; + char *t; + int pnn; + struct ctdb_public_ip_list *last = NULL; + + struct ctdb_public_ip_list *ret = NULL; + + while (fgets(line, sizeof(line), stdin) != NULL) { + + if ((t = strchr(line, ' ')) != NULL) { + /* Make line contain just the address */ + *t = '\0'; + /* Point to PNN or leading whitespace... */ + t++; + pnn = (int) strtol(t, (char **) NULL, 10); + } else { + /* Assume just an IP address, default to PNN -1 */ + if ((t = strchr(line, '\n')) != NULL) { + *t = '\0'; + } + pnn = -1; + } + + if (parse_ip(line, NULL, 0, &addr)) { + if (last == NULL) { + last = talloc(ctx, struct ctdb_public_ip_list); + } else { + last->next = talloc(ctx, struct ctdb_public_ip_list); + last = last->next; + } + last->next = NULL; + last->pnn = pnn; + memcpy(&(last->addr), &addr, sizeof(addr)); + if (ret == NULL) { + ret = last; + } + } else { + DEBUG(DEBUG_ERR, (__location__ " ERROR, bad address :%s\n", line)); + } + } + + return ret; +} + +void print_ctdb_public_ip_list(struct ctdb_public_ip_list * ips) +{ + while (ips) { + printf("%s %d\n", ctdb_addr_to_str(&(ips->addr)), ips->pnn); + ips = ips->next; + } +} + +/* Read some IPs from stdin, 1 per line, parse them and then print + * them back out. */ +void ctdb_test_read_ctdb_public_ip_list(void) +{ + struct ctdb_public_ip_list *l; + + TALLOC_CTX *tmp_ctx = talloc_new(NULL); + + l = read_ctdb_public_ip_list(tmp_ctx); + + print_ctdb_public_ip_list(l); + + talloc_free(tmp_ctx); +} + +/* Read 2 IPs from stdin, calculate the IP distance and print it. */ +void ctdb_test_ip_distance(void) +{ + struct ctdb_public_ip_list *l; + uint32_t distance; + + TALLOC_CTX *tmp_ctx = talloc_new(NULL); + + l = read_ctdb_public_ip_list(tmp_ctx); + + if (l && l->next) { + distance = ip_distance(&(l->addr), &(l->next->addr)); + printf ("%lu\n", (unsigned long) distance); + } + + talloc_free(tmp_ctx); +} + +/* Read some IPs from stdin, calculate the sum of the squares of the + * IP distances between the 1st argument and those read that are on + * the given node. The given IP must one of the ones in the list. */ +void ctdb_test_ip_distance_2_sum(const char ip[], int pnn) +{ + struct ctdb_public_ip_list *l; + struct ctdb_public_ip_list *t; + ctdb_sock_addr addr; + uint32_t distance; + + TALLOC_CTX *tmp_ctx = talloc_new(NULL); + + + l = read_ctdb_public_ip_list(tmp_ctx); + + if (l && parse_ip(ip, NULL, 0, &addr)) { + /* find the entry for the specified IP */ + for (t=l; t!=NULL; t=t->next) { + if (ctdb_same_ip(&(t->addr), &addr)) { + break; + } + } + + if (t == NULL) { + fprintf(stderr, "IP NOT PRESENT IN LIST"); + exit(1); + } + + distance = ip_distance_2_sum(&(t->addr), l, pnn); + printf ("%lu\n", (unsigned long) distance); + } else { + fprintf(stderr, "BAD INPUT"); + exit(1); + } + + talloc_free(tmp_ctx); +} + +/* Read some IPs from stdin, calculate the sume of the squares of the + * IP distances between the first and the rest, and print it. */ +void ctdb_test_lcp2_imbalance(int pnn) +{ + struct ctdb_public_ip_list *l; + uint32_t imbalance; + + TALLOC_CTX *tmp_ctx = talloc_new(NULL); + + l = read_ctdb_public_ip_list(tmp_ctx); + + imbalance = lcp2_imbalance(l, pnn); + printf ("%lu\n", (unsigned long) imbalance); + + talloc_free(tmp_ctx); +} + +void ctdb_test_init(const char nodestates[], + struct ctdb_context **ctdb, + struct ctdb_public_ip_list **all_ips, + struct ctdb_node_map **nodemap) +{ + struct ctdb_public_ip_list *t; + struct ctdb_all_public_ips *available_public_ips; + int i, numips, numnodes; + + numnodes = strlen(nodestates); + + *ctdb = talloc_zero(NULL, struct ctdb_context); + + /* Fake things up... */ + (*ctdb)->num_nodes = numnodes; + + (*ctdb)->tunable.deterministic_public_ips = 0; + (*ctdb)->tunable.disable_ip_failover = 0; + (*ctdb)->tunable.no_ip_failback = 0; + + if (getenv("CTDB_LCP2")) { + if (strcmp(getenv("CTDB_LCP2"), "yes") == 0) { + (*ctdb)->tunable.lcp2_public_ip_assignment = 1; + } else { + (*ctdb)->tunable.lcp2_public_ip_assignment = 0; + } + } + + *nodemap = talloc_array(*ctdb, struct ctdb_node_map, numnodes); + (*nodemap)->num = numnodes; + + for (i=0; i < numnodes; i++) { + (*nodemap)->nodes[i].pnn = i; + (*nodemap)->nodes[i].flags = nodestates[i] - '0'; + /* *nodemap->nodes[i].sockaddr is uninitialised */ + } + + *all_ips = read_ctdb_public_ip_list(*ctdb); + numips = 0; + for (t = *all_ips; t != NULL; t = t->next) { + numips++; + } + + available_public_ips = talloc_array(*ctdb, struct ctdb_all_public_ips, numips); // FIXME: bogus size, overkill + available_public_ips->num = numips; + for (t = *all_ips, i=0; t != NULL && i < numips ; t = t->next, i++) { + available_public_ips->ips[i].pnn = t->pnn; + memcpy(&(available_public_ips->ips[i].addr), &(t->addr), sizeof(t->addr)); + } + + (*ctdb)->nodes = talloc_array(*ctdb, struct ctdb_node *, numnodes); // FIXME: bogus size, overkill + + /* Setup both nodemap and ctdb->nodes. Mark all nodes as + * healthy - change this later. */ + for (i=0; i < numnodes; i++) { + uint32_t flags = nodestates[i] - '0' ? NODE_FLAGS_UNHEALTHY : 0; + (*nodemap)->nodes[i].pnn = i; + (*nodemap)->nodes[i].flags = flags; + /* nodemap->nodes[i].sockaddr is uninitialised */ + + (*ctdb)->nodes[i] = talloc(*ctdb, struct ctdb_node); + (*ctdb)->nodes[i]->pnn = i; + (*ctdb)->nodes[i]->flags = flags; + (*ctdb)->nodes[i]->available_public_ips = available_public_ips; + (*ctdb)->nodes[i]->known_public_ips = available_public_ips; + } +} + +/* IP layout is read from stdin. */ +void ctdb_test_lcp2_allocate_unassigned(const char nodestates[]) +{ + struct ctdb_context *ctdb; + struct ctdb_public_ip_list *all_ips; + struct ctdb_node_map *nodemap; + + uint32_t *lcp2_imbalances; + bool *newly_healthy; + + ctdb_test_init(nodestates, &ctdb, &all_ips, &nodemap); + + lcp2_init(ctdb, nodemap, + NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED, + all_ips, &lcp2_imbalances, &newly_healthy); + + lcp2_allocate_unassigned(ctdb, nodemap, + NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED, + all_ips, lcp2_imbalances); + + print_ctdb_public_ip_list(all_ips); + + talloc_free(ctdb); +} + +/* IP layout is read from stdin. */ +void ctdb_test_lcp2_failback(const char nodestates[]) +{ + struct ctdb_context *ctdb; + struct ctdb_public_ip_list *all_ips; + struct ctdb_node_map *nodemap; + + uint32_t *lcp2_imbalances; + bool *newly_healthy; + + ctdb_test_init(nodestates, &ctdb, &all_ips, &nodemap); + + lcp2_init(ctdb, nodemap, + NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED, + all_ips, &lcp2_imbalances, &newly_healthy); + + lcp2_failback(ctdb, nodemap, + NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED, + all_ips, lcp2_imbalances, newly_healthy); + + print_ctdb_public_ip_list(all_ips); + + talloc_free(ctdb); +} + +/* IP layout is read from stdin. */ +void ctdb_test_lcp2_failback_loop(const char nodestates[]) +{ + struct ctdb_context *ctdb; + struct ctdb_public_ip_list *all_ips; + struct ctdb_node_map *nodemap; + + uint32_t *lcp2_imbalances; + bool *newly_healthy; + + ctdb_test_init(nodestates, &ctdb, &all_ips, &nodemap); + + lcp2_init(ctdb, nodemap, + NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED, + all_ips, &lcp2_imbalances, &newly_healthy); + +try_again: + if (lcp2_failback(ctdb, nodemap, + NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED, + all_ips, lcp2_imbalances, newly_healthy)) { + goto try_again; + } + + print_ctdb_public_ip_list(all_ips); + + talloc_free(ctdb); +} + +/* IP layout is read from stdin. */ +void ctdb_test_ctdb_takeover_run_core(const char nodestates[]) +{ + struct ctdb_context *ctdb; + struct ctdb_public_ip_list *all_ips; + struct ctdb_node_map *nodemap; + + ctdb_test_init(nodestates, &ctdb, &all_ips, &nodemap); + + ctdb_takeover_run_core(ctdb, nodemap, &all_ips); + + print_ctdb_public_ip_list(all_ips); + + talloc_free(ctdb); +} + +void usage(void) +{ + fprintf(stderr, "usage: ctdb_takeover_tests <op>\n"); + exit(1); +} + +int main(int argc, const char *argv[]) +{ + LogLevel = DEBUG_DEBUG; + if (getenv("CTDB_TEST_LOGLEVEL")) { + LogLevel = atoi(getenv("CTDB_TEST_LOGLEVEL")); + } + + if (argc < 2) { + usage(); + } + + if (strcmp(argv[1], "ip_list") == 0) { + ctdb_test_read_ctdb_public_ip_list(); + } else if (strcmp(argv[1], "ip_distance") == 0) { + ctdb_test_ip_distance(); + } else if (argc == 4 && strcmp(argv[1], "ip_distance_2_sum") == 0) { + ctdb_test_ip_distance_2_sum(argv[2], atoi(argv[3])); + } else if (argc >= 3 && strcmp(argv[1], "lcp2_imbalance") == 0) { + ctdb_test_lcp2_imbalance(atoi(argv[2])); + } else if (argc == 3 && strcmp(argv[1], "lcp2_allocate_unassigned") == 0) { + ctdb_test_lcp2_allocate_unassigned(argv[2]); + } else if (argc == 3 && strcmp(argv[1], "lcp2_failback") == 0) { + ctdb_test_lcp2_failback(argv[2]); + } else if (argc == 3 && strcmp(argv[1], "lcp2_failback_loop") == 0) { + ctdb_test_lcp2_failback_loop(argv[2]); + } else if (argc == 3 && strcmp(argv[1], "ctdb_takeover_run_core") == 0) { + ctdb_test_ctdb_takeover_run_core(argv[2]); + } else { + usage(); + } + + return 0; +} |