summaryrefslogtreecommitdiffstats
path: root/ctdb
diff options
context:
space:
mode:
authorRonnie Sahlberg <ronniesahlberg@gmail.com>2011-07-29 09:04:01 +1000
committerRonnie Sahlberg <ronniesahlberg@gmail.com>2011-07-29 09:04:01 +1000
commita17ae8a8bed982b9ecd064ccca0be88a61d2ec85 (patch)
treee87663bc138c73e51acf0835bae13c364aae384b /ctdb
parenta5cd8a3270d1b614157115aa71fd224abcab6be7 (diff)
parent5ac67504caace31d9d529792ace03fbe133a0d59 (diff)
downloadsamba-a17ae8a8bed982b9ecd064ccca0be88a61d2ec85.tar.gz
samba-a17ae8a8bed982b9ecd064ccca0be88a61d2ec85.tar.xz
samba-a17ae8a8bed982b9ecd064ccca0be88a61d2ec85.zip
Merge branch 'master' of 10.1.1.27:/shared/ctdb/ctdb-master
(This used to be ctdb commit 518945e59e2e48f07fcc0955f3aa81cd0d946aea)
Diffstat (limited to 'ctdb')
-rwxr-xr-xctdb/Makefile.in7
-rw-r--r--ctdb/include/ctdb_private.h34
-rw-r--r--ctdb/server/ctdb_takeover.c648
-rw-r--r--ctdb/server/ctdb_tunables.c1
-rw-r--r--ctdb/tests/src/ctdb_takeover_tests.c378
5 files changed, 955 insertions, 113 deletions
diff --git a/ctdb/Makefile.in b/ctdb/Makefile.in
index 8fb9ea73cb..d53d3db5e0 100755
--- a/ctdb/Makefile.in
+++ b/ctdb/Makefile.in
@@ -70,6 +70,7 @@ TEST_BINS=tests/bin/ctdb_bench tests/bin/ctdb_fetch tests/bin/ctdb_fetch_one \
tests/bin/ctdb_fetch_lock_once tests/bin/ctdb_store \
tests/bin/ctdb_randrec tests/bin/ctdb_persistent \
tests/bin/ctdb_traverse tests/bin/rb_test tests/bin/ctdb_transaction \
+ tests/bin/ctdb_takeover_tests
@INFINIBAND_BINS@
BINS = bin/ctdb @CTDB_SCSI_IO@ bin/smnotify bin/ping_pong bin/ltdbtool
@@ -190,6 +191,12 @@ tests/bin/ctdb_transaction: $(CTDB_CLIENT_OBJ) tests/src/ctdb_transaction.o
@echo Linking $@
@$(CC) $(CFLAGS) -o $@ tests/src/ctdb_transaction.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+CTDB_TAKEOVER_OBJ = $(CTDB_SERVER_OBJ:server/ctdbd.o=)
+
+tests/bin/ctdb_takeover_tests: $(CTDB_TAKEOVER_OBJ) tests/src/ctdb_takeover_tests.o
+ @echo Linking $@
+ @$(CC) $(CFLAGS) -o $@ tests/src/ctdb_takeover_tests.o $(CTDB_TAKEOVER_OBJ) $(LIB_FLAGS)
+
tests/bin/ibwrapper_test: $(CTDB_CLIENT_OBJ) ib/ibwrapper_test.o
@echo Linking $@
@$(CC) $(CFLAGS) -o $@ ib/ibwrapper_test.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index 396427bfa6..37f8a7344a 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -120,6 +120,7 @@ struct ctdb_tunable {
uint32_t stat_history_interval;
uint32_t deferred_attach_timeout;
uint32_t vacuum_fast_path_count;
+ uint32_t lcp2_public_ip_assignment;
};
/*
@@ -1410,4 +1411,37 @@ int32_t ctdb_local_schedule_for_deletion(struct ctdb_db_context *ctdb_db,
struct ctdb_ltdb_header *ctdb_header_from_record_handle(struct ctdb_record_handle *h);
+/* For unit testing ctdb_transaction.c. */
+struct ctdb_public_ip_list {
+ struct ctdb_public_ip_list *next;
+ uint32_t pnn;
+ ctdb_sock_addr addr;
+};
+uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2);
+uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+ struct ctdb_public_ip_list *ips,
+ int pnn);
+uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn);
+void lcp2_init(struct ctdb_context * tmp_ctx,
+ struct ctdb_node_map * nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ uint32_t **lcp2_imbalances,
+ bool **newly_healthy);
+void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ uint32_t *lcp2_imbalances);
+bool lcp2_failback(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ uint32_t *lcp2_imbalances,
+ bool *newly_healthy);
+void ctdb_takeover_run_core(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ struct ctdb_public_ip_list **all_ips_p);
+
+
#endif
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c
index a971fdecfb..5512acc379 100644
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@@ -3,6 +3,7 @@
Copyright (C) Ronnie Sahlberg 2007
Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -1058,13 +1059,6 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
return 0;
}
-struct ctdb_public_ip_list {
- struct ctdb_public_ip_list *next;
- uint32_t pnn;
- ctdb_sock_addr addr;
-};
-
-
/* Given a physical node, return the number of
public addresses that is currently assigned to this node.
*/
@@ -1255,112 +1249,119 @@ create_merged_ip_list(struct ctdb_context *ctdb)
return ip_list;
}
-/*
- make any IP alias changes for public addresses that are necessary
+/*
+ * This is the length of the longtest common prefix between the IPs.
+ * It is calculated by XOR-ing the 2 IPs together and counting the
+ * number of leading zeroes. The implementation means that all
+ * addresses end up being 128 bits long.
+ * Not static, so we can easily link it into a unit test.
+ *
+ * FIXME? Should we consider IPv4 and IPv6 separately given that the
+ * 12 bytes of 0 prefix padding will hurt the algorithm if there are
+ * lots of nodes and IP addresses?
*/
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
{
- int i, num_healthy, retries, num_ips;
- struct ctdb_public_ip ip;
- struct ctdb_public_ipv4 ipv4;
- uint32_t mask, *nodes;
- struct ctdb_public_ip_list *all_ips, *tmp_ip;
- int maxnode, maxnum=0, minnode, minnum=0, num;
- TDB_DATA data;
- struct timeval timeout;
- struct client_async_data *async_data;
- struct ctdb_client_control_state *state;
- TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
-
- /*
- * ip failover is completely disabled, just send out the
- * ipreallocated event.
- */
- if (ctdb->tunable.disable_ip_failover != 0) {
- goto ipreallocated;
- }
+ uint32_t ip1_k[IP_KEYLEN];
+ uint32_t *t;
+ int i;
+ uint32_t x;
- ZERO_STRUCT(ip);
+ uint32_t distance = 0;
- /* Count how many completely healthy nodes we have */
- num_healthy = 0;
- for (i=0;i<nodemap->num;i++) {
- if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
- num_healthy++;
+ memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
+ t = ip_key(ip2);
+ for (i=0; i<IP_KEYLEN; i++) {
+ x = ip1_k[i] ^ t[i];
+ if (x == 0) {
+ distance += 32;
+ } else {
+ /* Count number of leading zeroes.
+ * FIXME? This could be optimised...
+ */
+ while ((x & (1 << 31)) == 0) {
+ x <<= 1;
+ distance += 1;
+ }
}
}
- if (num_healthy > 0) {
- /* We have healthy nodes, so only consider them for
- serving public addresses
- */
- mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
- } else {
- /* We didnt have any completely healthy nodes so
- use "disabled" nodes as a fallback
- */
- mask = NODE_FLAGS_INACTIVE;
- }
-
- /* since nodes only know about those public addresses that
- can be served by that particular node, no single node has
- a full list of all public addresses that exist in the cluster.
- Walk over all node structures and create a merged list of
- all public addresses that exist in the cluster.
+ return distance;
+}
- keep the tree of ips around as ctdb->ip_tree
- */
- all_ips = create_merged_ip_list(ctdb);
+/* Calculate the IP distance for the given IP relative to IPs on the
+ given node. The ips argument is generally the all_ips variable
+ used in the main part of the algorithm.
+ * Not static, so we can easily link it into a unit test.
+ */
+uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+ struct ctdb_public_ip_list *ips,
+ int pnn)
+{
+ struct ctdb_public_ip_list *t;
+ uint32_t d;
- /* Count how many ips we have */
- num_ips = 0;
- for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
- num_ips++;
- }
+ uint32_t sum = 0;
- /* If we want deterministic ip allocations, i.e. that the ip addresses
- will always be allocated the same way for a specific set of
- available/unavailable nodes.
- */
- if (1 == ctdb->tunable.deterministic_public_ips) {
- DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
- for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
- tmp_ip->pnn = i%nodemap->num;
+ for (t=ips; t != NULL; t=t->next) {
+ if (t->pnn != pnn) {
+ continue;
}
- }
-
- /* mark all public addresses with a masked node as being served by
- node -1
- */
- for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
- if (tmp_ip->pnn == -1) {
+ /* Optimisation: We never calculate the distance
+ * between an address and itself. This allows us to
+ * calculate the effect of removing an address from a
+ * node by simply calculating the distance between
+ * that address and all of the exitsing addresses.
+ * Moreover, we assume that we're only ever dealing
+ * with addresses from all_ips so we can identify an
+ * address via a pointer rather than doing a more
+ * expensive address comparison. */
+ if (&(t->addr) == ip) {
continue;
}
- if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
- tmp_ip->pnn = -1;
- }
+
+ d = ip_distance(ip, &(t->addr));
+ sum += d * d; /* Cheaper than pulling in math.h :-) */
}
- /* verify that the assigned nodes can serve that public ip
- and set it to -1 if not
- */
- for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
- if (tmp_ip->pnn == -1) {
+ return sum;
+}
+
+/* Return the LCP2 imbalance metric for addresses currently assigned
+ to the given node.
+ * Not static, so we can easily link it into a unit test.
+ */
+uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
+{
+ struct ctdb_public_ip_list *t;
+
+ uint32_t imbalance = 0;
+
+ for (t=all_ips; t!=NULL; t=t->next) {
+ if (t->pnn != pnn) {
continue;
}
- if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
- /* this node can not serve this ip. */
- tmp_ip->pnn = -1;
- }
+ /* Pass the rest of the IPs rather than the whole
+ all_ips input list.
+ */
+ imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
}
+ return imbalance;
+}
+
+/* Allocate any unassigned IPs just by looping through the IPs and
+ * finding the best node for each.
+ * Not static, so we can easily link it into a unit test.
+ */
+void basic_allocate_unassigned(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips)
+{
+ struct ctdb_public_ip_list *tmp_ip;
- /* now we must redistribute all public addresses with takeover node
- -1 among the nodes available
- */
- retries = 0;
-try_again:
/* loop over all ip's and find a physical node to cover for
each unassigned ip.
*/
@@ -1372,26 +1373,26 @@ try_again:
}
}
}
+}
- /* If we dont want ips to fail back after a node becomes healthy
- again, we wont even try to reallocat the ip addresses so that
- they are evenly spread out.
- This can NOT be used at the same time as DeterministicIPs !
- */
- if (1 == ctdb->tunable.no_ip_failback) {
- if (1 == ctdb->tunable.deterministic_public_ips) {
- DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
- }
- goto finished;
- }
-
+/* Basic non-deterministic rebalancing algorithm.
+ * Not static, so we can easily link it into a unit test.
+ */
+bool basic_failback(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ int num_ips,
+ int *retries)
+{
+ int i;
+ int maxnode, maxnum=0, minnode, minnum=0, num;
+ struct ctdb_public_ip_list *tmp_ip;
- /* now, try to make sure the ip adresses are evenly distributed
- across the node.
- for each ip address, loop over all nodes that can serve this
- ip and make sure that the difference between the node
- serving the most and the node serving the least ip's are not greater
- than 1.
+ /* for each ip address, loop over all nodes that can serve
+ this ip and make sure that the difference between the node
+ serving the most and the node serving the least ip's are
+ not greater than 1.
*/
for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
if (tmp_ip->pnn == -1) {
@@ -1455,7 +1456,7 @@ try_again:
want to spend too much time balancing the ip coverage.
*/
if ( (maxnum > minnum+1)
- && (retries < (num_ips + 5)) ){
+ && (*retries < (num_ips + 5)) ){
struct ctdb_public_ip_list *tmp;
/* mark one of maxnode's vnn's as unassigned and try
@@ -1464,13 +1465,402 @@ try_again:
for (tmp=all_ips;tmp;tmp=tmp->next) {
if (tmp->pnn == maxnode) {
tmp->pnn = -1;
- retries++;
- goto try_again;
+ (*retries)++;
+ return true;
}
}
}
}
+ return false;
+}
+
+/* Do necessary LCP2 initialisation. Bury it in a function here so
+ * that we can unit test it.
+ * Not static, so we can easily link it into a unit test.
+ */
+void lcp2_init(struct ctdb_context * tmp_ctx,
+ struct ctdb_node_map * nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ uint32_t **lcp2_imbalances,
+ bool **newly_healthy)
+{
+ int i;
+ struct ctdb_public_ip_list *tmp_ip;
+
+ *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
+ CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
+ *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
+ CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
+
+ for (i=0;i<nodemap->num;i++) {
+ (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
+ /* First step: is the node "healthy"? */
+ (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
+ }
+
+ /* 2nd step: if a ndoe has IPs assigned then it must have been
+ * healthy before, so we remove it from consideration... */
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn != -1) {
+ (*newly_healthy)[tmp_ip->pnn] = false;
+ }
+ }
+}
+
+/* Allocate any unassigned addresses using the LCP2 algorithm to find
+ * the IP/node combination that will cost the least.
+ * Not static, so we can easily link it into a unit test.
+ */
+void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ uint32_t *lcp2_imbalances)
+{
+ struct ctdb_public_ip_list *tmp_ip;
+ int dstnode;
+
+ int minnode;
+ uint32_t mindsum, dstdsum, dstimbl, minimbl;
+ struct ctdb_public_ip_list *minip;
+
+ bool should_loop = true;
+ bool have_unassigned = true;
+
+ while (have_unassigned && should_loop) {
+ should_loop = false;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
+
+ minnode = -1;
+ mindsum = 0;
+ minip = NULL;
+
+ /* loop over each unassigned ip. */
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn != -1) {
+ continue;
+ }
+
+ for (dstnode=0; dstnode < nodemap->num; dstnode++) {
+ /* only check nodes that can actually serve this ip */
+ if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+ if (nodemap->nodes[dstnode].flags & mask) {
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
+ ctdb_addr_to_str(&(tmp_ip->addr)),
+ dstnode,
+ dstimbl - lcp2_imbalances[dstnode]));
+
+
+ if ((minnode == -1) || (dstdsum < mindsum)) {
+ minnode = dstnode;
+ minimbl = dstimbl;
+ mindsum = dstdsum;
+ minip = tmp_ip;
+ should_loop = true;
+ }
+ }
+ }
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ /* If we found one then assign it to the given node. */
+ if (minnode != -1) {
+ minip->pnn = minnode;
+ lcp2_imbalances[minnode] = minimbl;
+ DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
+ ctdb_addr_to_str(&(minip->addr)),
+ minnode,
+ mindsum));
+ }
+
+ /* There might be a better way but at least this is clear. */
+ have_unassigned = false;
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn == -1) {
+ have_unassigned = true;
+ }
+ }
+ }
+
+ /* We know if we have an unassigned addresses so we might as
+ * well optimise.
+ */
+ if (have_unassigned) {
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn == -1) {
+ DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
+ ctdb_addr_to_str(&tmp_ip->addr)));
+ }
+ }
+ }
+}
+
+/* LCP2 algorithm for rebalancing the cluster. This finds the source
+ * node with the highest LCP2 imbalance, and then determines the best
+ * IP/destination node combination to move from the source node.
+ *
+ * Not static, so we can easily link it into a unit test.
+ */
+bool lcp2_failback(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ uint32_t *lcp2_imbalances,
+ bool *newly_healthy)
+{
+ int srcnode, dstnode, mindstnode, i, num_newly_healthy;
+ uint32_t srcimbl, srcdsum, maximbl, dstimbl, dstdsum;
+ uint32_t minsrcimbl, mindstimbl, b;
+ struct ctdb_public_ip_list *minip;
+ struct ctdb_public_ip_list *tmp_ip;
+
+ /* It is only worth continuing if we have suitable target
+ * nodes to transfer IPs to. This check is much cheaper than
+ * continuing on...
+ */
+ num_newly_healthy = 0;
+ for (i = 0; i < nodemap->num; i++) {
+ if (newly_healthy[i]) {
+ num_newly_healthy++;
+ }
+ }
+ if (num_newly_healthy == 0) {
+ return false;
+ }
+
+ /* Get the node with the highest imbalance metric. */
+ srcnode = -1;
+ maximbl = 0;
+ for (i=0; i < nodemap->num; i++) {
+ b = lcp2_imbalances[i];
+ if ((srcnode == -1) || (b > maximbl)) {
+ srcnode = i;
+ maximbl = b;
+ }
+ }
+
+ /* This means that all nodes had 0 or 1 addresses, so can't be
+ * imbalanced.
+ */
+ if (maximbl == 0) {
+ return false;
+ }
+
+ /* Find an IP and destination node that best reduces imbalance. */
+ minip = NULL;
+ minsrcimbl = 0;
+ mindstnode = -1;
+ mindstimbl = 0;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, maximbl));
+
+ for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
+ /* Only consider addresses on srcnode. */
+ if (tmp_ip->pnn != srcnode) {
+ continue;
+ }
+
+ /* What is this IP address costing the source node? */
+ srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
+ srcimbl = maximbl - srcdsum;
+
+ /* Consider this IP address would cost each potential
+ * destination node. Destination nodes are limited to
+ * those that are newly healthy, since we don't want
+ * to do gratuitous failover of IPs just to make minor
+ * balance improvements.
+ */
+ for (dstnode=0; dstnode < nodemap->num; dstnode++) {
+ if (! newly_healthy[dstnode]) {
+ continue;
+ }
+ /* only check nodes that can actually serve this ip */
+ if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
+ srcnode, srcimbl - lcp2_imbalances[srcnode],
+ ctdb_addr_to_str(&(tmp_ip->addr)),
+ dstnode, dstimbl - lcp2_imbalances[dstnode]));
+
+ if ((dstimbl < maximbl) && (dstdsum < srcdsum) && \
+ ((mindstnode == -1) || \
+ ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
+
+ minip = tmp_ip;
+ minsrcimbl = srcimbl;
+ mindstnode = dstnode;
+ mindstimbl = dstimbl;
+ }
+ }
+ }
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ if (mindstnode != -1) {
+ /* We found a move that makes things better... */
+ DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
+ srcnode, minsrcimbl - lcp2_imbalances[srcnode],
+ ctdb_addr_to_str(&(minip->addr)),
+ mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
+
+
+ lcp2_imbalances[srcnode] = srcimbl;
+ lcp2_imbalances[mindstnode] = mindstimbl;
+ minip->pnn = mindstnode;
+
+ return true;
+ }
+
+ return false;
+
+}
+
+/* The calculation part of the IP allocation algorithm.
+ * Not static, so we can easily link it into a unit test.
+ */
+void ctdb_takeover_run_core(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ struct ctdb_public_ip_list **all_ips_p)
+{
+ int i, num_healthy, retries, num_ips;
+ uint32_t mask;
+ struct ctdb_public_ip_list *all_ips, *tmp_ip;
+ uint32_t *lcp2_imbalances;
+ bool *newly_healthy;
+
+ TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+ /* Count how many completely healthy nodes we have */
+ num_healthy = 0;
+ for (i=0;i<nodemap->num;i++) {
+ if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
+ num_healthy++;
+ }
+ }
+
+ if (num_healthy > 0) {
+ /* We have healthy nodes, so only consider them for
+ serving public addresses
+ */
+ mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
+ } else {
+ /* We didnt have any completely healthy nodes so
+ use "disabled" nodes as a fallback
+ */
+ mask = NODE_FLAGS_INACTIVE;
+ }
+
+ /* since nodes only know about those public addresses that
+ can be served by that particular node, no single node has
+ a full list of all public addresses that exist in the cluster.
+ Walk over all node structures and create a merged list of
+ all public addresses that exist in the cluster.
+
+ keep the tree of ips around as ctdb->ip_tree
+ */
+ all_ips = create_merged_ip_list(ctdb);
+ *all_ips_p = all_ips; /* minimal code changes */
+
+ /* Count how many ips we have */
+ num_ips = 0;
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ num_ips++;
+ }
+
+ /* If we want deterministic ip allocations, i.e. that the ip addresses
+ will always be allocated the same way for a specific set of
+ available/unavailable nodes.
+ */
+ if (1 == ctdb->tunable.deterministic_public_ips) {
+ DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
+ for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
+ tmp_ip->pnn = i%nodemap->num;
+ }
+ }
+
+
+ /* mark all public addresses with a masked node as being served by
+ node -1
+ */
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn == -1) {
+ continue;
+ }
+ if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
+ tmp_ip->pnn = -1;
+ }
+ }
+
+ /* verify that the assigned nodes can serve that public ip
+ and set it to -1 if not
+ */
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn == -1) {
+ continue;
+ }
+ if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
+ /* this node can not serve this ip. */
+ tmp_ip->pnn = -1;
+ }
+ }
+
+ if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+ lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
+ }
+
+ /* now we must redistribute all public addresses with takeover node
+ -1 among the nodes available
+ */
+ retries = 0;
+try_again:
+ if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+ lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
+ } else {
+ basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
+ }
+
+ /* If we dont want ips to fail back after a node becomes healthy
+ again, we wont even try to reallocat the ip addresses so that
+ they are evenly spread out.
+ This can NOT be used at the same time as DeterministicIPs !
+ */
+ if (1 == ctdb->tunable.no_ip_failback) {
+ if (1 == ctdb->tunable.deterministic_public_ips) {
+ DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
+ }
+ goto finished;
+ }
+
+
+ /* now, try to make sure the ip adresses are evenly distributed
+ across the node.
+ */
+ if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+ if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
+ goto try_again;
+ }
+ } else {
+ if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
+ goto try_again;
+ }
+ }
/* finished distributing the public addresses, now just send the
info out to the nodes
@@ -1481,6 +1871,38 @@ finished:
or -1 if there is no node that can cover this ip
*/
+ return;
+}
+
+/*
+ make any IP alias changes for public addresses that are necessary
+ */
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+{
+ int i;
+ struct ctdb_public_ip ip;
+ struct ctdb_public_ipv4 ipv4;
+ uint32_t *nodes;
+ struct ctdb_public_ip_list *all_ips, *tmp_ip;
+ TDB_DATA data;
+ struct timeval timeout;
+ struct client_async_data *async_data;
+ struct ctdb_client_control_state *state;
+ TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+ /*
+ * ip failover is completely disabled, just send out the
+ * ipreallocated event.
+ */
+ if (ctdb->tunable.disable_ip_failover != 0) {
+ goto ipreallocated;
+ }
+
+ ZERO_STRUCT(ip);
+
+ /* Do the IP reassignment calculations */
+ ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
+
/* now tell all nodes to delete any alias that they should not
have. This will be a NOOP on nodes that don't currently
hold the given alias */
diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c
index 52fc22a364..bd7834daad 100644
--- a/ctdb/server/ctdb_tunables.c
+++ b/ctdb/server/ctdb_tunables.c
@@ -46,6 +46,7 @@ static const struct {
{ "RerecoveryTimeout", 10, offsetof(struct ctdb_tunable, rerecovery_timeout) },
{ "EnableBans", 1, offsetof(struct ctdb_tunable, enable_bans) },
{ "DeterministicIPs", 1, offsetof(struct ctdb_tunable, deterministic_public_ips) },
+ { "LCP2PublicIPs", 0, offsetof(struct ctdb_tunable, lcp2_public_ip_assignment) },
{ "ReclockPingPeriod", 60, offsetof(struct ctdb_tunable, reclock_ping_period) },
{ "NoIPFailback", 0, offsetof(struct ctdb_tunable, no_ip_failback) },
{ "DisableIPFailover", 0, offsetof(struct ctdb_tunable, disable_ip_failover) },
diff --git a/ctdb/tests/src/ctdb_takeover_tests.c b/ctdb/tests/src/ctdb_takeover_tests.c
new file mode 100644
index 0000000000..5fd23320a3
--- /dev/null
+++ b/ctdb/tests/src/ctdb_takeover_tests.c
@@ -0,0 +1,378 @@
+/*
+ Tests for ctdb_takeover.c
+
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "../include/ctdb_private.h"
+
+/*
+ * Need these, since they're defined in ctdbd.c but we can't link
+ * that.
+ */
+int script_log_level;
+bool fast_start;
+void ctdb_load_nodes_file(struct ctdb_context *ctdb) {}
+
+/* Format of each line is "IP pnn" - the separator has to be at least
+ * 1 space (not a tab or whatever - a space!).
+ */
+static struct ctdb_public_ip_list *
+read_ctdb_public_ip_list(TALLOC_CTX *ctx)
+{
+ char line[1024];
+ ctdb_sock_addr addr;
+ char *t;
+ int pnn;
+ struct ctdb_public_ip_list *last = NULL;
+
+ struct ctdb_public_ip_list *ret = NULL;
+
+ while (fgets(line, sizeof(line), stdin) != NULL) {
+
+ if ((t = strchr(line, ' ')) != NULL) {
+ /* Make line contain just the address */
+ *t = '\0';
+ /* Point to PNN or leading whitespace... */
+ t++;
+ pnn = (int) strtol(t, (char **) NULL, 10);
+ } else {
+ /* Assume just an IP address, default to PNN -1 */
+ if ((t = strchr(line, '\n')) != NULL) {
+ *t = '\0';
+ }
+ pnn = -1;
+ }
+
+ if (parse_ip(line, NULL, 0, &addr)) {
+ if (last == NULL) {
+ last = talloc(ctx, struct ctdb_public_ip_list);
+ } else {
+ last->next = talloc(ctx, struct ctdb_public_ip_list);
+ last = last->next;
+ }
+ last->next = NULL;
+ last->pnn = pnn;
+ memcpy(&(last->addr), &addr, sizeof(addr));
+ if (ret == NULL) {
+ ret = last;
+ }
+ } else {
+ DEBUG(DEBUG_ERR, (__location__ " ERROR, bad address :%s\n", line));
+ }
+ }
+
+ return ret;
+}
+
+void print_ctdb_public_ip_list(struct ctdb_public_ip_list * ips)
+{
+ while (ips) {
+ printf("%s %d\n", ctdb_addr_to_str(&(ips->addr)), ips->pnn);
+ ips = ips->next;
+ }
+}
+
+/* Read some IPs from stdin, 1 per line, parse them and then print
+ * them back out. */
+void ctdb_test_read_ctdb_public_ip_list(void)
+{
+ struct ctdb_public_ip_list *l;
+
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+ l = read_ctdb_public_ip_list(tmp_ctx);
+
+ print_ctdb_public_ip_list(l);
+
+ talloc_free(tmp_ctx);
+}
+
+/* Read 2 IPs from stdin, calculate the IP distance and print it. */
+void ctdb_test_ip_distance(void)
+{
+ struct ctdb_public_ip_list *l;
+ uint32_t distance;
+
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+ l = read_ctdb_public_ip_list(tmp_ctx);
+
+ if (l && l->next) {
+ distance = ip_distance(&(l->addr), &(l->next->addr));
+ printf ("%lu\n", (unsigned long) distance);
+ }
+
+ talloc_free(tmp_ctx);
+}
+
+/* Read some IPs from stdin, calculate the sum of the squares of the
+ * IP distances between the 1st argument and those read that are on
+ * the given node. The given IP must one of the ones in the list. */
+void ctdb_test_ip_distance_2_sum(const char ip[], int pnn)
+{
+ struct ctdb_public_ip_list *l;
+ struct ctdb_public_ip_list *t;
+ ctdb_sock_addr addr;
+ uint32_t distance;
+
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+
+ l = read_ctdb_public_ip_list(tmp_ctx);
+
+ if (l && parse_ip(ip, NULL, 0, &addr)) {
+ /* find the entry for the specified IP */
+ for (t=l; t!=NULL; t=t->next) {
+ if (ctdb_same_ip(&(t->addr), &addr)) {
+ break;
+ }
+ }
+
+ if (t == NULL) {
+ fprintf(stderr, "IP NOT PRESENT IN LIST");
+ exit(1);
+ }
+
+ distance = ip_distance_2_sum(&(t->addr), l, pnn);
+ printf ("%lu\n", (unsigned long) distance);
+ } else {
+ fprintf(stderr, "BAD INPUT");
+ exit(1);
+ }
+
+ talloc_free(tmp_ctx);
+}
+
+/* Read some IPs from stdin, calculate the sume of the squares of the
+ * IP distances between the first and the rest, and print it. */
+void ctdb_test_lcp2_imbalance(int pnn)
+{
+ struct ctdb_public_ip_list *l;
+ uint32_t imbalance;
+
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+ l = read_ctdb_public_ip_list(tmp_ctx);
+
+ imbalance = lcp2_imbalance(l, pnn);
+ printf ("%lu\n", (unsigned long) imbalance);
+
+ talloc_free(tmp_ctx);
+}
+
+void ctdb_test_init(const char nodestates[],
+ struct ctdb_context **ctdb,
+ struct ctdb_public_ip_list **all_ips,
+ struct ctdb_node_map **nodemap)
+{
+ struct ctdb_public_ip_list *t;
+ struct ctdb_all_public_ips *available_public_ips;
+ int i, numips, numnodes;
+
+ numnodes = strlen(nodestates);
+
+ *ctdb = talloc_zero(NULL, struct ctdb_context);
+
+ /* Fake things up... */
+ (*ctdb)->num_nodes = numnodes;
+
+ (*ctdb)->tunable.deterministic_public_ips = 0;
+ (*ctdb)->tunable.disable_ip_failover = 0;
+ (*ctdb)->tunable.no_ip_failback = 0;
+
+ if (getenv("CTDB_LCP2")) {
+ if (strcmp(getenv("CTDB_LCP2"), "yes") == 0) {
+ (*ctdb)->tunable.lcp2_public_ip_assignment = 1;
+ } else {
+ (*ctdb)->tunable.lcp2_public_ip_assignment = 0;
+ }
+ }
+
+ *nodemap = talloc_array(*ctdb, struct ctdb_node_map, numnodes);
+ (*nodemap)->num = numnodes;
+
+ for (i=0; i < numnodes; i++) {
+ (*nodemap)->nodes[i].pnn = i;
+ (*nodemap)->nodes[i].flags = nodestates[i] - '0';
+ /* *nodemap->nodes[i].sockaddr is uninitialised */
+ }
+
+ *all_ips = read_ctdb_public_ip_list(*ctdb);
+ numips = 0;
+ for (t = *all_ips; t != NULL; t = t->next) {
+ numips++;
+ }
+
+ available_public_ips = talloc_array(*ctdb, struct ctdb_all_public_ips, numips); // FIXME: bogus size, overkill
+ available_public_ips->num = numips;
+ for (t = *all_ips, i=0; t != NULL && i < numips ; t = t->next, i++) {
+ available_public_ips->ips[i].pnn = t->pnn;
+ memcpy(&(available_public_ips->ips[i].addr), &(t->addr), sizeof(t->addr));
+ }
+
+ (*ctdb)->nodes = talloc_array(*ctdb, struct ctdb_node *, numnodes); // FIXME: bogus size, overkill
+
+ /* Setup both nodemap and ctdb->nodes. Mark all nodes as
+ * healthy - change this later. */
+ for (i=0; i < numnodes; i++) {
+ uint32_t flags = nodestates[i] - '0' ? NODE_FLAGS_UNHEALTHY : 0;
+ (*nodemap)->nodes[i].pnn = i;
+ (*nodemap)->nodes[i].flags = flags;
+ /* nodemap->nodes[i].sockaddr is uninitialised */
+
+ (*ctdb)->nodes[i] = talloc(*ctdb, struct ctdb_node);
+ (*ctdb)->nodes[i]->pnn = i;
+ (*ctdb)->nodes[i]->flags = flags;
+ (*ctdb)->nodes[i]->available_public_ips = available_public_ips;
+ (*ctdb)->nodes[i]->known_public_ips = available_public_ips;
+ }
+}
+
+/* IP layout is read from stdin. */
+void ctdb_test_lcp2_allocate_unassigned(const char nodestates[])
+{
+ struct ctdb_context *ctdb;
+ struct ctdb_public_ip_list *all_ips;
+ struct ctdb_node_map *nodemap;
+
+ uint32_t *lcp2_imbalances;
+ bool *newly_healthy;
+
+ ctdb_test_init(nodestates, &ctdb, &all_ips, &nodemap);
+
+ lcp2_init(ctdb, nodemap,
+ NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED,
+ all_ips, &lcp2_imbalances, &newly_healthy);
+
+ lcp2_allocate_unassigned(ctdb, nodemap,
+ NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED,
+ all_ips, lcp2_imbalances);
+
+ print_ctdb_public_ip_list(all_ips);
+
+ talloc_free(ctdb);
+}
+
+/* IP layout is read from stdin. */
+void ctdb_test_lcp2_failback(const char nodestates[])
+{
+ struct ctdb_context *ctdb;
+ struct ctdb_public_ip_list *all_ips;
+ struct ctdb_node_map *nodemap;
+
+ uint32_t *lcp2_imbalances;
+ bool *newly_healthy;
+
+ ctdb_test_init(nodestates, &ctdb, &all_ips, &nodemap);
+
+ lcp2_init(ctdb, nodemap,
+ NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED,
+ all_ips, &lcp2_imbalances, &newly_healthy);
+
+ lcp2_failback(ctdb, nodemap,
+ NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED,
+ all_ips, lcp2_imbalances, newly_healthy);
+
+ print_ctdb_public_ip_list(all_ips);
+
+ talloc_free(ctdb);
+}
+
+/* IP layout is read from stdin. */
+void ctdb_test_lcp2_failback_loop(const char nodestates[])
+{
+ struct ctdb_context *ctdb;
+ struct ctdb_public_ip_list *all_ips;
+ struct ctdb_node_map *nodemap;
+
+ uint32_t *lcp2_imbalances;
+ bool *newly_healthy;
+
+ ctdb_test_init(nodestates, &ctdb, &all_ips, &nodemap);
+
+ lcp2_init(ctdb, nodemap,
+ NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED,
+ all_ips, &lcp2_imbalances, &newly_healthy);
+
+try_again:
+ if (lcp2_failback(ctdb, nodemap,
+ NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED,
+ all_ips, lcp2_imbalances, newly_healthy)) {
+ goto try_again;
+ }
+
+ print_ctdb_public_ip_list(all_ips);
+
+ talloc_free(ctdb);
+}
+
+/* IP layout is read from stdin. */
+void ctdb_test_ctdb_takeover_run_core(const char nodestates[])
+{
+ struct ctdb_context *ctdb;
+ struct ctdb_public_ip_list *all_ips;
+ struct ctdb_node_map *nodemap;
+
+ ctdb_test_init(nodestates, &ctdb, &all_ips, &nodemap);
+
+ ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
+
+ print_ctdb_public_ip_list(all_ips);
+
+ talloc_free(ctdb);
+}
+
+void usage(void)
+{
+ fprintf(stderr, "usage: ctdb_takeover_tests <op>\n");
+ exit(1);
+}
+
+int main(int argc, const char *argv[])
+{
+ LogLevel = DEBUG_DEBUG;
+ if (getenv("CTDB_TEST_LOGLEVEL")) {
+ LogLevel = atoi(getenv("CTDB_TEST_LOGLEVEL"));
+ }
+
+ if (argc < 2) {
+ usage();
+ }
+
+ if (strcmp(argv[1], "ip_list") == 0) {
+ ctdb_test_read_ctdb_public_ip_list();
+ } else if (strcmp(argv[1], "ip_distance") == 0) {
+ ctdb_test_ip_distance();
+ } else if (argc == 4 && strcmp(argv[1], "ip_distance_2_sum") == 0) {
+ ctdb_test_ip_distance_2_sum(argv[2], atoi(argv[3]));
+ } else if (argc >= 3 && strcmp(argv[1], "lcp2_imbalance") == 0) {
+ ctdb_test_lcp2_imbalance(atoi(argv[2]));
+ } else if (argc == 3 && strcmp(argv[1], "lcp2_allocate_unassigned") == 0) {
+ ctdb_test_lcp2_allocate_unassigned(argv[2]);
+ } else if (argc == 3 && strcmp(argv[1], "lcp2_failback") == 0) {
+ ctdb_test_lcp2_failback(argv[2]);
+ } else if (argc == 3 && strcmp(argv[1], "lcp2_failback_loop") == 0) {
+ ctdb_test_lcp2_failback_loop(argv[2]);
+ } else if (argc == 3 && strcmp(argv[1], "ctdb_takeover_run_core") == 0) {
+ ctdb_test_ctdb_takeover_run_core(argv[2]);
+ } else {
+ usage();
+ }
+
+ return 0;
+}