summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Tridgell <tridge@samba.org>2007-06-01 19:05:41 +1000
committerAndrew Tridgell <tridge@samba.org>2007-06-01 19:05:41 +1000
commit7db1d04d5cdc548b65b57bdfca03e80a2d5dc5c5 (patch)
treec7d8a148b449cfc5121f29a1fbe777f27564c300
parent869d70d9c9b05fc97102317c88bd02fed9aba421 (diff)
make the running of the takeover and release event scripts async, to prevent outages due to slow scripts
(This used to be ctdb commit 4189be97eee7ab2a50335c860f2fcd9566667d01)
-rw-r--r--ctdb/common/ctdb.c2
-rw-r--r--ctdb/common/ctdb_control.c4
-rw-r--r--ctdb/common/ctdb_daemon.c5
-rw-r--r--ctdb/include/ctdb_private.h16
-rw-r--r--ctdb/takeover/ctdb_takeover.c167
-rw-r--r--ctdb/takeover/system.c24
6 files changed, 164 insertions, 54 deletions
diff --git a/ctdb/common/ctdb.c b/ctdb/common/ctdb.c
index d957e372f9..273d40236c 100644
--- a/ctdb/common/ctdb.c
+++ b/ctdb/common/ctdb.c
@@ -41,7 +41,7 @@ int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport)
int ctdb_set_logfile(struct ctdb_context *ctdb, const char *logfile)
{
ctdb->logfile = talloc_strdup(ctdb, logfile);
- if (ctdb->logfile != NULL) {
+ if (ctdb->logfile != NULL && strcmp(logfile, "-") != 0) {
int fd;
close(1);
close(2);
diff --git a/ctdb/common/ctdb_control.c b/ctdb/common/ctdb_control.c
index ac677ac1c3..319adfc6e1 100644
--- a/ctdb/common/ctdb_control.c
+++ b/ctdb/common/ctdb_control.c
@@ -265,11 +265,11 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
case CTDB_CONTROL_TAKEOVER_IP:
CHECK_CONTROL_DATA_SIZE(sizeof(struct sockaddr));
- return ctdb_control_takeover_ip(ctdb, indata);
+ return ctdb_control_takeover_ip(ctdb, c, indata, async_reply);
case CTDB_CONTROL_RELEASE_IP:
CHECK_CONTROL_DATA_SIZE(sizeof(struct sockaddr));
- return ctdb_control_release_ip(ctdb, indata);
+ return ctdb_control_release_ip(ctdb, c, indata, async_reply);
case CTDB_CONTROL_DELETE_LOW_RSN:
CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_delete_low_rsn));
diff --git a/ctdb/common/ctdb_daemon.c b/ctdb/common/ctdb_daemon.c
index b98981b9dc..3309d375e4 100644
--- a/ctdb/common/ctdb_daemon.c
+++ b/ctdb/common/ctdb_daemon.c
@@ -32,7 +32,7 @@
static void daemon_incoming_packet(void *, struct ctdb_req_header *);
/* called when the "startup" event script has finished */
-static void ctdb_start_transport(struct ctdb_context *ctdb, int status)
+static void ctdb_start_transport(struct ctdb_context *ctdb, int status, void *p)
{
if (status != 0) {
DEBUG(0,("startup event failed!\n"));
@@ -87,7 +87,8 @@ static void ctdb_main_loop(struct ctdb_context *ctdb)
CTDB_CTRL_FLAG_NOREPLY,
tdb_null, NULL, NULL);
- ret = ctdb_event_script_callback(ctdb, ctdb_start_transport, "startup");
+ ret = ctdb_event_script_callback(ctdb, ctdb,
+ ctdb_start_transport, NULL, "startup");
if (ret != 0) {
DEBUG(0,("Failed startup event script\n"));
return;
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index d3320ac163..f2087e8e90 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -925,10 +925,16 @@ int ctdb_ctrl_set_rsn_nonempty(struct ctdb_context *ctdb, struct timeval timeout
int ctdb_ctrl_delete_low_rsn(struct ctdb_context *ctdb, struct timeval timeout,
uint32_t destnode, uint32_t db_id, uint64_t rsn);
void ctdb_set_realtime(void);
-int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
+ struct ctdb_req_control *c,
+ TDB_DATA indata,
+ bool *async_reply);
int ctdb_ctrl_takeover_ip(struct ctdb_context *ctdb, struct timeval timeout,
uint32_t destnode, const char *ip);
-int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
+ struct ctdb_req_control *c,
+ TDB_DATA indata,
+ bool *async_reply);
int ctdb_ctrl_release_ip(struct ctdb_context *ctdb, struct timeval timeout,
uint32_t destnode, const char *ip);
@@ -951,8 +957,10 @@ int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn);
void ctdb_takeover_client_destructor_hook(struct ctdb_client *client);
int ctdb_event_script(struct ctdb_context *ctdb, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
int ctdb_event_script_callback(struct ctdb_context *ctdb,
- void (*callback)(struct ctdb_context *, int),
- const char *fmt, ...) PRINTF_ATTRIBUTE(3,4);
+ TALLOC_CTX *mem_ctx,
+ void (*callback)(struct ctdb_context *, int, void *),
+ void *private_data,
+ const char *fmt, ...) PRINTF_ATTRIBUTE(5,6);
void ctdb_release_all_ips(struct ctdb_context *ctdb);
void set_nonblocking(int fd);
diff --git a/ctdb/takeover/ctdb_takeover.c b/ctdb/takeover/ctdb_takeover.c
index d5fcfcee64..af250f570b 100644
--- a/ctdb/takeover/ctdb_takeover.c
+++ b/ctdb/takeover/ctdb_takeover.c
@@ -91,52 +91,48 @@ static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *
ctdb_control_send_arp, arp);
}
+struct takeover_callback_state {
+ struct ctdb_req_control *c;
+ struct sockaddr_in *sin;
+};
/*
- take over an ip address
+ called when takeip event finishes
*/
-int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, TDB_DATA indata)
+static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
+ void *private_data)
{
- int ret;
- struct sockaddr_in *sin = (struct sockaddr_in *)indata.dptr;
+ struct takeover_callback_state *state =
+ talloc_get_type(private_data, struct takeover_callback_state);
struct ctdb_takeover_arp *arp;
- char *ip = inet_ntoa(sin->sin_addr);
+ char *ip = inet_ntoa(state->sin->sin_addr);
struct ctdb_tcp_list *tcp;
- if (ctdb_sys_have_ip(ip)) {
- return 0;
- }
-
- DEBUG(0,("Takover of IP %s/%u on interface %s\n",
- ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits,
- ctdb->takeover.interface));
- ret = ctdb_event_script(ctdb, "takeip %s %s %u",
- ctdb->takeover.interface,
- ip,
- ctdb->nodes[ctdb->vnn]->public_netmask_bits);
- if (ret != 0) {
+ if (status != 0) {
DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
ip, ctdb->takeover.interface));
- return -1;
+ ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+ talloc_free(state);
+ return;
}
if (!ctdb->takeover.last_ctx) {
ctdb->takeover.last_ctx = talloc_new(ctdb);
- CTDB_NO_MEMORY(ctdb, ctdb->takeover.last_ctx);
+ if (!ctdb->takeover.last_ctx) goto failed;
}
arp = talloc_zero(ctdb->takeover.last_ctx, struct ctdb_takeover_arp);
- CTDB_NO_MEMORY(ctdb, arp);
+ if (!arp) goto failed;
arp->ctdb = ctdb;
- arp->sin = *sin;
+ arp->sin = *state->sin;
/* add all of the known tcp connections for this IP to the
list of tcp connections to send tickle acks for */
for (tcp=ctdb->tcp_list;tcp;tcp=tcp->next) {
- if (sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) {
+ if (state->sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) {
struct ctdb_tcp_list *t2 = talloc(arp, struct ctdb_tcp_list);
- CTDB_NO_MEMORY(ctdb, t2);
+ if (t2 == NULL) goto failed;
*t2 = *tcp;
DLIST_ADD(arp->tcp_list, t2);
}
@@ -145,42 +141,78 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, TDB_DATA indata)
event_add_timed(arp->ctdb->ev, arp->ctdb->takeover.last_ctx,
timeval_zero(), ctdb_control_send_arp, arp);
- return ret;
+ /* the control succeeded */
+ ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+ talloc_free(state);
+ return;
+
+failed:
+ ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+ talloc_free(state);
+ return;
}
/*
- release an ip address
+ take over an ip address
*/
-int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, TDB_DATA indata)
+int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
+ struct ctdb_req_control *c,
+ TDB_DATA indata,
+ bool *async_reply)
{
+ int ret;
+ struct takeover_callback_state *state;
struct sockaddr_in *sin = (struct sockaddr_in *)indata.dptr;
- TDB_DATA data;
char *ip = inet_ntoa(sin->sin_addr);
- int ret;
- struct ctdb_tcp_list *tcp;
- if (!ctdb_sys_have_ip(ip)) {
+ /* if our kernel already has this IP, do nothing */
+ if (ctdb_sys_have_ip(ip)) {
return 0;
}
- DEBUG(0,("Release of IP %s/%u on interface %s\n",
+ state = talloc(ctdb, struct takeover_callback_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->c = talloc_steal(ctdb, c);
+ state->sin = talloc(ctdb, struct sockaddr_in);
+ CTDB_NO_MEMORY(ctdb, state->sin);
+ *state->sin = *(struct sockaddr_in *)indata.dptr;
+
+ DEBUG(0,("Takover of IP %s/%u on interface %s\n",
ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits,
ctdb->takeover.interface));
- /* stop any previous arps */
- talloc_free(ctdb->takeover.last_ctx);
- ctdb->takeover.last_ctx = NULL;
-
- ret = ctdb_event_script(ctdb, "releaseip %s %s %u",
- ctdb->takeover.interface,
- ip,
- ctdb->nodes[ctdb->vnn]->public_netmask_bits);
+ ret = ctdb_event_script_callback(ctdb, state, takeover_ip_callback, state,
+ "takeip %s %s %u",
+ ctdb->takeover.interface,
+ ip,
+ ctdb->nodes[ctdb->vnn]->public_netmask_bits);
if (ret != 0) {
- DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n",
+ DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
ip, ctdb->takeover.interface));
+ talloc_free(state);
return -1;
}
+ /* tell ctdb_control.c that we will be replying asynchronously */
+ *async_reply = true;
+
+ return 0;
+}
+
+
+/*
+ called when releaseip event finishes
+ */
+static void release_ip_callback(struct ctdb_context *ctdb, int status,
+ void *private_data)
+{
+ struct takeover_callback_state *state =
+ talloc_get_type(private_data, struct takeover_callback_state);
+ char *ip = inet_ntoa(state->sin->sin_addr);
+ TDB_DATA data;
+ struct ctdb_tcp_list *tcp;
+
/* send a message to all clients of this node telling them
that the cluster has been reconfigured and they should
release any sockets on this IP */
@@ -192,7 +224,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, TDB_DATA indata)
/* tell other nodes about any tcp connections we were holding with this IP */
for (tcp=ctdb->tcp_list;tcp;tcp=tcp->next) {
if (tcp->vnn == ctdb->vnn &&
- sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) {
+ state->sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) {
struct ctdb_control_tcp_vnn t;
t.vnn = ctdb->vnn;
@@ -208,6 +240,59 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, TDB_DATA indata)
}
}
+ /* the control succeeded */
+ ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+ talloc_free(state);
+}
+
+
+/*
+ release an ip address
+ */
+int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
+ struct ctdb_req_control *c,
+ TDB_DATA indata,
+ bool *async_reply)
+{
+ int ret;
+ struct takeover_callback_state *state;
+ struct sockaddr_in *sin = (struct sockaddr_in *)indata.dptr;
+ char *ip = inet_ntoa(sin->sin_addr);
+
+ if (!ctdb_sys_have_ip(ip)) {
+ return 0;
+ }
+
+ DEBUG(0,("Release of IP %s/%u on interface %s\n",
+ ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits,
+ ctdb->takeover.interface));
+
+ /* stop any previous arps */
+ talloc_free(ctdb->takeover.last_ctx);
+ ctdb->takeover.last_ctx = NULL;
+
+ state = talloc(ctdb, struct takeover_callback_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->c = talloc_steal(state, c);
+ state->sin = talloc(state, struct sockaddr_in);
+ CTDB_NO_MEMORY(ctdb, state->sin);
+ *state->sin = *(struct sockaddr_in *)indata.dptr;
+
+ ret = ctdb_event_script_callback(ctdb, state, release_ip_callback, state,
+ "releaseip %s %s %u",
+ ctdb->takeover.interface,
+ ip,
+ ctdb->nodes[ctdb->vnn]->public_netmask_bits);
+ if (ret != 0) {
+ DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n",
+ ip, ctdb->takeover.interface));
+ talloc_free(state);
+ return -1;
+ }
+
+ /* tell the control that we will be reply asynchronously */
+ *async_reply = true;
return 0;
}
diff --git a/ctdb/takeover/system.c b/ctdb/takeover/system.c
index 59016e2c37..cff122f35b 100644
--- a/ctdb/takeover/system.c
+++ b/ctdb/takeover/system.c
@@ -312,8 +312,9 @@ int ctdb_event_script(struct ctdb_context *ctdb, const char *fmt, ...)
struct ctdb_event_script_state {
struct ctdb_context *ctdb;
pid_t child;
- void (*callback)(struct ctdb_context *, int);
+ void (*callback)(struct ctdb_context *, int, void *);
int fd[2];
+ void *private_data;
};
/* called when child is finished */
@@ -327,28 +328,41 @@ static void ctdb_event_script_handler(struct event_context *ev, struct fd_event
if (status != -1) {
status = WEXITSTATUS(status);
}
- state->callback(state->ctdb, status);
+ state->callback(state->ctdb, status, state->private_data);
+ talloc_set_destructor(state, NULL);
talloc_free(state);
}
+/*
+ destroy a running event script
+ */
+static int event_script_destructor(struct ctdb_event_script_state *state)
+{
+ kill(state->child, SIGKILL);
+ waitpid(state->child, NULL, 0);
+ return 0;
+}
/*
run the event script in the background, calling the callback when
finished
*/
int ctdb_event_script_callback(struct ctdb_context *ctdb,
- void (*callback)(struct ctdb_context *, int),
+ TALLOC_CTX *mem_ctx,
+ void (*callback)(struct ctdb_context *, int, void *),
+ void *private_data,
const char *fmt, ...)
{
struct ctdb_event_script_state *state;
va_list ap;
int ret;
- state = talloc(ctdb, struct ctdb_event_script_state);
+ state = talloc(mem_ctx, struct ctdb_event_script_state);
CTDB_NO_MEMORY(ctdb, state);
state->ctdb = ctdb;
state->callback = callback;
+ state->private_data = private_data;
ret = pipe(state->fd);
if (ret != 0) {
@@ -373,6 +387,8 @@ int ctdb_event_script_callback(struct ctdb_context *ctdb,
_exit(ret);
}
+ talloc_set_destructor(state, event_script_destructor);
+
close(state->fd[1]);
event_add_fd(ctdb->ev, state, state->fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,