diff options
-rw-r--r-- | ctdb/.bzrignore | 1 | ||||
-rw-r--r-- | ctdb/Makefile.in | 9 | ||||
-rw-r--r-- | ctdb/common/ctdb.c | 194 | ||||
-rw-r--r-- | ctdb/common/ctdb_call.c | 133 | ||||
-rw-r--r-- | ctdb/common/ctdb_ltdb.c (renamed from ctdb/ctdb_daemon.c) | 24 | ||||
-rw-r--r-- | ctdb/common/ctdb_util.c | 91 | ||||
-rw-r--r-- | ctdb/common/util.c (renamed from ctdb/util.c) | 0 | ||||
-rw-r--r-- | ctdb/ctdb_tcp.c | 373 | ||||
-rw-r--r-- | ctdb/ctdb_tcp_child.c | 70 | ||||
-rw-r--r-- | ctdb/ctdb_test.c | 9 | ||||
-rw-r--r-- | ctdb/include/ctdb.h (renamed from ctdb/ctdb.h) | 6 | ||||
-rw-r--r-- | ctdb/include/ctdb_private.h | 147 | ||||
-rw-r--r-- | ctdb/include/idtree.h | 7 | ||||
-rw-r--r-- | ctdb/include/includes.h (renamed from ctdb/includes.h) | 1 | ||||
-rw-r--r-- | ctdb/lib/util/idtree.c | 374 | ||||
-rw-r--r-- | ctdb/tcp/ctdb_tcp.h (renamed from ctdb/ctdb_private.h) | 62 | ||||
-rw-r--r-- | ctdb/tcp/tcp_connect.c | 187 | ||||
-rw-r--r-- | ctdb/tcp/tcp_init.c | 86 | ||||
-rw-r--r-- | ctdb/tcp/tcp_io.c | 194 | ||||
-rwxr-xr-x | ctdb/tests/test.sh | 9 |
20 files changed, 1493 insertions, 484 deletions
diff --git a/ctdb/.bzrignore b/ctdb/.bzrignore index 71bdda496b..26fa17de5c 100644 --- a/ctdb/.bzrignore +++ b/ctdb/.bzrignore @@ -5,3 +5,4 @@ common config.log push.sh ctdb_test +config.cache diff --git a/ctdb/Makefile.in b/ctdb/Makefile.in index 5ad758af55..4bb29c7440 100644 --- a/ctdb/Makefile.in +++ b/ctdb/Makefile.in @@ -18,9 +18,14 @@ CFLAGS=-g -I$(srcdir)/include -Iinclude -I$(srcdir) \ LIB_FLAGS=@LDFLAGS@ -Llib @LIBS@ -lpopt -EVENTS_OBJ = lib/events/events.o lib/events/events_standard.o +EVENTS_OBJ = lib/events/events.o lib/events/events_standard.o -CTDB_OBJ = ctdb_tcp_child.o ctdb_tcp.o util.o +CTDB_COMMON_OBJ = common/ctdb.o common/util.o common/ctdb_util.o \ + common/ctdb_call.o common/ctdb_ltdb.o lib/util/idtree.o + +CTDB_TCP_OBJ = tcp/tcp_connect.o tcp/tcp_io.o tcp/tcp_init.o + +CTDB_OBJ = $(CTDB_COMMON_OBJ) $(CTDB_TCP_OBJ) OBJS = @TDBOBJ@ @TALLOCOBJ@ @LIBREPLACEOBJ@ $(EXTRA_OBJ) $(EVENTS_OBJ) $(CTDB_OBJ) diff --git a/ctdb/common/ctdb.c b/ctdb/common/ctdb.c new file mode 100644 index 0000000000..6ab4b16702 --- /dev/null +++ b/ctdb/common/ctdb.c @@ -0,0 +1,194 @@ +/* + ctdb main protocol code + + Copyright (C) Andrew Tridgell 2006 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "system/network.h" +#include "system/filesys.h" +#include "ctdb_private.h" + +/* + choose the transport we will use +*/ +int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport) +{ + int ctdb_tcp_init(struct ctdb_context *ctdb); + + if (strcmp(transport, "tcp") == 0) { + return ctdb_tcp_init(ctdb); + } + ctdb_set_error(ctdb, "Unknown transport '%s'\n", transport); + return -1; +} + + +/* + add a node to the list of active nodes +*/ +static int ctdb_add_node(struct ctdb_context *ctdb, char *nstr) +{ + struct ctdb_node *node, **nodep; + + nodep = talloc_realloc(ctdb, ctdb->nodes, struct ctdb_node *, ctdb->num_nodes+1); + CTDB_NO_MEMORY(ctdb, nodep); + + ctdb->nodes = nodep; + nodep = &ctdb->nodes[ctdb->num_nodes]; + (*nodep) = talloc_zero(ctdb->nodes, struct ctdb_node); + CTDB_NO_MEMORY(ctdb, *nodep); + node = *nodep; + + if (ctdb_parse_address(ctdb, node, nstr, &node->address) != 0) { + return -1; + } + node->ctdb = ctdb; + node->name = talloc_asprintf(node, "%s:%u", + node->address.address, + node->address.port); + /* for now we just set the vnn to the line in the file - this + will change! */ + node->vnn = ctdb->num_nodes; + + if (ctdb->methods->add_node(node) != 0) { + talloc_free(node); + return -1; + } + + if (ctdb_same_address(&ctdb->address, &node->address)) { + ctdb->vnn = node->vnn; + } + + ctdb->num_nodes++; + + return 0; +} + +/* + setup the node list from a file +*/ +int ctdb_set_nlist(struct ctdb_context *ctdb, const char *nlist) +{ + char **lines; + int nlines; + int i; + + lines = file_lines_load(nlist, &nlines, ctdb); + if (lines == NULL) { + ctdb_set_error(ctdb, "Failed to load nlist '%s'\n", nlist); + return -1; + } + + for (i=0;i<nlines;i++) { + if (ctdb_add_node(ctdb, lines[i]) != 0) { + talloc_free(lines); + return -1; + } + } + + talloc_free(lines); + return 0; +} + +/* + setup the local node address +*/ +int ctdb_set_address(struct ctdb_context *ctdb, const char *address) +{ + if (ctdb_parse_address(ctdb, ctdb, address, &ctdb->address) != 0) { + return -1; + } + + ctdb->name = talloc_asprintf(ctdb, "%s:%u", + ctdb->address.address, + ctdb->address.port); + return 0; +} + +/* + add a node to the list of active nodes +*/ +int ctdb_set_call(struct ctdb_context *ctdb, ctdb_fn_t fn, int id) +{ + struct ctdb_registered_call *call; + + call = talloc(ctdb, struct ctdb_registered_call); + call->fn = fn; + call->id = id; + + DLIST_ADD(ctdb->calls, call); + return 0; +} + +/* + start the protocol going +*/ +int ctdb_start(struct ctdb_context *ctdb) +{ + return ctdb->methods->start(ctdb); +} + +/* + called by the transport layer when a packet comes in +*/ +static void ctdb_recv_pkt(struct ctdb_context *ctdb, uint8_t *data, uint32_t length) +{ + printf("received pkt of length %d\n", length); +} + +/* + called by the transport layer when a node is dead +*/ +static void ctdb_node_dead(struct ctdb_node *node) +{ + printf("%s: node %s is dead\n", node->ctdb->name, node->name); +} + +/* + called by the transport layer when a node is dead +*/ +static void ctdb_node_connected(struct ctdb_node *node) +{ + printf("%s: connected to %s\n", node->ctdb->name, node->name); +} + +static const struct ctdb_upcalls ctdb_upcalls = { + .recv_pkt = ctdb_recv_pkt, + .node_dead = ctdb_node_dead, + .node_connected = ctdb_node_connected +}; + +/* + initialise the ctdb daemon. + + NOTE: In current code the daemon does not fork. This is for testing purposes only + and to simplify the code. +*/ +struct ctdb_context *ctdb_init(struct event_context *ev) +{ + struct ctdb_context *ctdb; + + ctdb = talloc_zero(ev, struct ctdb_context); + ctdb->ev = ev; + ctdb->upcalls = &ctdb_upcalls; + ctdb->idr = idr_init(ctdb); + + return ctdb; +} + diff --git a/ctdb/common/ctdb_call.c b/ctdb/common/ctdb_call.c new file mode 100644 index 0000000000..f16c5705f4 --- /dev/null +++ b/ctdb/common/ctdb_call.c @@ -0,0 +1,133 @@ +/* + ctdb_call protocol code + + Copyright (C) Andrew Tridgell 2006 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "system/network.h" +#include "system/filesys.h" +#include "ctdb_private.h" + + +/* + local version of ctdb_call +*/ +static int ctdb_call_local(struct ctdb_context *ctdb, TDB_DATA key, int call_id, + TDB_DATA *call_data, TDB_DATA *reply_data) +{ + struct ctdb_call *c; + struct ctdb_registered_call *fn; + TDB_DATA data; + + c = talloc(ctdb, struct ctdb_call); + CTDB_NO_MEMORY(ctdb, c); + + data = tdb_fetch(ctdb->ltdb, key); + + c->key = key; + c->call_data = call_data; + c->record_data.dptr = talloc_memdup(c, data.dptr, data.dsize); + CTDB_NO_MEMORY(ctdb, c->record_data.dptr); + if (data.dptr) free(data.dptr); + c->new_data = NULL; + c->reply_data = NULL; + + for (fn=ctdb->calls;fn;fn=fn->next) { + if (fn->id == call_id) break; + } + if (fn == NULL) { + ctdb_set_error(ctdb, "Unknown call id %u\n", call_id); + return -1; + } + + if (fn->fn(c) != 0) { + free(c->record_data.dptr); + ctdb_set_error(ctdb, "ctdb_call %u failed\n", call_id); + return -1; + } + + if (c->new_data) { + if (tdb_store(ctdb->ltdb, key, *c->new_data, TDB_REPLACE) != 0) { + ctdb_set_error(ctdb, "ctdb_call tdb_store failed\n"); + return -1; + } + } + + if (reply_data) { + if (c->reply_data) { + *reply_data = *c->reply_data; + talloc_steal(ctdb, reply_data->dptr); + } else { + reply_data->dptr = NULL; + reply_data->dsize = 0; + } + } + + talloc_free(c); + + return -1; +} + +/* + make a remote ctdb call +*/ +int ctdb_call(struct ctdb_context *ctdb, TDB_DATA key, int call_id, + TDB_DATA *call_data, TDB_DATA *reply_data) +{ + uint32_t dest; + struct ctdb_req_call *c; + uint32_t len; + struct ctdb_node *node; + + dest = ctdb_hash(&key) % ctdb->num_nodes; + if (dest == ctdb->vnn) { + return ctdb_call_local(ctdb, key, call_id, call_data, reply_data); + } + + len = sizeof(*c) + key.dsize + (call_data?call_data->dsize:0); + c = talloc_size(ctdb, len); + CTDB_NO_MEMORY(ctdb, c); + + c->hdr.operation = CTDB_OP_CALL; + c->hdr.destnode = dest; + c->hdr.srcnode = ctdb->vnn; + /* this limits us to 16k outstanding messages - not unreasonable */ + c->hdr.reqid = idr_get_new(ctdb->idr, c, 0xFFFF); + c->callid = call_id; + c->keylen = key.dsize; + c->calldatalen = call_data?call_data->dsize:0; + memcpy(&c->data[0], key.dptr, key.dsize); + if (call_data) { + memcpy(&c->data[key.dsize], call_data->dptr, call_data->dsize); + } + + node = ctdb->nodes[dest]; + + if (ctdb->methods->queue_pkt(node, (uint8_t *)c, len) != 0) { + talloc_free(c); + return -1; + } + + /* + event_add_timed(ctdb->ev, c, timeval_current_ofs(CTDB_REQ_TIMEOUT, 0), + ctdb_call_timeout, c); + */ + return -1; +} + diff --git a/ctdb/ctdb_daemon.c b/ctdb/common/ctdb_ltdb.c index 5a8db674d5..7238a436b4 100644 --- a/ctdb/ctdb_daemon.c +++ b/ctdb/common/ctdb_ltdb.c @@ -1,5 +1,5 @@ /* - ctdb database library + ctdb ltdb code Copyright (C) Andrew Tridgell 2006 @@ -18,3 +18,25 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include "includes.h" +#include "lib/events/events.h" +#include "system/network.h" +#include "system/filesys.h" +#include "ctdb_private.h" + +/* + attach to a specific database +*/ +int ctdb_attach(struct ctdb_context *ctdb, const char *name, int tdb_flags, + int open_flags, mode_t mode) +{ + /* when we have a separate daemon this will need to be a real + file, not a TDB_INTERNAL, so the parent can access it to + for ltdb bypass */ + ctdb->ltdb = tdb_open(name, 0, TDB_INTERNAL, 0, 0); + if (ctdb->ltdb == NULL) { + ctdb_set_error(ctdb, "Failed to open tdb %s\n", name); + return -1; + } + return 0; +} diff --git a/ctdb/common/ctdb_util.c b/ctdb/common/ctdb_util.c new file mode 100644 index 0000000000..64053a5d99 --- /dev/null +++ b/ctdb/common/ctdb_util.c @@ -0,0 +1,91 @@ +/* + ctdb utility code + + Copyright (C) Andrew Tridgell 2006 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "system/network.h" +#include "system/filesys.h" +#include "ctdb_private.h" + +/* + return error string for last error +*/ +const char *ctdb_errstr(struct ctdb_context *ctdb) +{ + return ctdb->err_msg; +} + + +/* + remember an error message +*/ +void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...) +{ + va_list ap; + talloc_free(ctdb->err_msg); + va_start(ap, fmt); + ctdb->err_msg = talloc_vasprintf(ctdb, fmt, ap); + va_end(ap); +} + + +/* + parse a IP:port pair +*/ +int ctdb_parse_address(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, const char *str, + struct ctdb_address *address) +{ + char *p; + p = strchr(str, ':'); + if (p == NULL) { + ctdb_set_error(ctdb, "Badly formed node '%s'\n", str); + return -1; + } + + address->address = talloc_strndup(mem_ctx, str, p-str); + address->port = strtoul(p+1, NULL, 0); + return 0; +} + + +/* + check if two addresses are the same +*/ +bool ctdb_same_address(struct ctdb_address *a1, struct ctdb_address *a2) +{ + return strcmp(a1->address, a2->address) == 0 && a1->port == a2->port; +} + + +/* + hash function for mapping data to a VNN - taken from tdb +*/ +uint32_t ctdb_hash(TDB_DATA *key) +{ + uint32_t value; /* Used to compute the hash value. */ + uint32_t i; /* Used to cycle through random values. */ + + /* Set the initial value from the key size. */ + for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++) + value = (value + (key->dptr[i] << (i*5 % 24))); + + return (1103515243 * value + 12345); +} diff --git a/ctdb/util.c b/ctdb/common/util.c index a44c7d0ad0..a44c7d0ad0 100644 --- a/ctdb/util.c +++ b/ctdb/common/util.c diff --git a/ctdb/ctdb_tcp.c b/ctdb/ctdb_tcp.c deleted file mode 100644 index 6b423abf35..0000000000 --- a/ctdb/ctdb_tcp.c +++ /dev/null @@ -1,373 +0,0 @@ -/* - ctdb over TCP - - Copyright (C) Andrew Tridgell 2006 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ - -#include "includes.h" -#include "lib/events/events.h" -#include "system/network.h" -#include "system/filesys.h" -#include "ctdb_private.h" - -/* - initialise the ctdb daemon. - - if the ctdb dispatcher daemon has already been started then this - does nothing. Otherwise it forks the ctdb dispatcher daemon and - starts the daemons connecting to each other - - NOTE: In current code the daemon does not fork. This is for testing purposes only - and to simplify the code. -*/ - -struct ctdb_context *ctdb_init(struct event_context *ev) -{ - struct ctdb_context *ctdb; - - ctdb = talloc_zero(ev, struct ctdb_context); - ctdb->ev = ev; - - return ctdb; -} - -const char *ctdb_errstr(struct ctdb_context *ctdb) -{ - return ctdb->err_msg; -} - -/* - remember an error message -*/ -static void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...) -{ - va_list ap; - talloc_free(ctdb->err_msg); - va_start(ap, fmt); - ctdb->err_msg = talloc_vasprintf(ctdb, fmt, ap); - va_end(ap); -} - -/* - called when socket becomes readable -*/ -static void ctdb_node_read(struct event_context *ev, struct fd_event *fde, - uint16_t flags, void *private) -{ - struct ctdb_node *node = talloc_get_type(private, struct ctdb_node); - printf("connection to node %s:%u is readable\n", - node->address.address, node->address.port); - event_set_fd_flags(fde, 0); -} - -static void ctdb_node_connect(struct event_context *ev, struct timed_event *te, - struct timeval t, void *private); - -/* - called when socket becomes writeable on connect -*/ -static void ctdb_node_connect_write(struct event_context *ev, struct fd_event *fde, - uint16_t flags, void *private) -{ - struct ctdb_node *node = talloc_get_type(private, struct ctdb_node); - struct ctdb_context *ctdb = node->ctdb; - int error; - socklen_t len; - - if (getsockopt(node->fd, SOL_SOCKET, SO_ERROR, &error, &len) != 0 || - error != 0) { - if (error == EINPROGRESS) { - printf("connect in progress\n"); - return; - } - printf("getsockopt errno=%s\n", strerror(errno)); - talloc_free(fde); - close(node->fd); - node->fd = -1; - event_add_timed(ctdb->ev, node, timeval_current_ofs(1, 0), - ctdb_node_connect, node); - return; - } - - printf("Established connection to %s:%u\n", - node->address.address, node->address.port); - talloc_free(fde); - event_add_fd(node->ctdb->ev, node, node->fd, EVENT_FD_READ, - ctdb_node_read, node); -} - -/* - called when we should try and establish a tcp connection to a node -*/ -static void ctdb_node_connect(struct event_context *ev, struct timed_event *te, - struct timeval t, void *private) -{ - struct ctdb_node *node = talloc_get_type(private, struct ctdb_node); - struct ctdb_context *ctdb = node->ctdb; - unsigned v; - struct sockaddr_in sock_out; - - node->fd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); - - v = fcntl(node->fd, F_GETFL, 0); - fcntl(node->fd, F_SETFL, v | O_NONBLOCK); - - inet_pton(AF_INET, node->address.address, &sock_out.sin_addr); - sock_out.sin_port = htons(node->address.port); - sock_out.sin_family = PF_INET; - - if (connect(node->fd, &sock_out, sizeof(sock_out)) != 0 && - errno != EINPROGRESS) { - /* try again once a second */ - close(node->fd); - event_add_timed(ctdb->ev, node, timeval_current_ofs(1, 0), - ctdb_node_connect, node); - return; - } - - /* non-blocking connect - wait for write event */ - event_add_fd(node->ctdb->ev, node, node->fd, EVENT_FD_WRITE, - ctdb_node_connect_write, node); -} - -/* - parse a IP:port pair -*/ -static int ctdb_parse_address(struct ctdb_context *ctdb, - TALLOC_CTX *mem_ctx, const char *str, - struct ctdb_address *address) -{ - char *p; - p = strchr(str, ':'); - if (p == NULL) { - ctdb_set_error(ctdb, "Badly formed node '%s'\n", str); - return -1; - } - - address->address = talloc_strndup(mem_ctx, str, p-str); - address->port = strtoul(p+1, NULL, 0); - return 0; -} - - -/* - add a node to the list of active nodes -*/ -static int ctdb_add_node(struct ctdb_context *ctdb, char *nstr) -{ - struct ctdb_node *node; - - node = talloc(ctdb, struct ctdb_node); - if (ctdb_parse_address(ctdb, node, nstr, &node->address) != 0) { - return -1; - } - node->fd = -1; - node->ctdb = ctdb; - - DLIST_ADD(ctdb->nodes, node); - return 0; -} - -/* - setup the node list from a file -*/ -int ctdb_set_nlist(struct ctdb_context *ctdb, const char *nlist) -{ - char **lines; - int nlines; - int i; - - lines = file_lines_load(nlist, &nlines, ctdb); - if (lines == NULL) { - ctdb_set_error(ctdb, "Failed to load nlist '%s'\n", nlist); - return -1; - } - - for (i=0;i<nlines;i++) { - if (ctdb_add_node(ctdb, lines[i]) != 0) { - talloc_free(lines); - return -1; - } - } - - talloc_free(lines); - return 0; -} - -/* - setup the node list from a file -*/ -int ctdb_set_address(struct ctdb_context *ctdb, const char *address) -{ - return ctdb_parse_address(ctdb, ctdb, address, &ctdb->address); -} - -/* - add a node to the list of active nodes -*/ -int ctdb_set_call(struct ctdb_context *ctdb, ctdb_fn_t fn, int id) -{ - struct ctdb_registered_call *call; - - call = talloc(ctdb, struct ctdb_registered_call); - call->fn = fn; - call->id = id; - - DLIST_ADD(ctdb->calls, call); - return 0; -} - -/* - attach to a specific database -*/ -int ctdb_attach(struct ctdb_context *ctdb, const char *name, int tdb_flags, - int open_flags, mode_t mode) -{ - ctdb->ltdb = tdb_open(name, 0, TDB_INTERNAL, 0, 0); - if (ctdb->ltdb == NULL) { - ctdb_set_error(ctdb, "Failed to open tdb %s\n", name); - return -1; - } - return 0; -} - -/* - called when an incoming connection is readable -*/ -static void ctdb_incoming_read(struct event_context *ev, struct fd_event *fde, - uint16_t flags, void *private) -{ - struct ctdb_incoming *in = talloc_get_type(private, struct ctdb_incoming); - char c; - printf("Incoming data\n"); - if (read(in->fd, &c, 1) <= 0) { - /* socket is dead */ - close(in->fd); - talloc_free(in); - } -} - - -/* - called when we get contacted by another node - currently makes no attempt to check if the connection is really from a ctdb - node in our cluster -*/ -static void ctdb_listen_event(struct event_context *ev, struct fd_event *fde, - uint16_t flags, void *private) -{ - struct ctdb_context *ctdb; - struct sockaddr_in addr; - socklen_t len; - int fd; - struct ctdb_incoming *in; - - ctdb = talloc_get_type(private, struct ctdb_context); - memset(&addr, 0, sizeof(addr)); - len = sizeof(addr); - fd = accept(ctdb->listen_fd, (struct sockaddr *)&addr, &len); - if (fd == -1) return; - - in = talloc(ctdb, struct ctdb_incoming); - in->fd = fd; - in->ctdb = ctdb; - - event_add_fd(ctdb->ev, in, in->fd, EVENT_FD_READ, - ctdb_incoming_read, in); - - printf("New incoming socket %d\n", in->fd); -} - - -/* - listen on our own address -*/ -static int ctdb_listen(struct ctdb_context *ctdb) -{ - struct sockaddr_in sock; - int one = 1; - - sock.sin_port = htons(ctdb->address.port); - sock.sin_family = PF_INET; - inet_pton(AF_INET, ctdb->address.address, &sock.sin_addr); - - ctdb->listen_fd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); - if (ctdb->listen_fd == -1) { - ctdb_set_error(ctdb, "socket failed\n"); - return -1; - } - - setsockopt(ctdb->listen_fd,SOL_SOCKET,SO_REUSEADDR,(char *)&one,sizeof(one)); - - if (bind(ctdb->listen_fd, (struct sockaddr * )&sock, sizeof(sock)) != 0) { - ctdb_set_error(ctdb, "bind failed\n"); - close(ctdb->listen_fd); - ctdb->listen_fd = -1; - return -1; - } - - if (listen(ctdb->listen_fd, 10) == -1) { - ctdb_set_error(ctdb, "listen failed\n"); - close(ctdb->listen_fd); - ctdb->listen_fd = -1; - return -1; - } - - event_add_fd(ctdb->ev, ctdb, ctdb->listen_fd, EVENT_FD_READ, - ctdb_listen_event, ctdb); - - return 0; -} - -/* - check if two addresses are the same -*/ -static bool ctdb_same_address(struct ctdb_address *a1, struct ctdb_address *a2) -{ - return strcmp(a1->address, a2->address) == 0 && a1->port == a2->port; -} - -/* - start the protocol going -*/ -int ctdb_start(struct ctdb_context *ctdb) -{ - struct ctdb_node *node; - - /* listen on our own address */ - if (ctdb_listen(ctdb) != 0) return -1; - - /* startup connections to the other servers - will happen on - next event loop */ - for (node=ctdb->nodes;node;node=node->next) { - if (ctdb_same_address(&ctdb->address, &node->address)) continue; - event_add_timed(ctdb->ev, node, timeval_zero(), - ctdb_node_connect, node); - } - - return 0; -} - -/* - make a remote ctdb call -*/ -int ctdb_call(struct ctdb_context *ctdb, TDB_DATA key, int call_id, - TDB_DATA *call_data, TDB_DATA *reply_data) -{ - printf("ctdb_call not implemented\n"); - return -1; -} diff --git a/ctdb/ctdb_tcp_child.c b/ctdb/ctdb_tcp_child.c deleted file mode 100644 index 27d9282664..0000000000 --- a/ctdb/ctdb_tcp_child.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - ctdb over TCP - - Copyright (C) Andrew Tridgell 2006 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ - -#include "includes.h" -#include "system/network.h" -#include "system/filesys.h" -#include "ctdb_private.h" - -struct ctdb_child_state { - int sock; - struct event_context *ev; -}; - - -/* - create a unix domain socket and bind it - return a file descriptor open on the socket -*/ -static int ux_socket_bind(const char *name) -{ - int fd; - struct sockaddr_un addr; - - /* get rid of any old socket */ - unlink(name); - - fd = socket(AF_UNIX, SOCK_DGRAM, 0); - if (fd == -1) return -1; - - memset(&addr, 0, sizeof(addr)); - addr.sun_family = AF_UNIX; - strncpy(addr.sun_path, name, sizeof(addr.sun_path)); - - if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { - close(fd); - return -1; - } - - return fd; -} - -/* - start the ctdb tcp child daemon -*/ -int ctdb_tcp_child(void) -{ - struct ctdb_child_state *state; - - state = talloc(NULL, struct ctdb_child_state); - state->sock = ux_socket_bind(CTDB_SOCKET); - - return 0; -} diff --git a/ctdb/ctdb_test.c b/ctdb/ctdb_test.c index bb92c02b41..452fa558b1 100644 --- a/ctdb/ctdb_test.c +++ b/ctdb/ctdb_test.c @@ -73,12 +73,14 @@ int main(int argc, const char *argv[]) { struct ctdb_context *ctdb; const char *nlist = NULL; + const char *transport = "tcp"; const char *myaddress = NULL; struct poptOption popt_options[] = { POPT_AUTOHELP { "nlist", 0, POPT_ARG_STRING, &nlist, 0, "node list file", "filename" }, { "listen", 0, POPT_ARG_STRING, &myaddress, 0, "address to listen on", "address" }, + { "transport", 0, POPT_ARG_STRING, &transport, 0, "protocol transport", NULL }, POPT_TABLEEND }; int opt; @@ -121,6 +123,12 @@ int main(int argc, const char *argv[]) exit(1); } + ret = ctdb_set_transport(ctdb, transport); + if (ret == -1) { + printf("ctdb_set_transport failed - %s\n", ctdb_errstr(ctdb)); + exit(1); + } + /* tell ctdb what address to listen on */ ret = ctdb_set_address(ctdb, myaddress); if (ret == -1) { @@ -182,6 +190,7 @@ int main(int argc, const char *argv[]) for (i=0;i<data.dsize/sizeof(int);i++) { printf("%3d\n", ((int *)data.dptr)[i]); } + talloc_free(data.dptr); /* shut it down */ talloc_free(ctdb); diff --git a/ctdb/ctdb.h b/ctdb/include/ctdb.h index 71f6ea74a7..c6bc043266 100644 --- a/ctdb/ctdb.h +++ b/ctdb/include/ctdb.h @@ -41,6 +41,11 @@ struct event_context; struct ctdb_context *ctdb_init(struct event_context *ev); /* + choose the transport +*/ +int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport); + +/* tell ctdb what address to listen on, in transport specific format */ int ctdb_set_address(struct ctdb_context *ctdb, const char *address); @@ -82,3 +87,4 @@ int ctdb_attach(struct ctdb_context *ctdb, const char *name, int tdb_flags, */ int ctdb_call(struct ctdb_context *ctdb, TDB_DATA key, int call_id, TDB_DATA *call_data, TDB_DATA *reply_data); + diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h new file mode 100644 index 0000000000..582739f617 --- /dev/null +++ b/ctdb/include/ctdb_private.h @@ -0,0 +1,147 @@ +/* + ctdb database library + + Copyright (C) Andrew Tridgell 2006 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + + +/* + an installed ctdb remote call +*/ +struct ctdb_registered_call { + struct ctdb_registered_call *next, *prev; + uint32_t id; + ctdb_fn_t fn; +}; + +/* + this address structure might need to be generalised later for some + transports +*/ +struct ctdb_address { + const char *address; + int port; +}; + +/* + state associated with one node +*/ +struct ctdb_node { + struct ctdb_context *ctdb; + struct ctdb_address address; + const char *name; /* for debug messages */ + void *private; /* private to transport */ + uint32_t vnn; +}; + +/* + transport specific methods +*/ +struct ctdb_methods { + int (*start)(struct ctdb_context *); /* start protocol processing */ + int (*add_node)(struct ctdb_node *); /* setup a new node */ + int (*queue_pkt)(struct ctdb_node *, uint8_t *data, uint32_t length); +}; + +/* + transport calls up to the ctdb layer +*/ +struct ctdb_upcalls { + /* recv_pkt is called when a packet comes in */ + void (*recv_pkt)(struct ctdb_context *, uint8_t *data, uint32_t length); + + /* node_dead is called when an attempt to send to a node fails */ + void (*node_dead)(struct ctdb_node *); + + /* node_connected is called when a connection to a node is established */ + void (*node_connected)(struct ctdb_node *); +}; + +/* main state of the ctdb daemon */ +struct ctdb_context { + struct event_context *ev; + struct ctdb_address address; + const char *name; + uint32_t vnn; /* our own vnn */ + uint32_t num_nodes; + struct idr_context *idr; + struct ctdb_node **nodes; /* array of nodes in the cluster - indexed by vnn */ + struct ctdb_registered_call *calls; /* list of registered calls */ + char *err_msg; + struct tdb_context *ltdb; + const struct ctdb_methods *methods; /* transport methods */ + const struct ctdb_upcalls *upcalls; /* transport upcalls */ + void *private; /* private to transport */ +}; + +#define CTDB_NO_MEMORY(ctdb, p) do { if (!(p)) { \ + ctdb_set_error(ctdb, "Out of memory at %s:%d", __FILE__, __LINE__); \ + return -1; }} while (0) + +/* arbitrary maximum timeout for ctdb operations */ +#define CTDB_REQ_TIMEOUT 10 + + +/* + operation IDs +*/ +enum ctdb_operation { + CTDB_OP_CALL = 0 +}; + +/* + packet structures +*/ +struct ctdb_req_header { + uint32_t _length; /* ignored by datagram transports */ + uint32_t operation; + uint32_t destnode; + uint32_t srcnode; + uint32_t reqid; + uint32_t reqtimeout; +}; + +struct ctdb_reply_header { + uint32_t _length; /* ignored by datagram transports */ + uint32_t operation; + uint32_t destnode; + uint32_t srcnode; + uint32_t reqid; +}; + +struct ctdb_req_call { + struct ctdb_req_header hdr; + uint32_t callid; + uint32_t keylen; + uint32_t calldatalen; + uint8_t data[0]; /* key[] followed by calldata[] */ +}; + +struct ctdb_reply_call { + struct ctdb_reply_header hdr; + uint32_t datalen; + uint8_t data[0]; +}; + +/* internal prototypes */ +void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...); +bool ctdb_same_address(struct ctdb_address *a1, struct ctdb_address *a2); +int ctdb_parse_address(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, const char *str, + struct ctdb_address *address); +uint32_t ctdb_hash(TDB_DATA *key); + diff --git a/ctdb/include/idtree.h b/ctdb/include/idtree.h new file mode 100644 index 0000000000..259af91005 --- /dev/null +++ b/ctdb/include/idtree.h @@ -0,0 +1,7 @@ +struct idr_context *idr_init(TALLOC_CTX *mem_ctx); +int idr_get_new(struct idr_context *idp, void *ptr, int limit); +int idr_get_new_above(struct idr_context *idp, void *ptr, int starting_id, int limit); +int idr_get_new_random(struct idr_context *idp, void *ptr, int limit); +void *idr_find(struct idr_context *idp, int id); +int idr_remove(struct idr_context *idp, int id); + diff --git a/ctdb/includes.h b/ctdb/include/includes.h index 70632a7e0d..b160e1f739 100644 --- a/ctdb/includes.h +++ b/ctdb/include/includes.h @@ -3,6 +3,7 @@ #include "replace.h" #include "talloc.h" #include "tdb.h" +#include "idtree.h" #include "ctdb.h" #include "lib/util/dlinklist.h" diff --git a/ctdb/lib/util/idtree.c b/ctdb/lib/util/idtree.c new file mode 100644 index 0000000000..b275ae4519 --- /dev/null +++ b/ctdb/lib/util/idtree.c @@ -0,0 +1,374 @@ +/* + Unix SMB/CIFS implementation. + + very efficient functions to manage mapping a id (such as a fnum) to + a pointer. This is used for fnum and search id allocation. + + Copyright (C) Andrew Tridgell 2004 + + This code is derived from lib/idr.c in the 2.6 Linux kernel, which was + written by Jim Houston jim.houston@ccur.com, and is + Copyright (C) 2002 by Concurrent Computer Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +/* + see the section marked "public interface" below for documentation +*/ + +/** + * @file + */ + +#include "includes.h" + +#define IDR_BITS 5 +#define IDR_FULL 0xfffffffful +#if 0 /* unused */ +#define TOP_LEVEL_FULL (IDR_FULL >> 30) +#endif +#define IDR_SIZE (1 << IDR_BITS) +#define IDR_MASK ((1 << IDR_BITS)-1) +#define MAX_ID_SHIFT (sizeof(int)*8 - 1) +#define MAX_ID_BIT (1U << MAX_ID_SHIFT) +#define MAX_ID_MASK (MAX_ID_BIT - 1) +#define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS +#define IDR_FREE_MAX MAX_LEVEL + MAX_LEVEL + +#define set_bit(bit, v) (v) |= (1<<(bit)) +#define clear_bit(bit, v) (v) &= ~(1<<(bit)) +#define test_bit(bit, v) ((v) & (1<<(bit))) + +struct idr_layer { + uint32_t bitmap; + struct idr_layer *ary[IDR_SIZE]; + int count; +}; + +struct idr_context { + struct idr_layer *top; + struct idr_layer *id_free; + int layers; + int id_free_cnt; +}; + +static struct idr_layer *alloc_layer(struct idr_context *idp) +{ + struct idr_layer *p; + + if (!(p = idp->id_free)) + return NULL; + idp->id_free = p->ary[0]; + idp->id_free_cnt--; + p->ary[0] = NULL; + return p; +} + +static int find_next_bit(uint32_t bm, int maxid, int n) +{ + while (n<maxid && !test_bit(n, bm)) n++; + return n; +} + +static void free_layer(struct idr_context *idp, struct idr_layer *p) +{ + p->ary[0] = idp->id_free; + idp->id_free = p; + idp->id_free_cnt++; +} + +static int idr_pre_get(struct idr_context *idp) +{ + while (idp->id_free_cnt < IDR_FREE_MAX) { + struct idr_layer *new = talloc_zero(idp, struct idr_layer); + if(new == NULL) + return (0); + free_layer(idp, new); + } + return 1; +} + +static int sub_alloc(struct idr_context *idp, void *ptr, int *starting_id) +{ + int n, m, sh; + struct idr_layer *p, *new; + struct idr_layer *pa[MAX_LEVEL]; + int l, id; + uint32_t bm; + + memset(pa, 0, sizeof(pa)); + + id = *starting_id; + p = idp->top; + l = idp->layers; + pa[l--] = NULL; + while (1) { + /* + * We run around this while until we reach the leaf node... + */ + n = (id >> (IDR_BITS*l)) & IDR_MASK; + bm = ~p->bitmap; + m = find_next_bit(bm, IDR_SIZE, n); + if (m == IDR_SIZE) { + /* no space available go back to previous layer. */ + l++; + id = (id | ((1 << (IDR_BITS*l))-1)) + 1; + if (!(p = pa[l])) { + *starting_id = id; + return -2; + } + continue; + } + if (m != n) { + sh = IDR_BITS*l; + id = ((id >> sh) ^ n ^ m) << sh; + } + if ((id >= MAX_ID_BIT) || (id < 0)) + return -1; + if (l == 0) + break; + /* + * Create the layer below if it is missing. + */ + if (!p->ary[m]) { + if (!(new = alloc_layer(idp))) + return -1; + p->ary[m] = new; + p->count++; + } + pa[l--] = p; + p = p->ary[m]; + } + /* + * We have reached the leaf node, plant the + * users pointer and return the raw id. + */ + p->ary[m] = (struct idr_layer *)ptr; + set_bit(m, p->bitmap); + p->count++; + /* + * If this layer is full mark the bit in the layer above + * to show that this part of the radix tree is full. + * This may complete the layer above and require walking + * up the radix tree. + */ + n = id; + while (p->bitmap == IDR_FULL) { + if (!(p = pa[++l])) + break; + n = n >> IDR_BITS; + set_bit((n & IDR_MASK), p->bitmap); + } + return(id); +} + +static int idr_get_new_above_int(struct idr_context *idp, void *ptr, int starting_id) +{ + struct idr_layer *p, *new; + int layers, v, id; + + idr_pre_get(idp); + + id = starting_id; +build_up: + p = idp->top; + layers = idp->layers; + if (!p) { + if (!(p = alloc_layer(idp))) + return -1; + layers = 1; + } + /* + * Add a new layer to the top of the tree if the requested + * id is larger than the currently allocated space. + */ + while ((layers < MAX_LEVEL) && (id >= (1 << (layers*IDR_BITS)))) { + layers++; + if (!p->count) + continue; + if (!(new = alloc_layer(idp))) { + /* + * The allocation failed. If we built part of + * the structure tear it down. + */ + for (new = p; p && p != idp->top; new = p) { + p = p->ary[0]; + new->ary[0] = NULL; + new->bitmap = new->count = 0; + free_layer(idp, new); + } + return -1; + } + new->ary[0] = p; + new->count = 1; + if (p->bitmap == IDR_FULL) + set_bit(0, new->bitmap); + p = new; + } + idp->top = p; + idp->layers = layers; + v = sub_alloc(idp, ptr, &id); + if (v == -2) + goto build_up; + return(v); +} + +static int sub_remove(struct idr_context *idp, int shift, int id) +{ + struct idr_layer *p = idp->top; + struct idr_layer **pa[MAX_LEVEL]; + struct idr_layer ***paa = &pa[0]; + int n; + + *paa = NULL; + *++paa = &idp->top; + + while ((shift > 0) && p) { + n = (id >> shift) & IDR_MASK; + clear_bit(n, p->bitmap); + *++paa = &p->ary[n]; + p = p->ary[n]; + shift -= IDR_BITS; + } + n = id & IDR_MASK; + if (p != NULL && test_bit(n, p->bitmap)) { + clear_bit(n, p->bitmap); + p->ary[n] = NULL; + while(*paa && ! --((**paa)->count)){ + free_layer(idp, **paa); + **paa-- = NULL; + } + if ( ! *paa ) + idp->layers = 0; + return 0; + } + return -1; +} + +static void *_idr_find(struct idr_context *idp, int id) +{ + int n; + struct idr_layer *p; + + n = idp->layers * IDR_BITS; + p = idp->top; + /* + * This tests to see if bits outside the current tree are + * present. If so, tain't one of ours! + */ + if ((id & ~(~0 << MAX_ID_SHIFT)) >> (n + IDR_BITS)) + return NULL; + + /* Mask off upper bits we don't use for the search. */ + id &= MAX_ID_MASK; + + while (n >= IDR_BITS && p) { + n -= IDR_BITS; + p = p->ary[(id >> n) & IDR_MASK]; + } + return((void *)p); +} + +static int _idr_remove(struct idr_context *idp, int id) +{ + struct idr_layer *p; + + /* Mask off upper bits we don't use for the search. */ + id &= MAX_ID_MASK; + + if (sub_remove(idp, (idp->layers - 1) * IDR_BITS, id) == -1) { + return -1; + } + + if ( idp->top && idp->top->count == 1 && + (idp->layers > 1) && + idp->top->ary[0]) { + /* We can drop a layer */ + p = idp->top->ary[0]; + idp->top->bitmap = idp->top->count = 0; + free_layer(idp, idp->top); + idp->top = p; + --idp->layers; + } + while (idp->id_free_cnt >= IDR_FREE_MAX) { + p = alloc_layer(idp); + talloc_free(p); + } + return 0; +} + +/************************************************************************ + this is the public interface +**************************************************************************/ + +/** + initialise a idr tree. The context return value must be passed to + all subsequent idr calls. To destroy the idr tree use talloc_free() + on this context + */ +_PUBLIC_ struct idr_context *idr_init(TALLOC_CTX *mem_ctx) +{ + return talloc_zero(mem_ctx, struct idr_context); +} + +/** + allocate the next available id, and assign 'ptr' into its slot. + you can retrieve later this pointer using idr_find() +*/ +_PUBLIC_ int idr_get_new(struct idr_context *idp, void *ptr, int limit) +{ + int ret = idr_get_new_above_int(idp, ptr, 0); + if (ret > limit) { + idr_remove(idp, ret); + return -1; + } + return ret; +} + +/** + allocate a new id, giving the first available value greater than or + equal to the given starting id +*/ +_PUBLIC_ int idr_get_new_above(struct idr_context *idp, void *ptr, int starting_id, int limit) +{ + int ret = idr_get_new_above_int(idp, ptr, starting_id); + if (ret > limit) { + idr_remove(idp, ret); + return -1; + } + return ret; +} + +/** + find a pointer value previously set with idr_get_new given an id +*/ +_PUBLIC_ void *idr_find(struct idr_context *idp, int id) +{ + return _idr_find(idp, id); +} + +/** + remove an id from the idr tree +*/ +_PUBLIC_ int idr_remove(struct idr_context *idp, int id) +{ + int ret; + ret = _idr_remove((struct idr_context *)idp, id); + if (ret != 0) { + DEBUG(0,("WARNING: attempt to remove unset id %d in idtree\n", id)); + } + return ret; +} diff --git a/ctdb/ctdb_private.h b/ctdb/tcp/ctdb_tcp.h index ab3138580b..14e96ea897 100644 --- a/ctdb/ctdb_private.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -19,55 +19,41 @@ */ -/* - a pending ctdb request -*/ -struct ctdb_request { - -}; - -/* - an installed ctdb remote call -*/ -struct ctdb_registered_call { - struct ctdb_registered_call *next, *prev; - uint32_t id; - ctdb_fn_t fn; -}; - -struct ctdb_address { - const char *address; - int port; +/* ctdb_tcp main state */ +struct ctdb_tcp { + int listen_fd; }; /* - state associated with one node + state associated with an incoming connection */ -struct ctdb_node { +struct ctdb_incoming { struct ctdb_context *ctdb; - struct ctdb_node *next, *prev; - struct ctdb_address address; int fd; }; +struct ctdb_tcp_packet { + struct ctdb_tcp_packet *next, *prev; + uint8_t *data; + uint32_t length; +}; + /* - state associated with an incoming connection + state associated with one tcp node */ -struct ctdb_incoming { - struct ctdb_context *ctdb; +struct ctdb_tcp_node { int fd; -}; - -/* main state of the ctdb daemon */ -struct ctdb_context { - struct event_context *ev; - struct ctdb_address address; - int listen_fd; - struct ctdb_node *nodes; /* list of nodes in the cluster */ - struct ctdb_registered_call *calls; /* list of registered calls */ - char *err_msg; - struct tdb_context *ltdb; + struct fd_event *fde; + struct ctdb_tcp_packet *queue; }; -#define CTDB_SOCKET "/tmp/ctdb.sock" +/* prototypes internal to tcp transport */ +void ctdb_tcp_node_write(struct event_context *ev, struct fd_event *fde, + uint16_t flags, void *private); +void ctdb_tcp_incoming_read(struct event_context *ev, struct fd_event *fde, + uint16_t flags, void *private); +int ctdb_tcp_queue_pkt(struct ctdb_node *node, uint8_t *data, uint32_t length); +int ctdb_tcp_listen(struct ctdb_context *ctdb); +void ctdb_tcp_node_connect(struct event_context *ev, struct timed_event *te, + struct timeval t, void *private); diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c new file mode 100644 index 0000000000..c7e361f9ca --- /dev/null +++ b/ctdb/tcp/tcp_connect.c @@ -0,0 +1,187 @@ +/* + ctdb over TCP + + Copyright (C) Andrew Tridgell 2006 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "system/network.h" +#include "system/filesys.h" +#include "ctdb_private.h" +#include "ctdb_tcp.h" + +static void set_nonblocking(int fd) +{ + unsigned v; + v = fcntl(fd, F_GETFL, 0); + fcntl(fd, F_SETFL, v | O_NONBLOCK); +} + + +/* + called when socket becomes writeable on connect +*/ +static void ctdb_node_connect_write(struct event_context *ev, struct fd_event *fde, + uint16_t flags, void *private) +{ + struct ctdb_node *node = talloc_get_type(private, struct ctdb_node); + struct ctdb_tcp_node *tnode = talloc_get_type(node->private, + struct ctdb_tcp_node); + struct ctdb_context *ctdb = node->ctdb; + int error = 0; + socklen_t len; + + if (getsockopt(tnode->fd, SOL_SOCKET, SO_ERROR, &error, &len) != 0 || + error != 0) { + printf("getsockopt error=%s\n", strerror(error)); + talloc_free(fde); + close(tnode->fd); + tnode->fd = -1; + event_add_timed(ctdb->ev, node, timeval_current_ofs(1, 0), + ctdb_tcp_node_connect, node); + return; + } + + talloc_free(fde); + tnode->fde = event_add_fd(node->ctdb->ev, node, tnode->fd, EVENT_FD_READ, + ctdb_tcp_node_write, node); + + /* tell the ctdb layer we are connected */ + node->ctdb->upcalls->node_connected(node); +} + +/* + called when we should try and establish a tcp connection to a node +*/ +void ctdb_tcp_node_connect(struct event_context *ev, struct timed_event *te, + struct timeval t, void *private) +{ + struct ctdb_node *node = talloc_get_type(private, struct ctdb_node); + struct ctdb_tcp_node *tnode = talloc_get_type(node->private, + struct ctdb_tcp_node); + struct ctdb_context *ctdb = node->ctdb; + struct sockaddr_in sock_out; + + tnode->fd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + + set_nonblocking(tnode->fd); + + inet_pton(AF_INET, node->address.address, &sock_out.sin_addr); + sock_out.sin_port = htons(node->address.port); + sock_out.sin_family = PF_INET; + + if (connect(tnode->fd, &sock_out, sizeof(sock_out)) != 0 && + errno != EINPROGRESS) { + /* try again once a second */ + close(tnode->fd); + event_add_timed(ctdb->ev, node, timeval_current_ofs(1, 0), + ctdb_tcp_node_connect, node); + return; + } + + /* non-blocking connect - wait for write event */ + event_add_fd(node->ctdb->ev, node, tnode->fd, EVENT_FD_WRITE|EVENT_FD_READ, + ctdb_node_connect_write, node); +} + +/* + destroy a ctdb_incoming structure +*/ +static int ctdb_incoming_destructor(struct ctdb_incoming *in) +{ + close(in->fd); + in->fd = -1; + return 0; +} + +/* + called when we get contacted by another node + currently makes no attempt to check if the connection is really from a ctdb + node in our cluster +*/ +static void ctdb_listen_event(struct event_context *ev, struct fd_event *fde, + uint16_t flags, void *private) +{ + struct ctdb_context *ctdb; + struct ctdb_tcp *ctcp; + struct sockaddr_in addr; + socklen_t len; + int fd; + struct ctdb_incoming *in; + + ctdb = talloc_get_type(private, struct ctdb_context); + ctcp = talloc_get_type(ctdb->private, struct ctdb_tcp); + memset(&addr, 0, sizeof(addr)); + len = sizeof(addr); + fd = accept(ctcp->listen_fd, (struct sockaddr *)&addr, &len); + if (fd == -1) return; + + in = talloc(ctdb, struct ctdb_incoming); + in->fd = fd; + in->ctdb = ctdb; + + set_nonblocking(in->fd); + + event_add_fd(ctdb->ev, in, in->fd, EVENT_FD_READ, + ctdb_tcp_incoming_read, in); + + talloc_set_destructor(in, ctdb_incoming_destructor); +} + + +/* + listen on our own address +*/ +int ctdb_tcp_listen(struct ctdb_context *ctdb) +{ + struct ctdb_tcp *ctcp = talloc_get_type(ctdb->private, struct ctdb_tcp); + struct sockaddr_in sock; + int one = 1; + + sock.sin_port = htons(ctdb->address.port); + sock.sin_family = PF_INET; + inet_pton(AF_INET, ctdb->address.address, &sock.sin_addr); + + ctcp->listen_fd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (ctcp->listen_fd == -1) { + ctdb_set_error(ctdb, "socket failed\n"); + return -1; + } + + setsockopt(ctcp->listen_fd,SOL_SOCKET,SO_REUSEADDR,(char *)&one,sizeof(one)); + + if (bind(ctcp->listen_fd, (struct sockaddr * )&sock, sizeof(sock)) != 0) { + ctdb_set_error(ctdb, "bind failed\n"); + close(ctcp->listen_fd); + ctcp->listen_fd = -1; + return -1; + } + + if (listen(ctcp->listen_fd, 10) == -1) { + ctdb_set_error(ctdb, "listen failed\n"); + close(ctcp->listen_fd); + ctcp->listen_fd = -1; + return -1; + } + + event_add_fd(ctdb->ev, ctdb, ctcp->listen_fd, EVENT_FD_READ, + ctdb_listen_event, ctdb); + + return 0; +} + diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c new file mode 100644 index 0000000000..39ecec4dbd --- /dev/null +++ b/ctdb/tcp/tcp_init.c @@ -0,0 +1,86 @@ +/* + ctdb over TCP + + Copyright (C) Andrew Tridgell 2006 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "system/network.h" +#include "system/filesys.h" +#include "ctdb_private.h" +#include "ctdb_tcp.h" + +/* + start the protocol going +*/ +int ctdb_tcp_start(struct ctdb_context *ctdb) +{ + int i; + + /* listen on our own address */ + if (ctdb_tcp_listen(ctdb) != 0) return -1; + + /* startup connections to the other servers - will happen on + next event loop */ + for (i=0;i<ctdb->num_nodes;i++) { + struct ctdb_node *node = *(ctdb->nodes + i); + if (ctdb_same_address(&ctdb->address, &node->address)) continue; + event_add_timed(ctdb->ev, node, timeval_zero(), + ctdb_tcp_node_connect, node); + } + + return 0; +} + + +/* + initialise tcp portion of a ctdb node +*/ +int ctdb_tcp_add_node(struct ctdb_node *node) +{ + struct ctdb_tcp_node *tnode; + tnode = talloc_zero(node, struct ctdb_tcp_node); + CTDB_NO_MEMORY(node->ctdb, tnode); + + tnode->fd = -1; + node->private = tnode; + return 0; +} + + +static const struct ctdb_methods ctdb_tcp_methods = { + .start = ctdb_tcp_start, + .add_node = ctdb_tcp_add_node, + .queue_pkt = ctdb_tcp_queue_pkt +}; + +/* + initialise tcp portion of ctdb +*/ +int ctdb_tcp_init(struct ctdb_context *ctdb) +{ + struct ctdb_tcp *ctcp; + ctcp = talloc_zero(ctdb, struct ctdb_tcp); + CTDB_NO_MEMORY(ctdb, ctcp); + + ctcp->listen_fd = -1; + ctdb->private = ctcp; + ctdb->methods = &ctdb_tcp_methods; + return 0; +} + diff --git a/ctdb/tcp/tcp_io.c b/ctdb/tcp/tcp_io.c new file mode 100644 index 0000000000..d6bc2db83e --- /dev/null +++ b/ctdb/tcp/tcp_io.c @@ -0,0 +1,194 @@ +/* + ctdb over TCP + + Copyright (C) Andrew Tridgell 2006 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include "includes.h" +#include "lib/events/events.h" +#include "system/network.h" +#include "system/filesys.h" +#include "ctdb_private.h" +#include "ctdb_tcp.h" + +/* + called when we fail to send a message to a node +*/ +static void ctdb_tcp_node_dead(struct event_context *ev, struct timed_event *te, + struct timeval t, void *private) +{ + struct ctdb_node *node = talloc_get_type(private, struct ctdb_node); + struct ctdb_tcp_node *tnode = talloc_get_type(node->private, + struct ctdb_tcp_node); + + /* flush the queue */ + while (tnode->queue) { + struct ctdb_tcp_packet *pkt = tnode->queue; + DLIST_REMOVE(tnode->queue, pkt); + talloc_free(pkt); + } + + /* start a new connect cycle to try to re-establish the + link */ + talloc_free(tnode->fde); + close(tnode->fd); + tnode->fd = -1; + event_add_timed(node->ctdb->ev, node, timeval_zero(), + ctdb_tcp_node_connect, node); +} + +/* + called when socket becomes readable +*/ +void ctdb_tcp_node_write(struct event_context *ev, struct fd_event *fde, + uint16_t flags, void *private) +{ + struct ctdb_node *node = talloc_get_type(private, struct ctdb_node); + struct ctdb_tcp_node *tnode = talloc_get_type(node->private, + struct ctdb_tcp_node); + if (flags & EVENT_FD_READ) { + /* getting a read event on this fd in the current tcp model is + always an error, as we have separate read and write + sockets. In future we may combine them, but for now it must + mean that the socket is dead, so we try to reconnect */ + talloc_free(tnode->fde); + close(tnode->fd); + tnode->fd = -1; + event_add_timed(node->ctdb->ev, node, timeval_zero(), + ctdb_tcp_node_connect, node); + return; + } + + while (tnode->queue) { + struct ctdb_tcp_packet *pkt = tnode->queue; + ssize_t n; + + n = write(tnode->fd, pkt->data, pkt->length); + + if (n == -1 && errno != EAGAIN && errno != EWOULDBLOCK) { + event_add_timed(node->ctdb->ev, node, timeval_zero(), + ctdb_tcp_node_dead, node); + EVENT_FD_NOT_WRITEABLE(tnode->fde); + return; + } + if (n <= 0) return; + + if (n != pkt->length) { + pkt->length -= n; + pkt->data += n; + return; + } + + DLIST_REMOVE(tnode->queue, pkt); + talloc_free(pkt); + } + + EVENT_FD_NOT_WRITEABLE(tnode->fde); +} + + +/* + called when an incoming connection is readable +*/ +void ctdb_tcp_incoming_read(struct event_context *ev, struct fd_event *fde, + uint16_t flags, void *private) +{ + struct ctdb_incoming *in = talloc_get_type(private, struct ctdb_incoming); + int num_ready = 0; + uint8_t *data; + + /* NOTE: we don't yet handle combined packets or partial + packets. Obviously that needed fixing, using a similar + scheme to the Samba4 packet layer */ + + if (ioctl(in->fd, FIONREAD, &num_ready) != 0 || + num_ready == 0) { + /* we've lost the link from another node. We don't + notify the upper layers, as we only want to trigger + a full node reorganisation when a send fails - that + allows nodes to restart without penalty as long as + the network is idle */ + talloc_free(in); + return; + } + + data = talloc_size(in, num_ready); + if (data == NULL) { + /* not much we can do except drop the socket */ + talloc_free(in); + return; + } + + if (read(in->fd, data, num_ready) != num_ready) { + talloc_free(in); + return; + } + + /* tell the ctdb layer above that we have a packet */ + in->ctdb->upcalls->recv_pkt(in->ctdb, data, num_ready); + + talloc_free(data); +} + +/* + queue a packet for sending +*/ +int ctdb_tcp_queue_pkt(struct ctdb_node *node, uint8_t *data, uint32_t length) +{ + struct ctdb_tcp_node *tnode = talloc_get_type(node->private, + struct ctdb_tcp_node); + struct ctdb_tcp_packet *pkt; + + if (tnode->fd == -1) { + ctdb_set_error(node->ctdb, "Sending to dead node %s\n", node->name); + return -1; + } + + /* if the queue is empty then try an immediate write, avoiding + queue overhead. This relies on non-blocking sockets */ + if (tnode->queue == NULL) { + ssize_t n = write(tnode->fd, data, length); + if (n == -1 && errno != EAGAIN && errno != EWOULDBLOCK) { + event_add_timed(node->ctdb->ev, node, timeval_zero(), + ctdb_tcp_node_dead, node); + /* yes, we report success, as the dead node is + handled via a separate event */ + return 0; + } + if (n > 0) { + data += n; + length -= n; + } + if (length == 0) return 0; + } + + pkt = talloc(tnode, struct ctdb_tcp_packet); + CTDB_NO_MEMORY(node->ctdb, pkt); + + pkt->data = talloc_memdup(pkt, data, length); + CTDB_NO_MEMORY(node->ctdb, pkt->data); + + pkt->length = length; + + if (tnode->queue == NULL) { + EVENT_FD_WRITEABLE(tnode->fde); + } + + DLIST_ADD_END(tnode->queue, pkt, struct ctdb_tcp_packet *); + + return 0; +} diff --git a/ctdb/tests/test.sh b/ctdb/tests/test.sh new file mode 100755 index 0000000000..0aa93fbdbd --- /dev/null +++ b/ctdb/tests/test.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +killall -q ctdb_test + +bin/ctdb_test --nlist nodes.txt --listen 127.0.0.1:9001 & +bin/ctdb_test --nlist nodes.txt --listen 127.0.0.2:9001 & + +sleep 3 +killall ctdb_test |