summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Teigland <teigland@redhat.com>2009-05-22 16:06:40 -0500
committerDavid Teigland <teigland@redhat.com>2009-05-22 16:06:40 -0500
commitf32d252bbbe6c63fee2248c0b0667351bdb303bb (patch)
treedcce40287df56b1ba2ba76fae3e5bce055cbd3ba
cpgx: initial import
Signed-off-by: David Teigland <teigland@redhat.com>
-rw-r--r--cpgx/Makefile4
-rw-r--r--cpgx/cpgx.c1610
-rw-r--r--cpgx/list.h336
3 files changed, 1950 insertions, 0 deletions
diff --git a/cpgx/Makefile b/cpgx/Makefile
new file mode 100644
index 0000000..54e3860
--- /dev/null
+++ b/cpgx/Makefile
@@ -0,0 +1,4 @@
+cpgx: cpgx.c
+ gcc -L/usr/lib64 cpgx.c -g -o cpgx -lcpg
+
+
diff --git a/cpgx/cpgx.c b/cpgx/cpgx.c
new file mode 100644
index 0000000..bc6887f
--- /dev/null
+++ b/cpgx/cpgx.c
@@ -0,0 +1,1610 @@
+/*
+ * Copyright (c) 2009 David Teigland
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License V2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ Initial outline:
+
+ recv time message
+ - assert from a member
+ if sync_done
+ - history[event_num++] = contents
+ - assert leave_done == 0
+ - assert join_done == 1
+ else
+ - save message
+
+ recv sync message
+ if !sync_done and to us
+ - copy contents into history, set event_num
+ - recv saved time messages
+ - sync_done = 1
+ if !sync_done and not to us
+ - ignore
+ if sync_done
+ - set needs_sync = 0 for node sent to
+
+ confchg
+ - history[event_num++] = members/joined/left
+ - if we joined: join_done = 1, sync_done = 0
+ - if we left: leave_done = 1
+ - for each joined node, set needs_sync = 1
+ - if any nodes need sync, and we are synced and we are low_nodeid,
+ send sync message to each specific node with history from the event_num
+ when they were removed up to the event_num where they were added
+
+ send time message
+ - send buf with our time and last event_num
+
+ New history begins at zero every time a new group is formed (all members go
+ away). Otherwise, when formning a new group we'd have to wait for all
+ members to join and gather the latest history from all so we know where to
+ begin. This also means that a node clears its history before every join,
+ since history may have been restarted while it was away.
+*/
+
+/* Limitations: max 8 nodes, nodeids between 1 and 255 */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <limits.h>
+#include <time.h>
+#include <errno.h>
+#include <signal.h>
+#include <sys/poll.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <corosync/cpg.h>
+#include "list.h"
+
+struct client {
+ int fd;
+ void *workfn;
+ void *deadfn;
+};
+
+#define CLIENT_NALLOC 2
+
+static int client_maxi;
+static int client_size = 0;
+static struct client *client = NULL;
+static struct pollfd *pollfd = NULL;
+
+struct cpg_name dct_cpg_name;
+cpg_handle_t dct_cpg_handle;
+int dct_cpg_client;
+int dct_cpg_fd;
+
+int daemon_quit;
+int cluster_down;
+int opt_leave = 1;
+int opt_fail = 1;
+int got_error = 0;
+int continue_after_error = 0;
+int opt_print_event = 1;
+int opt_print_debug = 1;
+
+int join_wait;
+int join_done;
+int sync_wait;
+int sync_done;
+int leave_wait;
+int leave_done;
+
+uint8_t our_nodeid;
+uint32_t eventid;
+uint32_t last_config_eventid;
+uint32_t dispatch_count;
+
+#define MAX_NODES 8
+
+#define EV_CONFCHG 1
+#define EV_MSGTIME 2
+#define EV_MSGSYNC 3
+
+struct dct_header {
+ uint8_t type; /* EV_MSGTIME, EV_MSGSYNC */
+ uint8_t nodeid; /* sender */
+ uint8_t to_nodeid; /* for MSGSYNC */
+ uint8_t pad;
+ uint8_t synced_nodes[8];/* for MSGSYNC, nodes in sync */
+ uint32_t tv_sec;
+ uint32_t tv_usec;
+ uint32_t last_config; /* last config eventid we've seen */
+ uint32_t event_count; /* this many events after header */
+}; /* 28 bytes */
+
+/* time msg is followed by N event structs from history for run-time checks */
+/* sync msg is followed by M event structs from history to sync new node,
+ the final one is the confchg event that to_nodeid joined in */
+
+struct dct_config {
+ uint8_t type; /* EV_CONFCHG */
+ uint8_t memb_count;
+ uint8_t join_count;
+ uint8_t left_count;
+ uint8_t memb[8];
+ uint8_t join[8];
+ uint8_t left[8];
+}; /* 28 bytes */
+
+struct event {
+ uint32_t eventid;
+ union {
+ struct dct_header header;
+ struct dct_config config;
+ char buf[28];
+ };
+}; /* 32 bytes */
+
+int history_len;
+struct event *history;
+struct dct_config *last_config;
+
+struct node {
+ struct list_head list;
+ uint8_t nodeid;
+ uint8_t sync_from; /* node that will send the sync to this node */
+ int is_member;
+ int needs_sync; /* when this node joins we set this to 1, and clear
+ it when we see sync message for it */
+ int sent_sync; /* we sent a sync */
+ uint32_t join_eventid;
+ uint32_t last_check_eventid; /* the eventid from the check event in the
+ most recent time message we received
+ from this node. not used for anything */
+};
+
+struct list_head nodes;
+
+struct save_event {
+ struct list_head list;
+ int type;
+ int we_join;
+ int len;
+ char buf[0];
+};
+
+struct list_head saved_events;
+
+static void process_message(struct dct_header *hd, int len);
+void send_sync(uint8_t nodeid);
+
+#define log_error(fmt, args...) \
+do { \
+ fprintf(stderr, "ERROR: " fmt "\n", ##args); \
+ got_error = 1; \
+} while (0)
+
+#define log_history(fmt, args...) \
+do { \
+ if (opt_print_event) \
+ fprintf(stdout, "H: " fmt "\n", ##args); \
+} while (0)
+
+#define log_debug(fmt, args...) \
+do { \
+ if (opt_print_debug) \
+ fprintf(stdout, "D: " fmt "\n", ##args); \
+} while (0)
+
+static void _log_config(struct dct_config *c, uint32_t id, int error)
+{
+ char m_buf[32];
+ char j_buf[32];
+ char l_buf[32];
+ int i, off;
+
+ memset(m_buf, 0, sizeof(m_buf));
+ memset(j_buf, 0, sizeof(j_buf));
+ memset(l_buf, 0, sizeof(l_buf));
+
+ off = 0;
+ for (i = 0; i < c->memb_count; i++) {
+ off += sprintf(m_buf+off, " %u", c->memb[i]);
+ }
+
+ off = 0;
+ for (i = 0; i < c->join_count; i++) {
+ off += sprintf(j_buf+off, " %u", c->join[i]);
+ }
+
+ off = 0;
+ for (i = 0; i < c->left_count; i++) {
+ off += sprintf(l_buf+off, " %u", c->left[i]);
+ }
+
+ if (error)
+ log_error("%08u conf %u %u %u memb%s join%s left%s",
+ id, c->memb_count, c->join_count, c->left_count,
+ m_buf, j_buf, l_buf);
+ else
+ log_history("%08u conf %u %u %u memb%s join%s left%s",
+ id, c->memb_count, c->join_count, c->left_count,
+ m_buf, j_buf, l_buf);
+
+}
+#define log_config(_c, _er) _log_config(_c, 0, _er)
+
+static void _log_header(struct dct_header *h, uint32_t id, int error)
+{
+ if (error && h->type == EV_MSGTIME)
+ log_error("%08u time %u tv %u.%06u config %u",
+ id, h->nodeid, h->tv_sec, h->tv_usec,
+ h->last_config);
+
+ else if (error && h->type == EV_MSGSYNC)
+ log_error("%08u sync %u tv %u.%06u config %u to %u count %u",
+ id, h->nodeid, h->tv_sec, h->tv_usec,
+ h->last_config, h->to_nodeid, h->event_count);
+
+ else if (!error && h->type == EV_MSGTIME)
+ log_history("%08u time %u tv %u.%06u config %u",
+ id, h->nodeid, h->tv_sec, h->tv_usec,
+ h->last_config);
+
+ else if (!error && h->type == EV_MSGSYNC)
+ log_history("%08u sync %u tv %u.%06u config %u to %u count %u",
+ id, h->nodeid, h->tv_sec, h->tv_usec,
+ h->last_config, h->to_nodeid, h->event_count);
+
+ else
+ log_error("%08u unknown message type %d", id, h->type);
+}
+#define log_header(_h, _er) _log_header(_h, 0, _er)
+
+static void log_event(struct event *ev, int error)
+{
+ struct dct_config *c = &ev->config;
+
+ if (c->type == EV_CONFCHG)
+ _log_config(&ev->config, ev->eventid, error);
+ else
+ _log_header(&ev->header, ev->eventid, error);
+}
+
+static int rand_int(int a, int b)
+{
+ return a + (int) (((float)(b - a + 1)) * random() / (RAND_MAX+1.0));
+}
+
+static unsigned long time_diff_ms(struct timeval *begin, struct timeval *end)
+{
+ struct timeval result;
+ timersub(end, begin, &result);
+ return (result.tv_sec * 1000) + (result.tv_usec / 1000);
+}
+
+static void client_alloc(void)
+{
+ int i;
+
+ if (!client) {
+ client = malloc(CLIENT_NALLOC * sizeof(struct client));
+ pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd));
+ } else {
+ client = realloc(client, (client_size + CLIENT_NALLOC) *
+ sizeof(struct client));
+ pollfd = realloc(pollfd, (client_size + CLIENT_NALLOC) *
+ sizeof(struct pollfd));
+ if (!pollfd)
+ log_error("can't alloc for pollfd");
+ }
+ if (!client || !pollfd)
+ log_error("can't alloc for client array");
+
+ for (i = client_size; i < client_size + CLIENT_NALLOC; i++) {
+ client[i].workfn = NULL;
+ client[i].deadfn = NULL;
+ client[i].fd = -1;
+ pollfd[i].fd = -1;
+ pollfd[i].revents = 0;
+ }
+ client_size += CLIENT_NALLOC;
+}
+
+static void client_dead(int ci)
+{
+ close(client[ci].fd);
+ client[ci].workfn = NULL;
+ client[ci].fd = -1;
+ pollfd[ci].fd = -1;
+}
+
+static int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci))
+{
+ int i;
+
+ if (!client)
+ client_alloc();
+ again:
+ for (i = 0; i < client_size; i++) {
+ if (client[i].fd == -1) {
+ client[i].workfn = workfn;
+ if (deadfn)
+ client[i].deadfn = deadfn;
+ else
+ client[i].deadfn = client_dead;
+ client[i].fd = fd;
+ pollfd[i].fd = fd;
+ pollfd[i].events = POLLIN;
+ if (i > client_maxi)
+ client_maxi = i;
+ return i;
+ }
+ }
+
+ client_alloc();
+ goto again;
+}
+
+static void sigterm_handler(int sig)
+{
+ daemon_quit = 1;
+}
+
+void cluster_dead(int ci)
+{
+ if (!cluster_down)
+ log_error("cluster is down, exiting");
+ daemon_quit = 1;
+ cluster_down = 1;
+}
+
+struct node *get_node(uint8_t nodeid)
+{
+ struct node *node;
+
+ list_for_each_entry(node, &nodes, list) {
+ if (node->nodeid == nodeid)
+ return node;
+ }
+ return NULL;
+}
+
+struct node *add_node(uint8_t nodeid)
+{
+ struct node *node;
+
+ node = malloc(sizeof(struct node));
+ if (!node)
+ return NULL;
+ memset(node, 0, sizeof(struct node));
+ node->nodeid = nodeid;
+
+ list_add_tail(&node->list, &nodes);
+ return node;
+}
+
+void set_node_synced(uint8_t nodeid)
+{
+ struct node *node;
+
+ node = get_node(nodeid);
+ node->needs_sync = 0;
+ log_debug("nodeid %d needs_sync 0", nodeid);
+}
+
+int in_memb(struct dct_config *c, uint8_t nodeid)
+{
+ int i;
+
+ for (i = 0; i < c->memb_count; i++) {
+ if (c->memb[i] == nodeid)
+ return 1;
+ }
+ return 0;
+}
+
+int in_join(struct dct_config *c, uint8_t nodeid)
+{
+ int i;
+
+ for (i = 0; i < c->join_count; i++) {
+ if (c->join[i] == nodeid)
+ return 1;
+ }
+ return 0;
+}
+
+int in_left(struct dct_config *c, uint8_t nodeid)
+{
+ int i;
+
+ for (i = 0; i < c->left_count; i++) {
+ if (c->left[i] == nodeid)
+ return 1;
+ }
+ return 0;
+}
+
+
+void print_nodes_list(void)
+{
+ struct node *node;
+
+ list_for_each_entry(node, &nodes, list) {
+ log_debug("nodeid %u is_member %d needs_sync %d join %u check %u",
+ node->nodeid, node->is_member, node->needs_sync,
+ node->join_eventid, node->last_check_eventid);
+ }
+}
+
+void init_join_eventid(struct node *node)
+{
+ struct event *ev;
+ struct dct_config *c;
+ int i = eventid - 1;
+
+ while (1) {
+ ev = &history[i];
+ c = &ev->config;
+
+ if ((c->type == EV_CONFCHG) && in_join(c, node->nodeid)) {
+ node->join_eventid = ev->eventid;
+ return;
+ }
+
+ if (!i)
+ break;
+ i--;
+ }
+}
+
+/* hd is the header of our sync message, c is the config for our join event */
+
+void init_nodes_list(struct dct_header *hd, struct dct_config *c)
+{
+ struct node *node;
+ uint8_t nodeid;
+ int i;
+
+ if (!list_empty(&nodes)) {
+ log_error("init_nodes_list nodes not empty");
+ return;
+ }
+
+ for (i = 0; i < c->memb_count; i++) {
+ node = add_node(c->memb[i]);
+ node->is_member = 1;
+ node->needs_sync = 1;
+ }
+
+ for (i = 0; i < 8; i++) {
+ nodeid = hd->synced_nodes[i];
+ if (!nodeid)
+ break;
+
+ node = get_node(nodeid);
+ if (!node)
+ log_error("init_nodes_list synced_nodes %u", nodeid);
+ else
+ node->needs_sync = 0;
+ }
+
+ /* I don't think there's any case where we'll need to know the
+ join_eventid of an earlier joining node to send it a sync */
+#if 0
+ list_for_each_entry(node, &nodes, list)
+ init_join_eventid(node);
+#endif
+
+ log_debug("init_nodes_list");
+ print_nodes_list();
+}
+
+void free_nodes_list(void)
+{
+ struct node *node, *safe;
+
+ list_for_each_entry_safe(node, safe, &nodes, list) {
+ list_del(&node->list);
+ free(node);
+ }
+}
+
+void update_nodes_list(struct dct_config *c, uint32_t id)
+{
+ struct node *node;
+ int i, is_memb, is_join, is_left;
+
+ list_for_each_entry(node, &nodes, list) {
+
+ is_memb = in_memb(c, node->nodeid);
+ is_join = in_join(c, node->nodeid);
+ is_left = in_left(c, node->nodeid);
+
+ if (is_memb && node->is_member) {
+ if (is_join || is_left)
+ log_error("member list off %d %d %d %d %u",
+ is_memb, is_join, is_left,
+ node->is_member, node->nodeid);
+ continue;
+ }
+
+ if (!is_memb && !node->is_member) {
+ if (is_join || is_left)
+ log_error("member list off %d %d %d %d %u",
+ is_memb, is_join, is_left,
+ node->is_member, node->nodeid);
+ continue;
+ }
+
+ /* node has failed/left */
+
+ if (!is_memb & node->is_member) {
+ if (is_join || !is_left)
+ log_error("member list off %d %d %d %d %u",
+ is_memb, is_join, is_left,
+ node->is_member, node->nodeid);
+
+ node->is_member = 0;
+ node->sync_from = 0;
+ node->sent_sync = 0;
+ continue;
+ }
+
+ /* old node has joined again */
+
+ if (is_memb && !node->is_member) {
+ if (!is_join || is_left)
+ log_error("member list off %d %d %d %d %u",
+ is_memb, is_join, is_left,
+ node->is_member, node->nodeid);
+
+ node->is_member = 1;
+ node->needs_sync = 1;
+ node->join_eventid = id;
+ continue;
+ }
+
+ log_error("update_nodes_list shouldn't get here %d %d %d %d %u",
+ is_memb, is_join, is_left,
+ node->is_member, node->nodeid);
+ }
+
+ /* add new node we've not seen before */
+
+ for (i = 0; i < c->memb_count; i++) {
+ node = get_node(c->memb[i]);
+ if (node)
+ continue;
+
+ node = add_node(c->memb[i]);
+ node->is_member = 1;
+ node->needs_sync = 1;
+ node->join_eventid = id;
+
+ is_join = in_join(c, node->nodeid);
+ if (!is_join)
+ log_error("join list off %d", node->nodeid);
+ }
+
+ log_debug("update_nodes_list");
+ print_nodes_list();
+}
+
+/* a scheme where new nodes are synced from the low synced nodeid needs to
+ consider that the low nodeid may fail before syncing the new nodes */
+
+void update_nodes_sync_from(void)
+{
+ struct node *node;
+ uint8_t low = 0;
+
+ list_for_each_entry(node, &nodes, list) {
+ if (!node->is_member)
+ continue;
+ if (node->needs_sync)
+ continue;
+ if (!low || node->nodeid < low)
+ low = node->nodeid;
+ }
+
+ list_for_each_entry(node, &nodes, list) {
+ if (!node->is_member)
+ continue;
+ if (!node->needs_sync)
+ continue;
+
+ node->sync_from = low;
+ }
+}
+
+void send_syncs(void)
+{
+ struct node *node;
+
+ list_for_each_entry(node, &nodes, list) {
+ if (!node->is_member)
+ continue;
+ if (!node->needs_sync)
+ continue;
+ if (node->sync_from != our_nodeid)
+ continue;
+ if (node->sent_sync)
+ continue;
+
+ send_sync(node->nodeid);
+ node->sent_sync = 1;
+ }
+}
+
+static int _send_message(cpg_handle_t h, char *buf, int len, int type)
+{
+ struct iovec iov;
+ cpg_error_t error;
+ int retries = 0;
+
+ iov.iov_base = buf;
+ iov.iov_len = len;
+
+ retry:
+ error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
+ if (error == CPG_ERR_TRY_AGAIN) {
+ retries++;
+ usleep(1000);
+ if (!(retries % 100))
+ log_debug("cpg_mcast_joined retry %d %d",
+ retries, type);
+ goto retry;
+ }
+ if (error != CPG_OK) {
+ log_error("cpg_mcast_joined error %d handle %llx %d",
+ error, (unsigned long long)h, type);
+ return -1;
+ }
+
+ return 0;
+}
+
+#define CHECK_COUNT 8
+
+void send_time(void)
+{
+ char *buf;
+ struct timeval tv;
+ struct dct_header *hd;
+ struct event *ev;
+ uint32_t start_eventid, end_eventid; /* inclusive */
+ int i, count, event_count, len;
+
+ event_count = eventid;
+
+ if (event_count < CHECK_COUNT) {
+ count = event_count;
+ end_eventid = eventid - 1;
+ start_eventid = 0;
+ } else {
+ count = CHECK_COUNT;
+ end_eventid = eventid - 1;
+ start_eventid = end_eventid - count + 1;
+ }
+ len = sizeof(struct dct_header) + count * sizeof(struct event);
+
+ buf = malloc(len);
+ if (!buf) {
+ log_error("send_time no mem %d", len);
+ return;
+ }
+ memset(buf, 0, len);
+
+ hd = (struct dct_header *)buf;
+ ev = (struct event *)(buf + sizeof(struct dct_header));
+
+ gettimeofday(&tv, NULL);
+
+ hd->type = EV_MSGTIME;
+ hd->nodeid = our_nodeid;
+ hd->tv_sec = tv.tv_sec;
+ hd->tv_usec = tv.tv_usec;
+ hd->last_config = last_config_eventid;
+ hd->event_count = count;
+
+ for (i = start_eventid; i < end_eventid + 1; i++) {
+ memcpy(ev, &history[i], sizeof(struct event));
+ ev++;
+ }
+
+ _send_message(dct_cpg_handle, buf, len, EV_MSGTIME);
+
+ free(buf);
+}
+
+/* TODO: ability to send full history, cpg max message sizes limit us */
+
+#define SYNC_MAX 1000
+
+void send_sync(uint8_t nodeid)
+{
+ char *buf;
+ struct timeval tv;
+ struct dct_header *hd;
+ struct event *ev;
+ struct node *node;
+ uint32_t start_eventid, end_eventid; /* inclusive */
+ int i, count, event_count, len;
+
+ node = get_node(nodeid);
+ if (!node) {
+ log_error("send_sync no node %u", nodeid);
+ return;
+ }
+
+ if (!node->join_eventid) {
+ log_error("send_sync nodeid %d zero join_eventid", nodeid);
+ return;
+ }
+
+ event_count = node->join_eventid + 1;
+
+ if (event_count > SYNC_MAX) {
+ count = SYNC_MAX;
+ end_eventid = node->join_eventid;
+ start_eventid = end_eventid - count + 1;
+ } else {
+ count = event_count;
+ end_eventid = node->join_eventid;
+ start_eventid = 0;
+ }
+ len = sizeof(struct dct_header) + count * sizeof(struct event);
+
+ buf = malloc(len);
+ if (!buf) {
+ log_error("send_sync no mem %d", len);
+ return;
+ }
+ memset(buf, 0, len);
+
+ hd = (struct dct_header *)buf;
+ ev = (struct event *)(buf + sizeof(struct dct_header));
+
+ gettimeofday(&tv, NULL);
+
+ hd->type = EV_MSGSYNC;
+ hd->nodeid = our_nodeid;
+ hd->to_nodeid = nodeid;
+ hd->tv_sec = tv.tv_sec;
+ hd->tv_usec = tv.tv_usec;
+ hd->last_config = last_config_eventid;
+ hd->event_count = count;
+
+ /* which nodes are synced is part of the replicated state that needs to
+ be synced to new nodes */
+
+ i = 0;
+ list_for_each_entry(node, &nodes, list) {
+ if (!node->is_member)
+ continue;
+ if (!node->needs_sync)
+ hd->synced_nodes[i++] = node->nodeid;
+ }
+
+ for (i = start_eventid; i < end_eventid + 1; i++) {
+ memcpy(ev, &history[i], sizeof(struct event));
+ ev++;
+ }
+
+ log_debug("send_sync to %u len %d count %d events %u-%u",
+ nodeid, len, count, start_eventid, end_eventid);
+
+ _send_message(dct_cpg_handle, buf, len, EV_MSGSYNC);
+
+ free(buf);
+}
+
+void check_event(struct event *ev_buf)
+{
+ if (memcmp(ev_buf, &history[ev_buf->eventid], sizeof(struct event))) {
+ log_error("check_event %u", ev_buf->eventid);
+ log_event(ev_buf, 1);
+ log_event(&history[ev_buf->eventid], 1);
+ }
+}
+
+/* last event read should be the one in which we joined,
+ after that, we process saved events, after that we
+ add the sync message these events are being read from */
+
+void read_events(struct dct_header *hd, int len, int check_only)
+{
+ struct event *ev_buf, *ev_his;
+ struct dct_config *c1, *c2;
+ struct save_event *se;
+ uint32_t start_eventid, end_eventid; /* inclusive */
+ int count = hd->event_count;
+ int i;
+
+ if (len != sizeof(struct dct_header) + count * sizeof(struct event)) {
+ log_error("read_events bad len %d count %d", len, count);
+ return;
+ }
+
+ ev_buf = (struct event *)((char *)hd + sizeof(struct dct_header));
+
+ start_eventid = ev_buf->eventid;
+
+ for (i = 0; i < count; i++) {
+ ev_his = &history[ev_buf->eventid];
+
+ if (check_only || ev_his->eventid)
+ check_event(ev_buf);
+ else
+ memcpy(ev_his, ev_buf, sizeof(struct event));
+
+ end_eventid = ev_buf->eventid;
+ ev_buf++;
+ }
+
+ log_debug("read_events from %u to %u len %d count %d events %u-%u %s",
+ hd->nodeid, hd->to_nodeid, len, count, start_eventid,
+ end_eventid, check_only ? "check" : "copy");
+
+ if (check_only)
+ return;
+
+ /* verify last event in sync series is our join. it should match the
+ first saved event which should be our join config. we remove this
+ first saved event which overlaps what we got from sync */
+
+ ev_his = &history[end_eventid];
+ c1 = &ev_his->config;
+
+ if (c1->type != EV_CONFCHG) {
+ log_error("read_events history event not confchg");
+ log_event(ev_his, 1);
+ }
+
+ se = list_first_entry(&saved_events, struct save_event, list);
+ c2 = (struct dct_config *)se->buf;
+
+ if (se->type != EV_CONFCHG) {
+ log_error("read_events saved event not confchg");
+ log_event((struct event *)se->buf, 1);
+ }
+
+ if (!se->we_join) {
+ log_error("read_events first entry not our join");
+ log_config(c2, 1);
+ }
+
+ if (memcmp(c1, c2, sizeof(struct dct_config))) {
+ log_error("read_events no config match");
+ log_config(c1, 1);
+ log_config(c2, 1);
+ }
+
+ eventid = end_eventid + 1;
+ last_config_eventid = end_eventid;
+ last_config = c1;
+
+ list_del(&se->list);
+ free(se);
+
+ init_nodes_list(hd, c1);
+}
+
+void add_history_confchg(struct dct_config *c)
+{
+ struct event *ev;
+
+ ev = &history[eventid];
+ ev->eventid = eventid++;
+
+ memcpy(&ev->config, c, sizeof(struct dct_config));
+
+ _log_config(c, ev->eventid, 0);
+
+ last_config_eventid = ev->eventid;
+ last_config = &ev->config;
+}
+
+void add_history_message(struct dct_header *h, int len)
+{
+ struct event *ev;
+
+ ev = &history[eventid];
+ ev->eventid = eventid++;
+
+ memcpy(&ev->header, h, sizeof(struct dct_header));
+
+ _log_header(h, ev->eventid, 0);
+}
+
+/* process events that occured between our join event (which is processed in
+ receive_sync and removed from saved_events) and our sync event */
+
+void read_saved_events(void)
+{
+ struct save_event *se, *safe;
+ int count_c = 0, count_m = 0;
+
+ log_debug("read_saved_events");
+
+ list_for_each_entry_safe(se, safe, &saved_events, list) {
+ if (se->type == EV_CONFCHG) {
+ add_history_confchg((struct dct_config *)&se->buf);
+ update_nodes_list(last_config, last_config_eventid);
+ count_c++;
+ } else {
+ process_message((struct dct_header *)&se->buf, se->len);
+ count_m++;
+ }
+
+ list_del(&se->list);
+ free(se);
+ }
+
+ log_debug("read_saved_events confchg %d message %d", count_c, count_m);
+}
+
+void save_message(struct dct_header *hd, int len, int type)
+{
+ struct save_event *se;
+
+ se = malloc(sizeof(struct save_event) + len);
+ if (!se) {
+ log_error("save_message no mem %d", len);
+ return;
+ }
+ memset(se, 0, sizeof(struct save_event) + len);
+
+ se->type = type;
+ se->len = len;
+ memcpy(&se->buf, hd, len);
+
+ list_add_tail(&se->list, &saved_events);
+}
+
+void save_confchg(struct dct_config *c, int we_join)
+{
+ struct save_event *se;
+ int len;
+
+ len = sizeof(struct save_event) + sizeof(struct dct_config);
+
+ se = malloc(len);
+ if (!se) {
+ log_error("save_confchg no mem %d", len);
+ return;
+ }
+ memset(se, 0, len);
+
+ se->type = EV_CONFCHG;
+ se->we_join = we_join;
+ memcpy(&se->buf, c, sizeof(struct dct_config));
+
+ list_add_tail(&se->list, &saved_events);
+}
+
+static void confchg_cb(cpg_handle_t handle, const struct cpg_name *group_name,
+ const struct cpg_address *memb_list,
+ size_t memb_list_entries,
+ const struct cpg_address *left_list,
+ size_t left_list_entries,
+ const struct cpg_address *join_list,
+ size_t join_list_entries)
+{
+ struct dct_config c;
+ struct node *node;
+ int we_left, we_join;
+ int i;
+
+ dispatch_count++;
+
+ memset(&c, 0, sizeof(struct dct_config));
+
+ c.type = EV_CONFCHG;
+
+ c.memb_count = memb_list_entries;
+ c.left_count = left_list_entries;
+ c.join_count = join_list_entries;
+
+ for (i = 0; i < memb_list_entries; i++)
+ c.memb[i] = (uint8_t)memb_list[i].nodeid;
+ for (i = 0; i < left_list_entries; i++)
+ c.left[i] = (uint8_t)left_list[i].nodeid;
+ for (i = 0; i < join_list_entries; i++)
+ c.join[i] = (uint8_t)join_list[i].nodeid;
+
+ if (leave_done) {
+ /* the we_left confchg should be the very last event we see */
+ log_error("confchg_cb after leave_done");
+ log_config(&c, 1);
+ return;
+ }
+
+ we_left = leave_wait && in_left(&c, our_nodeid);
+ we_join = join_wait && in_memb(&c, our_nodeid);
+
+ if (we_left) {
+ leave_wait = 0;
+ leave_done = 1;
+ cpg_finalize(dct_cpg_handle);
+ client_dead(dct_cpg_client);
+ }
+
+ if (!we_left && !in_memb(&c, our_nodeid)) {
+ log_error("confchg_cb without our_nodeid %u", our_nodeid);
+ log_config(&c, 1);
+ return;
+ }
+
+ if (we_join) {
+ join_wait = 0;
+ join_done = 1;
+ sync_wait = 1;
+ sync_done = 0;
+ }
+
+ /* shortcut to bootstrap things, doesn't work if more than one node
+ join in the first confchg. If it happens, we'll see the initial
+ members all sitting in sync_wait for an existing member (there is
+ none) to send a sync message. To do this properly we'd probably
+ need to have everyone exchange state messages for each confchg so we
+ could detect when multiple new nodes join in the first confchg of a
+ newly formed cpg. Or, rely on the join list to be the same as the
+ member list? */
+
+ if (we_join && c.memb_count == 1) {
+ sync_wait = 0;
+ sync_done = 1;
+ node = add_node(our_nodeid);
+ node->is_member = 1;
+ node->needs_sync = 0;
+ add_history_confchg(&c);
+ return;
+ }
+
+ if (sync_wait) {
+ save_confchg(&c, we_join);
+ return;
+ }
+
+ add_history_confchg(&c);
+ update_nodes_list(last_config, last_config_eventid);
+ update_nodes_sync_from();
+}
+
+static void receive_sync(struct dct_header *hd, int len)
+{
+ if (sync_wait) {
+ if (hd->to_nodeid != our_nodeid) {
+ /* save events to add to history after we're synced */
+ save_message(hd, len, EV_MSGSYNC);
+ } else {
+ sync_wait = 0;
+ sync_done = 1;
+
+ read_events(hd, len, 0);
+ read_saved_events();
+
+ add_history_message(hd, len);
+ set_node_synced(our_nodeid);
+ }
+ } else {
+ if (hd->to_nodeid == our_nodeid)
+ log_debug("receive_sync from %d redundant", hd->nodeid);
+
+ /* check we agree with past events being synced to new node */
+ read_events(hd, len, 1);
+
+ add_history_message(hd, len);
+ set_node_synced(hd->to_nodeid);
+ }
+}
+
+static void receive_time(struct dct_header *hd, int len)
+{
+ struct event *ev_buf;
+ struct node *node;
+
+ if (sync_wait) {
+ /* save events to add to history after we're synced */
+ save_message(hd, len, EV_MSGTIME);
+ return;
+ }
+
+ /* check if the sender is a member of the last configuration */
+
+ if (!in_memb(last_config, hd->nodeid)) {
+ log_error("receive_time from non member");
+ log_header(hd, 1);
+ log_config(last_config, 1);
+ return;
+ }
+
+ /* check our scheme of syncing nodes, where nodes don't send time
+ messages until they're synced */
+
+ node = get_node(hd->nodeid);
+ if (!node) {
+ log_error("receive_time no node %u", hd->nodeid);
+ log_header(hd, 1);
+ return;
+ }
+
+ if (node->needs_sync) {
+ log_error("receive_time from %u needs_sync", hd->nodeid);
+ log_header(hd, 1);
+ return;
+ }
+
+ add_history_message(hd, len);
+
+ /* an event from past history (which everyone should know about) is
+ included in time messages as a way to check that nodes are staying
+ in sync as we go */
+
+ ev_buf = (struct event *)((char *)hd + sizeof(struct dct_header));
+
+ check_event(ev_buf);
+
+ node->last_check_eventid = ev_buf->eventid;
+
+ /* this check currently fails with corosync */
+#if 0
+ /* Track whether messages sent in configuration C1 are delivered in C1
+ instead of a subsequent C2. Recent reading led me to believe that
+ this was one of the VS guarantees, but I'm not certain. */
+
+ if (hd->last_config != last_config_eventid) {
+ log_error("receive_time in other config");
+ log_header(hd, 1);
+ log_config(last_config, 1);
+ }
+#endif
+}
+
+static void process_message(struct dct_header *hd, int len)
+{
+ switch (hd->type) {
+ case EV_MSGTIME:
+ receive_time(hd, len);
+ break;
+
+ case EV_MSGSYNC:
+ receive_sync(hd, len);
+ break;
+ };
+}
+
+static void deliver_cb(cpg_handle_t handle, const struct cpg_name *group_name,
+ uint32_t nodeid, uint32_t pid, void *data, size_t len)
+{
+ struct dct_header *hd = data;
+
+ dispatch_count++;
+
+ if (len < sizeof(struct dct_header)) {
+ log_error("deliver_cb short message %d", len);
+ log_header(hd, 1);
+ return;
+ }
+
+ if (hd->nodeid != (uint8_t)nodeid) {
+ log_error("bad msg nodeid %u %u", hd->nodeid, nodeid);
+ log_header(hd, 1);
+ return;
+ }
+
+ if (join_wait) {
+ /* the we_joined confchg should be the first event we see */
+ log_error("deliver_cb before joined");
+ log_header(hd, 1);
+ return;
+ }
+
+ if (leave_done) {
+ /* the we_left confchg should be the last event we see */
+ log_error("deliver_cb after left");
+ log_header(hd, 1);
+ return;
+ }
+
+ process_message(hd, (int)len);
+}
+
+static cpg_callbacks_t cpg_callbacks = {
+ .cpg_deliver_fn = deliver_cb,
+ .cpg_confchg_fn = confchg_cb,
+};
+
+static void process_cpg(int ci)
+{
+ cpg_error_t error;
+
+ error = cpg_dispatch(dct_cpg_handle, CPG_DISPATCH_ALL);
+ if (error != CPG_OK) {
+ log_error("cpg_dispatch error %d", error);
+ return;
+ }
+
+ /* can't send from dispatch, so just flag the nodes that need
+ syncing during the dispatch and send the sync messages now */
+
+ send_syncs();
+}
+
+int do_join(void)
+{
+ cpg_error_t error;
+ cpg_handle_t h;
+ int i = 0, fd, ci, rv;
+ int unused;
+ uint32_t nodeid;
+
+ sprintf(dct_cpg_name.value, "dct_cpg");
+ dct_cpg_name.length = 8;
+
+ error = cpg_initialize(&h, &cpg_callbacks);
+ if (error != CPG_OK) {
+ log_error("cpg_initialize error %d", error);
+ rv = -1;
+ goto fail_out;
+ }
+
+ error = cpg_local_get(h, &nodeid);
+ if (error != CPG_OK) {
+ log_error("cpg_local_get error %d", error);
+ rv = -1;
+ goto fail_fin;
+ }
+
+ if (nodeid < 1 || nodeid > 255) {
+ log_error("nodeids must be between 1 and 255");
+ rv = -1;
+ goto fail_fin;
+ }
+ our_nodeid = (uint8_t)nodeid;
+
+ cpg_fd_get(h, &fd);
+
+ ci = client_add(fd, process_cpg, cluster_dead);
+
+ dct_cpg_handle = h;
+ dct_cpg_client = ci;
+ dct_cpg_fd = fd;
+
+ log_debug("do join our_nodeid %u", our_nodeid);
+ retry:
+ error = cpg_join(h, &dct_cpg_name);
+ if (error == CPG_ERR_TRY_AGAIN) {
+ sleep(1);
+ if (!(++i % 10))
+ log_debug("cpg_join error retrying");
+ goto retry;
+ }
+ if (error != CPG_OK) {
+ log_error("cpg_join error %d", error);
+ cpg_finalize(h);
+ rv = -1;
+ goto fail;
+ }
+
+ return 0;
+
+ fail:
+ client_dead(ci);
+ fail_fin:
+ cpg_finalize(h);
+ fail_out:
+ return rv;
+}
+
+int do_leave(void)
+{
+ cpg_error_t error;
+ int i = 0;
+
+ retry:
+ error = cpg_leave(dct_cpg_handle, &dct_cpg_name);
+ if (error == CPG_ERR_TRY_AGAIN) {
+ sleep(1);
+ if (!(++i % 10))
+ log_debug("cpg_leave error retrying");
+ goto retry;
+ }
+ if (error != CPG_OK)
+ log_error("cpg_leave error %d", error);
+
+ return 0;
+}
+
+static struct timeval last_send;
+static int wait_send;
+
+/* random interval from 5 to 100 ms between every send */
+
+int we_should_send(void)
+{
+ struct timeval now;
+ unsigned long ms;
+
+ gettimeofday(&now, NULL);
+
+ ms = time_diff_ms(&last_send, &now);
+
+ if (ms >= wait_send) {
+ last_send = now;
+ wait_send = rand_int(5, 100);
+ return 1;
+ }
+ return 0;
+}
+
+int we_should_leave(void)
+{
+ static unsigned int tries;
+ int rv;
+
+ if (!opt_leave)
+ return 0;
+
+ tries++;
+
+ rv = rand_int(1, 2000);
+ if (rv == 200) {
+ log_debug("do leave %u", tries);
+ tries = 0;
+ return 1;
+ }
+ return 0;
+}
+
+int we_should_fail(void)
+{
+ static unsigned int tries;
+ int rv;
+
+ if (!opt_fail)
+ return 0;
+
+ tries++;
+
+ rv = rand_int(1, 2000);
+ if (rv == 100) {
+ log_debug("do fail %u", tries);
+ return 1;
+ }
+ return 0;
+}
+
+void loop(void)
+{
+ void (*workfn) (int ci);
+ void (*deadfn) (int ci);
+ int poll_timeout = 5; /* ms */
+ int rv, i;
+
+ srandom(time(NULL));
+
+ sync_wait = 0;
+ sync_done = 0;
+ leave_wait = 0;
+ leave_done = 0;
+
+ join_wait = 1;
+ join_done = 0;
+
+ eventid = 0;
+
+ memset(history, 0, history_len);
+ free_nodes_list();
+
+ do_join();
+
+ for (;;) {
+ rv = poll(pollfd, client_maxi + 1, poll_timeout);
+ if (rv == -1 && errno == EINTR) {
+ if (daemon_quit)
+ goto out;
+ continue;
+ }
+ if (rv < 0) {
+ log_error("poll errno %d", errno);
+ goto out;
+ }
+
+ /*
+ * read events from callbacks
+ */
+
+ for (i = 0; i <= client_maxi; i++) {
+ if (client[i].fd < 0)
+ continue;
+ if (pollfd[i].revents & POLLIN) {
+ workfn = client[i].workfn;
+ workfn(i);
+ }
+ if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)){
+ deadfn = client[i].deadfn;
+ deadfn(i);
+ }
+ }
+
+ if (got_error) {
+ fflush(stdout);
+ fflush(stderr);
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * do things that create events (send messages, leave, fail)
+ */
+
+ if (!join_done && dispatch_count) {
+ /* a confchg for our join should be the first event we
+ see, so this shouldn't happen. without the
+ dispatch_count condition this error triggers
+ sometimes, indicating zero dispatch_count */
+ log_error("event before join done dispatches %u",
+ dispatch_count);
+ continue;
+ }
+
+ if (!sync_done) {
+ /* don't do things until we're synced */
+ continue;
+ }
+
+ if (leave_done)
+ break;
+
+ if (leave_wait) {
+ /* don't send messages while waiting for our leave
+ to complete, but we must keep reading events until
+ we've left, which is when a confchg for our leave
+ arrives. */
+ continue;
+ }
+
+ if (we_should_leave()) {
+ leave_wait = 1;
+ leave_done = 0;
+ do_leave();
+ continue;
+ }
+
+ if (we_should_send())
+ send_time();
+
+ if (we_should_fail()) {
+ fflush(stdout);
+ fflush(stderr);
+ exit(2);
+ }
+ }
+ out:
+ return;
+}
+
+void print_usage(void)
+{
+ printf("Output:\n");
+ printf("ERROR: <error string> (stderr)\n");
+ printf("H: <event string> (stdout)\n");
+ printf("D: <debug string> (stdout)\n");
+ printf("\n");
+ printf("Options:\n");
+ printf(" -H [0|1] event history output [off|on] default 1\n");
+ printf(" -D [0|1] debug output [off|on] default 1\n");
+ printf(" -f [0|1] fail included in test [off|on] default 1\n");
+ printf(" -l [0|1] leave included in test [off|on] default 1\n");
+ printf(" -c continue after error\n");
+
+ printf("to prevent history from periodically restarting from 0,\n"
+ "keep one node from either leaving or failing with -f0 -l0\n");
+}
+
+#define HISTORY_EVENTS (1024 * 1024)
+
+int main(int argc, char **argv)
+{
+ pid_t pid;
+ int status, code;
+ int cont = 1;
+ int optchar;
+
+ while (cont) {
+ optchar = getopt(argc, argv, "H:D:cf:l:h");
+
+ switch (optchar) {
+ case 'H':
+ opt_print_event = atoi(optarg);
+ break;
+
+ case 'D':
+ opt_print_debug = atoi(optarg);
+ break;
+
+ case 'f':
+ opt_fail = atoi(optarg);
+ break;
+
+ case 'l':
+ opt_leave = atoi(optarg);
+ break;
+
+ case 'c':
+ continue_after_error = 1;
+ break;
+
+ case 'h':
+ print_usage();
+ exit(EXIT_SUCCESS);
+
+ case EOF:
+ cont = 0;
+ break;
+ };
+ }
+
+ srandom(time(NULL));
+
+ history_len = HISTORY_EVENTS * sizeof(struct event);
+
+ history = malloc(history_len);
+ if (!history) {
+ log_error("history no mem %d", history_len);
+ exit(-1);
+ }
+
+ INIT_LIST_HEAD(&nodes);
+ INIT_LIST_HEAD(&saved_events);
+
+ while (1) {
+ pid = fork();
+
+ if (!pid) {
+ /*
+ * repeat join/work/leave until exit
+ */
+
+ while (1) {
+ loop();
+ sleep(rand_int(0, 3));
+ }
+ }
+
+ /*
+ * parent waits for exit,
+ * exit 1 is error; stop to see what went wrong
+ * exit 2 is intentional part of test, keep going
+ */
+
+ waitpid(pid, &status, 0);
+
+ if (WIFEXITED(status)) {
+ code = WEXITSTATUS(status);
+ if (code == 1 && !continue_after_error)
+ break;
+ } else {
+ printf("not WIFEXITED\n");
+ break;
+ }
+ sleep(rand_int(0, 3));
+ }
+
+ return 1;
+}
+
diff --git a/cpgx/list.h b/cpgx/list.h
new file mode 100644
index 0000000..8100cbc
--- /dev/null
+++ b/cpgx/list.h
@@ -0,0 +1,336 @@
+/* Copied from include/linux/list.h */
+
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ *
+ * @ptr: the pointer to the member.
+ * @type: the type of the container struct this is embedded in.
+ * @member: the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1 ((void *) 0x00100100)
+#define LIST_POISON2 ((void *) 0x00200200)
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+ struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->next = LIST_POISON1;
+ entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+ __list_del(list->prev, list->next);
+ list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+ struct list_head *head)
+{
+ __list_del(list->prev, list->next);
+ list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+ return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is
+ * empty _and_ checks that no other CPU might be
+ * in the process of still modifying either member
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ *
+ * @head: the list to test.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+ struct list_head *next = head->next;
+ return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+ struct list_head *head)
+{
+ struct list_head *first = list->next;
+ struct list_head *last = list->prev;
+ struct list_head *at = head->next;
+
+ first->prev = head;
+ head->next = first;
+
+ last->next = at;
+ at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list)) {
+ __list_splice(list, head);
+ INIT_LIST_HEAD(list);
+ }
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr: the &struct list_head pointer.
+ * @type: the type of the struct this is embedded in.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+/**
+ * list_first_entry - get the first element from a list
+ * @ptr: the list head to take the element from.
+ * @type: the type of the struct this is embedded in.
+ * @member: the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_first_entry(ptr, type, member) \
+ list_entry((ptr)->next, type, member)
+
+/**
+ * list_for_each - iterate over a list
+ * @pos: the &struct list_head to use as a loop counter.
+ * @head: the head for your list.
+ */
+#define list_for_each(pos, head) \
+ for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * __list_for_each - iterate over a list
+ * @pos: the &struct list_head to use as a loop counter.
+ * @head: the head for your list.
+ *
+ * This variant differs from list_for_each() in that it's the
+ * simplest possible list iteration code, no prefetching is done.
+ * Use this for code that knows the list to be very short (empty
+ * or 1 entry) most of the time.
+ */
+#define __list_for_each(pos, head) \
+ for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev - iterate over a list backwards
+ * @pos: the &struct list_head to use as a loop counter.
+ * @head: the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+ for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos: the &struct list_head to use as a loop counter.
+ * @n: another &struct list_head to use as temporary storage
+ * @head: the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+ for (pos = (head)->next, n = pos->next; pos != (head); \
+ pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry - iterate over list of given type
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use as a start point in
+ * list_for_each_entry_continue
+ * @pos: the type * to use as a start point
+ * @head: the head of the list
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_prepare_entry(pos, head, member) \
+ ((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue - iterate over list of given type
+ * continuing after existing point
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_continue(pos, head, member) \
+ for (pos = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos: the type * to use as a loop counter.
+ * @n: another type * to use as temporary storage
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member), \
+ n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+
+#endif