summaryrefslogtreecommitdiffstats
path: root/net/rds
diff options
context:
space:
mode:
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/Kconfig26
-rw-r--r--net/rds/Makefile11
-rw-r--r--net/rds/af_rds.c8
-rw-r--r--net/rds/bind.c3
-rw-r--r--net/rds/cong.c1
-rw-r--r--net/rds/connection.c54
-rw-r--r--net/rds/ib.c7
-rw-r--r--net/rds/ib.h18
-rw-r--r--net/rds/ib_cm.c62
-rw-r--r--net/rds/ib_rdma.c12
-rw-r--r--net/rds/ib_recv.c53
-rw-r--r--net/rds/ib_stats.c4
-rw-r--r--net/rds/ib_sysctl.c12
-rw-r--r--net/rds/info.c3
-rw-r--r--net/rds/iw.c16
-rw-r--r--net/rds/iw.h1
-rw-r--r--net/rds/iw_rdma.c28
-rw-r--r--net/rds/iw_send.c2
-rw-r--r--net/rds/iw_stats.c4
-rw-r--r--net/rds/message.c6
-rw-r--r--net/rds/page.c3
-rw-r--r--net/rds/rdma_transport.c16
-rw-r--r--net/rds/rds.h9
-rw-r--r--net/rds/recv.c28
-rw-r--r--net/rds/send.c3
-rw-r--r--net/rds/stats.c6
-rw-r--r--net/rds/tcp.c320
-rw-r--r--net/rds/tcp.h93
-rw-r--r--net/rds/tcp_connect.c153
-rw-r--r--net/rds/tcp_listen.c199
-rw-r--r--net/rds/tcp_recv.c356
-rw-r--r--net/rds/tcp_send.c263
-rw-r--r--net/rds/tcp_stats.c74
-rw-r--r--net/rds/threads.c2
-rw-r--r--net/rds/transport.c31
35 files changed, 1747 insertions, 140 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index 796773b5df9..ec753b3ae72 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -1,14 +1,28 @@
config RDS
- tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
- depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
- depends on INFINIBAND && INFINIBAND_ADDR_TRANS
+ tristate "The RDS Protocol (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
---help---
- RDS provides reliable, sequenced delivery of datagrams
- over Infiniband.
+ The RDS (Reliable Datagram Sockets) protocol provides reliable,
+ sequenced delivery of datagrams over Infiniband, iWARP,
+ or TCP.
+
+config RDS_RDMA
+ tristate "RDS over Infiniband and iWARP"
+ depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
+ ---help---
+ Allow RDS to use Infiniband and iWARP as a transport.
+ This transport supports RDMA operations.
+
+config RDS_TCP
+ tristate "RDS over TCP"
+ depends on RDS
+ ---help---
+ Allow RDS to use TCP as a transport.
+ This transport does not support RDMA operations.
config RDS_DEBUG
- bool "Debugging messages"
+ bool "RDS debugging messages"
depends on RDS
default n
diff --git a/net/rds/Makefile b/net/rds/Makefile
index 51f27585fa0..b46eca10968 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -1,13 +1,20 @@
obj-$(CONFIG_RDS) += rds.o
rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
recv.o send.o stats.o sysctl.o threads.o transport.o \
- loop.o page.o rdma.o \
- rdma_transport.o \
+ loop.o page.o rdma.o
+
+obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
+rds_rdma-objs := rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
ib_sysctl.o ib_rdma.o \
iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
iw_sysctl.o iw_rdma.o
+
+obj-$(CONFIG_RDS_TCP) += rds_tcp.o
+rds_tcp-objs := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
+ tcp_send.o tcp_stats.o
+
ifeq ($(CONFIG_RDS_DEBUG), y)
EXTRA_CFLAGS += -DDEBUG
endif
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index b11e7e52786..108ed2e671c 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -39,7 +39,6 @@
#include "rds.h"
#include "rdma.h"
-#include "rdma_transport.h"
/* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock);
@@ -509,7 +508,6 @@ out:
static void __exit rds_exit(void)
{
- rds_rdma_exit();
sock_unregister(rds_family_ops.family);
proto_unregister(&rds_proto);
rds_conn_exit();
@@ -549,14 +547,8 @@ static int __init rds_init(void)
rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
- /* ib/iwarp transports currently compiled-in */
- ret = rds_rdma_init();
- if (ret)
- goto out_sock;
goto out;
-out_sock:
- sock_unregister(rds_family_ops.family);
out_proto:
proto_unregister(&rds_proto);
out_stats:
diff --git a/net/rds/bind.c b/net/rds/bind.c
index c17cc39160c..5d95fc007f1 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -187,6 +187,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (trans == NULL) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
+ if (printk_ratelimit())
+ printk(KERN_INFO "RDS: rds_bind() could not find a transport, "
+ "load rds_tcp or rds_rdma?\n");
goto out;
}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 710e4599d76..dd2711df640 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -254,6 +254,7 @@ void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
}
}
+EXPORT_SYMBOL_GPL(rds_cong_map_updated);
int rds_cong_updated_since(unsigned long *recent)
{
diff --git a/net/rds/connection.c b/net/rds/connection.c
index d14445c4830..cc8b568c0c8 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -126,7 +126,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp,
int is_outgoing)
{
- struct rds_connection *conn, *tmp, *parent = NULL;
+ struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
unsigned long flags;
int ret;
@@ -155,7 +155,6 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
}
INIT_HLIST_NODE(&conn->c_hash_node);
- conn->c_version = RDS_PROTOCOL_3_0;
conn->c_laddr = laddr;
conn->c_faddr = faddr;
spin_lock_init(&conn->c_lock);
@@ -211,26 +210,40 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
trans->t_name ? trans->t_name : "[unknown]",
is_outgoing ? "(outgoing)" : "");
+ /*
+ * Since we ran without holding the conn lock, someone could
+ * have created the same conn (either normal or passive) in the
+ * interim. We check while holding the lock. If we won, we complete
+ * init and return our conn. If we lost, we rollback and return the
+ * other one.
+ */
spin_lock_irqsave(&rds_conn_lock, flags);
- if (parent == NULL) {
- tmp = rds_conn_lookup(head, laddr, faddr, trans);
- if (tmp == NULL)
- hlist_add_head(&conn->c_hash_node, head);
- } else {
- tmp = parent->c_passive;
- if (!tmp)
+ if (parent) {
+ /* Creating passive conn */
+ if (parent->c_passive) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = parent->c_passive;
+ } else {
parent->c_passive = conn;
- }
-
- if (tmp) {
- trans->conn_free(conn->c_transport_data);
- kmem_cache_free(rds_conn_slab, conn);
- conn = tmp;
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
+ }
} else {
- rds_cong_add_conn(conn);
- rds_conn_count++;
+ /* Creating normal conn */
+ struct rds_connection *found;
+
+ found = rds_conn_lookup(head, laddr, faddr, trans);
+ if (found) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = found;
+ } else {
+ hlist_add_head(&conn->c_hash_node, head);
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
+ }
}
-
spin_unlock_irqrestore(&rds_conn_lock, flags);
out:
@@ -242,12 +255,14 @@ struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
{
return __rds_conn_create(laddr, faddr, trans, gfp, 0);
}
+EXPORT_SYMBOL_GPL(rds_conn_create);
struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
return __rds_conn_create(laddr, faddr, trans, gfp, 1);
}
+EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
void rds_conn_destroy(struct rds_connection *conn)
{
@@ -290,6 +305,7 @@ void rds_conn_destroy(struct rds_connection *conn)
rds_conn_count--;
}
+EXPORT_SYMBOL_GPL(rds_conn_destroy);
static void rds_conn_message_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
@@ -393,6 +409,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
spin_unlock_irqrestore(&rds_conn_lock, flags);
}
+EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
static int rds_conn_info_visitor(struct rds_connection *conn,
void *buffer)
@@ -468,6 +485,7 @@ void rds_conn_drop(struct rds_connection *conn)
atomic_set(&conn->c_state, RDS_CONN_ERROR);
queue_work(rds_wq, &conn->c_down_w);
}
+EXPORT_SYMBOL_GPL(rds_conn_drop);
/*
* An error occurred on the connection
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b9bcd32431e..536ebe5d3f6 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -43,11 +43,14 @@
unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
module_param(fmr_pool_size, int, 0444);
MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
module_param(fmr_message_size, int, 0444);
MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+module_param(rds_ib_retry_count, int, 0444);
+MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
struct list_head rds_ib_devices;
@@ -82,9 +85,6 @@ void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
- rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
- rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift;
- rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1);
rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
rds_ibdev->max_fmrs = dev_attr->max_fmr ?
min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
@@ -282,6 +282,7 @@ struct rds_transport rds_ib_transport = {
.flush_mrs = rds_ib_flush_mrs,
.t_owner = THIS_MODULE,
.t_name = "infiniband",
+ .t_type = RDS_TRANS_IB
};
int __init rds_ib_init(void)
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 455ae73047f..1378b854cac 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -15,6 +15,8 @@
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
+#define RDS_IB_DEFAULT_RETRY_COUNT 2
+
#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
extern struct list_head rds_ib_devices;
@@ -157,9 +159,6 @@ struct rds_ib_device {
struct ib_pd *pd;
struct ib_mr *mr;
struct rds_ib_mr_pool *mr_pool;
- int fmr_page_shift;
- int fmr_page_size;
- u64 fmr_page_mask;
unsigned int fmr_max_remaps;
unsigned int max_fmrs;
int max_sge;
@@ -247,6 +246,7 @@ extern struct ib_client rds_ib_client;
extern unsigned int fmr_pool_size;
extern unsigned int fmr_message_size;
+extern unsigned int rds_ib_retry_count;
extern spinlock_t ib_nodev_conns_lock;
extern struct list_head ib_nodev_conns;
@@ -355,17 +355,25 @@ extern ctl_table rds_ib_sysctl_table[];
/*
* Helper functions for getting/setting the header and data SGEs in
* RDS packets (not RDMA)
+ *
+ * From version 3.1 onwards, header is in front of data in the sge.
*/
static inline struct ib_sge *
rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
{
- return &sge[0];
+ if (ic->conn->c_version > RDS_PROTOCOL_3_0)
+ return &sge[0];
+ else
+ return &sge[1];
}
static inline struct ib_sge *
rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
{
- return &sge[1];
+ if (ic->conn->c_version > RDS_PROTOCOL_3_0)
+ return &sge[1];
+ else
+ return &sge[0];
}
#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f8e40e1a603..c2d372f13db 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -98,21 +98,34 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
struct ib_qp_attr qp_attr;
int err;
- if (event->param.conn.private_data_len) {
+ if (event->param.conn.private_data_len >= sizeof(*dp)) {
dp = event->param.conn.private_data;
- rds_ib_set_protocol(conn,
+ /* make sure it isn't empty data */
+ if (dp->dp_protocol_major) {
+ rds_ib_set_protocol(conn,
RDS_PROTOCOL(dp->dp_protocol_major,
- dp->dp_protocol_minor));
- rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ dp->dp_protocol_minor));
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ }
}
printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
- &conn->c_laddr,
+ &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : "");
+ /*
+ * Init rings and fill recv. this needs to wait until protocol negotiation
+ * is complete, since ring layout is different from 3.0 to 3.1.
+ */
+ rds_ib_send_init_ring(ic);
+ rds_ib_recv_init_ring(ic);
+ /* Post receive buffers - as a side effect, this will update
+ * the posted credit count. */
+ rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
/* Tune RNR behavior */
rds_ib_tune_rnr(ic, &qp_attr);
@@ -145,7 +158,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
/* XXX tune these? */
conn_param->responder_resources = 1;
conn_param->initiator_depth = 1;
- conn_param->retry_count = 7;
+ conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
conn_param->rnr_retry_count = 7;
if (dp) {
@@ -190,9 +203,9 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break;
default:
- printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
- "on connection to %pI4\n", event->event,
- &conn->c_faddr);
+ rds_ib_conn_error(conn, "RDS/IB: Fatal QP Event %u "
+ "- connection %pI4->%pI4, reconnecting\n",
+ event->event, &conn->c_laddr, &conn->c_faddr);
break;
}
}
@@ -321,7 +334,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
rdsdebug("send allocation failed\n");
goto out;
}
- rds_ib_send_init_ring(ic);
+ memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
if (ic->i_recvs == NULL) {
@@ -329,14 +342,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
rdsdebug("recv allocation failed\n");
goto out;
}
+ memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
- rds_ib_recv_init_ring(ic);
rds_ib_recv_init_ack(ic);
- /* Post receive buffers - as a side effect, this will update
- * the posted credit count. */
- rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
-
rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
ic->i_send_cq, ic->i_recv_cq);
@@ -344,19 +353,32 @@ out:
return ret;
}
-static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
{
+ const struct rds_ib_connect_private *dp = event->param.conn.private_data;
u16 common;
u32 version = 0;
- /* rdma_cm private data is odd - when there is any private data in the
+ /*
+ * rdma_cm private data is odd - when there is any private data in the
* request, we will be given a pretty large buffer without telling us the
* original size. The only way to tell the difference is by looking at
* the contents, which are initialized to zero.
* If the protocol version fields aren't set, this is a connection attempt
* from an older version. This could could be 3.0 or 2.0 - we can't tell.
- * We really should have changed this for OFED 1.3 :-( */
- if (dp->dp_protocol_major == 0)
+ * We really should have changed this for OFED 1.3 :-(
+ */
+
+ /* Be paranoid. RDS always has privdata */
+ if (!event->param.conn.private_data_len) {
+ printk(KERN_NOTICE "RDS incoming connection has no private data, "
+ "rejecting\n");
+ return 0;
+ }
+
+ /* Even if len is crap *now* I still want to check it. -ASG */
+ if (event->param.conn.private_data_len < sizeof (*dp)
+ || dp->dp_protocol_major == 0)
return RDS_PROTOCOL_3_0;
common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
@@ -388,7 +410,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
int err, destroy = 1;
/* Check whether the remote protocol version matches ours. */
- version = rds_ib_protocol_compatible(dp);
+ version = rds_ib_protocol_compatible(event);
if (!version)
goto out;
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 81033af9302..ef3ab5b7283 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -211,7 +211,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
pool->fmr_attr.max_pages = fmr_message_size;
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
- pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
+ pool->fmr_attr.page_shift = PAGE_SHIFT;
pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
/* We never allow more than max_items MRs to be allocated.
@@ -349,13 +349,13 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
- if (dma_addr & ~rds_ibdev->fmr_page_mask) {
+ if (dma_addr & ~PAGE_MASK) {
if (i > 0)
return -EINVAL;
else
++page_cnt;
}
- if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
+ if ((dma_addr + dma_len) & ~PAGE_MASK) {
if (i < sg_dma_len - 1)
return -EINVAL;
else
@@ -365,7 +365,7 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
len += dma_len;
}
- page_cnt += len >> rds_ibdev->fmr_page_shift;
+ page_cnt += len >> PAGE_SHIFT;
if (page_cnt > fmr_message_size)
return -EINVAL;
@@ -378,9 +378,9 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
- for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
+ for (j = 0; j < dma_len; j += PAGE_SIZE)
dma_pages[page_cnt++] =
- (dma_addr & rds_ibdev->fmr_page_mask) + j;
+ (dma_addr & PAGE_MASK) + j;
}
ret = ib_map_phys_fmr(ibmr->fmr,
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 5709bad2832..cd7a6cfcab0 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -555,6 +555,47 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
return rds_ib_get_ack(ic);
}
+static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
+ struct rds_ib_recv_work *recv,
+ u32 data_len)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
+ void *addr;
+ u32 misplaced_hdr_bytes;
+
+ /*
+ * Support header at the front (RDS 3.1+) as well as header-at-end.
+ *
+ * Cases:
+ * 1) header all in header buff (great!)
+ * 2) header all in data page (copy all to header buff)
+ * 3) header split across hdr buf + data page
+ * (move bit in hdr buff to end before copying other bit from data page)
+ */
+ if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
+ return hdr_buff;
+
+ if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
+ addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
+ memcpy(hdr_buff,
+ addr + recv->r_frag->f_offset + data_len,
+ sizeof(struct rds_header));
+ kunmap_atomic(addr, KM_SOFTIRQ0);
+ return hdr_buff;
+ }
+
+ misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
+
+ memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
+
+ addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
+ memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
+ sizeof(struct rds_header) - misplaced_hdr_bytes);
+ kunmap_atomic(addr, KM_SOFTIRQ0);
+ return hdr_buff;
+}
+
/*
* It's kind of lame that we're copying from the posted receive pages into
* long-lived bitmaps. We could have posted the bitmaps and rdma written into
@@ -645,7 +686,7 @@ struct rds_ib_ack_state {
};
static void rds_ib_process_recv(struct rds_connection *conn,
- struct rds_ib_recv_work *recv, u32 byte_len,
+ struct rds_ib_recv_work *recv, u32 data_len,
struct rds_ib_ack_state *state)
{
struct rds_ib_connection *ic = conn->c_transport_data;
@@ -655,9 +696,9 @@ static void rds_ib_process_recv(struct rds_connection *conn,
/* XXX shut down the connection if port 0,0 are seen? */
rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
- byte_len);
+ data_len);
- if (byte_len < sizeof(struct rds_header)) {
+ if (data_len < sizeof(struct rds_header)) {
rds_ib_conn_error(conn, "incoming message "
"from %pI4 didn't inclue a "
"header, disconnecting and "
@@ -665,9 +706,9 @@ static void rds_ib_process_recv(struct rds_connection *conn,
&conn->c_faddr);
return;
}
- byte_len -= sizeof(struct rds_header);
+ data_len -= sizeof(struct rds_header);
- ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+ ihdr = rds_ib_get_header(conn, recv, data_len);
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
@@ -687,7 +728,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
if (ihdr->h_credit)
rds_ib_send_add_credits(conn, ihdr->h_credit);
- if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+ if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
/* This is an ACK-only packet. The fact that it gets
* special treatment here is that historically, ACKs
* were rather special beasts.
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 02e3e3d50d4..d2c904dd6fb 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -37,9 +37,9 @@
#include "rds.h"
#include "ib.h"
-DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
-static char *rds_ib_stat_names[] = {
+static const char *const rds_ib_stat_names[] = {
"ib_connect_raced",
"ib_listen_closed_stale",
"ib_tx_cq_call",
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index d87830db93a..84b5ffcb280 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -53,7 +53,17 @@ unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
-unsigned int rds_ib_sysctl_flow_control = 1;
+/*
+ * This sysctl does nothing.
+ *
+ * Backwards compatibility with RDS 3.0 wire protocol
+ * disables initial FC credit exchange.
+ * If it's ever possible to drop 3.0 support,
+ * setting this to 1 and moving init/refill of send/recv
+ * rings from ib_cm_connect_complete() back into ib_setup_qp()
+ * will cause credits to be added before protocol negotiation.
+ */
+unsigned int rds_ib_sysctl_flow_control = 0;
ctl_table rds_ib_sysctl_table[] = {
{
diff --git a/net/rds/info.c b/net/rds/info.c
index 62aeef37aef..814a91a6f4a 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -79,6 +79,7 @@ void rds_info_register_func(int optname, rds_info_func func)
rds_info_funcs[offset] = func;
spin_unlock(&rds_info_lock);
}
+EXPORT_SYMBOL_GPL(rds_info_register_func);
void rds_info_deregister_func(int optname, rds_info_func func)
{
@@ -91,6 +92,7 @@ void rds_info_deregister_func(int optname, rds_info_func func)
rds_info_funcs[offset] = NULL;
spin_unlock(&rds_info_lock);
}
+EXPORT_SYMBOL_GPL(rds_info_deregister_func);
/*
* Typically we hold an atomic kmap across multiple rds_info_copy() calls
@@ -137,6 +139,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
}
}
}
+EXPORT_SYMBOL_GPL(rds_info_copy);
/*
* @optval points to the userspace buffer that the information snapshot
diff --git a/net/rds/iw.c b/net/rds/iw.c
index d16e1cbc8e8..db224f7c293 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -83,23 +83,16 @@ void rds_iw_add_one(struct ib_device *device)
rds_iwdev->max_wrs = dev_attr->max_qp_wr;
rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
- rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
-
rds_iwdev->dev = device;
rds_iwdev->pd = ib_alloc_pd(device);
if (IS_ERR(rds_iwdev->pd))
goto free_dev;
if (!rds_iwdev->dma_local_lkey) {
- if (device->node_type != RDMA_NODE_RNIC) {
- rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
- IB_ACCESS_LOCAL_WRITE);
- } else {
- rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_LOCAL_WRITE);
- }
+ rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(rds_iwdev->mr))
goto err_pd;
} else
@@ -291,6 +284,7 @@ struct rds_transport rds_iw_transport = {
.flush_mrs = rds_iw_flush_mrs,
.t_owner = THIS_MODULE,
.t_name = "iwarp",
+ .t_type = RDS_TRANS_IWARP,
.t_prefer_loopback = 1,
};
diff --git a/net/rds/iw.h b/net/rds/iw.h
index 0715dde323e..dd72b62bd50 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -181,7 +181,6 @@ struct rds_iw_device {
struct ib_pd *pd;
struct ib_mr *mr;
struct rds_iw_mr_pool *mr_pool;
- int page_shift;
int max_sge;
unsigned int max_wrs;
unsigned int dma_local_lkey:1;
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index dcdb37da80f..de4a1b16bf7 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -263,18 +263,12 @@ static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
}
static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
- struct rds_iw_scatterlist *sg,
- unsigned int dma_page_shift)
+ struct rds_iw_scatterlist *sg)
{
struct ib_device *dev = rds_iwdev->dev;
u64 *dma_pages = NULL;
- u64 dma_mask;
- unsigned int dma_page_size;
int i, j, ret;
- dma_page_size = 1 << dma_page_shift;
- dma_mask = dma_page_size - 1;
-
WARN_ON(sg->dma_len);
sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
@@ -295,18 +289,18 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
sg->bytes += dma_len;
end_addr = dma_addr + dma_len;
- if (dma_addr & dma_mask) {
+ if (dma_addr & PAGE_MASK) {
if (i > 0)
goto out_unmap;
- dma_addr &= ~dma_mask;
+ dma_addr &= ~PAGE_MASK;
}
- if (end_addr & dma_mask) {
+ if (end_addr & PAGE_MASK) {
if (i < sg->dma_len - 1)
goto out_unmap;
- end_addr = (end_addr + dma_mask) & ~dma_mask;
+ end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
}
- sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift;
+ sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
}
/* Now gather the dma addrs into one list */
@@ -325,8 +319,8 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
u64 end_addr;
end_addr = dma_addr + dma_len;
- dma_addr &= ~dma_mask;
- for (; dma_addr < end_addr; dma_addr += dma_page_size)
+ dma_addr &= ~PAGE_MASK;
+ for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
dma_pages[j++] = dma_addr;
BUG_ON(j > sg->dma_npages);
}
@@ -727,7 +721,7 @@ static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
f_wr.wr.fast_reg.rkey = mapping->m_rkey;
f_wr.wr.fast_reg.page_list = ibmr->page_list;
f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
- f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift;
+ f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE;
@@ -780,9 +774,7 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
- dma_pages = rds_iw_map_scatterlist(rds_iwdev,
- &mapping->m_sg,
- rds_iwdev->page_shift);
+ dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
if (IS_ERR(dma_pages)) {
ret = PTR_ERR(dma_pages);
dma_pages = NULL;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 44a6a0551f2..1f5abe3cf2b 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -779,7 +779,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
send->s_wr.wr.fast_reg.page_list = send->s_page_list;
send->s_wr.wr.fast_reg.page_list_len = nent;
- send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift;
+ send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
send->s_wr.wr.fast_reg.iova_start = sg_addr;
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
index ccc7e8f0bf0..5fe67f6a1d8 100644
--- a/net/rds/iw_stats.c
+++ b/net/rds/iw_stats.c
@@ -37,9 +37,9 @@
#include "rds.h"
#include "iw.h"
-DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
-static char *rds_iw_stat_names[] = {
+static const char *const rds_iw_stat_names[] = {
"iw_connect_raced",
"iw_listen_closed_stale",
"iw_tx_cq_call",
diff --git a/net/rds/message.c b/net/rds/message.c
index 5a15dc8d0cd..ca50a8ec974 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -50,6 +50,7 @@ void rds_message_addref(struct rds_message *rm)
rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
atomic_inc(&rm->m_refcount);
}
+EXPORT_SYMBOL_GPL(rds_message_addref);
/*
* This relies on dma_map_sg() not touching sg[].page during merging.
@@ -92,6 +93,7 @@ void rds_message_put(struct rds_message *rm)
kfree(rm);
}
}
+EXPORT_SYMBOL_GPL(rds_message_put);
void rds_message_inc_free(struct rds_incoming *inc)
{
@@ -108,6 +110,7 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
hdr->h_sequence = cpu_to_be64(seq);
hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
}
+EXPORT_SYMBOL_GPL(rds_message_populate_header);
int rds_message_add_extension(struct rds_header *hdr,
unsigned int type, const void *data, unsigned int len)
@@ -133,6 +136,7 @@ int rds_message_add_extension(struct rds_header *hdr,
dst[len] = RDS_EXTHDR_NONE;
return 1;
}
+EXPORT_SYMBOL_GPL(rds_message_add_extension);
/*
* If a message has extension headers, retrieve them here.
@@ -208,6 +212,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
ext_hdr.h_rdma_offset = cpu_to_be32(offset);
return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
}
+EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
{
@@ -399,4 +404,5 @@ void rds_message_unmapped(struct rds_message *rm)
if (waitqueue_active(&rds_message_flush_waitq))
wake_up(&rds_message_flush_waitq);
}
+EXPORT_SYMBOL_GPL(rds_message_unmapped);
diff --git a/net/rds/page.c b/net/rds/page.c
index c460743a89a..36790122dfd 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -39,7 +39,7 @@ struct rds_page_remainder {
unsigned long r_offset;
};
-DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
/*
* returns 0 on success or -errno on failure.
@@ -81,6 +81,7 @@ int rds_page_copy_user(struct page *page, unsigned long offset,
return 0;
}
+EXPORT_SYMBOL_GPL(rds_page_copy_user);
/*
* Message allocation uses this to build up regions of a message.
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 7d0f901c93d..9ece910ea39 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -101,7 +101,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
break;
case RDMA_CM_EVENT_DISCONNECTED:
- printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
+ printk(KERN_WARNING "RDS/RDMA: DISCONNECT event - dropping connection "
"%pI4->%pI4\n", &conn->c_laddr,
&conn->c_faddr);
rds_conn_drop(conn);
@@ -132,12 +132,12 @@ static int __init rds_rdma_listen_init(void)
cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
if (IS_ERR(cm_id)) {
ret = PTR_ERR(cm_id);
- printk(KERN_ERR "RDS/IW: failed to setup listener, "
+ printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
"rdma_create_id() returned %d\n", ret);
goto out;
}
- sin.sin_family = PF_INET,
+ sin.sin_family = AF_INET,
sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
sin.sin_port = (__force u16)htons(RDS_PORT);
@@ -147,14 +147,14 @@ static int __init rds_rdma_listen_init(void)
*/
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
if (ret) {
- printk(KERN_ERR "RDS/IW: failed to setup listener, "
+ printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
"rdma_bind_addr() returned %d\n", ret);
goto out;
}
ret = rdma_listen(cm_id, 128);
if (ret) {
- printk(KERN_ERR "RDS/IW: failed to setup listener, "
+ printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
"rdma_listen() returned %d\n", ret);
goto out;
}
@@ -203,6 +203,7 @@ err_iw_init:
out:
return ret;
}
+module_init(rds_rdma_init);
void rds_rdma_exit(void)
{
@@ -211,4 +212,9 @@ void rds_rdma_exit(void)
rds_ib_exit();
rds_iw_exit();
}
+module_exit(rds_rdma_exit);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: IB/iWARP transport");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/rds/rds.h b/net/rds/rds.h
index dbe11123678..85d6f897ecc 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -311,11 +311,17 @@ struct rds_notifier {
* flag and header.
*/
+#define RDS_TRANS_IB 0
+#define RDS_TRANS_IWARP 1
+#define RDS_TRANS_TCP 2
+#define RDS_TRANS_COUNT 3
+
struct rds_transport {
char t_name[TRANSNAMSIZ];
struct list_head t_item;
struct module *t_owner;
unsigned int t_prefer_loopback:1;
+ unsigned int t_type;
int (*laddr_check)(__be32 addr);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
@@ -652,7 +658,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
int __init rds_stats_init(void);
void rds_stats_exit(void);
void rds_stats_info_copy(struct rds_info_iterator *iter,
- uint64_t *values, char **names, size_t nr);
+ uint64_t *values, const char *const *names,
+ size_t nr);
/* sysctl.c */
int __init rds_sysctl_init(void);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index f2118c51cfa..fdff33c7b43 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -46,12 +46,14 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
inc->i_saddr = saddr;
inc->i_rdma_cookie = 0;
}
+EXPORT_SYMBOL_GPL(rds_inc_init);
void rds_inc_addref(struct rds_incoming *inc)
{
rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
atomic_inc(&inc->i_refcount);
}
+EXPORT_SYMBOL_GPL(rds_inc_addref);
void rds_inc_put(struct rds_incoming *inc)
{
@@ -62,6 +64,7 @@ void rds_inc_put(struct rds_incoming *inc)
inc->i_conn->c_trans->inc_free(inc);
}
}
+EXPORT_SYMBOL_GPL(rds_inc_put);
static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
struct rds_cong_map *map,
@@ -237,6 +240,7 @@ out:
if (rs)
rds_sock_put(rs);
}
+EXPORT_SYMBOL_GPL(rds_recv_incoming);
/*
* be very careful here. This is being called as the condition in
@@ -409,18 +413,18 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
if (msg_flags & MSG_OOB)
goto out;
- /* If there are pending notifications, do those - and nothing else */
- if (!list_empty(&rs->rs_notify_queue)) {
- ret = rds_notify_queue_get(rs, msg);
- goto out;
- }
+ while (1) {
+ /* If there are pending notifications, do those - and nothing else */
+ if (!list_empty(&rs->rs_notify_queue)) {
+ ret = rds_notify_queue_get(rs, msg);
+ break;
+ }
- if (rs->rs_cong_notify) {
- ret = rds_notify_cong(rs, msg);
- goto out;
- }
+ if (rs->rs_cong_notify) {
+ ret = rds_notify_cong(rs, msg);
+ break;
+ }
- while (1) {
if (!rds_next_incoming(rs, &inc)) {
if (nonblock) {
ret = -EAGAIN;
@@ -428,7 +432,9 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
}
timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
- rds_next_incoming(rs, &inc),
+ (!list_empty(&rs->rs_notify_queue)
+ || rs->rs_cong_notify
+ || rds_next_incoming(rs, &inc)),
timeo);
rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
timeo);
diff --git a/net/rds/send.c b/net/rds/send.c
index a4a7f428cd7..28c88ff3d03 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -439,6 +439,7 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
sock_put(rds_rs_to_sk(rs));
}
}
+EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
/*
* This is the same as rds_rdma_send_complete except we
@@ -494,6 +495,7 @@ out:
return found;
}
+EXPORT_SYMBOL_GPL(rds_send_get_message);
/*
* This removes messages from the socket's list if they're on it. The list
@@ -610,6 +612,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
/* now remove the messages from the sock list as needed */
rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
}
+EXPORT_SYMBOL_GPL(rds_send_drop_acked);
void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
{
diff --git a/net/rds/stats.c b/net/rds/stats.c
index 637146893cf..7598eb07cfb 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -37,10 +37,11 @@
#include "rds.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
-static char *rds_stat_names[] = {
+static const char *const rds_stat_names[] = {
"conn_reset",
"recv_drop_bad_checksum",
"recv_drop_old_seq",
@@ -77,7 +78,7 @@ static char *rds_stat_names[] = {
};
void rds_stats_info_copy(struct rds_info_iterator *iter,
- uint64_t *values, char **names, size_t nr)
+ uint64_t *values, const char *const *names, size_t nr)
{
struct rds_info_counter ctr;
size_t i;
@@ -90,6 +91,7 @@ void rds_stats_info_copy(struct rds_info_iterator *iter,
rds_info_copy(iter, &ctr, sizeof(ctr));
}
}
+EXPORT_SYMBOL_GPL(rds_stats_info_copy);
/*
* This gives global counters across all the transports. The strings
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
new file mode 100644
index 00000000000..b5198aee45d
--- /dev/null
+++ b/net/rds/tcp.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/* only for info exporting */
+static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
+static LIST_HEAD(rds_tcp_tc_list);
+unsigned int rds_tcp_tc_count;
+
+/* Track rds_tcp_connection structs so they can be cleaned up */
+static DEFINE_SPINLOCK(rds_tcp_conn_lock);
+static LIST_HEAD(rds_tcp_conn_list);
+
+static struct kmem_cache *rds_tcp_conn_slab;
+
+#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
+
+/* doing it this way avoids calling tcp_sk() */
+void rds_tcp_nonagle(struct socket *sock)
+{
+ mm_segment_t oldfs = get_fs();
+ int val = 1;
+
+ set_fs(KERNEL_DS);
+ sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
+ sizeof(val));
+ set_fs(oldfs);
+}
+
+void rds_tcp_tune(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ rds_tcp_nonagle(sock);
+
+ /*
+ * We're trying to saturate gigabit with the default,
+ * see svc_sock_setbufsize().
+ */
+ lock_sock(sk);
+ sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
+ sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
+ release_sock(sk);
+}
+
+u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
+{
+ return tcp_sk(tc->t_sock->sk)->snd_nxt;
+}
+
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
+{
+ return tcp_sk(tc->t_sock->sk)->snd_una;
+}
+
+void rds_tcp_restore_callbacks(struct socket *sock,
+ struct rds_tcp_connection *tc)
+{
+ rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
+ write_lock_bh(&sock->sk->sk_callback_lock);
+
+ /* done under the callback_lock to serialize with write_space */
+ spin_lock(&rds_tcp_tc_list_lock);
+ list_del_init(&tc->t_list_item);
+ rds_tcp_tc_count--;
+ spin_unlock(&rds_tcp_tc_list_lock);
+
+ tc->t_sock = NULL;
+
+ sock->sk->sk_write_space = tc->t_orig_write_space;
+ sock->sk->sk_data_ready = tc->t_orig_data_ready;
+ sock->sk->sk_state_change = tc->t_orig_state_change;
+ sock->sk->sk_user_data = NULL;
+
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+/*
+ * This is the only path that sets tc->t_sock. Send and receive trust that
+ * it is set. The RDS_CONN_CONNECTED bit protects those paths from being
+ * called while it isn't set.
+ */
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+
+ rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
+ write_lock_bh(&sock->sk->sk_callback_lock);
+
+ /* done under the callback_lock to serialize with write_space */
+ spin_lock(&rds_tcp_tc_list_lock);
+ list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
+ rds_tcp_tc_count++;
+ spin_unlock(&rds_tcp_tc_list_lock);
+
+ /* accepted sockets need our listen data ready undone */
+ if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
+ sock->sk->sk_data_ready = sock->sk->sk_user_data;
+
+ tc->t_sock = sock;
+ tc->conn = conn;
+ tc->t_orig_data_ready = sock->sk->sk_data_ready;
+ tc->t_orig_write_space = sock->sk->sk_write_space;
+ tc->t_orig_state_change = sock->sk->sk_state_change;
+
+ sock->sk->sk_user_data = conn;
+ sock->sk->sk_data_ready = rds_tcp_data_ready;
+ sock->sk->sk_write_space = rds_tcp_write_space;
+ sock->sk->sk_state_change = rds_tcp_state_change;
+
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_info_tcp_socket tsinfo;
+ struct rds_tcp_connection *tc;
+ unsigned long flags;
+ struct sockaddr_in sin;
+ int sinlen;
+
+ spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+ if (len / sizeof(tsinfo) < rds_tcp_tc_count)
+ goto out;
+
+ list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+
+ sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
+ tsinfo.local_addr = sin.sin_addr.s_addr;
+ tsinfo.local_port = sin.sin_port;
+ sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
+ tsinfo.peer_addr = sin.sin_addr.s_addr;
+ tsinfo.peer_port = sin.sin_port;
+
+ tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
+ tsinfo.data_rem = tc->t_tinc_data_rem;
+ tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
+ tsinfo.last_expected_una = tc->t_last_expected_una;
+ tsinfo.last_seen_una = tc->t_last_seen_una;
+
+ rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
+ }
+
+out:
+ lens->nr = rds_tcp_tc_count;
+ lens->each = sizeof(tsinfo);
+
+ spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
+static int rds_tcp_laddr_check(__be32 addr)
+{
+ if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
+ return 0;
+ return -EADDRNOTAVAIL;
+}
+
+static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_tcp_connection *tc;
+
+ tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
+ if (tc == NULL)
+ return -ENOMEM;
+
+ tc->t_sock = NULL;
+ tc->t_tinc = NULL;
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+
+ conn->c_transport_data = tc;
+
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+ spin_unlock_irq(&rds_tcp_conn_lock);
+
+ rdsdebug("alloced tc %p\n", conn->c_transport_data);
+ return 0;
+}
+
+static void rds_tcp_conn_free(void *arg)
+{
+ struct rds_tcp_connection *tc = arg;
+ rdsdebug("freeing tc %p\n", tc);
+ kmem_cache_free(rds_tcp_conn_slab, tc);
+}
+
+static void rds_tcp_destroy_conns(void)
+{
+ struct rds_tcp_connection *tc, *_tc;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_splice(&rds_tcp_conn_list, &tmp_list);
+ INIT_LIST_HEAD(&rds_tcp_conn_list);
+ spin_unlock_irq(&rds_tcp_conn_lock);
+
+ list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
+ if (tc->conn->c_passive)
+ rds_conn_destroy(tc->conn->c_passive);
+ rds_conn_destroy(tc->conn);
+ }
+}
+
+void rds_tcp_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+ rds_tcp_listen_stop();
+ rds_tcp_destroy_conns();
+ rds_trans_unregister(&rds_tcp_transport);
+ rds_tcp_recv_exit();
+ kmem_cache_destroy(rds_tcp_conn_slab);
+}
+module_exit(rds_tcp_exit);
+
+struct rds_transport rds_tcp_transport = {
+ .laddr_check = rds_tcp_laddr_check,
+ .xmit_prepare = rds_tcp_xmit_prepare,
+ .xmit_complete = rds_tcp_xmit_complete,
+ .xmit_cong_map = rds_tcp_xmit_cong_map,
+ .xmit = rds_tcp_xmit,
+ .recv = rds_tcp_recv,
+ .conn_alloc = rds_tcp_conn_alloc,
+ .conn_free = rds_tcp_conn_free,
+ .conn_connect = rds_tcp_conn_connect,
+ .conn_shutdown = rds_tcp_conn_shutdown,
+ .inc_copy_to_user = rds_tcp_inc_copy_to_user,
+ .inc_purge = rds_tcp_inc_purge,
+ .inc_free = rds_tcp_inc_free,
+ .stats_info_copy = rds_tcp_stats_info_copy,
+ .exit = rds_tcp_exit,
+ .t_owner = THIS_MODULE,
+ .t_name = "tcp",
+ .t_type = RDS_TRANS_TCP,
+ .t_prefer_loopback = 1,
+};
+
+int __init rds_tcp_init(void)
+{
+ int ret;
+
+ rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
+ sizeof(struct rds_tcp_connection),
+ 0, 0, NULL);
+ if (rds_tcp_conn_slab == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = rds_tcp_recv_init();
+ if (ret)
+ goto out_slab;
+
+ ret = rds_trans_register(&rds_tcp_transport);
+ if (ret)
+ goto out_recv;
+
+ ret = rds_tcp_listen_init();
+ if (ret)
+ goto out_register;
+
+ rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+
+ goto out;
+
+out_register:
+ rds_trans_unregister(&rds_tcp_transport);
+out_recv:
+ rds_tcp_recv_exit();
+out_slab:
+ kmem_cache_destroy(rds_tcp_conn_slab);
+out:
+ return ret;
+}
+module_init(rds_tcp_init);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: TCP transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
new file mode 100644
index 00000000000..844fa6b9cf5
--- /dev/null
+++ b/net/rds/tcp.h
@@ -0,0 +1,93 @@
+#ifndef _RDS_TCP_H
+#define _RDS_TCP_H
+
+#define RDS_TCP_PORT 16385
+
+struct rds_tcp_incoming {
+ struct rds_incoming ti_inc;
+ struct sk_buff_head ti_skb_list;
+};
+
+struct rds_tcp_connection {
+
+ struct list_head t_tcp_node;
+ struct rds_connection *conn;
+ struct socket *t_sock;
+ void *t_orig_write_space;
+ void *t_orig_data_ready;
+ void *t_orig_state_change;
+
+ struct rds_tcp_incoming *t_tinc;
+ size_t t_tinc_hdr_rem;
+ size_t t_tinc_data_rem;
+
+ /* XXX error report? */
+ struct work_struct t_conn_w;
+ struct work_struct t_send_w;
+ struct work_struct t_down_w;
+ struct work_struct t_recv_w;
+
+ /* for info exporting only */
+ struct list_head t_list_item;
+ u32 t_last_sent_nxt;
+ u32 t_last_expected_una;
+ u32 t_last_seen_una;
+};
+
+struct rds_tcp_statistics {
+ uint64_t s_tcp_data_ready_calls;
+ uint64_t s_tcp_write_space_calls;
+ uint64_t s_tcp_sndbuf_full;
+ uint64_t s_tcp_connect_raced;
+ uint64_t s_tcp_listen_closed_stale;
+};
+
+/* tcp.c */
+int __init rds_tcp_init(void);
+void rds_tcp_exit(void);
+void rds_tcp_tune(struct socket *sock);
+void rds_tcp_nonagle(struct socket *sock);
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_restore_callbacks(struct socket *sock,
+ struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
+u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
+extern struct rds_transport rds_tcp_transport;
+
+/* tcp_connect.c */
+int rds_tcp_conn_connect(struct rds_connection *conn);
+void rds_tcp_conn_shutdown(struct rds_connection *conn);
+void rds_tcp_state_change(struct sock *sk);
+
+/* tcp_listen.c */
+int __init rds_tcp_listen_init(void);
+void rds_tcp_listen_stop(void);
+void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
+
+/* tcp_recv.c */
+int __init rds_tcp_recv_init(void);
+void rds_tcp_recv_exit(void);
+void rds_tcp_data_ready(struct sock *sk, int bytes);
+int rds_tcp_recv(struct rds_connection *conn);
+void rds_tcp_inc_purge(struct rds_incoming *inc);
+void rds_tcp_inc_free(struct rds_incoming *inc);
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+ size_t size);
+
+/* tcp_send.c */
+void rds_tcp_xmit_prepare(struct rds_connection *conn);
+void rds_tcp_xmit_complete(struct rds_connection *conn);
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_tcp_write_space(struct sock *sk);
+int rds_tcp_xmit_cong_map(struct rds_connection *conn,
+ struct rds_cong_map *map, unsigned long offset);
+
+/* tcp_stats.c */
+DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
+#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+
+#endif
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
new file mode 100644
index 00000000000..211522f9a9a
--- /dev/null
+++ b/net/rds/tcp_connect.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+void rds_tcp_state_change(struct sock *sk)
+{
+ void (*state_change)(struct sock *sk);
+ struct rds_connection *conn;
+ struct rds_tcp_connection *tc;
+
+ read_lock(&sk->sk_callback_lock);
+ conn = sk->sk_user_data;
+ if (conn == NULL) {
+ state_change = sk->sk_state_change;
+ goto out;
+ }
+ tc = conn->c_transport_data;
+ state_change = tc->t_orig_state_change;
+
+ rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
+
+ switch(sk->sk_state) {
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ rds_connect_complete(conn);
+ break;
+ case TCP_CLOSE:
+ rds_conn_drop(conn);
+ default:
+ break;
+ }
+out:
+ read_unlock(&sk->sk_callback_lock);
+ state_change(sk);
+}
+
+int rds_tcp_conn_connect(struct rds_connection *conn)
+{
+ struct socket *sock = NULL;
+ struct sockaddr_in src, dest;
+ int ret;
+
+ ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ret < 0)
+ goto out;
+
+ rds_tcp_tune(sock);
+
+ src.sin_family = AF_INET;
+ src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+ src.sin_port = (__force u16)htons(0);
+
+ ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
+ if (ret) {
+ rdsdebug("bind failed with %d at address %u.%u.%u.%u\n",
+ ret, NIPQUAD(conn->c_laddr));
+ goto out;
+ }
+
+ dest.sin_family = AF_INET;
+ dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+ dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+ /*
+ * once we call connect() we can start getting callbacks and they
+ * own the socket
+ */
+ rds_tcp_set_callbacks(sock, conn);
+ ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
+ O_NONBLOCK);
+ sock = NULL;
+
+ rdsdebug("connect to address %u.%u.%u.%u returned %d\n",
+ NIPQUAD(conn->c_faddr), ret);
+ if (ret == -EINPROGRESS)
+ ret = 0;
+
+out:
+ if (sock)
+ sock_release(sock);
+ return ret;
+}
+
+/*
+ * Before killing the tcp socket this needs to serialize with callbacks. The
+ * caller has already grabbed the sending sem so we're serialized with other
+ * senders.
+ *
+ * TCP calls the callbacks with the sock lock so we hold it while we reset the
+ * callbacks to those set by TCP. Our callbacks won't execute again once we
+ * hold the sock lock.
+ */
+void rds_tcp_conn_shutdown(struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct socket *sock = tc->t_sock;
+
+ rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
+
+ if (sock) {
+ sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
+ lock_sock(sock->sk);
+ rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
+
+ release_sock(sock->sk);
+ sock_release(sock);
+ };
+
+ if (tc->t_tinc) {
+ rds_inc_put(&tc->t_tinc->ti_inc);
+ tc->t_tinc = NULL;
+ }
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
new file mode 100644
index 00000000000..24b743eb0b1
--- /dev/null
+++ b/net/rds/tcp_listen.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/*
+ * cheesy, but simple..
+ */
+static void rds_tcp_accept_worker(struct work_struct *work);
+static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
+static struct socket *rds_tcp_listen_sock;
+
+static int rds_tcp_accept_one(struct socket *sock)
+{
+ struct socket *new_sock = NULL;
+ struct rds_connection *conn;
+ int ret;
+ struct inet_sock *inet;
+
+ ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
+ sock->sk->sk_protocol, &new_sock);
+ if (ret)
+ goto out;
+
+ new_sock->type = sock->type;
+ new_sock->ops = sock->ops;
+ ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+ if (ret < 0)
+ goto out;
+
+ rds_tcp_tune(new_sock);
+
+ inet = inet_sk(new_sock->sk);
+
+ rdsdebug("accepted tcp %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n",
+ NIPQUAD(inet->saddr), ntohs(inet->sport),
+ NIPQUAD(inet->daddr), ntohs(inet->dport));
+
+ conn = rds_conn_create(inet->saddr, inet->daddr, &rds_tcp_transport,
+ GFP_KERNEL);
+ if (IS_ERR(conn)) {
+ ret = PTR_ERR(conn);
+ goto out;
+ }
+
+ /*
+ * see the comment above rds_queue_delayed_reconnect()
+ */
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+ if (rds_conn_state(conn) == RDS_CONN_UP)
+ rds_tcp_stats_inc(s_tcp_listen_closed_stale);
+ else
+ rds_tcp_stats_inc(s_tcp_connect_raced);
+ rds_conn_drop(conn);
+ ret = 0;
+ goto out;
+ }
+
+ rds_tcp_set_callbacks(new_sock, conn);
+ rds_connect_complete(conn);
+ new_sock = NULL;
+ ret = 0;
+
+out:
+ if (new_sock)
+ sock_release(new_sock);
+ return ret;
+}
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+ while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
+ cond_resched();
+}
+
+void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
+{
+ void (*ready)(struct sock *sk, int bytes);
+
+ rdsdebug("listen data ready sk %p\n", sk);
+
+ read_lock(&sk->sk_callback_lock);
+ ready = sk->sk_user_data;
+ if (ready == NULL) { /* check for teardown race */
+ ready = sk->sk_data_ready;
+ goto out;
+ }
+
+ /*
+ * ->sk_data_ready is also called for a newly established child socket
+ * before it has been accepted and the accepter has set up their
+ * data_ready.. we only want to queue listen work for our listening
+ * socket
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ queue_work(rds_wq, &rds_tcp_listen_work);
+
+out:
+ read_unlock(&sk->sk_callback_lock);
+ ready(sk, bytes);
+}
+
+int __init rds_tcp_listen_init(void)
+{
+ struct sockaddr_in sin;
+ struct socket *sock = NULL;
+ int ret;
+
+ ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ret < 0)
+ goto out;
+
+ sock->sk->sk_reuse = 1;
+ rds_tcp_nonagle(sock);
+
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ sock->sk->sk_user_data = sock->sk->sk_data_ready;
+ sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+
+ sin.sin_family = PF_INET,
+ sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+ sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+ ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+ if (ret < 0)
+ goto out;
+
+ ret = sock->ops->listen(sock, 64);
+ if (ret < 0)
+ goto out;
+
+ rds_tcp_listen_sock = sock;
+ sock = NULL;
+out:
+ if (sock)
+ sock_release(sock);
+ return ret;
+}
+
+void rds_tcp_listen_stop(void)
+{
+ struct socket *sock = rds_tcp_listen_sock;
+ struct sock *sk;
+
+ if (sock == NULL)
+ return;
+
+ sk = sock->sk;
+
+ /* serialize with and prevent further callbacks */
+ lock_sock(sk);
+ write_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_user_data) {
+ sk->sk_data_ready = sk->sk_user_data;
+ sk->sk_user_data = NULL;
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
+
+ /* wait for accepts to stop and close the socket */
+ flush_workqueue(rds_wq);
+ sock_release(sock);
+ rds_tcp_listen_sock = NULL;
+}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
new file mode 100644
index 00000000000..c00dafffbb5
--- /dev/null
+++ b/net/rds/tcp_recv.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static struct kmem_cache *rds_tcp_incoming_slab;
+
+void rds_tcp_inc_purge(struct rds_incoming *inc)
+{
+ struct rds_tcp_incoming *tinc;
+ tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+ rdsdebug("purging tinc %p inc %p\n", tinc, inc);
+ skb_queue_purge(&tinc->ti_skb_list);
+}
+
+void rds_tcp_inc_free(struct rds_incoming *inc)
+{
+ struct rds_tcp_incoming *tinc;
+ tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+ rds_tcp_inc_purge(inc);
+ rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
+ kmem_cache_free(rds_tcp_incoming_slab, tinc);
+}
+
+/*
+ * this is pretty lame, but, whatever.
+ */
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+ size_t size)
+{
+ struct rds_tcp_incoming *tinc;
+ struct iovec *iov, tmp;
+ struct sk_buff *skb;
+ unsigned long to_copy, skb_off;
+ int ret = 0;
+
+ if (size == 0)
+ goto out;
+
+ tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+ iov = first_iov;
+ tmp = *iov;
+
+ skb_queue_walk(&tinc->ti_skb_list, skb) {
+ skb_off = 0;
+ while (skb_off < skb->len) {
+ while (tmp.iov_len == 0) {
+ iov++;
+ tmp = *iov;
+ }
+
+ to_copy = min(tmp.iov_len, size);
+ to_copy = min(to_copy, skb->len - skb_off);
+
+ rdsdebug("ret %d size %zu skb %p skb_off %lu "
+ "skblen %d iov_base %p iov_len %zu cpy %lu\n",
+ ret, size, skb, skb_off, skb->len,
+ tmp.iov_base, tmp.iov_len, to_copy);
+
+ /* modifies tmp as it copies */
+ if (skb_copy_datagram_iovec(skb, skb_off, &tmp,
+ to_copy)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ size -= to_copy;
+ ret += to_copy;
+ skb_off += to_copy;
+ if (size == 0)
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+/*
+ * We have a series of skbs that have fragmented pieces of the congestion
+ * bitmap. They must add up to the exact size of the congestion bitmap. We
+ * use the skb helpers to copy those into the pages that make up the in-memory
+ * congestion bitmap for the remote address of this connection. We then tell
+ * the congestion core that the bitmap has been changed so that it can wake up
+ * sleepers.
+ *
+ * This is racing with sending paths which are using test_bit to see if the
+ * bitmap indicates that their recipient is congested.
+ */
+
+static void rds_tcp_cong_recv(struct rds_connection *conn,
+ struct rds_tcp_incoming *tinc)
+{
+ struct sk_buff *skb;
+ unsigned int to_copy, skb_off;
+ unsigned int map_off;
+ unsigned int map_page;
+ struct rds_cong_map *map;
+ int ret;
+
+ /* catch completely corrupt packets */
+ if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+ return;
+
+ map_page = 0;
+ map_off = 0;
+ map = conn->c_fcong;
+
+ skb_queue_walk(&tinc->ti_skb_list, skb) {
+ skb_off = 0;
+ while (skb_off < skb->len) {
+ to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
+ skb->len - skb_off);
+
+ BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
+
+ /* only returns 0 or -error */
+ ret = skb_copy_bits(skb, skb_off,
+ (void *)map->m_page_addrs[map_page] + map_off,
+ to_copy);
+ BUG_ON(ret != 0);
+
+ skb_off += to_copy;
+ map_off += to_copy;
+ if (map_off == PAGE_SIZE) {
+ map_off = 0;
+ map_page++;
+ }
+ }
+ }
+
+ rds_cong_map_updated(map, ~(u64) 0);
+}
+
+struct rds_tcp_desc_arg {
+ struct rds_connection *conn;
+ gfp_t gfp;
+ enum km_type km;
+};
+
+static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct rds_tcp_desc_arg *arg = desc->arg.data;
+ struct rds_connection *conn = arg->conn;
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_incoming *tinc = tc->t_tinc;
+ struct sk_buff *clone;
+ size_t left = len, to_copy;
+
+ rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
+ len);
+
+ /*
+ * tcp_read_sock() interprets partial progress as an indication to stop
+ * processing.
+ */
+ while (left) {
+ if (tinc == NULL) {
+ tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
+ arg->gfp);
+ if (tinc == NULL) {
+ desc->error = -ENOMEM;
+ goto out;
+ }
+ tc->t_tinc = tinc;
+ rdsdebug("alloced tinc %p\n", tinc);
+ rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
+ /*
+ * XXX * we might be able to use the __ variants when
+ * we've already serialized at a higher level.
+ */
+ skb_queue_head_init(&tinc->ti_skb_list);
+ }
+
+ if (left && tc->t_tinc_hdr_rem) {
+ to_copy = min(tc->t_tinc_hdr_rem, left);
+ rdsdebug("copying %zu header from skb %p\n", to_copy,
+ skb);
+ skb_copy_bits(skb, offset,
+ (char *)&tinc->ti_inc.i_hdr +
+ sizeof(struct rds_header) -
+ tc->t_tinc_hdr_rem,
+ to_copy);
+ tc->t_tinc_hdr_rem -= to_copy;
+ left -= to_copy;
+ offset += to_copy;
+
+ if (tc->t_tinc_hdr_rem == 0) {
+ /* could be 0 for a 0 len message */
+ tc->t_tinc_data_rem =
+ be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+ }
+ }
+
+ if (left && tc->t_tinc_data_rem) {
+ clone = skb_clone(skb, arg->gfp);
+ if (clone == NULL) {
+ desc->error = -ENOMEM;
+ goto out;
+ }
+
+ to_copy = min(tc->t_tinc_data_rem, left);
+ pskb_pull(clone, offset);
+ pskb_trim(clone, to_copy);
+ skb_queue_tail(&tinc->ti_skb_list, clone);
+
+ rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
+ "clone %p data %p len %d\n",
+ skb, skb->data, skb->len, offset, to_copy,
+ clone, clone->data, clone->len);
+
+ tc->t_tinc_data_rem -= to_copy;
+ left -= to_copy;
+ offset += to_copy;
+ }
+
+ if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
+ if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+ rds_tcp_cong_recv(conn, tinc);
+ else
+ rds_recv_incoming(conn, conn->c_faddr,
+ conn->c_laddr, &tinc->ti_inc,
+ arg->gfp, arg->km);
+
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+ tc->t_tinc = NULL;
+ rds_inc_put(&tinc->ti_inc);
+ tinc = NULL;
+ }
+ }
+out:
+ rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
+ len, left, skb->len,
+ skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
+ return len - left;
+}
+
+/* the caller has to hold the sock lock */
+int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct socket *sock = tc->t_sock;
+ read_descriptor_t desc;
+ struct rds_tcp_desc_arg arg;
+
+ /* It's like glib in the kernel! */
+ arg.conn = conn;
+ arg.gfp = gfp;
+ arg.km = km;
+ desc.arg.data = &arg;
+ desc.error = 0;
+ desc.count = 1; /* give more than one skb per call */
+
+ tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
+ rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
+ desc.error);
+
+ return desc.error;
+}
+
+/*
+ * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
+ * data_ready.
+ *
+ * if we fail to allocate we're in trouble.. blindly wait some time before
+ * trying again to see if the VM can free up something for us.
+ */
+int rds_tcp_recv(struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct socket *sock = tc->t_sock;
+ int ret = 0;
+
+ rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
+
+ lock_sock(sock->sk);
+ ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0);
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+void rds_tcp_data_ready(struct sock *sk, int bytes)
+{
+ void (*ready)(struct sock *sk, int bytes);
+ struct rds_connection *conn;
+ struct rds_tcp_connection *tc;
+
+ rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
+
+ read_lock(&sk->sk_callback_lock);
+ conn = sk->sk_user_data;
+ if (conn == NULL) { /* check for teardown race */
+ ready = sk->sk_data_ready;
+ goto out;
+ }
+
+ tc = conn->c_transport_data;
+ ready = tc->t_orig_data_ready;
+ rds_tcp_stats_inc(s_tcp_data_ready_calls);
+
+ if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM)
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+out:
+ read_unlock(&sk->sk_callback_lock);
+ ready(sk, bytes);
+}
+
+int __init rds_tcp_recv_init(void)
+{
+ rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
+ sizeof(struct rds_tcp_incoming),
+ 0, 0, NULL);
+ if (rds_tcp_incoming_slab == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void rds_tcp_recv_exit(void)
+{
+ kmem_cache_destroy(rds_tcp_incoming_slab);
+}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
new file mode 100644
index 00000000000..ab545e0cd5d
--- /dev/null
+++ b/net/rds/tcp_send.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static void rds_tcp_cork(struct socket *sock, int val)
+{
+ mm_segment_t oldfs;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
+ sizeof(val));
+ set_fs(oldfs);
+}
+
+void rds_tcp_xmit_prepare(struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+
+ rds_tcp_cork(tc->t_sock, 1);
+}
+
+void rds_tcp_xmit_complete(struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+
+ rds_tcp_cork(tc->t_sock, 0);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
+{
+ struct kvec vec = {
+ .iov_base = data,
+ .iov_len = len,
+ };
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
+ };
+
+ return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+int rds_tcp_xmit_cong_map(struct rds_connection *conn,
+ struct rds_cong_map *map, unsigned long offset)
+{
+ static struct rds_header rds_tcp_map_header = {
+ .h_flags = RDS_FLAG_CONG_BITMAP,
+ };
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ unsigned long i;
+ int ret;
+ int copied = 0;
+
+ /* Some problem claims cpu_to_be32(constant) isn't a constant. */
+ rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
+
+ if (offset < sizeof(struct rds_header)) {
+ ret = rds_tcp_sendmsg(tc->t_sock,
+ (void *)&rds_tcp_map_header + offset,
+ sizeof(struct rds_header) - offset);
+ if (ret <= 0)
+ return ret;
+ offset += ret;
+ copied = ret;
+ if (offset < sizeof(struct rds_header))
+ return ret;
+ }
+
+ offset -= sizeof(struct rds_header);
+ i = offset / PAGE_SIZE;
+ offset = offset % PAGE_SIZE;
+ BUG_ON(i >= RDS_CONG_MAP_PAGES);
+
+ do {
+ ret = tc->t_sock->ops->sendpage(tc->t_sock,
+ virt_to_page(map->m_page_addrs[i]),
+ offset, PAGE_SIZE - offset,
+ MSG_DONTWAIT);
+ if (ret <= 0)
+ break;
+ copied += ret;
+ offset += ret;
+ if (offset == PAGE_SIZE) {
+ offset = 0;
+ i++;
+ }
+ } while (i < RDS_CONG_MAP_PAGES);
+
+ return copied ? copied : ret;
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ int done = 0;
+ int ret = 0;
+
+ if (hdr_off == 0) {
+ /*
+ * m_ack_seq is set to the sequence number of the last byte of
+ * header and data. see rds_tcp_is_acked().
+ */
+ tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc);
+ rm->m_ack_seq = tc->t_last_sent_nxt +
+ sizeof(struct rds_header) +
+ be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
+ smp_mb__before_clear_bit();
+ set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
+ tc->t_last_expected_una = rm->m_ack_seq + 1;
+
+ rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
+ rm, rds_tcp_snd_nxt(tc),
+ (unsigned long long)rm->m_ack_seq);
+ }
+
+ if (hdr_off < sizeof(struct rds_header)) {
+ /* see rds_tcp_write_space() */
+ set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
+
+ ret = rds_tcp_sendmsg(tc->t_sock,
+ (void *)&rm->m_inc.i_hdr + hdr_off,
+ sizeof(rm->m_inc.i_hdr) - hdr_off);
+ if (ret < 0)
+ goto out;
+ done += ret;
+ if (hdr_off + done != sizeof(struct rds_header))
+ goto out;
+ }
+
+ while (sg < rm->m_nents) {
+ ret = tc->t_sock->ops->sendpage(tc->t_sock,
+ sg_page(&rm->m_sg[sg]),
+ rm->m_sg[sg].offset + off,
+ rm->m_sg[sg].length - off,
+ MSG_DONTWAIT|MSG_NOSIGNAL);
+ rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]),
+ rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off,
+ ret);
+ if (ret <= 0)
+ break;
+
+ off += ret;
+ done += ret;
+ if (off == rm->m_sg[sg].length) {
+ off = 0;
+ sg++;
+ }
+ }
+
+out:
+ if (ret <= 0) {
+ /* write_space will hit after EAGAIN, all else fatal */
+ if (ret == -EAGAIN) {
+ rds_tcp_stats_inc(s_tcp_sndbuf_full);
+ ret = 0;
+ } else {
+ printk(KERN_WARNING "RDS/tcp: send to %u.%u.%u.%u "
+ "returned %d, disconnecting and reconnecting\n",
+ NIPQUAD(conn->c_faddr), ret);
+ rds_conn_drop(conn);
+ }
+ }
+ if (done == 0)
+ done = ret;
+ return done;
+}
+
+/*
+ * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
+ * last byte of the message, including the header. This means that the
+ * entire message has been received if rm->m_ack_seq is "before" the next
+ * unacked byte of the TCP sequence space. We have to do very careful
+ * wrapping 32bit comparisons here.
+ */
+static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
+{
+ if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
+ return 0;
+ return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
+}
+
+void rds_tcp_write_space(struct sock *sk)
+{
+ void (*write_space)(struct sock *sk);
+ struct rds_connection *conn;
+ struct rds_tcp_connection *tc;
+
+ read_lock(&sk->sk_callback_lock);
+ conn = sk->sk_user_data;
+ if (conn == NULL) {
+ write_space = sk->sk_write_space;
+ goto out;
+ }
+
+ tc = conn->c_transport_data;
+ rdsdebug("write_space for tc %p\n", tc);
+ write_space = tc->t_orig_write_space;
+ rds_tcp_stats_inc(s_tcp_write_space_calls);
+
+ rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
+ tc->t_last_seen_una = rds_tcp_snd_una(tc);
+ rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
+
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+out:
+ read_unlock(&sk->sk_callback_lock);
+
+ /*
+ * write_space is only called when data leaves tcp's send queue if
+ * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put
+ * data in tcp's send queue because we use write_space to parse the
+ * sequence numbers and notice that rds messages have been fully
+ * received.
+ *
+ * tcp's write_space clears SOCK_NOSPACE if the send queue has more
+ * than a certain amount of space. So we need to set it again *after*
+ * we call tcp's write_space or else we might only get called on the
+ * first of a series of incoming tcp acks.
+ */
+ write_space(sk);
+
+ if (sk->sk_socket)
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c
new file mode 100644
index 00000000000..d5898d03cd6
--- /dev/null
+++ b/net/rds/tcp_stats.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
+ ____cacheline_aligned;
+
+static const char const *rds_tcp_stat_names[] = {
+ "tcp_data_ready_calls",
+ "tcp_write_space_calls",
+ "tcp_sndbuf_full",
+ "tcp_connect_raced",
+ "tcp_listen_closed_stale",
+};
+
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+{
+ struct rds_tcp_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+
+ if (avail < ARRAY_SIZE(rds_tcp_stat_names))
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
+ ARRAY_SIZE(rds_tcp_stat_names));
+out:
+ return ARRAY_SIZE(rds_tcp_stat_names);
+}
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 828a1bf9ea9..dd7e0cad1e7 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -68,6 +68,7 @@
* (TCP, IB/RDMA) to provide the necessary synchronisation.
*/
struct workqueue_struct *rds_wq;
+EXPORT_SYMBOL_GPL(rds_wq);
void rds_connect_complete(struct rds_connection *conn)
{
@@ -89,6 +90,7 @@ void rds_connect_complete(struct rds_connection *conn)
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
}
+EXPORT_SYMBOL_GPL(rds_connect_complete);
/*
* This random exponential backoff is relied on to eventually resolve racing
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 767da61ad2f..7e106790135 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -37,7 +37,7 @@
#include "rds.h"
#include "loop.h"
-static LIST_HEAD(rds_transports);
+static struct rds_transport *transports[RDS_TRANS_COUNT];
static DECLARE_RWSEM(rds_trans_sem);
int rds_trans_register(struct rds_transport *trans)
@@ -46,36 +46,44 @@ int rds_trans_register(struct rds_transport *trans)
down_write(&rds_trans_sem);
- list_add_tail(&trans->t_item, &rds_transports);
- printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+ if (transports[trans->t_type])
+ printk(KERN_ERR "RDS Transport type %d already registered\n",
+ trans->t_type);
+ else {
+ transports[trans->t_type] = trans;
+ printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+ }
up_write(&rds_trans_sem);
return 0;
}
+EXPORT_SYMBOL_GPL(rds_trans_register);
void rds_trans_unregister(struct rds_transport *trans)
{
down_write(&rds_trans_sem);
- list_del_init(&trans->t_item);
+ transports[trans->t_type] = NULL;
printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
up_write(&rds_trans_sem);
}
+EXPORT_SYMBOL_GPL(rds_trans_unregister);
struct rds_transport *rds_trans_get_preferred(__be32 addr)
{
- struct rds_transport *trans;
struct rds_transport *ret = NULL;
+ int i;
if (IN_LOOPBACK(ntohl(addr)))
return &rds_loop_transport;
down_read(&rds_trans_sem);
- list_for_each_entry(trans, &rds_transports, t_item) {
- if (trans->laddr_check(addr) == 0) {
- ret = trans;
+ for (i = 0; i < RDS_TRANS_COUNT; i++)
+ {
+ if (transports[i] && (transports[i]->laddr_check(addr) == 0)) {
+ ret = transports[i];
break;
}
}
@@ -97,12 +105,15 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
struct rds_transport *trans;
unsigned int total = 0;
unsigned int part;
+ int i;
rds_info_iter_unmap(iter);
down_read(&rds_trans_sem);
- list_for_each_entry(trans, &rds_transports, t_item) {
- if (trans->stats_info_copy == NULL)
+ for (i = 0; i < RDS_TRANS_COUNT; i++)
+ {
+ trans = transports[i];
+ if (!trans || !trans->stats_info_copy)
continue;
part = trans->stats_info_copy(iter, avail);