diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/bluetooth/rfcomm/tty.c | 2 | ||||
-rw-r--r-- | net/core/skbuff.c | 3 | ||||
-rw-r--r-- | net/ipv4/Kconfig | 6 | ||||
-rw-r--r-- | net/ipv4/arp.c | 2 | ||||
-rw-r--r-- | net/ipv4/ipconfig.c | 16 | ||||
-rw-r--r-- | net/ipv4/ipvs/Kconfig | 4 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_sync.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/Kconfig | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_probe.c | 6 | ||||
-rw-r--r-- | net/irda/ircomm/ircomm_tty.c | 2 | ||||
-rw-r--r-- | net/rxrpc/transport.c | 3 | ||||
-rw-r--r-- | net/socket.c | 87 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/auth_gss.c | 2 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/svcauth_gss.c | 67 | ||||
-rw-r--r-- | net/sunrpc/clnt.c | 4 | ||||
-rw-r--r-- | net/sunrpc/rpc_pipe.c | 6 | ||||
-rw-r--r-- | net/sunrpc/sunrpc_syms.c | 2 | ||||
-rw-r--r-- | net/sunrpc/svc.c | 536 | ||||
-rw-r--r-- | net/sunrpc/svcauth_unix.c | 52 | ||||
-rw-r--r-- | net/sunrpc/svcsock.c | 426 |
20 files changed, 928 insertions, 302 deletions
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 26f322737db..1958ad1b854 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -1011,7 +1011,7 @@ static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsign /* ---- TTY structure ---- */ -static struct tty_operations rfcomm_ops = { +static const struct tty_operations rfcomm_ops = { .open = rfcomm_tty_open, .close = rfcomm_tty_close, .write = rfcomm_tty_write, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c448c7f6fde..3c23760c582 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -156,7 +156,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, /* Get the DATA. Size must match skb_add_mtu(). */ size = SKB_DATA_ALIGN(size); - data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + data = kmalloc_track_caller(size + sizeof(struct skb_shared_info), + gfp_mask); if (!data) goto nodata; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 30af4a4dfcc..d172a980444 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -64,7 +64,7 @@ config ASK_IP_FIB_HASH config IP_FIB_TRIE bool "FIB_TRIE" ---help--- - Use new experimental LC-trie as FIB lookup algoritm. + Use new experimental LC-trie as FIB lookup algorithm. This improves lookup performance if you have a large number of routes. @@ -526,7 +526,7 @@ config TCP_CONG_HYBLA ---help--- TCP-Hybla is a sender-side only change that eliminates penalization of long-RTT, large-bandwidth connections, like when satellite legs are - involved, expecially when sharing a common bottleneck with normal + involved, especially when sharing a common bottleneck with normal terrestrial connections. config TCP_CONG_VEGAS @@ -556,7 +556,7 @@ config TCP_CONG_LP default n ---help--- TCP Low Priority (TCP-LP), a distributed algorithm whose goal is - to utiliza only the excess network bandwidth as compared to the + to utilize only the excess network bandwidth as compared to the ``fair share`` of bandwidth as targeted by TCP. See http://www-ece.rice.edu/networks/TCP-LP/ diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index cfe5c847428..cfb5d3de9c8 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,4 +1,4 @@ -/* linux/net/inet/arp.c +/* linux/net/ipv4/arp.c * * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $ * diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 1fbb38415b1..f8ce8475915 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -366,7 +366,7 @@ static int __init ic_defaults(void) */ if (!ic_host_name_set) - sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); + sprintf(init_utsname()->nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); if (root_server_addr == INADDR_NONE) root_server_addr = ic_servaddr; @@ -805,7 +805,7 @@ static void __init ic_do_bootp_ext(u8 *ext) } break; case 12: /* Host name */ - ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN); + ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN); ic_host_name_set = 1; break; case 15: /* Domain name (DNS) */ @@ -816,7 +816,7 @@ static void __init ic_do_bootp_ext(u8 *ext) ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); break; case 40: /* NIS Domain name (_not_ DNS) */ - ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN); + ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); break; } } @@ -1368,7 +1368,7 @@ static int __init ip_auto_config(void) printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask)); printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway)); printk(",\n host=%s, domain=%s, nis-domain=%s", - system_utsname.nodename, ic_domain, system_utsname.domainname); + utsname()->nodename, ic_domain, utsname()->domainname); printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr)); printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr)); printk(", rootpath=%s", root_server_path); @@ -1478,11 +1478,11 @@ static int __init ip_auto_config_setup(char *addrs) case 4: if ((dp = strchr(ip, '.'))) { *dp++ = '\0'; - strlcpy(system_utsname.domainname, dp, - sizeof(system_utsname.domainname)); + strlcpy(utsname()->domainname, dp, + sizeof(utsname()->domainname)); } - strlcpy(system_utsname.nodename, ip, - sizeof(system_utsname.nodename)); + strlcpy(utsname()->nodename, ip, + sizeof(utsname()->nodename)); ic_host_name_set = 1; break; case 5: diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig index c9820bfc493..891b9355cf9 100644 --- a/net/ipv4/ipvs/Kconfig +++ b/net/ipv4/ipvs/Kconfig @@ -81,7 +81,7 @@ config IP_VS_PROTO_ESP bool "ESP load balancing support" depends on IP_VS ---help--- - This option enables support for load balancing ESP (Encapsultion + This option enables support for load balancing ESP (Encapsulation Security Payload) transport protocol. Say Y if unsure. config IP_VS_PROTO_AH @@ -204,7 +204,7 @@ config IP_VS_SED connections to the server with the shortest expected delay. The expected delay that the job will experience is (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of connections - on the the ith server and Ui is the fixed service rate (weight) + on the ith server and Ui is the fixed service rate (weight) of the ith server. If you want to compile it in kernel, say Y. To compile it as a diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 6ab57d72b61..91a075edd68 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -836,7 +836,7 @@ static int fork_sync_thread(void *startup) int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) { - DECLARE_COMPLETION(startup); + DECLARE_COMPLETION_ONSTACK(startup); pid_t pid; if ((state == IP_VS_STATE_MASTER && sync_master_pid) || diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a55b8ff70de..d88c292f118 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -373,7 +373,7 @@ config IP_NF_TARGET_ULOG daemon using netlink multicast sockets; unlike the LOG target which can only be viewed through syslog. - The apropriate userspace logging daemon (ulogd) may be obtained from + The appropriate userspace logging daemon (ulogd) may be obtained from <http://www.gnumonks.org/projects/ulogd/> To compile it as a module, choose M here. If unsure, say N. diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index dab37d2f65f..4be336f1788 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -99,8 +99,10 @@ static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, } static struct jprobe tcp_send_probe = { - .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, }, - .entry = (kprobe_opcode_t *) &jtcp_sendmsg, + .kp = { + .symbol_name = "tcp_sendmsg", + }, + .entry = JPROBE_ENTRY(jtcp_sendmsg), }; diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index 3bcdb467efc..d50a02030ad 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -79,7 +79,7 @@ static struct tty_driver *driver; hashbin_t *ircomm_tty = NULL; -static struct tty_operations ops = { +static const struct tty_operations ops = { .open = ircomm_tty_open, .close = ircomm_tty_close, .write = ircomm_tty_write, diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 465efc86fcc..94b2e2fe6fd 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -381,11 +381,10 @@ static int rxrpc_incoming_msg(struct rxrpc_transport *trans, /* allocate a new message record */ ret = -ENOMEM; - msg = kmalloc(sizeof(struct rxrpc_message), GFP_KERNEL); + msg = kmemdup(jumbomsg, sizeof(struct rxrpc_message), GFP_KERNEL); if (!msg) goto error; - memcpy(msg, jumbomsg, sizeof(*msg)); list_add_tail(&msg->link, msgq); /* adjust the jumbo packet */ diff --git a/net/socket.c b/net/socket.c index 1bc4167e0da..6c9b9b326d7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -95,10 +95,10 @@ #include <linux/netfilter.h> static int sock_no_open(struct inode *irrelevant, struct file *dontcare); -static ssize_t sock_aio_read(struct kiocb *iocb, char __user *buf, - size_t size, loff_t pos); -static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *buf, - size_t size, loff_t pos); +static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); @@ -110,10 +110,6 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #endif static int sock_fasync(int fd, struct file *filp, int on); -static ssize_t sock_readv(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos); -static ssize_t sock_writev(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more); @@ -136,8 +132,6 @@ static struct file_operations socket_file_ops = { .open = sock_no_open, /* special open code to disallow open via /proc */ .release = sock_close, .fasync = sock_fasync, - .readv = sock_readv, - .writev = sock_writev, .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, }; @@ -664,7 +658,6 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, } static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, - char __user *ubuf, size_t size, struct sock_iocb *siocb) { if (!is_sync_kiocb(iocb)) { @@ -675,16 +668,13 @@ static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, } siocb->kiocb = iocb; - siocb->async_iov.iov_base = ubuf; - siocb->async_iov.iov_len = size; - iocb->private = siocb; return siocb; } static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, - struct file *file, struct iovec *iov, - unsigned long nr_segs) + struct file *file, const struct iovec *iov, + unsigned long nr_segs) { struct socket *sock = file->private_data; size_t size = 0; @@ -704,43 +694,27 @@ static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); } -static ssize_t sock_readv(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct kiocb iocb; - struct sock_iocb siocb; - struct msghdr msg; - int ret; - - init_sync_kiocb(&iocb, NULL); - iocb.private = &siocb; - - ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&iocb); - return ret; -} - -static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, - size_t count, loff_t pos) +static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct sock_iocb siocb, *x; if (pos != 0) return -ESPIPE; - if (count == 0) /* Match SYS5 behaviour */ + + if (iocb->ki_left == 0) /* Match SYS5 behaviour */ return 0; - x = alloc_sock_iocb(iocb, ubuf, count, &siocb); + + x = alloc_sock_iocb(iocb, &siocb); if (!x) return -ENOMEM; - return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, - &x->async_iov, 1); + return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); } static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, - struct file *file, struct iovec *iov, - unsigned long nr_segs) + struct file *file, const struct iovec *iov, + unsigned long nr_segs) { struct socket *sock = file->private_data; size_t size = 0; @@ -762,39 +736,22 @@ static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, return __sock_sendmsg(iocb, sock, msg, size); } -static ssize_t sock_writev(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct msghdr msg; - struct kiocb iocb; - struct sock_iocb siocb; - int ret; - - init_sync_kiocb(&iocb, NULL); - iocb.private = &siocb; - - ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&iocb); - return ret; -} - -static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, - size_t count, loff_t pos) +static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct sock_iocb siocb, *x; if (pos != 0) return -ESPIPE; - if (count == 0) /* Match SYS5 behaviour */ + + if (iocb->ki_left == 0) /* Match SYS5 behaviour */ return 0; - x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb); + x = alloc_sock_iocb(iocb, &siocb); if (!x) return -ENOMEM; - return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, - &x->async_iov, 1); + return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); } /* @@ -868,7 +825,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; case FIOGETOWN: case SIOCGPGRP: - err = put_user(sock->file->f_owner.pid, + err = put_user(f_getown(sock->file), (int __user *)argp); break; case SIOCGIFBR: diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index a6ed2d22a6e..b36b9463f5a 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1,5 +1,5 @@ /* - * linux/net/sunrpc/auth_gss.c + * linux/net/sunrpc/auth_gss/auth_gss.c * * RPCSEC_GSS client authentication. * diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 638c0b57620..447d9aef460 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -903,9 +903,9 @@ out_seq: struct gss_svc_data { /* decoded gss client cred: */ struct rpc_gss_wire_cred clcred; - /* pointer to the beginning of the procedure-specific results, - * which may be encrypted/checksummed in svcauth_gss_release: */ - __be32 *body_start; + /* save a pointer to the beginning of the encoded verifier, + * for use in encryption/checksumming in svcauth_gss_release: */ + __be32 *verf_start; struct rsc *rsci; }; @@ -968,7 +968,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; - svcdata->body_start = NULL; + svcdata->verf_start = NULL; svcdata->rsci = NULL; gc = &svcdata->clcred; @@ -1097,6 +1097,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) goto complete; case RPC_GSS_PROC_DATA: *authp = rpcsec_gsserr_ctxproblem; + svcdata->verf_start = resv->iov_base + resv->iov_len; if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; rqstp->rq_cred = rsci->cred; @@ -1110,7 +1111,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) gc->gc_seq, rsci->mechctx)) goto auth_err; /* placeholders for length and seq. number: */ - svcdata->body_start = resv->iov_base + resv->iov_len; svc_putnl(resv, 0); svc_putnl(resv, 0); break; @@ -1119,7 +1119,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) gc->gc_seq, rsci->mechctx)) goto auth_err; /* placeholders for length and seq. number: */ - svcdata->body_start = resv->iov_base + resv->iov_len; svc_putnl(resv, 0); svc_putnl(resv, 0); break; @@ -1147,6 +1146,32 @@ out: return ret; } +u32 * +svcauth_gss_prepare_to_wrap(struct xdr_buf *resbuf, struct gss_svc_data *gsd) +{ + u32 *p, verf_len; + + p = gsd->verf_start; + gsd->verf_start = NULL; + + /* If the reply stat is nonzero, don't wrap: */ + if (*(p-1) != rpc_success) + return NULL; + /* Skip the verifier: */ + p += 1; + verf_len = ntohl(*p++); + p += XDR_QUADLEN(verf_len); + /* move accept_stat to right place: */ + memcpy(p, p + 2, 4); + /* Also don't wrap if the accept stat is nonzero: */ + if (*p != rpc_success) { + resbuf->head[0].iov_len -= 2 * 4; + return NULL; + } + p++; + return p; +} + static inline int svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) { @@ -1160,17 +1185,9 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) int integ_offset, integ_len; int stat = -EINVAL; - p = gsd->body_start; - gsd->body_start = NULL; - /* move accept_stat to right place: */ - memcpy(p, p + 2, 4); - /* Don't wrap in failure case: */ - /* Counting on not getting here if call was not even accepted! */ - if (*p != rpc_success) { - resbuf->head[0].iov_len -= 2 * 4; + p = svcauth_gss_prepare_to_wrap(resbuf, gsd); + if (p == NULL) goto out; - } - p++; integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; integ_len = resbuf->len - integ_offset; BUG_ON(integ_len % 4); @@ -1191,7 +1208,6 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) resbuf->tail[0].iov_base = resbuf->head[0].iov_base + resbuf->head[0].iov_len; resbuf->tail[0].iov_len = 0; - rqstp->rq_restailpage = 0; resv = &resbuf->tail[0]; } else { resv = &resbuf->tail[0]; @@ -1223,24 +1239,16 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) int offset; int pad; - p = gsd->body_start; - gsd->body_start = NULL; - /* move accept_stat to right place: */ - memcpy(p, p + 2, 4); - /* Don't wrap in failure case: */ - /* Counting on not getting here if call was not even accepted! */ - if (*p != rpc_success) { - resbuf->head[0].iov_len -= 2 * 4; + p = svcauth_gss_prepare_to_wrap(resbuf, gsd); + if (p == NULL) return 0; - } - p++; len = p++; offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base; *p++ = htonl(gc->gc_seq); inpages = resbuf->pages; /* XXX: Would be better to write some xdr helper functions for * nfs{2,3,4}xdr.c that place the data right, instead of copying: */ - if (resbuf->tail[0].iov_base && rqstp->rq_restailpage == 0) { + if (resbuf->tail[0].iov_base) { BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base + PAGE_SIZE); BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base); @@ -1258,7 +1266,6 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) resbuf->tail[0].iov_base = resbuf->head[0].iov_base + resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE; resbuf->tail[0].iov_len = 0; - rqstp->rq_restailpage = 0; } if (gss_wrap(gsd->rsci->mechctx, offset, resbuf, inpages)) return -ENOMEM; @@ -1282,7 +1289,7 @@ svcauth_gss_release(struct svc_rqst *rqstp) if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; /* Release can be called twice, but we only wrap once. */ - if (gsd->body_start == NULL) + if (gsd->verf_start == NULL) goto out; /* normally not set till svc_send, but we need it here: */ /* XXX: what for? Do we mess it up the moment we call svc_putu32 diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 124ff0ceb55..78696f2dc7d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -161,10 +161,10 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s } /* save the nodename */ - clnt->cl_nodelen = strlen(system_utsname.nodename); + clnt->cl_nodelen = strlen(utsname()->nodename); if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); + memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); return clnt; out_no_auth: diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 700c6e061a0..9a0b41a97f9 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -494,7 +494,7 @@ rpc_get_inode(struct super_block *sb, int mode) case S_IFDIR: inode->i_fop = &simple_dir_operations; inode->i_op = &simple_dir_inode_operations; - inode->i_nlink++; + inc_nlink(inode); default: break; } @@ -571,7 +571,7 @@ rpc_populate(struct dentry *parent, if (private) rpc_inode_setowner(inode, private); if (S_ISDIR(mode)) - dir->i_nlink++; + inc_nlink(dir); d_add(dentry, inode); } mutex_unlock(&dir->i_mutex); @@ -593,7 +593,7 @@ __rpc_mkdir(struct inode *dir, struct dentry *dentry) goto out_err; inode->i_ino = iunique(dir->i_sb, 100); d_instantiate(dentry, inode); - dir->i_nlink++; + inc_nlink(dir); inode_dir_notify(dir, DN_CREATE); return 0; out_err: diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 26c0531d7e2..192dff5dabc 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -70,6 +70,8 @@ EXPORT_SYMBOL(put_rpccred); /* RPC server stuff */ EXPORT_SYMBOL(svc_create); EXPORT_SYMBOL(svc_create_thread); +EXPORT_SYMBOL(svc_create_pooled); +EXPORT_SYMBOL(svc_set_num_threads); EXPORT_SYMBOL(svc_exit_thread); EXPORT_SYMBOL(svc_destroy); EXPORT_SYMBOL(svc_drop); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 44b8d9d4c18..c2c8bb20d07 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -4,6 +4,10 @@ * High-level RPC service routines * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + * + * Multiple threads pools and NUMAisation + * Copyright (c) 2006 Silicon Graphics, Inc. + * by Greg Banks <gnb@melbourne.sgi.com> */ #include <linux/linkage.h> @@ -12,6 +16,8 @@ #include <linux/net.h> #include <linux/in.h> #include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/module.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/xdr.h> @@ -23,14 +29,252 @@ #define RPC_PARANOIA 1 /* + * Mode for mapping cpus to pools. + */ +enum { + SVC_POOL_NONE = -1, /* uninitialised, choose one of the others */ + SVC_POOL_GLOBAL, /* no mapping, just a single global pool + * (legacy & UP mode) */ + SVC_POOL_PERCPU, /* one pool per cpu */ + SVC_POOL_PERNODE /* one pool per numa node */ +}; + +/* + * Structure for mapping cpus to pools and vice versa. + * Setup once during sunrpc initialisation. + */ +static struct svc_pool_map { + int mode; /* Note: int not enum to avoid + * warnings about "enumeration value + * not handled in switch" */ + unsigned int npools; + unsigned int *pool_to; /* maps pool id to cpu or node */ + unsigned int *to_pool; /* maps cpu or node to pool id */ +} svc_pool_map = { + .mode = SVC_POOL_NONE +}; + + +/* + * Detect best pool mapping mode heuristically, + * according to the machine's topology. + */ +static int +svc_pool_map_choose_mode(void) +{ + unsigned int node; + + if (num_online_nodes() > 1) { + /* + * Actually have multiple NUMA nodes, + * so split pools on NUMA node boundaries + */ + return SVC_POOL_PERNODE; + } + + node = any_online_node(node_online_map); + if (nr_cpus_node(node) > 2) { + /* + * Non-trivial SMP, or CONFIG_NUMA on + * non-NUMA hardware, e.g. with a generic + * x86_64 kernel on Xeons. In this case we + * want to divide the pools on cpu boundaries. + */ + return SVC_POOL_PERCPU; + } + + /* default: one global pool */ + return SVC_POOL_GLOBAL; +} + +/* + * Allocate the to_pool[] and pool_to[] arrays. + * Returns 0 on success or an errno. + */ +static int +svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools) +{ + m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); + if (!m->to_pool) + goto fail; + m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); + if (!m->pool_to) + goto fail_free; + + return 0; + +fail_free: + kfree(m->to_pool); +fail: + return -ENOMEM; +} + +/* + * Initialise the pool map for SVC_POOL_PERCPU mode. + * Returns number of pools or <0 on error. + */ +static int +svc_pool_map_init_percpu(struct svc_pool_map *m) +{ + unsigned int maxpools = highest_possible_processor_id()+1; + unsigned int pidx = 0; + unsigned int cpu; + int err; + + err = svc_pool_map_alloc_arrays(m, maxpools); + if (err) + return err; + + for_each_online_cpu(cpu) { + BUG_ON(pidx > maxpools); + m->to_pool[cpu] = pidx; + m->pool_to[pidx] = cpu; + pidx++; + } + /* cpus brought online later all get mapped to pool0, sorry */ + + return pidx; +}; + + +/* + * Initialise the pool map for SVC_POOL_PERNODE mode. + * Returns number of pools or <0 on error. + */ +static int +svc_pool_map_init_pernode(struct svc_pool_map *m) +{ + unsigned int maxpools = highest_possible_node_id()+1; + unsigned int pidx = 0; + unsigned int node; + int err; + + err = svc_pool_map_alloc_arrays(m, maxpools); + if (err) + return err; + + for_each_node_with_cpus(node) { + /* some architectures (e.g. SN2) have cpuless nodes */ + BUG_ON(pidx > maxpools); + m->to_pool[node] = pidx; + m->pool_to[pidx] = node; + pidx++; + } + /* nodes brought online later all get mapped to pool0, sorry */ + + return pidx; +} + + +/* + * Build the global map of cpus to pools and vice versa. + */ +static unsigned int +svc_pool_map_init(void) +{ + struct svc_pool_map *m = &svc_pool_map; + int npools = -1; + + if (m->mode != SVC_POOL_NONE) + return m->npools; + + m->mode = svc_pool_map_choose_mode(); + + switch (m->mode) { + case SVC_POOL_PERCPU: + npools = svc_pool_map_init_percpu(m); + break; + case SVC_POOL_PERNODE: + npools = svc_pool_map_init_pernode(m); + break; + } + + if (npools < 0) { + /* default, or memory allocation failure */ + npools = 1; + m->mode = SVC_POOL_GLOBAL; + } + m->npools = npools; + + return m->npools; +} + +/* + * Set the current thread's cpus_allowed mask so that it + * will only run on cpus in the given pool. + * + * Returns 1 and fills in oldmask iff a cpumask was applied. + */ +static inline int +svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int node; /* or cpu */ + + /* + * The caller checks for sv_nrpools > 1, which + * implies that we've been initialized and the + * map mode is not NONE. + */ + BUG_ON(m->mode == SVC_POOL_NONE); + + switch (m->mode) + { + default: + return 0; + case SVC_POOL_PERCPU: + node = m->pool_to[pidx]; + *oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(node)); + return 1; + case SVC_POOL_PERNODE: + node = m->pool_to[pidx]; + *oldmask = current->cpus_allowed; + set_cpus_allowed(current, node_to_cpumask(node)); + return 1; + } +} + +/* + * Use the mapping mode to choose a pool for a given CPU. + * Used when enqueueing an incoming RPC. Always returns + * a non-NULL pool pointer. + */ +struct svc_pool * +svc_pool_for_cpu(struct svc_serv *serv, int cpu) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int pidx = 0; + + /* + * SVC_POOL_NONE happens in a pure client when + * lockd is brought up, so silently treat it the + * same as SVC_POOL_GLOBAL. + */ + + switch (m->mode) { + case SVC_POOL_PERCPU: + pidx = m->to_pool[cpu]; + break; + case SVC_POOL_PERNODE: + pidx = m->to_pool[cpu_to_node(cpu)]; + break; + } + return &serv->sv_pools[pidx % serv->sv_nrpools]; +} + + +/* * Create an RPC service */ -struct svc_serv * -svc_create(struct svc_program *prog, unsigned int bufsize) +static struct svc_serv * +__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, + void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; int vers; unsigned int xdrsize; + unsigned int i; if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; @@ -39,6 +283,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize) serv->sv_nrthreads = 1; serv->sv_stats = prog->pg_stats; serv->sv_bufsz = bufsize? bufsize : 4096; + serv->sv_shutdown = shutdown; xdrsize = 0; while (prog) { prog->pg_lovers = prog->pg_nvers-1; @@ -53,20 +298,68 @@ svc_create(struct svc_program *prog, unsigned int bufsize) prog = prog->pg_next; } serv->sv_xdrsize = xdrsize; - INIT_LIST_HEAD(&serv->sv_threads); - INIT_LIST_HEAD(&serv->sv_sockets); INIT_LIST_HEAD(&serv->sv_tempsocks); INIT_LIST_HEAD(&serv->sv_permsocks); + init_timer(&serv->sv_temptimer); spin_lock_init(&serv->sv_lock); + serv->sv_nrpools = npools; + serv->sv_pools = + kcalloc(sizeof(struct svc_pool), serv->sv_nrpools, + GFP_KERNEL); + if (!serv->sv_pools) { + kfree(serv); + return NULL; + } + + for (i = 0; i < serv->sv_nrpools; i++) { + struct svc_pool *pool = &serv->sv_pools[i]; + + dprintk("initialising pool %u for %s\n", + i, serv->sv_name); + + pool->sp_id = i; + INIT_LIST_HEAD(&pool->sp_threads); + INIT_LIST_HEAD(&pool->sp_sockets); + INIT_LIST_HEAD(&pool->sp_all_threads); + spin_lock_init(&pool->sp_lock); + } + + /* Remove any stale portmap registrations */ svc_register(serv, 0, 0); return serv; } +struct svc_serv * +svc_create(struct svc_program *prog, unsigned int bufsize, + void (*shutdown)(struct svc_serv *serv)) +{ + return __svc_create(prog, bufsize, /*npools*/1, shutdown); +} + +struct svc_serv * +svc_create_pooled(struct svc_program *prog, unsigned int bufsize, + void (*shutdown)(struct svc_serv *serv), + svc_thread_fn func, int sig, struct module *mod) +{ + struct svc_serv *serv; + unsigned int npools = svc_pool_map_init(); + + serv = __svc_create(prog, bufsize, npools, shutdown); + + if (serv != NULL) { + serv->sv_function = func; + serv->sv_kill_signal = sig; + serv->sv_module = mod; + } + + return serv; +} + /* - * Destroy an RPC service + * Destroy an RPC service. Should be called with the BKL held */ void svc_destroy(struct svc_serv *serv) @@ -85,12 +378,17 @@ svc_destroy(struct svc_serv *serv) } else printk("svc_destroy: no threads for serv=%p!\n", serv); + del_timer_sync(&serv->sv_temptimer); + while (!list_empty(&serv->sv_tempsocks)) { svsk = list_entry(serv->sv_tempsocks.next, struct svc_sock, sk_list); svc_delete_socket(svsk); } + if (serv->sv_shutdown) + serv->sv_shutdown(serv); + while (!list_empty(&serv->sv_permsocks)) { svsk = list_entry(serv->sv_permsocks.next, struct svc_sock, @@ -102,6 +400,7 @@ svc_destroy(struct svc_serv *serv) /* Unregister service with the portmapper */ svc_register(serv, 0, 0); + kfree(serv->sv_pools); kfree(serv); } @@ -118,18 +417,15 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) if (size > RPCSVC_MAXPAYLOAD) size = RPCSVC_MAXPAYLOAD; pages = 2 + (size+ PAGE_SIZE -1) / PAGE_SIZE; - rqstp->rq_argused = 0; - rqstp->rq_resused = 0; arghi = 0; BUG_ON(pages > RPCSVC_MAXPAGES); while (pages) { struct page *p = alloc_page(GFP_KERNEL); if (!p) break; - rqstp->rq_argpages[arghi++] = p; + rqstp->rq_pages[arghi++] = p; pages--; } - rqstp->rq_arghi = arghi; return ! pages; } @@ -139,24 +435,25 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) static void svc_release_buffer(struct svc_rqst *rqstp) { - while (rqstp->rq_arghi) - put_page(rqstp->rq_argpages[--rqstp->rq_arghi]); - while (rqstp->rq_resused) { - if (rqstp->rq_respages[--rqstp->rq_resused] == NULL) - continue; - put_page(rqstp->rq_respages[rqstp->rq_resused]); - } - rqstp->rq_argused = 0; + int i; + for (i=0; i<ARRAY_SIZE(rqstp->rq_pages); i++) + if (rqstp->rq_pages[i]) + put_page(rqstp->rq_pages[i]); } /* - * Create a server thread + * Create a thread in the given pool. Caller must hold BKL. + * On a NUMA or SMP machine, with a multi-pool serv, the thread + * will be restricted to run on the cpus belonging to the pool. */ -int -svc_create_thread(svc_thread_fn func, struct svc_serv *serv) +static int +__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, + struct svc_pool *pool) { struct svc_rqst *rqstp; int error = -ENOMEM; + int have_oldmask = 0; + cpumask_t oldmask; rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); if (!rqstp) @@ -170,8 +467,21 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv) goto out_thread; serv->sv_nrthreads++; + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads++; + list_add(&rqstp->rq_all, &pool->sp_all_threads); + spin_unlock_bh(&pool->sp_lock); rqstp->rq_server = serv; + rqstp->rq_pool = pool; + + if (serv->sv_nrpools > 1) + have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); + error = kernel_thread((int (*)(void *)) func, rqstp, 0); + + if (have_oldmask) + set_cpus_allowed(current, oldmask); + if (error < 0) goto out_thread; svc_sock_update_bufs(serv); @@ -185,17 +495,136 @@ out_thread: } /* - * Destroy an RPC server thread + * Create a thread in the default pool. Caller must hold BKL. + */ +int +svc_create_thread(svc_thread_fn func, struct svc_serv *serv) +{ + return __svc_create_thread(func, serv, &serv->sv_pools[0]); +} + +/* + * Choose a pool in which to create a new thread, for svc_set_num_threads + */ +static inline struct svc_pool * +choose_pool(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + if (pool != NULL) + return pool; + + return &serv->sv_pools[(*state)++ % serv->sv_nrpools]; +} + +/* + * Choose a thread to kill, for svc_set_num_threads + */ +static inline struct task_struct * +choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + unsigned int i; + struct task_struct *task = NULL; + + if (pool != NULL) { + spin_lock_bh(&pool->sp_lock); + } else { + /* choose a pool in round-robin fashion */ + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_all_threads)) + goto found_pool; + spin_unlock_bh(&pool->sp_lock); + } + return NULL; + } + +found_pool: + if (!list_empty(&pool->sp_all_threads)) { + struct svc_rqst *rqstp; + + /* + * Remove from the pool->sp_all_threads list + * so we don't try to kill it again. + */ + rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all); + list_del_init(&rqstp->rq_all); + task = rqstp->rq_task; + } + spin_unlock_bh(&pool->sp_lock); + + return task; +} + +/* + * Create or destroy enough new threads to make the number + * of threads the given number. If `pool' is non-NULL, applies + * only to threads in that pool, otherwise round-robins between + * all pools. Must be called with a svc_get() reference and + * the BKL held. + * + * Destroying threads relies on the service threads filling in + * rqstp->rq_task, which only the nfs ones do. Assumes the serv + * has been created using svc_create_pooled(). + * + * Based on code that used to be in nfsd_svc() but tweaked + * to be pool-aware. + */ +int +svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct task_struct *victim; + int error = 0; + unsigned int state = serv->sv_nrthreads-1; + + if (pool == NULL) { + /* The -1 assumes caller has done a svc_get() */ + nrservs -= (serv->sv_nrthreads-1); + } else { + spin_lock_bh(&pool->sp_lock); + nrservs -= pool->sp_nrthreads; + spin_unlock_bh(&pool->sp_lock); + } + + /* create new threads */ + while (nrservs > 0) { + nrservs--; + __module_get(serv->sv_module); + error = __svc_create_thread(serv->sv_function, serv, + choose_pool(serv, pool, &state)); + if (error < 0) { + module_put(serv->sv_module); + break; + } + } + /* destroy old threads */ + while (nrservs < 0 && + (victim = choose_victim(serv, pool, &state)) != NULL) { + send_sig(serv->sv_kill_signal, victim, 1); + nrservs++; + } + + return error; +} + +/* + * Called from a server thread as it's exiting. Caller must hold BKL. */ void svc_exit_thread(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; svc_release_buffer(rqstp); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); + + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads--; + list_del(&rqstp->rq_all); + spin_unlock_bh(&pool->sp_lock); + kfree(rqstp); /* Release the server */ @@ -215,23 +644,32 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) unsigned long flags; int i, error = 0, dummy; - progp = serv->sv_program; - - dprintk("RPC: svc_register(%s, %s, %d)\n", - progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port); - if (!port) clear_thread_flag(TIF_SIGPENDING); - for (i = 0; i < progp->pg_nvers; i++) { - if (progp->pg_vers[i] == NULL) - continue; - error = rpc_register(progp->pg_prog, i, proto, port, &dummy); - if (error < 0) - break; - if (port && !dummy) { - error = -EACCES; - break; + for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + + dprintk("RPC: svc_register(%s, %s, %d, %d)%s\n", + progp->pg_name, + proto == IPPROTO_UDP? "udp" : "tcp", + port, + i, + progp->pg_vers[i]->vs_hidden? + " (but not telling portmap)" : ""); + + if (progp->pg_vers[i]->vs_hidden) + continue; + + error = rpc_register(progp->pg_prog, i, proto, port, &dummy); + if (error < 0) + break; + if (port && !dummy) { + error = -EACCES; + break; + } } } @@ -248,19 +686,20 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) * Process the RPC request. */ int -svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_process(struct svc_rqst *rqstp) { struct svc_program *progp; struct svc_version *versp = NULL; /* compiler food */ struct svc_procedure *procp = NULL; struct kvec * argv = &rqstp->rq_arg.head[0]; struct kvec * resv = &rqstp->rq_res.head[0]; + struct svc_serv *serv = rqstp->rq_server; kxdrproc_t xdr; __be32 *statp; u32 dir, prog, vers, proc; __be32 auth_stat, rpc_stat; int auth_res; - __be32 *accept_statp; + __be32 *reply_statp; rpc_stat = rpc_success; @@ -270,10 +709,10 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) /* setup response xdr_buf. * Initially it has just one page */ - svc_take_page(rqstp); /* must succeed */ + rqstp->rq_resused = 1; resv->iov_base = page_address(rqstp->rq_respages[0]); resv->iov_len = 0; - rqstp->rq_res.pages = rqstp->rq_respages+1; + rqstp->rq_res.pages = rqstp->rq_respages + 1; rqstp->rq_res.len = 0; rqstp->rq_res.page_base = 0; rqstp->rq_res.page_len = 0; @@ -301,7 +740,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) goto err_bad_rpc; /* Save position in case we later decide to reject: */ - accept_statp = resv->iov_base + resv->iov_len; + reply_statp = resv->iov_base + resv->iov_len; svc_putnl(resv, 0); /* ACCEPT */ @@ -449,7 +888,7 @@ err_bad_auth: dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of accept status: */ - xdr_ressize_check(rqstp, accept_statp); + xdr_ressize_check(rqstp, reply_statp); svc_putnl(resv, 1); /* REJECT */ svc_putnl(resv, 1); /* AUTH_ERROR */ svc_putnl(resv, ntohl(auth_stat)); /* status */ @@ -489,3 +928,18 @@ err_bad: svc_putnl(resv, ntohl(rpc_stat)); goto sendit; } + +/* + * Return (transport-specific) limit on the rpc payload. + */ +u32 svc_max_payload(const struct svc_rqst *rqstp) +{ + int max = RPCSVC_MAXPAYLOAD_TCP; + + if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM) + max = RPCSVC_MAXPAYLOAD_UDP; + if (rqstp->rq_server->sv_bufsz < max) + max = rqstp->rq_server->sv_bufsz; + return max; +} +EXPORT_SYMBOL_GPL(svc_max_payload); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 1020d54b01d..e1bd933629f 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/hash.h> #include <linux/string.h> +#include <net/sock.h> #define RPCDBG_FACILITY RPCDBG_AUTH @@ -348,12 +349,9 @@ int auth_unix_forget_old(struct auth_domain *dom) struct auth_domain *auth_unix_lookup(struct in_addr addr) { - struct ip_map key, *ipm; + struct ip_map *ipm; struct auth_domain *rv; - strcpy(key.m_class, "nfsd"); - key.m_addr = addr; - ipm = ip_map_lookup("nfsd", addr); if (!ipm) @@ -378,6 +376,44 @@ void svcauth_unix_purge(void) cache_purge(&ip_map_cache); } +static inline struct ip_map * +ip_map_cached_get(struct svc_rqst *rqstp) +{ + struct ip_map *ipm = rqstp->rq_sock->sk_info_authunix; + if (ipm != NULL) { + if (!cache_valid(&ipm->h)) { + /* + * The entry has been invalidated since it was + * remembered, e.g. by a second mount from the + * same IP address. + */ + rqstp->rq_sock->sk_info_authunix = NULL; + cache_put(&ipm->h, &ip_map_cache); + return NULL; + } + cache_get(&ipm->h); + } + return ipm; +} + +static inline void +ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) +{ + struct svc_sock *svsk = rqstp->rq_sock; + + if (svsk->sk_sock->type == SOCK_STREAM && svsk->sk_info_authunix == NULL) + svsk->sk_info_authunix = ipm; /* newly cached, keep the reference */ + else + cache_put(&ipm->h, &ip_map_cache); +} + +void +svcauth_unix_info_release(void *info) +{ + struct ip_map *ipm = info; + cache_put(&ipm->h, &ip_map_cache); +} + static int svcauth_unix_set_client(struct svc_rqst *rqstp) { @@ -387,8 +423,10 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) if (rqstp->rq_proc == 0) return SVC_OK; - ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class, - rqstp->rq_addr.sin_addr); + ipm = ip_map_cached_get(rqstp); + if (ipm == NULL) + ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class, + rqstp->rq_addr.sin_addr); if (ipm == NULL) return SVC_DENIED; @@ -403,7 +441,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) case 0: rqstp->rq_client = &ipm->m_client->h; kref_get(&rqstp->rq_client->ref); - cache_put(&ipm->h, &ip_map_cache); + ip_map_cached_put(rqstp, ipm); break; } return SVC_OK; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5b0fe1b66a2..b39e7e2b648 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -31,6 +31,7 @@ #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/file.h> #include <net/sock.h> #include <net/checksum.h> #include <net/ip.h> @@ -45,13 +46,16 @@ /* SMP locking strategy: * - * svc_serv->sv_lock protects most stuff for that service. + * svc_pool->sp_lock protects most of the fields of that pool. + * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. + * when both need to be taken (rare), svc_serv->sv_lock is first. + * BKL protects svc_serv->sv_nrthread. + * svc_sock->sk_defer_lock protects the svc_sock->sk_deferred list + * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. * * Some flags can be set to certain values at any time * providing that certain rules are followed: * - * SK_BUSY can be set to 0 at any time. - * svc_sock_enqueue must be called afterwards * SK_CONN, SK_DATA, can be set or cleared at any time. * after a set, svc_sock_enqueue must be called. * after a clear, the socket must be read/accepted @@ -73,23 +77,30 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); +/* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ +static int svc_conn_age_period = 6*60; + /* - * Queue up an idle server thread. Must have serv->sv_lock held. + * Queue up an idle server thread. Must have pool->sp_lock held. * Note: this is really a stack rather than a queue, so that we only - * use as many different threads as we need, and the rest don't polute + * use as many different threads as we need, and the rest don't pollute * the cache. */ static inline void -svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) { - list_add(&rqstp->rq_list, &serv->sv_threads); + list_add(&rqstp->rq_list, &pool->sp_threads); } /* - * Dequeue an nfsd thread. Must have serv->sv_lock held. + * Dequeue an nfsd thread. Must have pool->sp_lock held. */ static inline void -svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) { list_del(&rqstp->rq_list); } @@ -140,7 +151,9 @@ static void svc_sock_enqueue(struct svc_sock *svsk) { struct svc_serv *serv = svsk->sk_server; + struct svc_pool *pool; struct svc_rqst *rqstp; + int cpu; if (!(svsk->sk_flags & ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) @@ -148,10 +161,14 @@ svc_sock_enqueue(struct svc_sock *svsk) if (test_bit(SK_DEAD, &svsk->sk_flags)) return; - spin_lock_bh(&serv->sv_lock); + cpu = get_cpu(); + pool = svc_pool_for_cpu(svsk->sk_server, cpu); + put_cpu(); + + spin_lock_bh(&pool->sp_lock); - if (!list_empty(&serv->sv_threads) && - !list_empty(&serv->sv_sockets)) + if (!list_empty(&pool->sp_threads) && + !list_empty(&pool->sp_sockets)) printk(KERN_ERR "svc_sock_enqueue: threads and sockets both waiting??\n"); @@ -161,73 +178,79 @@ svc_sock_enqueue(struct svc_sock *svsk) goto out_unlock; } - if (test_bit(SK_BUSY, &svsk->sk_flags)) { - /* Don't enqueue socket while daemon is receiving */ + /* Mark socket as busy. It will remain in this state until the + * server has processed all pending data and put the socket back + * on the idle list. We update SK_BUSY atomically because + * it also guards against trying to enqueue the svc_sock twice. + */ + if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { + /* Don't enqueue socket while already enqueued */ dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); goto out_unlock; } + BUG_ON(svsk->sk_pool != NULL); + svsk->sk_pool = pool; set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - if (((svsk->sk_reserved + serv->sv_bufsz)*2 + if (((atomic_read(&svsk->sk_reserved) + serv->sv_bufsz)*2 > svc_sock_wspace(svsk)) && !test_bit(SK_CLOSE, &svsk->sk_flags) && !test_bit(SK_CONN, &svsk->sk_flags)) { /* Don't enqueue while not enough space for reply */ dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", - svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, + svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_bufsz, svc_sock_wspace(svsk)); + svsk->sk_pool = NULL; + clear_bit(SK_BUSY, &svsk->sk_flags); goto out_unlock; } clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - /* Mark socket as busy. It will remain in this state until the - * server has processed all pending data and put the socket back - * on the idle list. - */ - set_bit(SK_BUSY, &svsk->sk_flags); - if (!list_empty(&serv->sv_threads)) { - rqstp = list_entry(serv->sv_threads.next, + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, rq_list); dprintk("svc: socket %p served by daemon %p\n", svsk->sk_sk, rqstp); - svc_serv_dequeue(serv, rqstp); + svc_thread_dequeue(pool, rqstp); if (rqstp->rq_sock) printk(KERN_ERR "svc_sock_enqueue: server %p, rq_sock=%p!\n", rqstp, rqstp->rq_sock); rqstp->rq_sock = svsk; - svsk->sk_inuse++; + atomic_inc(&svsk->sk_inuse); rqstp->rq_reserved = serv->sv_bufsz; - svsk->sk_reserved += rqstp->rq_reserved; + atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); + BUG_ON(svsk->sk_pool != pool); wake_up(&rqstp->rq_wait); } else { dprintk("svc: socket %p put into queue\n", svsk->sk_sk); - list_add_tail(&svsk->sk_ready, &serv->sv_sockets); + list_add_tail(&svsk->sk_ready, &pool->sp_sockets); + BUG_ON(svsk->sk_pool != pool); } out_unlock: - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&pool->sp_lock); } /* - * Dequeue the first socket. Must be called with the serv->sv_lock held. + * Dequeue the first socket. Must be called with the pool->sp_lock held. */ static inline struct svc_sock * -svc_sock_dequeue(struct svc_serv *serv) +svc_sock_dequeue(struct svc_pool *pool) { struct svc_sock *svsk; - if (list_empty(&serv->sv_sockets)) + if (list_empty(&pool->sp_sockets)) return NULL; - svsk = list_entry(serv->sv_sockets.next, + svsk = list_entry(pool->sp_sockets.next, struct svc_sock, sk_ready); list_del_init(&svsk->sk_ready); dprintk("svc: socket %p dequeued, inuse=%d\n", - svsk->sk_sk, svsk->sk_inuse); + svsk->sk_sk, atomic_read(&svsk->sk_inuse)); return svsk; } @@ -241,6 +264,7 @@ svc_sock_dequeue(struct svc_serv *serv) static inline void svc_sock_received(struct svc_sock *svsk) { + svsk->sk_pool = NULL; clear_bit(SK_BUSY, &svsk->sk_flags); svc_sock_enqueue(svsk); } @@ -262,10 +286,8 @@ void svc_reserve(struct svc_rqst *rqstp, int space) if (space < rqstp->rq_reserved) { struct svc_sock *svsk = rqstp->rq_sock; - spin_lock_bh(&svsk->sk_server->sv_lock); - svsk->sk_reserved -= (rqstp->rq_reserved - space); + atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); rqstp->rq_reserved = space; - spin_unlock_bh(&svsk->sk_server->sv_lock); svc_sock_enqueue(svsk); } @@ -277,17 +299,11 @@ void svc_reserve(struct svc_rqst *rqstp, int space) static inline void svc_sock_put(struct svc_sock *svsk) { - struct svc_serv *serv = svsk->sk_server; - - spin_lock_bh(&serv->sv_lock); - if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { - spin_unlock_bh(&serv->sv_lock); + if (atomic_dec_and_test(&svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { dprintk("svc: releasing dead socket\n"); sock_release(svsk->sk_sock); kfree(svsk); } - else - spin_unlock_bh(&serv->sv_lock); } static void @@ -297,7 +313,7 @@ svc_sock_release(struct svc_rqst *rqstp) svc_release_skb(rqstp); - svc_free_allpages(rqstp); + svc_free_res_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; @@ -321,25 +337,33 @@ svc_sock_release(struct svc_rqst *rqstp) /* * External function to wake up a server waiting for data + * This really only makes sense for services like lockd + * which have exactly one thread anyway. */ void svc_wake_up(struct svc_serv *serv) { struct svc_rqst *rqstp; - - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_threads)) { - rqstp = list_entry(serv->sv_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: daemon %p woken up.\n", rqstp); - /* - svc_serv_dequeue(serv, rqstp); - rqstp->rq_sock = NULL; - */ - wake_up(&rqstp->rq_wait); + unsigned int i; + struct svc_pool *pool; + + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[i]; + + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_thread_dequeue(pool, rqstp); + rqstp->rq_sock = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&pool->sp_lock); } - spin_unlock_bh(&serv->sv_lock); } /* @@ -388,7 +412,8 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) /* send head */ if (slen == xdr->head[0].iov_len) flags = 0; - len = kernel_sendpage(sock, rqstp->rq_respages[0], 0, xdr->head[0].iov_len, flags); + len = kernel_sendpage(sock, rqstp->rq_respages[0], 0, + xdr->head[0].iov_len, flags); if (len != xdr->head[0].iov_len) goto out; slen -= xdr->head[0].iov_len; @@ -413,8 +438,9 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) } /* send tail */ if (xdr->tail[0].iov_len) { - result = kernel_sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage], - ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1), + result = kernel_sendpage(sock, rqstp->rq_respages[0], + ((unsigned long)xdr->tail[0].iov_base) + & (PAGE_SIZE-1), xdr->tail[0].iov_len, 0); if (result > 0) @@ -429,6 +455,56 @@ out: } /* + * Report socket names for nfsdfs + */ +static int one_sock_name(char *buf, struct svc_sock *svsk) +{ + int len; + + switch(svsk->sk_sk->sk_family) { + case AF_INET: + len = sprintf(buf, "ipv4 %s %u.%u.%u.%u %d\n", + svsk->sk_sk->sk_protocol==IPPROTO_UDP? + "udp" : "tcp", + NIPQUAD(inet_sk(svsk->sk_sk)->rcv_saddr), + inet_sk(svsk->sk_sk)->num); + break; + default: + len = sprintf(buf, "*unknown-%d*\n", + svsk->sk_sk->sk_family); + } + return len; +} + +int +svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) +{ + struct svc_sock *svsk, *closesk = NULL; + int len = 0; + + if (!serv) + return 0; + spin_lock(&serv->sv_lock); + list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { + int onelen = one_sock_name(buf+len, svsk); + if (toclose && strcmp(toclose, buf+len) == 0) + closesk = svsk; + else + len += onelen; + } + spin_unlock(&serv->sv_lock); + if (closesk) + /* Should unregister with portmap, but you cannot + * unregister just one protocol... + */ + svc_delete_socket(closesk); + else if (toclose) + return -ENOENT; + return len; +} +EXPORT_SYMBOL(svc_sock_names); + +/* * Check input queue length */ static int @@ -557,7 +633,10 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* udp sockets need large rcvbuf as all pending * requests are still in that buffer. sndbuf must * also be large enough that there is enough space - * for one reply per thread. + * for one reply per thread. We count all threads + * rather than threads in a particular pool, which + * provides an upper bound on the number of threads + * which will access the socket. */ svc_sock_setbufsize(svsk->sk_sock, (serv->sv_nrthreads+3) * serv->sv_bufsz, @@ -631,9 +710,11 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) if (len <= rqstp->rq_arg.head[0].iov_len) { rqstp->rq_arg.head[0].iov_len = len; rqstp->rq_arg.page_len = 0; + rqstp->rq_respages = rqstp->rq_pages+1; } else { rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; - rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; + rqstp->rq_respages = rqstp->rq_pages + 1 + + (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; } if (serv->sv_stats) @@ -844,7 +925,7 @@ svc_tcp_accept(struct svc_sock *svsk) struct svc_sock, sk_list); set_bit(SK_CLOSE, &svsk->sk_flags); - svsk->sk_inuse ++; + atomic_inc(&svsk->sk_inuse); } spin_unlock_bh(&serv->sv_lock); @@ -874,7 +955,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) struct svc_sock *svsk = rqstp->rq_sock; struct svc_serv *serv = svsk->sk_server; int len; - struct kvec vec[RPCSVC_MAXPAGES]; + struct kvec *vec; int pnum, vlen; dprintk("svc: tcp_recv %p data %d conn %d close %d\n", @@ -902,6 +983,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) /* sndbuf needs to have room for one request * per thread, otherwise we can stall even when the * network isn't a bottleneck. + * + * We count all threads rather than threads in a + * particular pool, which provides an upper bound + * on the number of threads which will access the socket. + * * rcvbuf just needs to be able to hold a few requests. * Normally they will be removed from the queue * as soon a a complete request arrives. @@ -967,15 +1053,17 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) len = svsk->sk_reclen; set_bit(SK_DATA, &svsk->sk_flags); + vec = rqstp->rq_vec; vec[0] = rqstp->rq_arg.head[0]; vlen = PAGE_SIZE; pnum = 1; while (vlen < len) { - vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]); + vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]); vec[pnum].iov_len = PAGE_SIZE; pnum++; vlen += PAGE_SIZE; } + rqstp->rq_respages = &rqstp->rq_pages[pnum]; /* Now receive data */ len = svc_recvfrom(rqstp, vec, pnum, len); @@ -1117,13 +1205,17 @@ svc_sock_update_bufs(struct svc_serv *serv) } /* - * Receive the next request on any socket. + * Receive the next request on any socket. This code is carefully + * organised not to touch any cachelines in the shared svc_serv + * structure, only cachelines in the local svc_pool. */ int -svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) +svc_recv(struct svc_rqst *rqstp, long timeout) { struct svc_sock *svsk =NULL; - int len; + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; + int len, i; int pages; struct xdr_buf *arg; DECLARE_WAITQUEUE(wait, current); @@ -1140,27 +1232,22 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) "svc_recv: service %p, wait queue active!\n", rqstp); - /* Initialize the buffers */ - /* first reclaim pages that were moved to response list */ - svc_pushback_allpages(rqstp); /* now allocate needed pages. If we get a failure, sleep briefly */ pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE; - while (rqstp->rq_arghi < pages) { - struct page *p = alloc_page(GFP_KERNEL); - if (!p) { - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - continue; + for (i=0; i < pages ; i++) + while (rqstp->rq_pages[i] == NULL) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + rqstp->rq_pages[i] = p; } - rqstp->rq_argpages[rqstp->rq_arghi++] = p; - } /* Make arg->head point to first page and arg->pages point to rest */ arg = &rqstp->rq_arg; - arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]); + arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); arg->head[0].iov_len = PAGE_SIZE; - rqstp->rq_argused = 1; - arg->pages = rqstp->rq_argpages + 1; + arg->pages = rqstp->rq_pages + 1; arg->page_base = 0; /* save at least one page for response */ arg->page_len = (pages-2)*PAGE_SIZE; @@ -1172,32 +1259,15 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) if (signalled()) return -EINTR; - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - svsk = list_entry(serv->sv_tempsocks.next, - struct svc_sock, sk_list); - /* apparently the "standard" is that clients close - * idle connections after 5 minutes, servers after - * 6 minutes - * http://www.connectathon.org/talks96/nfstcp.pdf - */ - if (get_seconds() - svsk->sk_lastrecv < 6*60 - || test_bit(SK_BUSY, &svsk->sk_flags)) - svsk = NULL; - } - if (svsk) { - set_bit(SK_BUSY, &svsk->sk_flags); - set_bit(SK_CLOSE, &svsk->sk_flags); - rqstp->rq_sock = svsk; - svsk->sk_inuse++; - } else if ((svsk = svc_sock_dequeue(serv)) != NULL) { + spin_lock_bh(&pool->sp_lock); + if ((svsk = svc_sock_dequeue(pool)) != NULL) { rqstp->rq_sock = svsk; - svsk->sk_inuse++; + atomic_inc(&svsk->sk_inuse); rqstp->rq_reserved = serv->sv_bufsz; - svsk->sk_reserved += rqstp->rq_reserved; + atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); } else { /* No data pending. Go to sleep */ - svc_serv_enqueue(serv, rqstp); + svc_thread_enqueue(pool, rqstp); /* * We have to be able to interrupt this wait @@ -1205,26 +1275,26 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) */ set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&rqstp->rq_wait, &wait); - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&pool->sp_lock); schedule_timeout(timeout); try_to_freeze(); - spin_lock_bh(&serv->sv_lock); + spin_lock_bh(&pool->sp_lock); remove_wait_queue(&rqstp->rq_wait, &wait); if (!(svsk = rqstp->rq_sock)) { - svc_serv_dequeue(serv, rqstp); - spin_unlock_bh(&serv->sv_lock); + svc_thread_dequeue(pool, rqstp); + spin_unlock_bh(&pool->sp_lock); dprintk("svc: server %p, no data yet\n", rqstp); return signalled()? -EINTR : -EAGAIN; } } - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&pool->sp_lock); - dprintk("svc: server %p, socket %p, inuse=%d\n", - rqstp, svsk, svsk->sk_inuse); + dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", + rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); len = svsk->sk_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); @@ -1235,13 +1305,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) return -EAGAIN; } svsk->sk_lastrecv = get_seconds(); - if (test_bit(SK_TEMP, &svsk->sk_flags)) { - /* push active sockets to end of list */ - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&svsk->sk_list)) - list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); - spin_unlock_bh(&serv->sv_lock); - } + clear_bit(SK_OLD, &svsk->sk_flags); rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; rqstp->rq_chandle.defer = svc_defer; @@ -1301,6 +1365,58 @@ svc_send(struct svc_rqst *rqstp) } /* + * Timer function to close old temporary sockets, using + * a mark-and-sweep algorithm. + */ +static void +svc_age_temp_sockets(unsigned long closure) +{ + struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_sock *svsk; + struct list_head *le, *next; + LIST_HEAD(to_be_aged); + + dprintk("svc_age_temp_sockets\n"); + + if (!spin_trylock_bh(&serv->sv_lock)) { + /* busy, try again 1 sec later */ + dprintk("svc_age_temp_sockets: busy\n"); + mod_timer(&serv->sv_temptimer, jiffies + HZ); + return; + } + + list_for_each_safe(le, next, &serv->sv_tempsocks) { + svsk = list_entry(le, struct svc_sock, sk_list); + + if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) + continue; + if (atomic_read(&svsk->sk_inuse) || test_bit(SK_BUSY, &svsk->sk_flags)) + continue; + atomic_inc(&svsk->sk_inuse); + list_move(le, &to_be_aged); + set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(SK_DETACHED, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_aged)) { + le = to_be_aged.next; + /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ + list_del_init(le); + svsk = list_entry(le, struct svc_sock, sk_list); + + dprintk("queuing svsk %p for closing, %lu seconds old\n", + svsk, get_seconds() - svsk->sk_lastrecv); + + /* a thread will dequeue and close it soon */ + svc_sock_enqueue(svsk); + svc_sock_put(svsk); + } + + mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); +} + +/* * Initialize socket for RPC use and create svc_sock struct * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. */ @@ -1337,7 +1453,9 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; svsk->sk_server = serv; + atomic_set(&svsk->sk_inuse, 0); svsk->sk_lastrecv = get_seconds(); + spin_lock_init(&svsk->sk_defer_lock); INIT_LIST_HEAD(&svsk->sk_deferred); INIT_LIST_HEAD(&svsk->sk_ready); mutex_init(&svsk->sk_mutex); @@ -1353,6 +1471,13 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, set_bit(SK_TEMP, &svsk->sk_flags); list_add(&svsk->sk_list, &serv->sv_tempsocks); serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp sockets */ + setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } } else { clear_bit(SK_TEMP, &svsk->sk_flags); list_add(&svsk->sk_list, &serv->sv_permsocks); @@ -1367,6 +1492,38 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, return svsk; } +int svc_addsock(struct svc_serv *serv, + int fd, + char *name_return, + int *proto) +{ + int err = 0; + struct socket *so = sockfd_lookup(fd, &err); + struct svc_sock *svsk = NULL; + + if (!so) + return err; + if (so->sk->sk_family != AF_INET) + err = -EAFNOSUPPORT; + else if (so->sk->sk_protocol != IPPROTO_TCP && + so->sk->sk_protocol != IPPROTO_UDP) + err = -EPROTONOSUPPORT; + else if (so->state > SS_UNCONNECTED) + err = -EISCONN; + else { + svsk = svc_setup_socket(serv, so, &err, 1); + if (svsk) + err = 0; + } + if (err) { + sockfd_put(so); + return err; + } + if (proto) *proto = so->sk->sk_protocol; + return one_sock_name(name_return, svsk); +} +EXPORT_SYMBOL_GPL(svc_addsock); + /* * Create socket for RPC service. */ @@ -1434,15 +1591,27 @@ svc_delete_socket(struct svc_sock *svsk) spin_lock_bh(&serv->sv_lock); - list_del_init(&svsk->sk_list); - list_del_init(&svsk->sk_ready); + if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) + list_del_init(&svsk->sk_list); + /* + * We used to delete the svc_sock from whichever list + * it's sk_ready node was on, but we don't actually + * need to. This is because the only time we're called + * while still attached to a queue, the queue itself + * is about to be destroyed (in svc_destroy). + */ if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) if (test_bit(SK_TEMP, &svsk->sk_flags)) serv->sv_tmpcnt--; - if (!svsk->sk_inuse) { + if (!atomic_read(&svsk->sk_inuse)) { spin_unlock_bh(&serv->sv_lock); - sock_release(svsk->sk_sock); + if (svsk->sk_sock->file) + sockfd_put(svsk->sk_sock); + else + sock_release(svsk->sk_sock); + if (svsk->sk_info_authunix != NULL) + svcauth_unix_info_release(svsk->sk_info_authunix); kfree(svsk); } else { spin_unlock_bh(&serv->sv_lock); @@ -1473,7 +1642,6 @@ svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) static void svc_revisit(struct cache_deferred_req *dreq, int too_many) { struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); - struct svc_serv *serv = dreq->owner; struct svc_sock *svsk; if (too_many) { @@ -1484,9 +1652,9 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) dprintk("revisit queued\n"); svsk = dr->svsk; dr->svsk = NULL; - spin_lock_bh(&serv->sv_lock); + spin_lock_bh(&svsk->sk_defer_lock); list_add(&dr->handle.recent, &svsk->sk_deferred); - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&svsk->sk_defer_lock); set_bit(SK_DEFERRED, &svsk->sk_flags); svc_sock_enqueue(svsk); svc_sock_put(svsk); @@ -1518,10 +1686,8 @@ svc_defer(struct cache_req *req) dr->argslen = rqstp->rq_arg.len >> 2; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); } - spin_lock_bh(&rqstp->rq_server->sv_lock); - rqstp->rq_sock->sk_inuse++; + atomic_inc(&rqstp->rq_sock->sk_inuse); dr->svsk = rqstp->rq_sock; - spin_unlock_bh(&rqstp->rq_server->sv_lock); dr->handle.revisit = svc_revisit; return &dr->handle; @@ -1541,6 +1707,7 @@ static int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_prot = dr->prot; rqstp->rq_addr = dr->addr; rqstp->rq_daddr = dr->daddr; + rqstp->rq_respages = rqstp->rq_pages; return dr->argslen<<2; } @@ -1548,11 +1715,10 @@ static int svc_deferred_recv(struct svc_rqst *rqstp) static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) { struct svc_deferred_req *dr = NULL; - struct svc_serv *serv = svsk->sk_server; if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) return NULL; - spin_lock_bh(&serv->sv_lock); + spin_lock_bh(&svsk->sk_defer_lock); clear_bit(SK_DEFERRED, &svsk->sk_flags); if (!list_empty(&svsk->sk_deferred)) { dr = list_entry(svsk->sk_deferred.next, @@ -1561,6 +1727,6 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) list_del_init(&dr->handle.recent); set_bit(SK_DEFERRED, &svsk->sk_flags); } - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&svsk->sk_defer_lock); return dr; } |